1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 struct spdk_thread *thread; 337 struct { 338 spdk_bdev_event_cb_t event_fn; 339 void *ctx; 340 } callback; 341 bool closed; 342 bool write; 343 bool memory_domains_supported; 344 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 345 struct spdk_spinlock spinlock; 346 uint32_t refs; 347 TAILQ_HEAD(, media_event_entry) pending_media_events; 348 TAILQ_HEAD(, media_event_entry) free_media_events; 349 struct media_event_entry *media_events_buffer; 350 TAILQ_ENTRY(spdk_bdev_desc) link; 351 352 uint64_t timeout_in_sec; 353 spdk_bdev_io_timeout_cb cb_fn; 354 void *cb_arg; 355 struct spdk_poller *io_timeout_poller; 356 struct spdk_bdev_module_claim *claim; 357 }; 358 359 struct spdk_bdev_iostat_ctx { 360 struct spdk_bdev_io_stat *stat; 361 enum spdk_bdev_reset_stat_mode reset_mode; 362 spdk_bdev_get_device_stat_cb cb; 363 void *cb_arg; 364 }; 365 366 struct set_qos_limit_ctx { 367 void (*cb_fn)(void *cb_arg, int status); 368 void *cb_arg; 369 struct spdk_bdev *bdev; 370 }; 371 372 struct spdk_bdev_channel_iter { 373 spdk_bdev_for_each_channel_msg fn; 374 spdk_bdev_for_each_channel_done cpl; 375 struct spdk_io_channel_iter *i; 376 void *ctx; 377 }; 378 379 struct spdk_bdev_io_error_stat { 380 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 381 }; 382 383 enum bdev_io_retry_state { 384 BDEV_IO_RETRY_STATE_INVALID, 385 BDEV_IO_RETRY_STATE_PULL, 386 BDEV_IO_RETRY_STATE_PULL_MD, 387 BDEV_IO_RETRY_STATE_SUBMIT, 388 BDEV_IO_RETRY_STATE_PUSH, 389 BDEV_IO_RETRY_STATE_PUSH_MD, 390 }; 391 392 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 393 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 394 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 395 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 396 397 static inline void bdev_io_complete(void *ctx); 398 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 399 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 401 402 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 403 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 404 405 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 406 struct spdk_io_channel *ch, void *_ctx); 407 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 408 409 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 410 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 411 uint64_t num_blocks, 412 struct spdk_memory_domain *domain, void *domain_ctx, 413 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 414 spdk_bdev_io_completion_cb cb, void *cb_arg); 415 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 416 struct iovec *iov, int iovcnt, void *md_buf, 417 uint64_t offset_blocks, uint64_t num_blocks, 418 struct spdk_memory_domain *domain, void *domain_ctx, 419 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 420 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 421 spdk_bdev_io_completion_cb cb, void *cb_arg); 422 423 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 424 uint64_t offset, uint64_t length, 425 lock_range_cb cb_fn, void *cb_arg); 426 427 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 428 uint64_t offset, uint64_t length, 429 lock_range_cb cb_fn, void *cb_arg); 430 431 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 432 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 433 434 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 435 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 436 static void claim_reset(struct spdk_bdev *bdev); 437 438 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 439 440 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 441 442 #define bdev_get_ext_io_opt(opts, field, defval) \ 443 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 444 445 static inline void 446 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 447 { 448 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 449 bdev_io->internal.ch->queue_depth++; 450 } 451 452 static inline void 453 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 454 { 455 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 456 bdev_io->internal.ch->queue_depth--; 457 } 458 459 void 460 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 461 { 462 if (!opts) { 463 SPDK_ERRLOG("opts should not be NULL\n"); 464 return; 465 } 466 467 if (!opts_size) { 468 SPDK_ERRLOG("opts_size should not be zero value\n"); 469 return; 470 } 471 472 opts->opts_size = opts_size; 473 474 #define SET_FIELD(field) \ 475 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 476 opts->field = g_bdev_opts.field; \ 477 } \ 478 479 SET_FIELD(bdev_io_pool_size); 480 SET_FIELD(bdev_io_cache_size); 481 SET_FIELD(bdev_auto_examine); 482 SET_FIELD(iobuf_small_cache_size); 483 SET_FIELD(iobuf_large_cache_size); 484 485 /* Do not remove this statement, you should always update this statement when you adding a new field, 486 * and do not forget to add the SET_FIELD statement for your added field. */ 487 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 488 489 #undef SET_FIELD 490 } 491 492 int 493 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 494 { 495 uint32_t min_pool_size; 496 497 if (!opts) { 498 SPDK_ERRLOG("opts cannot be NULL\n"); 499 return -1; 500 } 501 502 if (!opts->opts_size) { 503 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 504 return -1; 505 } 506 507 /* 508 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 509 * initialization. A second mgmt_ch will be created on the same thread when the application starts 510 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 511 */ 512 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 513 if (opts->bdev_io_pool_size < min_pool_size) { 514 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 515 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 516 spdk_thread_get_count()); 517 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 518 return -1; 519 } 520 521 #define SET_FIELD(field) \ 522 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 523 g_bdev_opts.field = opts->field; \ 524 } \ 525 526 SET_FIELD(bdev_io_pool_size); 527 SET_FIELD(bdev_io_cache_size); 528 SET_FIELD(bdev_auto_examine); 529 SET_FIELD(iobuf_small_cache_size); 530 SET_FIELD(iobuf_large_cache_size); 531 532 g_bdev_opts.opts_size = opts->opts_size; 533 534 #undef SET_FIELD 535 536 return 0; 537 } 538 539 static struct spdk_bdev * 540 bdev_get_by_name(const char *bdev_name) 541 { 542 struct spdk_bdev_name find; 543 struct spdk_bdev_name *res; 544 545 find.name = (char *)bdev_name; 546 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 547 if (res != NULL) { 548 return res->bdev; 549 } 550 551 return NULL; 552 } 553 554 struct spdk_bdev * 555 spdk_bdev_get_by_name(const char *bdev_name) 556 { 557 struct spdk_bdev *bdev; 558 559 spdk_spin_lock(&g_bdev_mgr.spinlock); 560 bdev = bdev_get_by_name(bdev_name); 561 spdk_spin_unlock(&g_bdev_mgr.spinlock); 562 563 return bdev; 564 } 565 566 struct bdev_io_status_string { 567 enum spdk_bdev_io_status status; 568 const char *str; 569 }; 570 571 static const struct bdev_io_status_string bdev_io_status_strings[] = { 572 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 573 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 574 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 575 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 576 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 577 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 578 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 579 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 580 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 581 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 582 }; 583 584 static const char * 585 bdev_io_status_get_string(enum spdk_bdev_io_status status) 586 { 587 uint32_t i; 588 589 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 590 if (bdev_io_status_strings[i].status == status) { 591 return bdev_io_status_strings[i].str; 592 } 593 } 594 595 return "reserved"; 596 } 597 598 struct spdk_bdev_wait_for_examine_ctx { 599 struct spdk_poller *poller; 600 spdk_bdev_wait_for_examine_cb cb_fn; 601 void *cb_arg; 602 }; 603 604 static bool bdev_module_all_actions_completed(void); 605 606 static int 607 bdev_wait_for_examine_cb(void *arg) 608 { 609 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 610 611 if (!bdev_module_all_actions_completed()) { 612 return SPDK_POLLER_IDLE; 613 } 614 615 spdk_poller_unregister(&ctx->poller); 616 ctx->cb_fn(ctx->cb_arg); 617 free(ctx); 618 619 return SPDK_POLLER_BUSY; 620 } 621 622 int 623 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 624 { 625 struct spdk_bdev_wait_for_examine_ctx *ctx; 626 627 ctx = calloc(1, sizeof(*ctx)); 628 if (ctx == NULL) { 629 return -ENOMEM; 630 } 631 ctx->cb_fn = cb_fn; 632 ctx->cb_arg = cb_arg; 633 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 634 635 return 0; 636 } 637 638 struct spdk_bdev_examine_item { 639 char *name; 640 TAILQ_ENTRY(spdk_bdev_examine_item) link; 641 }; 642 643 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 644 645 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 646 g_bdev_examine_allowlist); 647 648 static inline bool 649 bdev_examine_allowlist_check(const char *name) 650 { 651 struct spdk_bdev_examine_item *item; 652 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 653 if (strcmp(name, item->name) == 0) { 654 return true; 655 } 656 } 657 return false; 658 } 659 660 static inline void 661 bdev_examine_allowlist_remove(const char *name) 662 { 663 struct spdk_bdev_examine_item *item; 664 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 665 if (strcmp(name, item->name) == 0) { 666 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 667 free(item->name); 668 free(item); 669 break; 670 } 671 } 672 } 673 674 static inline void 675 bdev_examine_allowlist_free(void) 676 { 677 struct spdk_bdev_examine_item *item; 678 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 679 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 680 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 681 free(item->name); 682 free(item); 683 } 684 } 685 686 static inline bool 687 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 688 { 689 struct spdk_bdev_alias *tmp; 690 if (bdev_examine_allowlist_check(bdev->name)) { 691 return true; 692 } 693 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 694 if (bdev_examine_allowlist_check(tmp->alias.name)) { 695 return true; 696 } 697 } 698 return false; 699 } 700 701 static inline bool 702 bdev_ok_to_examine(struct spdk_bdev *bdev) 703 { 704 /* Some bdevs may not support the READ command. 705 * Do not try to examine them. 706 */ 707 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 708 return false; 709 } 710 711 if (g_bdev_opts.bdev_auto_examine) { 712 return true; 713 } else { 714 return bdev_in_examine_allowlist(bdev); 715 } 716 } 717 718 static void 719 bdev_examine(struct spdk_bdev *bdev) 720 { 721 struct spdk_bdev_module *module; 722 struct spdk_bdev_module_claim *claim, *tmpclaim; 723 uint32_t action; 724 725 if (!bdev_ok_to_examine(bdev)) { 726 return; 727 } 728 729 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 730 if (module->examine_config) { 731 spdk_spin_lock(&module->internal.spinlock); 732 action = module->internal.action_in_progress; 733 module->internal.action_in_progress++; 734 spdk_spin_unlock(&module->internal.spinlock); 735 module->examine_config(bdev); 736 if (action != module->internal.action_in_progress) { 737 SPDK_ERRLOG("examine_config for module %s did not call " 738 "spdk_bdev_module_examine_done()\n", module->name); 739 } 740 } 741 } 742 743 spdk_spin_lock(&bdev->internal.spinlock); 744 745 switch (bdev->internal.claim_type) { 746 case SPDK_BDEV_CLAIM_NONE: 747 /* Examine by all bdev modules */ 748 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 749 if (module->examine_disk) { 750 spdk_spin_lock(&module->internal.spinlock); 751 module->internal.action_in_progress++; 752 spdk_spin_unlock(&module->internal.spinlock); 753 spdk_spin_unlock(&bdev->internal.spinlock); 754 module->examine_disk(bdev); 755 spdk_spin_lock(&bdev->internal.spinlock); 756 } 757 } 758 break; 759 case SPDK_BDEV_CLAIM_EXCL_WRITE: 760 /* Examine by the one bdev module with a v1 claim */ 761 module = bdev->internal.claim.v1.module; 762 if (module->examine_disk) { 763 spdk_spin_lock(&module->internal.spinlock); 764 module->internal.action_in_progress++; 765 spdk_spin_unlock(&module->internal.spinlock); 766 spdk_spin_unlock(&bdev->internal.spinlock); 767 module->examine_disk(bdev); 768 return; 769 } 770 break; 771 default: 772 /* Examine by all bdev modules with a v2 claim */ 773 assert(claim_type_is_v2(bdev->internal.claim_type)); 774 /* 775 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 776 * list, perhaps accessing freed memory. Without protection, this could happen 777 * while the lock is dropped during the examine callback. 778 */ 779 bdev->internal.examine_in_progress++; 780 781 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 782 module = claim->module; 783 784 if (module == NULL) { 785 /* This is a vestigial claim, held by examine_count */ 786 continue; 787 } 788 789 if (module->examine_disk == NULL) { 790 continue; 791 } 792 793 spdk_spin_lock(&module->internal.spinlock); 794 module->internal.action_in_progress++; 795 spdk_spin_unlock(&module->internal.spinlock); 796 797 /* Call examine_disk without holding internal.spinlock. */ 798 spdk_spin_unlock(&bdev->internal.spinlock); 799 module->examine_disk(bdev); 800 spdk_spin_lock(&bdev->internal.spinlock); 801 } 802 803 assert(bdev->internal.examine_in_progress > 0); 804 bdev->internal.examine_in_progress--; 805 if (bdev->internal.examine_in_progress == 0) { 806 /* Remove any claims that were released during examine_disk */ 807 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 808 if (claim->desc != NULL) { 809 continue; 810 } 811 812 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 813 free(claim); 814 } 815 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 816 claim_reset(bdev); 817 } 818 } 819 } 820 821 spdk_spin_unlock(&bdev->internal.spinlock); 822 } 823 824 int 825 spdk_bdev_examine(const char *name) 826 { 827 struct spdk_bdev *bdev; 828 struct spdk_bdev_examine_item *item; 829 struct spdk_thread *thread = spdk_get_thread(); 830 831 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 832 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 833 thread ? spdk_thread_get_name(thread) : "null"); 834 return -EINVAL; 835 } 836 837 if (g_bdev_opts.bdev_auto_examine) { 838 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 839 return -EINVAL; 840 } 841 842 if (bdev_examine_allowlist_check(name)) { 843 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 844 return -EEXIST; 845 } 846 847 item = calloc(1, sizeof(*item)); 848 if (!item) { 849 return -ENOMEM; 850 } 851 item->name = strdup(name); 852 if (!item->name) { 853 free(item); 854 return -ENOMEM; 855 } 856 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 857 858 bdev = spdk_bdev_get_by_name(name); 859 if (bdev) { 860 bdev_examine(bdev); 861 } 862 return 0; 863 } 864 865 static inline void 866 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 867 { 868 struct spdk_bdev_examine_item *item; 869 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 870 spdk_json_write_object_begin(w); 871 spdk_json_write_named_string(w, "method", "bdev_examine"); 872 spdk_json_write_named_object_begin(w, "params"); 873 spdk_json_write_named_string(w, "name", item->name); 874 spdk_json_write_object_end(w); 875 spdk_json_write_object_end(w); 876 } 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_first(void) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = TAILQ_NEXT(prev, internal.link); 898 if (bdev) { 899 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 900 } 901 902 return bdev; 903 } 904 905 static struct spdk_bdev * 906 _bdev_next_leaf(struct spdk_bdev *bdev) 907 { 908 while (bdev != NULL) { 909 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 910 return bdev; 911 } else { 912 bdev = TAILQ_NEXT(bdev, internal.link); 913 } 914 } 915 916 return bdev; 917 } 918 919 struct spdk_bdev * 920 spdk_bdev_first_leaf(void) 921 { 922 struct spdk_bdev *bdev; 923 924 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 925 926 if (bdev) { 927 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 928 } 929 930 return bdev; 931 } 932 933 struct spdk_bdev * 934 spdk_bdev_next_leaf(struct spdk_bdev *prev) 935 { 936 struct spdk_bdev *bdev; 937 938 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 939 940 if (bdev) { 941 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 942 } 943 944 return bdev; 945 } 946 947 static inline bool 948 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 949 { 950 return bdev_io->internal.f.has_memory_domain; 951 } 952 953 static inline bool 954 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 955 { 956 return bdev_io->internal.f.has_accel_sequence; 957 } 958 959 static inline void 960 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 961 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 962 { 963 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 964 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 965 * channels we will instead wait for half to complete. 966 */ 967 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 968 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 969 970 assert(state != BDEV_IO_RETRY_STATE_INVALID); 971 bdev_io->internal.retry_state = state; 972 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 973 } 974 975 static inline void 976 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 977 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 978 { 979 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 980 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 981 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 982 983 assert(state != BDEV_IO_RETRY_STATE_INVALID); 984 bdev_io->internal.retry_state = state; 985 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 986 } 987 988 void 989 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 990 { 991 struct iovec *iovs; 992 993 if (bdev_io->u.bdev.iovs == NULL) { 994 bdev_io->u.bdev.iovs = &bdev_io->iov; 995 bdev_io->u.bdev.iovcnt = 1; 996 } 997 998 iovs = bdev_io->u.bdev.iovs; 999 1000 assert(iovs != NULL); 1001 assert(bdev_io->u.bdev.iovcnt >= 1); 1002 1003 iovs[0].iov_base = buf; 1004 iovs[0].iov_len = len; 1005 } 1006 1007 void 1008 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1009 { 1010 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1011 bdev_io->u.bdev.md_buf = md_buf; 1012 } 1013 1014 static bool 1015 _is_buf_allocated(const struct iovec *iovs) 1016 { 1017 if (iovs == NULL) { 1018 return false; 1019 } 1020 1021 return iovs[0].iov_base != NULL; 1022 } 1023 1024 static bool 1025 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1026 { 1027 int i; 1028 uintptr_t iov_base; 1029 1030 if (spdk_likely(alignment == 1)) { 1031 return true; 1032 } 1033 1034 for (i = 0; i < iovcnt; i++) { 1035 iov_base = (uintptr_t)iovs[i].iov_base; 1036 if ((iov_base & (alignment - 1)) != 0) { 1037 return false; 1038 } 1039 } 1040 1041 return true; 1042 } 1043 1044 static inline bool 1045 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1046 { 1047 if (!bdev_io_use_accel_sequence(bdev_io)) { 1048 return false; 1049 } 1050 1051 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1052 * bdev module didn't support accel sequences */ 1053 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1054 } 1055 1056 static inline void 1057 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1058 struct spdk_bdev_shared_resource *shared_resource) 1059 { 1060 bdev_ch->io_outstanding++; 1061 shared_resource->io_outstanding++; 1062 } 1063 1064 static inline void 1065 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1066 struct spdk_bdev_shared_resource *shared_resource) 1067 { 1068 assert(bdev_ch->io_outstanding > 0); 1069 assert(shared_resource->io_outstanding > 0); 1070 bdev_ch->io_outstanding--; 1071 shared_resource->io_outstanding--; 1072 } 1073 1074 static void 1075 bdev_io_submit_sequence_cb(void *ctx, int status) 1076 { 1077 struct spdk_bdev_io *bdev_io = ctx; 1078 1079 assert(bdev_io_use_accel_sequence(bdev_io)); 1080 1081 bdev_io->u.bdev.accel_sequence = NULL; 1082 bdev_io->internal.f.has_accel_sequence = false; 1083 1084 if (spdk_unlikely(status != 0)) { 1085 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1086 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1087 bdev_io_complete_unsubmitted(bdev_io); 1088 return; 1089 } 1090 1091 bdev_io_submit(bdev_io); 1092 } 1093 1094 static void 1095 bdev_io_exec_sequence_cb(void *ctx, int status) 1096 { 1097 struct spdk_bdev_io *bdev_io = ctx; 1098 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1099 1100 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1101 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1102 1103 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1104 bdev_ch_retry_io(ch); 1105 } 1106 1107 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1108 } 1109 1110 static void 1111 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1112 { 1113 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1114 1115 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1116 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1117 assert(bdev_io_use_accel_sequence(bdev_io)); 1118 1119 /* Since the operations are appended during submission, they're in the opposite order than 1120 * how we want to execute them for reads (i.e. we need to execute the most recently added 1121 * operation first), so reverse the sequence before executing it. 1122 */ 1123 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1124 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1125 } 1126 1127 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1128 bdev_io_increment_outstanding(ch, ch->shared_resource); 1129 bdev_io->internal.data_transfer_cpl = cb_fn; 1130 1131 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1132 bdev_io_exec_sequence_cb, bdev_io); 1133 } 1134 1135 static void 1136 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1137 { 1138 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1139 void *buf; 1140 1141 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1142 buf = bdev_io->internal.buf.ptr; 1143 bdev_io->internal.buf.ptr = NULL; 1144 bdev_io->internal.f.has_buf = false; 1145 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1146 bdev_io->internal.get_aux_buf_cb = NULL; 1147 } else { 1148 assert(bdev_io->internal.get_buf_cb != NULL); 1149 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1150 bdev_io->internal.get_buf_cb = NULL; 1151 } 1152 } 1153 1154 static void 1155 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1156 { 1157 struct spdk_bdev_io *bdev_io = ctx; 1158 1159 if (rc) { 1160 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1161 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1162 } 1163 bdev_io_get_buf_complete(bdev_io, !rc); 1164 } 1165 1166 static void 1167 bdev_io_pull_md_buf_done(void *ctx, int status) 1168 { 1169 struct spdk_bdev_io *bdev_io = ctx; 1170 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1171 1172 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1173 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1174 1175 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1176 bdev_ch_retry_io(ch); 1177 } 1178 1179 assert(bdev_io->internal.data_transfer_cpl); 1180 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1181 } 1182 1183 static void 1184 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1187 int rc = 0; 1188 1189 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1190 assert(bdev_io->internal.f.has_bounce_buf); 1191 if (bdev_io_use_memory_domain(bdev_io)) { 1192 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1193 bdev_io_increment_outstanding(ch, ch->shared_resource); 1194 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1195 bdev_io->internal.memory_domain_ctx, 1196 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1197 &bdev_io->internal.bounce_buf.md_iov, 1, 1198 bdev_io_pull_md_buf_done, bdev_io); 1199 if (rc == 0) { 1200 /* Continue to submit IO in completion callback */ 1201 return; 1202 } 1203 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1204 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1205 if (rc != -ENOMEM) { 1206 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1207 spdk_memory_domain_get_dma_device_id( 1208 bdev_io->internal.memory_domain), rc); 1209 } 1210 } else { 1211 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1212 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1213 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1214 } 1215 } 1216 1217 if (spdk_unlikely(rc == -ENOMEM)) { 1218 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1219 } else { 1220 assert(bdev_io->internal.data_transfer_cpl); 1221 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1222 } 1223 } 1224 1225 static void 1226 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1227 { 1228 assert(bdev_io->internal.f.has_bounce_buf); 1229 1230 /* save original md_buf */ 1231 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1232 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1233 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1234 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1235 /* set bounce md_buf */ 1236 bdev_io->u.bdev.md_buf = md_buf; 1237 1238 bdev_io_pull_md_buf(bdev_io); 1239 } 1240 1241 static void 1242 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1243 { 1244 struct spdk_bdev *bdev = bdev_io->bdev; 1245 uint64_t md_len; 1246 void *buf; 1247 1248 if (spdk_bdev_is_md_separate(bdev)) { 1249 assert(!bdev_io_use_accel_sequence(bdev_io)); 1250 1251 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1252 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1253 1254 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1255 1256 if (bdev_io->u.bdev.md_buf != NULL) { 1257 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1258 return; 1259 } else { 1260 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1261 } 1262 } 1263 1264 bdev_io_get_buf_complete(bdev_io, true); 1265 } 1266 1267 static inline void 1268 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1269 { 1270 if (rc) { 1271 SPDK_ERRLOG("Failed to get data buffer\n"); 1272 assert(bdev_io->internal.data_transfer_cpl); 1273 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1274 return; 1275 } 1276 1277 _bdev_io_set_md_buf(bdev_io); 1278 } 1279 1280 static void 1281 bdev_io_pull_data_done_and_track(void *ctx, int status) 1282 { 1283 struct spdk_bdev_io *bdev_io = ctx; 1284 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1285 1286 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1287 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1288 1289 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1290 bdev_ch_retry_io(ch); 1291 } 1292 1293 bdev_io_pull_data_done(bdev_io, status); 1294 } 1295 1296 static void 1297 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1298 { 1299 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1300 int rc = 0; 1301 1302 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1303 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1304 * operation */ 1305 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1306 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1307 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1308 assert(bdev_io_use_accel_sequence(bdev_io)); 1309 assert(bdev_io->internal.f.has_bounce_buf); 1310 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1311 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1312 NULL, NULL, 1313 bdev_io->internal.bounce_buf.orig_iovs, 1314 bdev_io->internal.bounce_buf.orig_iovcnt, 1315 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1316 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1317 NULL, NULL); 1318 } else { 1319 /* We need to reverse the src/dst for reads */ 1320 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1321 assert(bdev_io_use_accel_sequence(bdev_io)); 1322 assert(bdev_io->internal.f.has_bounce_buf); 1323 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1324 bdev_io->internal.bounce_buf.orig_iovs, 1325 bdev_io->internal.bounce_buf.orig_iovcnt, 1326 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1327 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1328 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1329 NULL, NULL, NULL, NULL); 1330 } 1331 1332 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1333 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1334 bdev_io->internal.accel_sequence); 1335 } 1336 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1337 /* if this is write path, copy data from original buffer to bounce buffer */ 1338 if (bdev_io_use_memory_domain(bdev_io)) { 1339 assert(bdev_io->internal.f.has_bounce_buf); 1340 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1341 bdev_io_increment_outstanding(ch, ch->shared_resource); 1342 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1343 bdev_io->internal.memory_domain_ctx, 1344 bdev_io->internal.bounce_buf.orig_iovs, 1345 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1346 bdev_io->u.bdev.iovs, 1, 1347 bdev_io_pull_data_done_and_track, 1348 bdev_io); 1349 if (rc == 0) { 1350 /* Continue to submit IO in completion callback */ 1351 return; 1352 } 1353 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1354 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1355 if (rc != -ENOMEM) { 1356 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1357 spdk_memory_domain_get_dma_device_id( 1358 bdev_io->internal.memory_domain)); 1359 } 1360 } else { 1361 assert(bdev_io->u.bdev.iovcnt == 1); 1362 assert(bdev_io->internal.f.has_bounce_buf); 1363 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1364 bdev_io->u.bdev.iovs[0].iov_len, 1365 bdev_io->internal.bounce_buf.orig_iovs, 1366 bdev_io->internal.bounce_buf.orig_iovcnt); 1367 } 1368 } 1369 1370 if (spdk_unlikely(rc == -ENOMEM)) { 1371 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1372 } else { 1373 bdev_io_pull_data_done(bdev_io, rc); 1374 } 1375 } 1376 1377 static void 1378 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1379 bdev_copy_bounce_buffer_cpl cpl_cb) 1380 { 1381 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1382 1383 assert(bdev_io->internal.f.has_bounce_buf == false); 1384 1385 bdev_io->internal.data_transfer_cpl = cpl_cb; 1386 bdev_io->internal.f.has_bounce_buf = true; 1387 /* save original iovec */ 1388 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1389 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1390 /* zero the other data members */ 1391 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1392 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1393 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1394 /* set bounce iov */ 1395 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1396 bdev_io->u.bdev.iovcnt = 1; 1397 /* set bounce buffer for this operation */ 1398 bdev_io->u.bdev.iovs[0].iov_base = buf; 1399 bdev_io->u.bdev.iovs[0].iov_len = len; 1400 /* Now we use 1 iov, the split condition could have been changed */ 1401 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1402 1403 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1404 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1405 } else { 1406 bdev_io_pull_data(bdev_io); 1407 } 1408 } 1409 1410 static void 1411 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1412 { 1413 struct spdk_bdev *bdev = bdev_io->bdev; 1414 bool buf_allocated; 1415 uint64_t alignment; 1416 void *aligned_buf; 1417 1418 bdev_io->internal.buf.ptr = buf; 1419 bdev_io->internal.f.has_buf = true; 1420 1421 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1422 bdev_io_get_buf_complete(bdev_io, true); 1423 return; 1424 } 1425 1426 alignment = spdk_bdev_get_buf_align(bdev); 1427 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1428 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1429 1430 if (buf_allocated) { 1431 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1432 /* Continue in completion callback */ 1433 return; 1434 } else { 1435 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1436 } 1437 1438 _bdev_io_set_md_buf(bdev_io); 1439 } 1440 1441 static inline uint64_t 1442 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1443 { 1444 struct spdk_bdev *bdev = bdev_io->bdev; 1445 uint64_t md_len, alignment; 1446 1447 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1448 1449 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1450 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1451 1452 return len + alignment + md_len; 1453 } 1454 1455 static void 1456 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1457 { 1458 struct spdk_bdev_mgmt_channel *ch; 1459 1460 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1461 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1462 } 1463 1464 static void 1465 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1466 { 1467 assert(bdev_io->internal.f.has_buf); 1468 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1469 bdev_io->internal.buf.ptr = NULL; 1470 bdev_io->internal.f.has_buf = false; 1471 } 1472 1473 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1474 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1475 1476 void 1477 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1478 { 1479 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1480 1481 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1482 1483 assert(buf != NULL); 1484 _bdev_io_put_buf(bdev_io, buf, len); 1485 } 1486 1487 static inline void 1488 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1489 struct spdk_bdev_io *bdev_io) 1490 { 1491 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1492 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1493 * sequence pointer to make sure we won't touch it anymore. */ 1494 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1495 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1496 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1497 bdev_io->internal.f.has_accel_sequence = false; 1498 } 1499 1500 bdev->fn_table->submit_request(ioch, bdev_io); 1501 } 1502 1503 static inline void 1504 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1505 { 1506 struct spdk_bdev *bdev = bdev_io->bdev; 1507 1508 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1509 bdev_io->internal.error.nvme.cdw0 = 0; 1510 bdev_io->num_retries++; 1511 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1512 } 1513 1514 static void 1515 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1516 { 1517 struct spdk_bdev_io *bdev_io; 1518 1519 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1520 /* 1521 * Allow some more I/O to complete before retrying the nomem_io queue. 1522 * Some drivers (such as nvme) cannot immediately take a new I/O in 1523 * the context of a completion, because the resources for the I/O are 1524 * not released until control returns to the bdev poller. Also, we 1525 * may require several small I/O to complete before a larger I/O 1526 * (that requires splitting) can be submitted. 1527 */ 1528 return; 1529 } 1530 1531 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1532 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1533 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1534 1535 switch (bdev_io->internal.retry_state) { 1536 case BDEV_IO_RETRY_STATE_SUBMIT: 1537 bdev_ch_resubmit_io(shared_resource, bdev_io); 1538 break; 1539 case BDEV_IO_RETRY_STATE_PULL: 1540 bdev_io_pull_data(bdev_io); 1541 break; 1542 case BDEV_IO_RETRY_STATE_PULL_MD: 1543 bdev_io_pull_md_buf(bdev_io); 1544 break; 1545 case BDEV_IO_RETRY_STATE_PUSH: 1546 bdev_io_push_bounce_data(bdev_io); 1547 break; 1548 case BDEV_IO_RETRY_STATE_PUSH_MD: 1549 bdev_io_push_bounce_md_buf(bdev_io); 1550 break; 1551 default: 1552 assert(0 && "invalid retry state"); 1553 break; 1554 } 1555 1556 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1557 /* This IO completed again with NOMEM status, so break the loop and 1558 * don't try anymore. Note that a bdev_io that fails with NOMEM 1559 * always gets requeued at the front of the list, to maintain 1560 * ordering. 1561 */ 1562 break; 1563 } 1564 } 1565 } 1566 1567 static void 1568 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1569 { 1570 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1571 } 1572 1573 static int 1574 bdev_no_mem_poller(void *ctx) 1575 { 1576 struct spdk_bdev_shared_resource *shared_resource = ctx; 1577 1578 spdk_poller_unregister(&shared_resource->nomem_poller); 1579 1580 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1581 bdev_shared_ch_retry_io(shared_resource); 1582 } 1583 /* the retry cb may re-register the poller so double check */ 1584 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1585 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1586 /* No IOs were submitted, try again */ 1587 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1588 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1589 } 1590 1591 return SPDK_POLLER_BUSY; 1592 } 1593 1594 static inline bool 1595 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1596 { 1597 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1598 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1599 1600 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1601 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1602 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1603 1604 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1605 /* Special case when we have nomem IOs and no outstanding IOs which completions 1606 * could trigger retry of queued IOs 1607 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1608 * new IOs submitted, e.g. qd==1 */ 1609 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1610 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1611 } 1612 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1613 * ownership of that sequence is transferred back to the bdev layer, so we need to 1614 * restore internal.accel_sequence to make sure that the sequence is handled 1615 * correctly in case the I/O is later aborted. */ 1616 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1617 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1618 assert(!bdev_io_use_accel_sequence(bdev_io)); 1619 bdev_io->internal.f.has_accel_sequence = true; 1620 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1621 } 1622 1623 return true; 1624 } 1625 1626 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1627 bdev_ch_retry_io(bdev_ch); 1628 } 1629 1630 return false; 1631 } 1632 1633 static void 1634 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1635 { 1636 struct spdk_bdev_io *bdev_io = ctx; 1637 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1638 1639 if (rc) { 1640 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1641 } 1642 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1643 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1644 */ 1645 bdev_io_put_buf(bdev_io); 1646 1647 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1648 bdev_ch_retry_io(ch); 1649 } 1650 1651 /* Continue with IO completion flow */ 1652 bdev_io_complete(bdev_io); 1653 } 1654 1655 static void 1656 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1657 { 1658 struct spdk_bdev_io *bdev_io = ctx; 1659 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1660 1661 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1662 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1663 bdev_io->internal.f.has_bounce_buf = false; 1664 1665 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1666 bdev_ch_retry_io(ch); 1667 } 1668 1669 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1670 } 1671 1672 static inline void 1673 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1674 { 1675 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1676 int rc = 0; 1677 1678 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1679 assert(bdev_io->internal.f.has_bounce_buf); 1680 1681 /* do the same for metadata buffer */ 1682 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1683 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1684 1685 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1686 if (bdev_io_use_memory_domain(bdev_io)) { 1687 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1688 bdev_io_increment_outstanding(ch, ch->shared_resource); 1689 /* If memory domain is used then we need to call async push function */ 1690 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1691 bdev_io->internal.memory_domain_ctx, 1692 &bdev_io->internal.bounce_buf.orig_md_iov, 1693 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1694 &bdev_io->internal.bounce_buf.md_iov, 1, 1695 bdev_io_push_bounce_md_buf_done, 1696 bdev_io); 1697 if (rc == 0) { 1698 /* Continue IO completion in async callback */ 1699 return; 1700 } 1701 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1702 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1703 if (rc != -ENOMEM) { 1704 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1705 spdk_memory_domain_get_dma_device_id( 1706 bdev_io->internal.memory_domain)); 1707 } 1708 } else { 1709 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1710 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1711 } 1712 } 1713 } 1714 1715 if (spdk_unlikely(rc == -ENOMEM)) { 1716 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1717 } else { 1718 assert(bdev_io->internal.data_transfer_cpl); 1719 bdev_io->internal.f.has_bounce_buf = false; 1720 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1721 } 1722 } 1723 1724 static inline void 1725 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1726 { 1727 assert(bdev_io->internal.data_transfer_cpl); 1728 if (rc) { 1729 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1730 return; 1731 } 1732 1733 /* set original buffer for this io */ 1734 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1735 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1736 1737 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1738 * we still need to clear the md buf */ 1739 1740 bdev_io_push_bounce_md_buf(bdev_io); 1741 } 1742 1743 static void 1744 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1745 { 1746 struct spdk_bdev_io *bdev_io = ctx; 1747 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1748 1749 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1750 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1751 1752 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1753 bdev_ch_retry_io(ch); 1754 } 1755 1756 bdev_io_push_bounce_data_done(bdev_io, status); 1757 } 1758 1759 static inline void 1760 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1761 { 1762 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1763 int rc = 0; 1764 1765 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1766 assert(!bdev_io_use_accel_sequence(bdev_io)); 1767 assert(bdev_io->internal.f.has_bounce_buf); 1768 1769 /* if this is read path, copy data from bounce buffer to original buffer */ 1770 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1771 if (bdev_io_use_memory_domain(bdev_io)) { 1772 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1773 bdev_io_increment_outstanding(ch, ch->shared_resource); 1774 /* If memory domain is used then we need to call async push function */ 1775 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1776 bdev_io->internal.memory_domain_ctx, 1777 bdev_io->internal.bounce_buf.orig_iovs, 1778 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1779 &bdev_io->internal.bounce_buf.iov, 1, 1780 bdev_io_push_bounce_data_done_and_track, 1781 bdev_io); 1782 if (rc == 0) { 1783 /* Continue IO completion in async callback */ 1784 return; 1785 } 1786 1787 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1788 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1789 if (rc != -ENOMEM) { 1790 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1791 spdk_memory_domain_get_dma_device_id( 1792 bdev_io->internal.memory_domain)); 1793 } 1794 } else { 1795 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1796 bdev_io->internal.bounce_buf.orig_iovcnt, 1797 bdev_io->internal.bounce_buf.iov.iov_base, 1798 bdev_io->internal.bounce_buf.iov.iov_len); 1799 } 1800 } 1801 1802 if (spdk_unlikely(rc == -ENOMEM)) { 1803 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1804 } else { 1805 bdev_io_push_bounce_data_done(bdev_io, rc); 1806 } 1807 } 1808 1809 static inline void 1810 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1811 { 1812 bdev_io->internal.data_transfer_cpl = cpl_cb; 1813 bdev_io_push_bounce_data(bdev_io); 1814 } 1815 1816 static void 1817 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1818 { 1819 struct spdk_bdev_io *bdev_io; 1820 1821 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1822 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1823 } 1824 1825 static void 1826 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1827 { 1828 struct spdk_bdev_mgmt_channel *mgmt_ch; 1829 uint64_t max_len; 1830 void *buf; 1831 1832 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1833 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1834 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1835 1836 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1837 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1838 bdev_io_get_buf_complete(bdev_io, false); 1839 return; 1840 } 1841 1842 bdev_io->internal.buf.len = len; 1843 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1844 bdev_io_get_iobuf_cb); 1845 if (buf != NULL) { 1846 _bdev_io_set_buf(bdev_io, buf, len); 1847 } 1848 } 1849 1850 void 1851 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1852 { 1853 struct spdk_bdev *bdev = bdev_io->bdev; 1854 uint64_t alignment; 1855 1856 assert(cb != NULL); 1857 bdev_io->internal.get_buf_cb = cb; 1858 1859 alignment = spdk_bdev_get_buf_align(bdev); 1860 1861 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1862 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1863 /* Buffer already present and aligned */ 1864 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1865 return; 1866 } 1867 1868 bdev_io_get_buf(bdev_io, len); 1869 } 1870 1871 static void 1872 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1873 bool success) 1874 { 1875 if (!success) { 1876 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1877 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1878 bdev_io_complete_unsubmitted(bdev_io); 1879 return; 1880 } 1881 1882 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1883 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1884 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1885 return; 1886 } 1887 /* For reads we'll execute the sequence after the data is read, so, for now, only 1888 * clear out accel_sequence pointer and submit the IO */ 1889 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1890 bdev_io->u.bdev.accel_sequence = NULL; 1891 } 1892 1893 bdev_io_submit(bdev_io); 1894 } 1895 1896 static void 1897 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1898 uint64_t len) 1899 { 1900 assert(cb != NULL); 1901 bdev_io->internal.get_buf_cb = cb; 1902 1903 bdev_io_get_buf(bdev_io, len); 1904 } 1905 1906 1907 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1908 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1909 1910 void 1911 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1912 { 1913 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1914 1915 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1916 1917 assert(cb != NULL); 1918 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1919 bdev_io->internal.get_aux_buf_cb = cb; 1920 bdev_io_get_buf(bdev_io, len); 1921 } 1922 1923 static int 1924 bdev_module_get_max_ctx_size(void) 1925 { 1926 struct spdk_bdev_module *bdev_module; 1927 int max_bdev_module_size = 0; 1928 1929 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1930 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1931 max_bdev_module_size = bdev_module->get_ctx_size(); 1932 } 1933 } 1934 1935 return max_bdev_module_size; 1936 } 1937 1938 static void 1939 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1940 { 1941 if (!bdev->internal.histogram_enabled) { 1942 return; 1943 } 1944 1945 spdk_json_write_object_begin(w); 1946 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1947 1948 spdk_json_write_named_object_begin(w, "params"); 1949 spdk_json_write_named_string(w, "name", bdev->name); 1950 1951 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1952 1953 if (bdev->internal.histogram_io_type) { 1954 spdk_json_write_named_string(w, "opc", 1955 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1956 } 1957 1958 spdk_json_write_object_end(w); 1959 1960 spdk_json_write_object_end(w); 1961 } 1962 1963 static void 1964 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1965 { 1966 int i; 1967 struct spdk_bdev_qos *qos = bdev->internal.qos; 1968 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1969 1970 if (!qos) { 1971 return; 1972 } 1973 1974 spdk_bdev_get_qos_rate_limits(bdev, limits); 1975 1976 spdk_json_write_object_begin(w); 1977 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1978 1979 spdk_json_write_named_object_begin(w, "params"); 1980 spdk_json_write_named_string(w, "name", bdev->name); 1981 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1982 if (limits[i] > 0) { 1983 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1984 } 1985 } 1986 spdk_json_write_object_end(w); 1987 1988 spdk_json_write_object_end(w); 1989 } 1990 1991 void 1992 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1993 { 1994 struct spdk_bdev_module *bdev_module; 1995 struct spdk_bdev *bdev; 1996 1997 assert(w != NULL); 1998 1999 spdk_json_write_array_begin(w); 2000 2001 spdk_json_write_object_begin(w); 2002 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2003 spdk_json_write_named_object_begin(w, "params"); 2004 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2005 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2006 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2007 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2008 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2009 spdk_json_write_object_end(w); 2010 spdk_json_write_object_end(w); 2011 2012 bdev_examine_allowlist_config_json(w); 2013 2014 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2015 if (bdev_module->config_json) { 2016 bdev_module->config_json(w); 2017 } 2018 } 2019 2020 spdk_spin_lock(&g_bdev_mgr.spinlock); 2021 2022 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2023 if (bdev->fn_table->write_config_json) { 2024 bdev->fn_table->write_config_json(bdev, w); 2025 } 2026 2027 bdev_qos_config_json(bdev, w); 2028 bdev_enable_histogram_config_json(bdev, w); 2029 } 2030 2031 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2032 2033 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2034 spdk_json_write_object_begin(w); 2035 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2036 spdk_json_write_object_end(w); 2037 2038 spdk_json_write_array_end(w); 2039 } 2040 2041 static void 2042 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2043 { 2044 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2045 struct spdk_bdev_io *bdev_io; 2046 2047 spdk_iobuf_channel_fini(&ch->iobuf); 2048 2049 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2050 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2051 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2052 ch->per_thread_cache_count--; 2053 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2054 } 2055 2056 assert(ch->per_thread_cache_count == 0); 2057 } 2058 2059 static int 2060 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2061 { 2062 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2063 struct spdk_bdev_io *bdev_io; 2064 uint32_t i; 2065 int rc; 2066 2067 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2068 g_bdev_opts.iobuf_small_cache_size, 2069 g_bdev_opts.iobuf_large_cache_size); 2070 if (rc != 0) { 2071 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2072 return -1; 2073 } 2074 2075 STAILQ_INIT(&ch->per_thread_cache); 2076 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2077 2078 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2079 ch->per_thread_cache_count = 0; 2080 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2081 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2082 if (bdev_io == NULL) { 2083 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2084 assert(false); 2085 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2086 return -1; 2087 } 2088 ch->per_thread_cache_count++; 2089 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2090 } 2091 2092 TAILQ_INIT(&ch->shared_resources); 2093 TAILQ_INIT(&ch->io_wait_queue); 2094 2095 return 0; 2096 } 2097 2098 static void 2099 bdev_init_complete(int rc) 2100 { 2101 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2102 void *cb_arg = g_init_cb_arg; 2103 struct spdk_bdev_module *m; 2104 2105 g_bdev_mgr.init_complete = true; 2106 g_init_cb_fn = NULL; 2107 g_init_cb_arg = NULL; 2108 2109 /* 2110 * For modules that need to know when subsystem init is complete, 2111 * inform them now. 2112 */ 2113 if (rc == 0) { 2114 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2115 if (m->init_complete) { 2116 m->init_complete(); 2117 } 2118 } 2119 } 2120 2121 cb_fn(cb_arg, rc); 2122 } 2123 2124 static bool 2125 bdev_module_all_actions_completed(void) 2126 { 2127 struct spdk_bdev_module *m; 2128 2129 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2130 if (m->internal.action_in_progress > 0) { 2131 return false; 2132 } 2133 } 2134 return true; 2135 } 2136 2137 static void 2138 bdev_module_action_complete(void) 2139 { 2140 /* 2141 * Don't finish bdev subsystem initialization if 2142 * module pre-initialization is still in progress, or 2143 * the subsystem been already initialized. 2144 */ 2145 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2146 return; 2147 } 2148 2149 /* 2150 * Check all bdev modules for inits/examinations in progress. If any 2151 * exist, return immediately since we cannot finish bdev subsystem 2152 * initialization until all are completed. 2153 */ 2154 if (!bdev_module_all_actions_completed()) { 2155 return; 2156 } 2157 2158 /* 2159 * Modules already finished initialization - now that all 2160 * the bdev modules have finished their asynchronous I/O 2161 * processing, the entire bdev layer can be marked as complete. 2162 */ 2163 bdev_init_complete(0); 2164 } 2165 2166 static void 2167 bdev_module_action_done(struct spdk_bdev_module *module) 2168 { 2169 spdk_spin_lock(&module->internal.spinlock); 2170 assert(module->internal.action_in_progress > 0); 2171 module->internal.action_in_progress--; 2172 spdk_spin_unlock(&module->internal.spinlock); 2173 bdev_module_action_complete(); 2174 } 2175 2176 void 2177 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2178 { 2179 assert(module->async_init); 2180 bdev_module_action_done(module); 2181 } 2182 2183 void 2184 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2185 { 2186 bdev_module_action_done(module); 2187 } 2188 2189 /** The last initialized bdev module */ 2190 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2191 2192 static void 2193 bdev_init_failed(void *cb_arg) 2194 { 2195 struct spdk_bdev_module *module = cb_arg; 2196 2197 spdk_spin_lock(&module->internal.spinlock); 2198 assert(module->internal.action_in_progress > 0); 2199 module->internal.action_in_progress--; 2200 spdk_spin_unlock(&module->internal.spinlock); 2201 bdev_init_complete(-1); 2202 } 2203 2204 static int 2205 bdev_modules_init(void) 2206 { 2207 struct spdk_bdev_module *module; 2208 int rc = 0; 2209 2210 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2211 g_resume_bdev_module = module; 2212 if (module->async_init) { 2213 spdk_spin_lock(&module->internal.spinlock); 2214 module->internal.action_in_progress = 1; 2215 spdk_spin_unlock(&module->internal.spinlock); 2216 } 2217 rc = module->module_init(); 2218 if (rc != 0) { 2219 /* Bump action_in_progress to prevent other modules from completion of modules_init 2220 * Send message to defer application shutdown until resources are cleaned up */ 2221 spdk_spin_lock(&module->internal.spinlock); 2222 module->internal.action_in_progress = 1; 2223 spdk_spin_unlock(&module->internal.spinlock); 2224 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2225 return rc; 2226 } 2227 } 2228 2229 g_resume_bdev_module = NULL; 2230 return 0; 2231 } 2232 2233 void 2234 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2235 { 2236 int rc = 0; 2237 char mempool_name[32]; 2238 2239 assert(cb_fn != NULL); 2240 2241 g_init_cb_fn = cb_fn; 2242 g_init_cb_arg = cb_arg; 2243 2244 spdk_notify_type_register("bdev_register"); 2245 spdk_notify_type_register("bdev_unregister"); 2246 2247 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2248 2249 rc = spdk_iobuf_register_module("bdev"); 2250 if (rc != 0) { 2251 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2252 bdev_init_complete(-1); 2253 return; 2254 } 2255 2256 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2257 g_bdev_opts.bdev_io_pool_size, 2258 sizeof(struct spdk_bdev_io) + 2259 bdev_module_get_max_ctx_size(), 2260 0, 2261 SPDK_ENV_NUMA_ID_ANY); 2262 2263 if (g_bdev_mgr.bdev_io_pool == NULL) { 2264 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2265 bdev_init_complete(-1); 2266 return; 2267 } 2268 2269 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2270 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2271 if (!g_bdev_mgr.zero_buffer) { 2272 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2273 bdev_init_complete(-1); 2274 return; 2275 } 2276 2277 #ifdef SPDK_CONFIG_VTUNE 2278 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2279 #endif 2280 2281 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2282 bdev_mgmt_channel_destroy, 2283 sizeof(struct spdk_bdev_mgmt_channel), 2284 "bdev_mgr"); 2285 2286 rc = bdev_modules_init(); 2287 g_bdev_mgr.module_init_complete = true; 2288 if (rc != 0) { 2289 SPDK_ERRLOG("bdev modules init failed\n"); 2290 return; 2291 } 2292 2293 bdev_module_action_complete(); 2294 } 2295 2296 static void 2297 bdev_mgr_unregister_cb(void *io_device) 2298 { 2299 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2300 2301 if (g_bdev_mgr.bdev_io_pool) { 2302 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2303 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2304 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2305 g_bdev_opts.bdev_io_pool_size); 2306 } 2307 2308 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2309 } 2310 2311 spdk_free(g_bdev_mgr.zero_buffer); 2312 2313 bdev_examine_allowlist_free(); 2314 2315 cb_fn(g_fini_cb_arg); 2316 g_fini_cb_fn = NULL; 2317 g_fini_cb_arg = NULL; 2318 g_bdev_mgr.init_complete = false; 2319 g_bdev_mgr.module_init_complete = false; 2320 } 2321 2322 static void 2323 bdev_module_fini_iter(void *arg) 2324 { 2325 struct spdk_bdev_module *bdev_module; 2326 2327 /* FIXME: Handling initialization failures is broken now, 2328 * so we won't even try cleaning up after successfully 2329 * initialized modules. if module_init_complete is false, 2330 * just call spdk_bdev_mgr_unregister_cb 2331 */ 2332 if (!g_bdev_mgr.module_init_complete) { 2333 bdev_mgr_unregister_cb(NULL); 2334 return; 2335 } 2336 2337 /* Start iterating from the last touched module */ 2338 if (!g_resume_bdev_module) { 2339 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2340 } else { 2341 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2342 internal.tailq); 2343 } 2344 2345 while (bdev_module) { 2346 if (bdev_module->async_fini) { 2347 /* Save our place so we can resume later. We must 2348 * save the variable here, before calling module_fini() 2349 * below, because in some cases the module may immediately 2350 * call spdk_bdev_module_fini_done() and re-enter 2351 * this function to continue iterating. */ 2352 g_resume_bdev_module = bdev_module; 2353 } 2354 2355 if (bdev_module->module_fini) { 2356 bdev_module->module_fini(); 2357 } 2358 2359 if (bdev_module->async_fini) { 2360 return; 2361 } 2362 2363 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2364 internal.tailq); 2365 } 2366 2367 g_resume_bdev_module = NULL; 2368 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2369 } 2370 2371 void 2372 spdk_bdev_module_fini_done(void) 2373 { 2374 if (spdk_get_thread() != g_fini_thread) { 2375 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2376 } else { 2377 bdev_module_fini_iter(NULL); 2378 } 2379 } 2380 2381 static void 2382 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2383 { 2384 struct spdk_bdev *bdev = cb_arg; 2385 2386 if (bdeverrno && bdev) { 2387 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2388 bdev->name); 2389 2390 /* 2391 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2392 * bdev; try to continue by manually removing this bdev from the list and continue 2393 * with the next bdev in the list. 2394 */ 2395 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2396 } 2397 2398 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2399 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2400 /* 2401 * Bdev module finish need to be deferred as we might be in the middle of some context 2402 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2403 * after returning. 2404 */ 2405 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2406 return; 2407 } 2408 2409 /* 2410 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2411 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2412 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2413 * base bdevs. 2414 * 2415 * Also, walk the list in the reverse order. 2416 */ 2417 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2418 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2419 spdk_spin_lock(&bdev->internal.spinlock); 2420 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2421 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2422 spdk_spin_unlock(&bdev->internal.spinlock); 2423 continue; 2424 } 2425 spdk_spin_unlock(&bdev->internal.spinlock); 2426 2427 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2428 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2429 return; 2430 } 2431 2432 /* 2433 * If any bdev fails to unclaim underlying bdev properly, we may face the 2434 * case of bdev list consisting of claimed bdevs only (if claims are managed 2435 * correctly, this would mean there's a loop in the claims graph which is 2436 * clearly impossible). Warn and unregister last bdev on the list then. 2437 */ 2438 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2439 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2440 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2441 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2442 return; 2443 } 2444 } 2445 2446 static void 2447 bdev_module_fini_start_iter(void *arg) 2448 { 2449 struct spdk_bdev_module *bdev_module; 2450 2451 if (!g_resume_bdev_module) { 2452 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2453 } else { 2454 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2455 } 2456 2457 while (bdev_module) { 2458 if (bdev_module->async_fini_start) { 2459 /* Save our place so we can resume later. We must 2460 * save the variable here, before calling fini_start() 2461 * below, because in some cases the module may immediately 2462 * call spdk_bdev_module_fini_start_done() and re-enter 2463 * this function to continue iterating. */ 2464 g_resume_bdev_module = bdev_module; 2465 } 2466 2467 if (bdev_module->fini_start) { 2468 bdev_module->fini_start(); 2469 } 2470 2471 if (bdev_module->async_fini_start) { 2472 return; 2473 } 2474 2475 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2476 } 2477 2478 g_resume_bdev_module = NULL; 2479 2480 bdev_finish_unregister_bdevs_iter(NULL, 0); 2481 } 2482 2483 void 2484 spdk_bdev_module_fini_start_done(void) 2485 { 2486 if (spdk_get_thread() != g_fini_thread) { 2487 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2488 } else { 2489 bdev_module_fini_start_iter(NULL); 2490 } 2491 } 2492 2493 static void 2494 bdev_finish_wait_for_examine_done(void *cb_arg) 2495 { 2496 bdev_module_fini_start_iter(NULL); 2497 } 2498 2499 static void bdev_open_async_fini(void); 2500 2501 void 2502 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2503 { 2504 int rc; 2505 2506 assert(cb_fn != NULL); 2507 2508 g_fini_thread = spdk_get_thread(); 2509 2510 g_fini_cb_fn = cb_fn; 2511 g_fini_cb_arg = cb_arg; 2512 2513 bdev_open_async_fini(); 2514 2515 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2516 if (rc != 0) { 2517 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2518 bdev_finish_wait_for_examine_done(NULL); 2519 } 2520 } 2521 2522 struct spdk_bdev_io * 2523 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2524 { 2525 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2526 struct spdk_bdev_io *bdev_io; 2527 2528 if (ch->per_thread_cache_count > 0) { 2529 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2530 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2531 ch->per_thread_cache_count--; 2532 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2533 /* 2534 * Don't try to look for bdev_ios in the global pool if there are 2535 * waiters on bdev_ios - we don't want this caller to jump the line. 2536 */ 2537 bdev_io = NULL; 2538 } else { 2539 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2540 } 2541 2542 return bdev_io; 2543 } 2544 2545 void 2546 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2547 { 2548 struct spdk_bdev_mgmt_channel *ch; 2549 2550 assert(bdev_io != NULL); 2551 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2552 2553 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2554 2555 if (bdev_io->internal.f.has_buf) { 2556 bdev_io_put_buf(bdev_io); 2557 } 2558 2559 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2560 ch->per_thread_cache_count++; 2561 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2562 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2563 struct spdk_bdev_io_wait_entry *entry; 2564 2565 entry = TAILQ_FIRST(&ch->io_wait_queue); 2566 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2567 entry->cb_fn(entry->cb_arg); 2568 } 2569 } else { 2570 /* We should never have a full cache with entries on the io wait queue. */ 2571 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2572 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2573 } 2574 } 2575 2576 static bool 2577 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2578 { 2579 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2580 2581 switch (limit) { 2582 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2583 return true; 2584 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2585 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2586 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2587 return false; 2588 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2589 default: 2590 return false; 2591 } 2592 } 2593 2594 static bool 2595 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2596 { 2597 switch (bdev_io->type) { 2598 case SPDK_BDEV_IO_TYPE_NVME_IO: 2599 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2600 case SPDK_BDEV_IO_TYPE_READ: 2601 case SPDK_BDEV_IO_TYPE_WRITE: 2602 return true; 2603 case SPDK_BDEV_IO_TYPE_ZCOPY: 2604 if (bdev_io->u.bdev.zcopy.start) { 2605 return true; 2606 } else { 2607 return false; 2608 } 2609 default: 2610 return false; 2611 } 2612 } 2613 2614 static bool 2615 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2616 { 2617 switch (bdev_io->type) { 2618 case SPDK_BDEV_IO_TYPE_NVME_IO: 2619 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2620 /* Bit 1 (0x2) set for read operation */ 2621 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2622 return true; 2623 } else { 2624 return false; 2625 } 2626 case SPDK_BDEV_IO_TYPE_READ: 2627 return true; 2628 case SPDK_BDEV_IO_TYPE_ZCOPY: 2629 /* Populate to read from disk */ 2630 if (bdev_io->u.bdev.zcopy.populate) { 2631 return true; 2632 } else { 2633 return false; 2634 } 2635 default: 2636 return false; 2637 } 2638 } 2639 2640 static uint64_t 2641 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2642 { 2643 struct spdk_bdev *bdev = bdev_io->bdev; 2644 2645 switch (bdev_io->type) { 2646 case SPDK_BDEV_IO_TYPE_NVME_IO: 2647 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2648 return bdev_io->u.nvme_passthru.nbytes; 2649 case SPDK_BDEV_IO_TYPE_READ: 2650 case SPDK_BDEV_IO_TYPE_WRITE: 2651 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2652 case SPDK_BDEV_IO_TYPE_ZCOPY: 2653 /* Track the data in the start phase only */ 2654 if (bdev_io->u.bdev.zcopy.start) { 2655 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2656 } else { 2657 return 0; 2658 } 2659 default: 2660 return 0; 2661 } 2662 } 2663 2664 static inline bool 2665 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2666 { 2667 int64_t remaining_this_timeslice; 2668 2669 if (!limit->max_per_timeslice) { 2670 /* The QoS is disabled */ 2671 return false; 2672 } 2673 2674 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2675 __ATOMIC_RELAXED); 2676 if (remaining_this_timeslice + (int64_t)delta > 0) { 2677 /* There was still a quota for this delta -> the IO shouldn't be queued 2678 * 2679 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2680 * quota can be allowed once a while. Such overrun then taken into account in 2681 * the QoS poller, where the next timeslice quota is calculated. 2682 */ 2683 return false; 2684 } 2685 2686 /* There was no quota for this delta -> the IO should be queued 2687 * The remaining_this_timeslice must be rewinded so it reflects the real 2688 * amount of IOs or bytes allowed. 2689 */ 2690 __atomic_add_fetch( 2691 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2692 return true; 2693 } 2694 2695 static inline void 2696 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2697 { 2698 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2699 } 2700 2701 static bool 2702 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2703 { 2704 return bdev_qos_rw_queue_io(limit, io, 1); 2705 } 2706 2707 static void 2708 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2709 { 2710 bdev_qos_rw_rewind_io(limit, io, 1); 2711 } 2712 2713 static bool 2714 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2715 { 2716 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2717 } 2718 2719 static void 2720 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2721 { 2722 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2723 } 2724 2725 static bool 2726 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2727 { 2728 if (bdev_is_read_io(io) == false) { 2729 return false; 2730 } 2731 2732 return bdev_qos_rw_bps_queue(limit, io); 2733 } 2734 2735 static void 2736 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2737 { 2738 if (bdev_is_read_io(io) != false) { 2739 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2740 } 2741 } 2742 2743 static bool 2744 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2745 { 2746 if (bdev_is_read_io(io) == true) { 2747 return false; 2748 } 2749 2750 return bdev_qos_rw_bps_queue(limit, io); 2751 } 2752 2753 static void 2754 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2755 { 2756 if (bdev_is_read_io(io) != true) { 2757 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2758 } 2759 } 2760 2761 static void 2762 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2763 { 2764 int i; 2765 2766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2767 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2768 qos->rate_limits[i].queue_io = NULL; 2769 continue; 2770 } 2771 2772 switch (i) { 2773 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2774 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2775 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2776 break; 2777 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2778 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2779 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2780 break; 2781 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2782 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2783 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2784 break; 2785 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2786 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2787 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2788 break; 2789 default: 2790 break; 2791 } 2792 } 2793 } 2794 2795 static void 2796 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2797 struct spdk_bdev_io *bdev_io, 2798 enum spdk_bdev_io_status status) 2799 { 2800 bdev_io->internal.f.in_submit_request = true; 2801 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2802 spdk_bdev_io_complete(bdev_io, status); 2803 bdev_io->internal.f.in_submit_request = false; 2804 } 2805 2806 static inline void 2807 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2808 { 2809 struct spdk_bdev *bdev = bdev_io->bdev; 2810 struct spdk_io_channel *ch = bdev_ch->channel; 2811 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2812 2813 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2814 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2815 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2816 2817 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2818 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2819 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2820 SPDK_BDEV_IO_STATUS_SUCCESS); 2821 return; 2822 } 2823 } 2824 2825 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2826 bdev_io->bdev->split_on_write_unit && 2827 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2828 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2829 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2830 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2831 return; 2832 } 2833 2834 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2835 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2836 bdev_io->internal.f.in_submit_request = true; 2837 bdev_submit_request(bdev, ch, bdev_io); 2838 bdev_io->internal.f.in_submit_request = false; 2839 } else { 2840 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2841 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2842 /* Special case when we have nomem IOs and no outstanding IOs which completions 2843 * could trigger retry of queued IOs */ 2844 bdev_shared_ch_retry_io(shared_resource); 2845 } 2846 } 2847 } 2848 2849 static bool 2850 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2851 { 2852 int i; 2853 2854 if (bdev_qos_io_to_limit(bdev_io) == true) { 2855 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2856 if (!qos->rate_limits[i].queue_io) { 2857 continue; 2858 } 2859 2860 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2861 bdev_io) == true) { 2862 for (i -= 1; i >= 0 ; i--) { 2863 if (!qos->rate_limits[i].queue_io) { 2864 continue; 2865 } 2866 2867 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2868 } 2869 return true; 2870 } 2871 } 2872 } 2873 2874 return false; 2875 } 2876 2877 static int 2878 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2879 { 2880 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2881 int submitted_ios = 0; 2882 2883 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2884 if (!bdev_qos_queue_io(qos, bdev_io)) { 2885 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2886 bdev_io_do_submit(ch, bdev_io); 2887 2888 submitted_ios++; 2889 } 2890 } 2891 2892 return submitted_ios; 2893 } 2894 2895 static void 2896 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2897 { 2898 int rc; 2899 2900 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2901 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2902 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2903 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2904 &bdev_io->internal.waitq_entry); 2905 if (rc != 0) { 2906 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2907 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2908 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2909 } 2910 } 2911 2912 static bool 2913 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2914 { 2915 uint32_t io_boundary; 2916 struct spdk_bdev *bdev = bdev_io->bdev; 2917 uint32_t max_segment_size = bdev->max_segment_size; 2918 uint32_t max_size = bdev->max_rw_size; 2919 int max_segs = bdev->max_num_segments; 2920 2921 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2922 io_boundary = bdev->write_unit_size; 2923 } else if (bdev->split_on_optimal_io_boundary) { 2924 io_boundary = bdev->optimal_io_boundary; 2925 } else { 2926 io_boundary = 0; 2927 } 2928 2929 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2930 return false; 2931 } 2932 2933 if (io_boundary) { 2934 uint64_t start_stripe, end_stripe; 2935 2936 start_stripe = bdev_io->u.bdev.offset_blocks; 2937 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2938 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2939 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2940 start_stripe >>= spdk_u32log2(io_boundary); 2941 end_stripe >>= spdk_u32log2(io_boundary); 2942 } else { 2943 start_stripe /= io_boundary; 2944 end_stripe /= io_boundary; 2945 } 2946 2947 if (start_stripe != end_stripe) { 2948 return true; 2949 } 2950 } 2951 2952 if (max_segs) { 2953 if (bdev_io->u.bdev.iovcnt > max_segs) { 2954 return true; 2955 } 2956 } 2957 2958 if (max_segment_size) { 2959 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2960 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2961 return true; 2962 } 2963 } 2964 } 2965 2966 if (max_size) { 2967 if (bdev_io->u.bdev.num_blocks > max_size) { 2968 return true; 2969 } 2970 } 2971 2972 return false; 2973 } 2974 2975 static bool 2976 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2977 { 2978 uint32_t num_unmap_segments; 2979 2980 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2981 return false; 2982 } 2983 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2984 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2985 return true; 2986 } 2987 2988 return false; 2989 } 2990 2991 static bool 2992 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2993 { 2994 if (!bdev_io->bdev->max_write_zeroes) { 2995 return false; 2996 } 2997 2998 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2999 return true; 3000 } 3001 3002 return false; 3003 } 3004 3005 static bool 3006 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3007 { 3008 if (bdev_io->bdev->max_copy != 0 && 3009 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3010 return true; 3011 } 3012 3013 return false; 3014 } 3015 3016 static bool 3017 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3018 { 3019 switch (bdev_io->type) { 3020 case SPDK_BDEV_IO_TYPE_READ: 3021 case SPDK_BDEV_IO_TYPE_WRITE: 3022 return bdev_rw_should_split(bdev_io); 3023 case SPDK_BDEV_IO_TYPE_UNMAP: 3024 return bdev_unmap_should_split(bdev_io); 3025 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3026 return bdev_write_zeroes_should_split(bdev_io); 3027 case SPDK_BDEV_IO_TYPE_COPY: 3028 return bdev_copy_should_split(bdev_io); 3029 default: 3030 return false; 3031 } 3032 } 3033 3034 static uint32_t 3035 _to_next_boundary(uint64_t offset, uint32_t boundary) 3036 { 3037 return (boundary - (offset % boundary)); 3038 } 3039 3040 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3041 3042 static void _bdev_rw_split(void *_bdev_io); 3043 3044 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3045 3046 static void 3047 _bdev_unmap_split(void *_bdev_io) 3048 { 3049 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3050 } 3051 3052 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3053 3054 static void 3055 _bdev_write_zeroes_split(void *_bdev_io) 3056 { 3057 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3058 } 3059 3060 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3061 3062 static void 3063 _bdev_copy_split(void *_bdev_io) 3064 { 3065 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3066 } 3067 3068 static int 3069 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3070 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3071 { 3072 int rc; 3073 uint64_t current_offset, current_remaining, current_src_offset; 3074 spdk_bdev_io_wait_cb io_wait_fn; 3075 3076 current_offset = *offset; 3077 current_remaining = *remaining; 3078 3079 assert(bdev_io->internal.f.split); 3080 3081 bdev_io->internal.split.outstanding++; 3082 3083 io_wait_fn = _bdev_rw_split; 3084 switch (bdev_io->type) { 3085 case SPDK_BDEV_IO_TYPE_READ: 3086 assert(bdev_io->u.bdev.accel_sequence == NULL); 3087 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3088 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3089 iov, iovcnt, md_buf, current_offset, 3090 num_blocks, 3091 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3092 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3093 NULL, 3094 bdev_io->u.bdev.dif_check_flags, 3095 bdev_io_split_done, bdev_io); 3096 break; 3097 case SPDK_BDEV_IO_TYPE_WRITE: 3098 assert(bdev_io->u.bdev.accel_sequence == NULL); 3099 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3100 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3101 iov, iovcnt, md_buf, current_offset, 3102 num_blocks, 3103 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3104 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3105 NULL, 3106 bdev_io->u.bdev.dif_check_flags, 3107 bdev_io->u.bdev.nvme_cdw12.raw, 3108 bdev_io->u.bdev.nvme_cdw13.raw, 3109 bdev_io_split_done, bdev_io); 3110 break; 3111 case SPDK_BDEV_IO_TYPE_UNMAP: 3112 io_wait_fn = _bdev_unmap_split; 3113 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3114 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3115 current_offset, num_blocks, 3116 bdev_io_split_done, bdev_io); 3117 break; 3118 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3119 io_wait_fn = _bdev_write_zeroes_split; 3120 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3121 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3122 current_offset, num_blocks, 3123 bdev_io_split_done, bdev_io); 3124 break; 3125 case SPDK_BDEV_IO_TYPE_COPY: 3126 io_wait_fn = _bdev_copy_split; 3127 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3128 (current_offset - bdev_io->u.bdev.offset_blocks); 3129 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3130 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3131 current_offset, current_src_offset, num_blocks, 3132 bdev_io_split_done, bdev_io); 3133 break; 3134 default: 3135 assert(false); 3136 rc = -EINVAL; 3137 break; 3138 } 3139 3140 if (rc == 0) { 3141 current_offset += num_blocks; 3142 current_remaining -= num_blocks; 3143 bdev_io->internal.split.current_offset_blocks = current_offset; 3144 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3145 *offset = current_offset; 3146 *remaining = current_remaining; 3147 } else { 3148 bdev_io->internal.split.outstanding--; 3149 if (rc == -ENOMEM) { 3150 if (bdev_io->internal.split.outstanding == 0) { 3151 /* No I/O is outstanding. Hence we should wait here. */ 3152 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3153 } 3154 } else { 3155 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3156 if (bdev_io->internal.split.outstanding == 0) { 3157 bdev_ch_remove_from_io_submitted(bdev_io); 3158 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3159 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3160 bdev_io->internal.ch->queue_depth); 3161 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3162 } 3163 } 3164 } 3165 3166 return rc; 3167 } 3168 3169 static void 3170 _bdev_rw_split(void *_bdev_io) 3171 { 3172 struct iovec *parent_iov, *iov; 3173 struct spdk_bdev_io *bdev_io = _bdev_io; 3174 struct spdk_bdev *bdev = bdev_io->bdev; 3175 uint64_t parent_offset, current_offset, remaining; 3176 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3177 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3178 uint32_t iovcnt, iov_len, child_iovsize; 3179 uint32_t blocklen = bdev->blocklen; 3180 uint32_t io_boundary; 3181 uint32_t max_segment_size = bdev->max_segment_size; 3182 uint32_t max_child_iovcnt = bdev->max_num_segments; 3183 uint32_t max_size = bdev->max_rw_size; 3184 void *md_buf = NULL; 3185 int rc; 3186 3187 max_size = max_size ? max_size : UINT32_MAX; 3188 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3189 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3190 SPDK_BDEV_IO_NUM_CHILD_IOV; 3191 3192 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3193 io_boundary = bdev->write_unit_size; 3194 } else if (bdev->split_on_optimal_io_boundary) { 3195 io_boundary = bdev->optimal_io_boundary; 3196 } else { 3197 io_boundary = UINT32_MAX; 3198 } 3199 3200 assert(bdev_io->internal.f.split); 3201 3202 remaining = bdev_io->internal.split.remaining_num_blocks; 3203 current_offset = bdev_io->internal.split.current_offset_blocks; 3204 parent_offset = bdev_io->u.bdev.offset_blocks; 3205 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3206 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3207 3208 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3209 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3210 if (parent_iov_offset < parent_iov->iov_len) { 3211 break; 3212 } 3213 parent_iov_offset -= parent_iov->iov_len; 3214 } 3215 3216 child_iovcnt = 0; 3217 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3218 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3219 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3220 to_next_boundary = spdk_min(remaining, to_next_boundary); 3221 to_next_boundary = spdk_min(max_size, to_next_boundary); 3222 to_next_boundary_bytes = to_next_boundary * blocklen; 3223 3224 iov = &bdev_io->child_iov[child_iovcnt]; 3225 iovcnt = 0; 3226 3227 if (bdev_io->u.bdev.md_buf) { 3228 md_buf = (char *)bdev_io->u.bdev.md_buf + 3229 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3230 } 3231 3232 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3233 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3234 iovcnt < child_iovsize) { 3235 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3236 iov_len = parent_iov->iov_len - parent_iov_offset; 3237 3238 iov_len = spdk_min(iov_len, max_segment_size); 3239 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3240 to_next_boundary_bytes -= iov_len; 3241 3242 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3243 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3244 3245 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3246 parent_iov_offset += iov_len; 3247 } else { 3248 parent_iovpos++; 3249 parent_iov_offset = 0; 3250 } 3251 child_iovcnt++; 3252 iovcnt++; 3253 } 3254 3255 if (to_next_boundary_bytes > 0) { 3256 /* We had to stop this child I/O early because we ran out of 3257 * child_iov space or were limited by max_num_segments. 3258 * Ensure the iovs to be aligned with block size and 3259 * then adjust to_next_boundary before starting the 3260 * child I/O. 3261 */ 3262 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3263 iovcnt == child_iovsize); 3264 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3265 if (to_last_block_bytes != 0) { 3266 uint32_t child_iovpos = child_iovcnt - 1; 3267 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3268 * so the loop will naturally end 3269 */ 3270 3271 to_last_block_bytes = blocklen - to_last_block_bytes; 3272 to_next_boundary_bytes += to_last_block_bytes; 3273 while (to_last_block_bytes > 0 && iovcnt > 0) { 3274 iov_len = spdk_min(to_last_block_bytes, 3275 bdev_io->child_iov[child_iovpos].iov_len); 3276 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3277 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3278 child_iovpos--; 3279 if (--iovcnt == 0) { 3280 /* If the child IO is less than a block size just return. 3281 * If the first child IO of any split round is less than 3282 * a block size, an error exit. 3283 */ 3284 if (bdev_io->internal.split.outstanding == 0) { 3285 SPDK_ERRLOG("The first child io was less than a block size\n"); 3286 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3287 bdev_ch_remove_from_io_submitted(bdev_io); 3288 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3289 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3290 bdev_io->internal.ch->queue_depth); 3291 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3292 } 3293 3294 return; 3295 } 3296 } 3297 3298 to_last_block_bytes -= iov_len; 3299 3300 if (parent_iov_offset == 0) { 3301 parent_iovpos--; 3302 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3303 } 3304 parent_iov_offset -= iov_len; 3305 } 3306 3307 assert(to_last_block_bytes == 0); 3308 } 3309 to_next_boundary -= to_next_boundary_bytes / blocklen; 3310 } 3311 3312 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3313 ¤t_offset, &remaining); 3314 if (spdk_unlikely(rc)) { 3315 return; 3316 } 3317 } 3318 } 3319 3320 static void 3321 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3322 { 3323 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3324 uint32_t num_children_reqs = 0; 3325 int rc; 3326 3327 assert(bdev_io->internal.f.split); 3328 3329 offset = bdev_io->internal.split.current_offset_blocks; 3330 remaining = bdev_io->internal.split.remaining_num_blocks; 3331 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3332 3333 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3334 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3335 3336 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3337 &offset, &remaining); 3338 if (spdk_likely(rc == 0)) { 3339 num_children_reqs++; 3340 } else { 3341 return; 3342 } 3343 } 3344 } 3345 3346 static void 3347 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3348 { 3349 uint64_t offset, write_zeroes_blocks, remaining; 3350 uint32_t num_children_reqs = 0; 3351 int rc; 3352 3353 assert(bdev_io->internal.f.split); 3354 3355 offset = bdev_io->internal.split.current_offset_blocks; 3356 remaining = bdev_io->internal.split.remaining_num_blocks; 3357 3358 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3359 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3360 3361 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3362 &offset, &remaining); 3363 if (spdk_likely(rc == 0)) { 3364 num_children_reqs++; 3365 } else { 3366 return; 3367 } 3368 } 3369 } 3370 3371 static void 3372 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3373 { 3374 uint64_t offset, copy_blocks, remaining; 3375 uint32_t num_children_reqs = 0; 3376 int rc; 3377 3378 assert(bdev_io->internal.f.split); 3379 3380 offset = bdev_io->internal.split.current_offset_blocks; 3381 remaining = bdev_io->internal.split.remaining_num_blocks; 3382 3383 assert(bdev_io->bdev->max_copy != 0); 3384 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3385 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3386 3387 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3388 &offset, &remaining); 3389 if (spdk_likely(rc == 0)) { 3390 num_children_reqs++; 3391 } else { 3392 return; 3393 } 3394 } 3395 } 3396 3397 static void 3398 parent_bdev_io_complete(void *ctx, int rc) 3399 { 3400 struct spdk_bdev_io *parent_io = ctx; 3401 3402 if (rc) { 3403 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3404 } 3405 3406 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3407 parent_io->internal.caller_ctx); 3408 } 3409 3410 static void 3411 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3412 { 3413 struct spdk_bdev_io *bdev_io = ctx; 3414 3415 /* u.bdev.accel_sequence should have already been cleared at this point */ 3416 assert(bdev_io->u.bdev.accel_sequence == NULL); 3417 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3418 bdev_io->internal.f.has_accel_sequence = false; 3419 3420 if (spdk_unlikely(status != 0)) { 3421 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3422 } 3423 3424 parent_bdev_io_complete(bdev_io, status); 3425 } 3426 3427 static void 3428 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3429 { 3430 struct spdk_bdev_io *parent_io = cb_arg; 3431 3432 spdk_bdev_free_io(bdev_io); 3433 3434 assert(parent_io->internal.f.split); 3435 3436 if (!success) { 3437 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3438 /* If any child I/O failed, stop further splitting process. */ 3439 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3440 parent_io->internal.split.remaining_num_blocks = 0; 3441 } 3442 parent_io->internal.split.outstanding--; 3443 if (parent_io->internal.split.outstanding != 0) { 3444 return; 3445 } 3446 3447 /* 3448 * Parent I/O finishes when all blocks are consumed. 3449 */ 3450 if (parent_io->internal.split.remaining_num_blocks == 0) { 3451 assert(parent_io->internal.cb != bdev_io_split_done); 3452 bdev_ch_remove_from_io_submitted(parent_io); 3453 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3454 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3455 parent_io->internal.ch->queue_depth); 3456 3457 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3458 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3459 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3460 return; 3461 } else if (parent_io->internal.f.has_bounce_buf && 3462 !bdev_io_use_accel_sequence(bdev_io)) { 3463 /* bdev IO will be completed in the callback */ 3464 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3465 return; 3466 } 3467 } 3468 3469 parent_bdev_io_complete(parent_io, 0); 3470 return; 3471 } 3472 3473 /* 3474 * Continue with the splitting process. This function will complete the parent I/O if the 3475 * splitting is done. 3476 */ 3477 switch (parent_io->type) { 3478 case SPDK_BDEV_IO_TYPE_READ: 3479 case SPDK_BDEV_IO_TYPE_WRITE: 3480 _bdev_rw_split(parent_io); 3481 break; 3482 case SPDK_BDEV_IO_TYPE_UNMAP: 3483 bdev_unmap_split(parent_io); 3484 break; 3485 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3486 bdev_write_zeroes_split(parent_io); 3487 break; 3488 case SPDK_BDEV_IO_TYPE_COPY: 3489 bdev_copy_split(parent_io); 3490 break; 3491 default: 3492 assert(false); 3493 break; 3494 } 3495 } 3496 3497 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3498 bool success); 3499 3500 static void 3501 bdev_io_split(struct spdk_bdev_io *bdev_io) 3502 { 3503 assert(bdev_io_should_split(bdev_io)); 3504 assert(bdev_io->internal.f.split); 3505 3506 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3507 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3508 bdev_io->internal.split.outstanding = 0; 3509 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3510 3511 switch (bdev_io->type) { 3512 case SPDK_BDEV_IO_TYPE_READ: 3513 case SPDK_BDEV_IO_TYPE_WRITE: 3514 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3515 _bdev_rw_split(bdev_io); 3516 } else { 3517 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3518 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3519 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3520 } 3521 break; 3522 case SPDK_BDEV_IO_TYPE_UNMAP: 3523 bdev_unmap_split(bdev_io); 3524 break; 3525 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3526 bdev_write_zeroes_split(bdev_io); 3527 break; 3528 case SPDK_BDEV_IO_TYPE_COPY: 3529 bdev_copy_split(bdev_io); 3530 break; 3531 default: 3532 assert(false); 3533 break; 3534 } 3535 } 3536 3537 static void 3538 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3539 { 3540 if (!success) { 3541 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3542 return; 3543 } 3544 3545 _bdev_rw_split(bdev_io); 3546 } 3547 3548 static inline void 3549 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3550 { 3551 struct spdk_bdev *bdev = bdev_io->bdev; 3552 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3553 3554 if (spdk_likely(bdev_ch->flags == 0)) { 3555 bdev_io_do_submit(bdev_ch, bdev_io); 3556 return; 3557 } 3558 3559 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3560 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3561 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3562 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3563 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3564 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3565 } else { 3566 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3567 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3568 } 3569 } else { 3570 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3571 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3572 } 3573 } 3574 3575 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3576 3577 bool 3578 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3579 { 3580 if (range1->length == 0 || range2->length == 0) { 3581 return false; 3582 } 3583 3584 if (range1->offset + range1->length <= range2->offset) { 3585 return false; 3586 } 3587 3588 if (range2->offset + range2->length <= range1->offset) { 3589 return false; 3590 } 3591 3592 return true; 3593 } 3594 3595 static bool 3596 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3597 { 3598 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3599 struct lba_range r; 3600 3601 switch (bdev_io->type) { 3602 case SPDK_BDEV_IO_TYPE_NVME_IO: 3603 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3604 /* Don't try to decode the NVMe command - just assume worst-case and that 3605 * it overlaps a locked range. 3606 */ 3607 return true; 3608 case SPDK_BDEV_IO_TYPE_READ: 3609 if (!range->quiesce) { 3610 return false; 3611 } 3612 /* fallthrough */ 3613 case SPDK_BDEV_IO_TYPE_WRITE: 3614 case SPDK_BDEV_IO_TYPE_UNMAP: 3615 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3616 case SPDK_BDEV_IO_TYPE_ZCOPY: 3617 case SPDK_BDEV_IO_TYPE_COPY: 3618 r.offset = bdev_io->u.bdev.offset_blocks; 3619 r.length = bdev_io->u.bdev.num_blocks; 3620 if (!bdev_lba_range_overlapped(range, &r)) { 3621 /* This I/O doesn't overlap the specified LBA range. */ 3622 return false; 3623 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3624 /* This I/O overlaps, but the I/O is on the same channel that locked this 3625 * range, and the caller_ctx is the same as the locked_ctx. This means 3626 * that this I/O is associated with the lock, and is allowed to execute. 3627 */ 3628 return false; 3629 } else { 3630 return true; 3631 } 3632 default: 3633 return false; 3634 } 3635 } 3636 3637 void 3638 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3639 { 3640 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3641 3642 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3643 3644 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3645 struct lba_range *range; 3646 3647 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3648 if (bdev_io_range_is_locked(bdev_io, range)) { 3649 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3650 return; 3651 } 3652 } 3653 } 3654 3655 bdev_ch_add_to_io_submitted(bdev_io); 3656 3657 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3658 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3659 ch->trace_id, bdev_io->u.bdev.num_blocks, 3660 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3661 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3662 3663 if (bdev_io->internal.f.split) { 3664 bdev_io_split(bdev_io); 3665 return; 3666 } 3667 3668 _bdev_io_submit(bdev_io); 3669 } 3670 3671 static inline void 3672 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3673 { 3674 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3675 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3676 * For write operation we need to pull buffers from memory domain before submitting IO. 3677 * Once read operation completes, we need to use memory_domain push functionality to 3678 * update data in original memory domain IO buffer 3679 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3680 assert(bdev_io->internal.f.has_memory_domain); 3681 bdev_io->u.bdev.memory_domain = NULL; 3682 bdev_io->u.bdev.memory_domain_ctx = NULL; 3683 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3684 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3685 } 3686 3687 static inline void 3688 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3689 { 3690 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3691 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3692 3693 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3694 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3695 bdev_io_complete_unsubmitted(bdev_io); 3696 return; 3697 } 3698 3699 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3700 * support them, but we need to execute an accel sequence and the data buffer is from accel 3701 * memory domain (to avoid doing a push/pull from that domain). 3702 */ 3703 if (bdev_io_use_memory_domain(bdev_io)) { 3704 if (!desc->memory_domains_supported || 3705 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3706 _bdev_io_ext_use_bounce_buffer(bdev_io); 3707 return; 3708 } 3709 } 3710 3711 if (needs_exec) { 3712 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3713 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3714 return; 3715 } 3716 /* For reads we'll execute the sequence after the data is read, so, for now, only 3717 * clear out accel_sequence pointer and submit the IO */ 3718 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3719 bdev_io->u.bdev.accel_sequence = NULL; 3720 } 3721 3722 bdev_io_submit(bdev_io); 3723 } 3724 3725 static void 3726 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3727 { 3728 struct spdk_bdev *bdev = bdev_io->bdev; 3729 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3730 struct spdk_io_channel *ch = bdev_ch->channel; 3731 3732 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3733 3734 bdev_io->internal.f.in_submit_request = true; 3735 bdev_submit_request(bdev, ch, bdev_io); 3736 bdev_io->internal.f.in_submit_request = false; 3737 } 3738 3739 void 3740 bdev_io_init(struct spdk_bdev_io *bdev_io, 3741 struct spdk_bdev *bdev, void *cb_arg, 3742 spdk_bdev_io_completion_cb cb) 3743 { 3744 bdev_io->bdev = bdev; 3745 bdev_io->internal.f.raw = 0; 3746 bdev_io->internal.caller_ctx = cb_arg; 3747 bdev_io->internal.cb = cb; 3748 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3749 bdev_io->internal.f.in_submit_request = false; 3750 bdev_io->internal.error.nvme.cdw0 = 0; 3751 bdev_io->num_retries = 0; 3752 bdev_io->internal.get_buf_cb = NULL; 3753 bdev_io->internal.get_aux_buf_cb = NULL; 3754 bdev_io->internal.data_transfer_cpl = NULL; 3755 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3756 } 3757 3758 static bool 3759 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3760 { 3761 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3762 } 3763 3764 bool 3765 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3766 { 3767 bool supported; 3768 3769 supported = bdev_io_type_supported(bdev, io_type); 3770 3771 if (!supported) { 3772 switch (io_type) { 3773 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3774 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3775 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3776 break; 3777 default: 3778 break; 3779 } 3780 } 3781 3782 return supported; 3783 } 3784 3785 static const char *g_io_type_strings[] = { 3786 [SPDK_BDEV_IO_TYPE_READ] = "read", 3787 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3788 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3789 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3790 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3791 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3792 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3793 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3794 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3795 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3796 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3797 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3798 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3799 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3800 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3801 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3802 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3803 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3804 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3805 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3806 }; 3807 3808 const char * 3809 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3810 { 3811 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3812 return NULL; 3813 } 3814 3815 return g_io_type_strings[io_type]; 3816 } 3817 3818 int 3819 spdk_bdev_get_io_type(const char *io_type_string) 3820 { 3821 int i; 3822 3823 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3824 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3825 return i; 3826 } 3827 } 3828 3829 return -1; 3830 } 3831 3832 uint64_t 3833 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3834 { 3835 return bdev_io->internal.submit_tsc; 3836 } 3837 3838 int 3839 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3840 { 3841 if (bdev->fn_table->dump_info_json) { 3842 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3843 } 3844 3845 return 0; 3846 } 3847 3848 static void 3849 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3850 { 3851 uint32_t max_per_timeslice = 0; 3852 int i; 3853 3854 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3855 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3856 qos->rate_limits[i].max_per_timeslice = 0; 3857 continue; 3858 } 3859 3860 max_per_timeslice = qos->rate_limits[i].limit * 3861 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3862 3863 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3864 qos->rate_limits[i].min_per_timeslice); 3865 3866 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3867 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3868 } 3869 3870 bdev_qos_set_ops(qos); 3871 } 3872 3873 static void 3874 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3875 struct spdk_io_channel *io_ch, void *ctx) 3876 { 3877 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3878 int status; 3879 3880 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3881 3882 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3883 /* TODO: channels round robing */ 3884 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3885 3886 spdk_bdev_for_each_channel_continue(i, status); 3887 } 3888 3889 3890 static void 3891 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3892 { 3893 3894 } 3895 3896 static int 3897 bdev_channel_poll_qos(void *arg) 3898 { 3899 struct spdk_bdev *bdev = arg; 3900 struct spdk_bdev_qos *qos = bdev->internal.qos; 3901 uint64_t now = spdk_get_ticks(); 3902 int i; 3903 int64_t remaining_last_timeslice; 3904 3905 if (spdk_unlikely(qos->thread == NULL)) { 3906 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3907 return SPDK_POLLER_IDLE; 3908 } 3909 3910 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3911 /* We received our callback earlier than expected - return 3912 * immediately and wait to do accounting until at least one 3913 * timeslice has actually expired. This should never happen 3914 * with a well-behaved timer implementation. 3915 */ 3916 return SPDK_POLLER_IDLE; 3917 } 3918 3919 /* Reset for next round of rate limiting */ 3920 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3921 /* We may have allowed the IOs or bytes to slightly overrun in the last 3922 * timeslice. remaining_this_timeslice is signed, so if it's negative 3923 * here, we'll account for the overrun so that the next timeslice will 3924 * be appropriately reduced. 3925 */ 3926 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3927 0, __ATOMIC_RELAXED); 3928 if (remaining_last_timeslice < 0) { 3929 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3930 * potentially use 2 atomic ops each, so they can intertwine. 3931 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3932 */ 3933 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3934 remaining_last_timeslice, __ATOMIC_RELAXED); 3935 } 3936 } 3937 3938 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3939 qos->last_timeslice += qos->timeslice_size; 3940 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3941 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3942 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3943 } 3944 } 3945 3946 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3947 bdev_channel_submit_qos_io_done); 3948 3949 return SPDK_POLLER_BUSY; 3950 } 3951 3952 static void 3953 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3954 { 3955 struct spdk_bdev_shared_resource *shared_resource; 3956 struct lba_range *range; 3957 3958 bdev_free_io_stat(ch->stat); 3959 #ifdef SPDK_CONFIG_VTUNE 3960 bdev_free_io_stat(ch->prev_stat); 3961 #endif 3962 3963 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3964 range = TAILQ_FIRST(&ch->locked_ranges); 3965 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3966 free(range); 3967 } 3968 3969 spdk_put_io_channel(ch->channel); 3970 spdk_put_io_channel(ch->accel_channel); 3971 3972 shared_resource = ch->shared_resource; 3973 3974 assert(TAILQ_EMPTY(&ch->io_locked)); 3975 assert(TAILQ_EMPTY(&ch->io_submitted)); 3976 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3977 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3978 assert(ch->io_outstanding == 0); 3979 assert(shared_resource->ref > 0); 3980 shared_resource->ref--; 3981 if (shared_resource->ref == 0) { 3982 assert(shared_resource->io_outstanding == 0); 3983 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3984 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3985 spdk_poller_unregister(&shared_resource->nomem_poller); 3986 free(shared_resource); 3987 } 3988 } 3989 3990 static void 3991 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3992 { 3993 struct spdk_bdev_qos *qos = bdev->internal.qos; 3994 int i; 3995 3996 assert(spdk_spin_held(&bdev->internal.spinlock)); 3997 3998 /* Rate limiting on this bdev enabled */ 3999 if (qos) { 4000 if (qos->ch == NULL) { 4001 struct spdk_io_channel *io_ch; 4002 4003 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4004 bdev->name, spdk_get_thread()); 4005 4006 /* No qos channel has been selected, so set one up */ 4007 4008 /* Take another reference to ch */ 4009 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4010 assert(io_ch != NULL); 4011 qos->ch = ch; 4012 4013 qos->thread = spdk_io_channel_get_thread(io_ch); 4014 4015 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4016 if (bdev_qos_is_iops_rate_limit(i) == true) { 4017 qos->rate_limits[i].min_per_timeslice = 4018 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4019 } else { 4020 qos->rate_limits[i].min_per_timeslice = 4021 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4022 } 4023 4024 if (qos->rate_limits[i].limit == 0) { 4025 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4026 } 4027 } 4028 bdev_qos_update_max_quota_per_timeslice(qos); 4029 qos->timeslice_size = 4030 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4031 qos->last_timeslice = spdk_get_ticks(); 4032 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4033 bdev, 4034 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4035 } 4036 4037 ch->flags |= BDEV_CH_QOS_ENABLED; 4038 } 4039 } 4040 4041 struct poll_timeout_ctx { 4042 struct spdk_bdev_desc *desc; 4043 uint64_t timeout_in_sec; 4044 spdk_bdev_io_timeout_cb cb_fn; 4045 void *cb_arg; 4046 }; 4047 4048 static void 4049 bdev_desc_free(struct spdk_bdev_desc *desc) 4050 { 4051 spdk_spin_destroy(&desc->spinlock); 4052 free(desc->media_events_buffer); 4053 free(desc); 4054 } 4055 4056 static void 4057 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4058 { 4059 struct poll_timeout_ctx *ctx = _ctx; 4060 struct spdk_bdev_desc *desc = ctx->desc; 4061 4062 free(ctx); 4063 4064 spdk_spin_lock(&desc->spinlock); 4065 desc->refs--; 4066 if (desc->closed == true && desc->refs == 0) { 4067 spdk_spin_unlock(&desc->spinlock); 4068 bdev_desc_free(desc); 4069 return; 4070 } 4071 spdk_spin_unlock(&desc->spinlock); 4072 } 4073 4074 static void 4075 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4076 struct spdk_io_channel *io_ch, void *_ctx) 4077 { 4078 struct poll_timeout_ctx *ctx = _ctx; 4079 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4080 struct spdk_bdev_desc *desc = ctx->desc; 4081 struct spdk_bdev_io *bdev_io; 4082 uint64_t now; 4083 4084 spdk_spin_lock(&desc->spinlock); 4085 if (desc->closed == true) { 4086 spdk_spin_unlock(&desc->spinlock); 4087 spdk_bdev_for_each_channel_continue(i, -1); 4088 return; 4089 } 4090 spdk_spin_unlock(&desc->spinlock); 4091 4092 now = spdk_get_ticks(); 4093 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4094 /* Exclude any I/O that are generated via splitting. */ 4095 if (bdev_io->internal.cb == bdev_io_split_done) { 4096 continue; 4097 } 4098 4099 /* Once we find an I/O that has not timed out, we can immediately 4100 * exit the loop. 4101 */ 4102 if (now < (bdev_io->internal.submit_tsc + 4103 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4104 goto end; 4105 } 4106 4107 if (bdev_io->internal.desc == desc) { 4108 ctx->cb_fn(ctx->cb_arg, bdev_io); 4109 } 4110 } 4111 4112 end: 4113 spdk_bdev_for_each_channel_continue(i, 0); 4114 } 4115 4116 static int 4117 bdev_poll_timeout_io(void *arg) 4118 { 4119 struct spdk_bdev_desc *desc = arg; 4120 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4121 struct poll_timeout_ctx *ctx; 4122 4123 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4124 if (!ctx) { 4125 SPDK_ERRLOG("failed to allocate memory\n"); 4126 return SPDK_POLLER_BUSY; 4127 } 4128 ctx->desc = desc; 4129 ctx->cb_arg = desc->cb_arg; 4130 ctx->cb_fn = desc->cb_fn; 4131 ctx->timeout_in_sec = desc->timeout_in_sec; 4132 4133 /* Take a ref on the descriptor in case it gets closed while we are checking 4134 * all of the channels. 4135 */ 4136 spdk_spin_lock(&desc->spinlock); 4137 desc->refs++; 4138 spdk_spin_unlock(&desc->spinlock); 4139 4140 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4141 bdev_channel_poll_timeout_io_done); 4142 4143 return SPDK_POLLER_BUSY; 4144 } 4145 4146 int 4147 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4148 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4149 { 4150 assert(desc->thread == spdk_get_thread()); 4151 4152 spdk_poller_unregister(&desc->io_timeout_poller); 4153 4154 if (timeout_in_sec) { 4155 assert(cb_fn != NULL); 4156 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4157 desc, 4158 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4159 1000); 4160 if (desc->io_timeout_poller == NULL) { 4161 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4162 return -1; 4163 } 4164 } 4165 4166 desc->cb_fn = cb_fn; 4167 desc->cb_arg = cb_arg; 4168 desc->timeout_in_sec = timeout_in_sec; 4169 4170 return 0; 4171 } 4172 4173 static int 4174 bdev_channel_create(void *io_device, void *ctx_buf) 4175 { 4176 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4177 struct spdk_bdev_channel *ch = ctx_buf; 4178 struct spdk_io_channel *mgmt_io_ch; 4179 struct spdk_bdev_mgmt_channel *mgmt_ch; 4180 struct spdk_bdev_shared_resource *shared_resource; 4181 struct lba_range *range; 4182 4183 ch->bdev = bdev; 4184 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4185 if (!ch->channel) { 4186 return -1; 4187 } 4188 4189 ch->accel_channel = spdk_accel_get_io_channel(); 4190 if (!ch->accel_channel) { 4191 spdk_put_io_channel(ch->channel); 4192 return -1; 4193 } 4194 4195 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4196 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4197 4198 assert(ch->histogram == NULL); 4199 if (bdev->internal.histogram_enabled) { 4200 ch->histogram = spdk_histogram_data_alloc(); 4201 if (ch->histogram == NULL) { 4202 SPDK_ERRLOG("Could not allocate histogram\n"); 4203 } 4204 } 4205 4206 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4207 if (!mgmt_io_ch) { 4208 spdk_put_io_channel(ch->channel); 4209 spdk_put_io_channel(ch->accel_channel); 4210 return -1; 4211 } 4212 4213 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4214 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4215 if (shared_resource->shared_ch == ch->channel) { 4216 spdk_put_io_channel(mgmt_io_ch); 4217 shared_resource->ref++; 4218 break; 4219 } 4220 } 4221 4222 if (shared_resource == NULL) { 4223 shared_resource = calloc(1, sizeof(*shared_resource)); 4224 if (shared_resource == NULL) { 4225 spdk_put_io_channel(ch->channel); 4226 spdk_put_io_channel(ch->accel_channel); 4227 spdk_put_io_channel(mgmt_io_ch); 4228 return -1; 4229 } 4230 4231 shared_resource->mgmt_ch = mgmt_ch; 4232 shared_resource->io_outstanding = 0; 4233 TAILQ_INIT(&shared_resource->nomem_io); 4234 shared_resource->nomem_threshold = 0; 4235 shared_resource->shared_ch = ch->channel; 4236 shared_resource->ref = 1; 4237 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4238 } 4239 4240 ch->io_outstanding = 0; 4241 TAILQ_INIT(&ch->locked_ranges); 4242 TAILQ_INIT(&ch->qos_queued_io); 4243 ch->flags = 0; 4244 ch->trace_id = bdev->internal.trace_id; 4245 ch->shared_resource = shared_resource; 4246 4247 TAILQ_INIT(&ch->io_submitted); 4248 TAILQ_INIT(&ch->io_locked); 4249 TAILQ_INIT(&ch->io_accel_exec); 4250 TAILQ_INIT(&ch->io_memory_domain); 4251 4252 ch->stat = bdev_alloc_io_stat(false); 4253 if (ch->stat == NULL) { 4254 bdev_channel_destroy_resource(ch); 4255 return -1; 4256 } 4257 4258 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4259 4260 #ifdef SPDK_CONFIG_VTUNE 4261 { 4262 char *name; 4263 __itt_init_ittlib(NULL, 0); 4264 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4265 if (!name) { 4266 bdev_channel_destroy_resource(ch); 4267 return -1; 4268 } 4269 ch->handle = __itt_string_handle_create(name); 4270 free(name); 4271 ch->start_tsc = spdk_get_ticks(); 4272 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4273 ch->prev_stat = bdev_alloc_io_stat(false); 4274 if (ch->prev_stat == NULL) { 4275 bdev_channel_destroy_resource(ch); 4276 return -1; 4277 } 4278 } 4279 #endif 4280 4281 spdk_spin_lock(&bdev->internal.spinlock); 4282 bdev_enable_qos(bdev, ch); 4283 4284 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4285 struct lba_range *new_range; 4286 4287 new_range = calloc(1, sizeof(*new_range)); 4288 if (new_range == NULL) { 4289 spdk_spin_unlock(&bdev->internal.spinlock); 4290 bdev_channel_destroy_resource(ch); 4291 return -1; 4292 } 4293 new_range->length = range->length; 4294 new_range->offset = range->offset; 4295 new_range->locked_ctx = range->locked_ctx; 4296 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4297 } 4298 4299 spdk_spin_unlock(&bdev->internal.spinlock); 4300 4301 return 0; 4302 } 4303 4304 static int 4305 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4306 void *cb_ctx) 4307 { 4308 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4309 struct spdk_bdev_io *bdev_io; 4310 uint64_t buf_len; 4311 4312 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4313 if (bdev_io->internal.ch == bdev_ch) { 4314 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4315 spdk_iobuf_entry_abort(ch, entry, buf_len); 4316 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4317 } 4318 4319 return 0; 4320 } 4321 4322 /* 4323 * Abort I/O that are waiting on a data buffer. 4324 */ 4325 static void 4326 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4327 { 4328 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4329 } 4330 4331 /* 4332 * Abort I/O that are queued waiting for submission. These types of I/O are 4333 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4334 */ 4335 static void 4336 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4337 { 4338 struct spdk_bdev_io *bdev_io, *tmp; 4339 4340 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4341 if (bdev_io->internal.ch == ch) { 4342 TAILQ_REMOVE(queue, bdev_io, internal.link); 4343 /* 4344 * spdk_bdev_io_complete() assumes that the completed I/O had 4345 * been submitted to the bdev module. Since in this case it 4346 * hadn't, bump io_outstanding to account for the decrement 4347 * that spdk_bdev_io_complete() will do. 4348 */ 4349 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4350 bdev_io_increment_outstanding(ch, ch->shared_resource); 4351 } 4352 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4353 } 4354 } 4355 } 4356 4357 static bool 4358 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4359 { 4360 struct spdk_bdev_io *bdev_io; 4361 4362 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4363 if (bdev_io == bio_to_abort) { 4364 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4365 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4366 return true; 4367 } 4368 } 4369 4370 return false; 4371 } 4372 4373 static int 4374 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4375 { 4376 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4377 uint64_t buf_len; 4378 4379 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4380 if (bdev_io == bio_to_abort) { 4381 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4382 spdk_iobuf_entry_abort(ch, entry, buf_len); 4383 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4384 return 1; 4385 } 4386 4387 return 0; 4388 } 4389 4390 static bool 4391 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4392 { 4393 int rc; 4394 4395 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4396 return rc == 1; 4397 } 4398 4399 static void 4400 bdev_qos_channel_destroy(void *cb_arg) 4401 { 4402 struct spdk_bdev_qos *qos = cb_arg; 4403 4404 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4405 spdk_poller_unregister(&qos->poller); 4406 4407 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4408 4409 free(qos); 4410 } 4411 4412 static int 4413 bdev_qos_destroy(struct spdk_bdev *bdev) 4414 { 4415 int i; 4416 4417 /* 4418 * Cleanly shutting down the QoS poller is tricky, because 4419 * during the asynchronous operation the user could open 4420 * a new descriptor and create a new channel, spawning 4421 * a new QoS poller. 4422 * 4423 * The strategy is to create a new QoS structure here and swap it 4424 * in. The shutdown path then continues to refer to the old one 4425 * until it completes and then releases it. 4426 */ 4427 struct spdk_bdev_qos *new_qos, *old_qos; 4428 4429 old_qos = bdev->internal.qos; 4430 4431 new_qos = calloc(1, sizeof(*new_qos)); 4432 if (!new_qos) { 4433 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4434 return -ENOMEM; 4435 } 4436 4437 /* Copy the old QoS data into the newly allocated structure */ 4438 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4439 4440 /* Zero out the key parts of the QoS structure */ 4441 new_qos->ch = NULL; 4442 new_qos->thread = NULL; 4443 new_qos->poller = NULL; 4444 /* 4445 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4446 * It will be used later for the new QoS structure. 4447 */ 4448 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4449 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4450 new_qos->rate_limits[i].min_per_timeslice = 0; 4451 new_qos->rate_limits[i].max_per_timeslice = 0; 4452 } 4453 4454 bdev->internal.qos = new_qos; 4455 4456 if (old_qos->thread == NULL) { 4457 free(old_qos); 4458 } else { 4459 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4460 } 4461 4462 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4463 * been destroyed yet. The destruction path will end up waiting for the final 4464 * channel to be put before it releases resources. */ 4465 4466 return 0; 4467 } 4468 4469 void 4470 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4471 { 4472 total->bytes_read += add->bytes_read; 4473 total->num_read_ops += add->num_read_ops; 4474 total->bytes_written += add->bytes_written; 4475 total->num_write_ops += add->num_write_ops; 4476 total->bytes_unmapped += add->bytes_unmapped; 4477 total->num_unmap_ops += add->num_unmap_ops; 4478 total->bytes_copied += add->bytes_copied; 4479 total->num_copy_ops += add->num_copy_ops; 4480 total->read_latency_ticks += add->read_latency_ticks; 4481 total->write_latency_ticks += add->write_latency_ticks; 4482 total->unmap_latency_ticks += add->unmap_latency_ticks; 4483 total->copy_latency_ticks += add->copy_latency_ticks; 4484 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4485 total->max_read_latency_ticks = add->max_read_latency_ticks; 4486 } 4487 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4488 total->min_read_latency_ticks = add->min_read_latency_ticks; 4489 } 4490 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4491 total->max_write_latency_ticks = add->max_write_latency_ticks; 4492 } 4493 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4494 total->min_write_latency_ticks = add->min_write_latency_ticks; 4495 } 4496 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4497 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4498 } 4499 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4500 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4501 } 4502 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4503 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4504 } 4505 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4506 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4507 } 4508 } 4509 4510 static void 4511 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4512 { 4513 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4514 4515 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4516 memcpy(to_stat->io_error, from_stat->io_error, 4517 sizeof(struct spdk_bdev_io_error_stat)); 4518 } 4519 } 4520 4521 void 4522 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4523 { 4524 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4525 return; 4526 } 4527 4528 stat->max_read_latency_ticks = 0; 4529 stat->min_read_latency_ticks = UINT64_MAX; 4530 stat->max_write_latency_ticks = 0; 4531 stat->min_write_latency_ticks = UINT64_MAX; 4532 stat->max_unmap_latency_ticks = 0; 4533 stat->min_unmap_latency_ticks = UINT64_MAX; 4534 stat->max_copy_latency_ticks = 0; 4535 stat->min_copy_latency_ticks = UINT64_MAX; 4536 4537 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4538 return; 4539 } 4540 4541 stat->bytes_read = 0; 4542 stat->num_read_ops = 0; 4543 stat->bytes_written = 0; 4544 stat->num_write_ops = 0; 4545 stat->bytes_unmapped = 0; 4546 stat->num_unmap_ops = 0; 4547 stat->bytes_copied = 0; 4548 stat->num_copy_ops = 0; 4549 stat->read_latency_ticks = 0; 4550 stat->write_latency_ticks = 0; 4551 stat->unmap_latency_ticks = 0; 4552 stat->copy_latency_ticks = 0; 4553 4554 if (stat->io_error != NULL) { 4555 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4556 } 4557 } 4558 4559 struct spdk_bdev_io_stat * 4560 bdev_alloc_io_stat(bool io_error_stat) 4561 { 4562 struct spdk_bdev_io_stat *stat; 4563 4564 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4565 if (stat == NULL) { 4566 return NULL; 4567 } 4568 4569 if (io_error_stat) { 4570 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4571 if (stat->io_error == NULL) { 4572 free(stat); 4573 return NULL; 4574 } 4575 } else { 4576 stat->io_error = NULL; 4577 } 4578 4579 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4580 4581 return stat; 4582 } 4583 4584 void 4585 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4586 { 4587 if (stat != NULL) { 4588 free(stat->io_error); 4589 free(stat); 4590 } 4591 } 4592 4593 void 4594 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4595 { 4596 int i; 4597 4598 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4599 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4600 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4601 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4602 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4603 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4604 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4605 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4606 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4607 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4608 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4609 stat->min_read_latency_ticks != UINT64_MAX ? 4610 stat->min_read_latency_ticks : 0); 4611 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4612 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4613 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4614 stat->min_write_latency_ticks != UINT64_MAX ? 4615 stat->min_write_latency_ticks : 0); 4616 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4617 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4618 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4619 stat->min_unmap_latency_ticks != UINT64_MAX ? 4620 stat->min_unmap_latency_ticks : 0); 4621 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4622 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4623 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4624 stat->min_copy_latency_ticks != UINT64_MAX ? 4625 stat->min_copy_latency_ticks : 0); 4626 4627 if (stat->io_error != NULL) { 4628 spdk_json_write_named_object_begin(w, "io_error"); 4629 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4630 if (stat->io_error->error_status[i] != 0) { 4631 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4632 stat->io_error->error_status[i]); 4633 } 4634 } 4635 spdk_json_write_object_end(w); 4636 } 4637 } 4638 4639 static void 4640 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4641 { 4642 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4643 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4644 4645 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4646 bdev_abort_all_buf_io(mgmt_ch, ch); 4647 } 4648 4649 static void 4650 bdev_channel_destroy(void *io_device, void *ctx_buf) 4651 { 4652 struct spdk_bdev_channel *ch = ctx_buf; 4653 4654 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4655 spdk_get_thread()); 4656 4657 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4658 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4659 4660 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4661 spdk_spin_lock(&ch->bdev->internal.spinlock); 4662 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4663 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4664 4665 bdev_channel_abort_queued_ios(ch); 4666 4667 if (ch->histogram) { 4668 spdk_histogram_data_free(ch->histogram); 4669 } 4670 4671 bdev_channel_destroy_resource(ch); 4672 } 4673 4674 /* 4675 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4676 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4677 */ 4678 static int 4679 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4680 { 4681 struct spdk_bdev_name *tmp; 4682 4683 bdev_name->name = strdup(name); 4684 if (bdev_name->name == NULL) { 4685 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4686 return -ENOMEM; 4687 } 4688 4689 bdev_name->bdev = bdev; 4690 4691 spdk_spin_lock(&g_bdev_mgr.spinlock); 4692 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4693 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4694 4695 if (tmp != NULL) { 4696 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4697 free(bdev_name->name); 4698 return -EEXIST; 4699 } 4700 4701 return 0; 4702 } 4703 4704 static void 4705 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4706 { 4707 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4708 free(bdev_name->name); 4709 } 4710 4711 static void 4712 bdev_name_del(struct spdk_bdev_name *bdev_name) 4713 { 4714 spdk_spin_lock(&g_bdev_mgr.spinlock); 4715 bdev_name_del_unsafe(bdev_name); 4716 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4717 } 4718 4719 int 4720 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4721 { 4722 struct spdk_bdev_alias *tmp; 4723 int ret; 4724 4725 if (alias == NULL) { 4726 SPDK_ERRLOG("Empty alias passed\n"); 4727 return -EINVAL; 4728 } 4729 4730 tmp = calloc(1, sizeof(*tmp)); 4731 if (tmp == NULL) { 4732 SPDK_ERRLOG("Unable to allocate alias\n"); 4733 return -ENOMEM; 4734 } 4735 4736 ret = bdev_name_add(&tmp->alias, bdev, alias); 4737 if (ret != 0) { 4738 free(tmp); 4739 return ret; 4740 } 4741 4742 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4743 4744 return 0; 4745 } 4746 4747 static int 4748 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4749 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4750 { 4751 struct spdk_bdev_alias *tmp; 4752 4753 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4754 if (strcmp(alias, tmp->alias.name) == 0) { 4755 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4756 alias_del_fn(&tmp->alias); 4757 free(tmp); 4758 return 0; 4759 } 4760 } 4761 4762 return -ENOENT; 4763 } 4764 4765 int 4766 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4767 { 4768 int rc; 4769 4770 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4771 if (rc == -ENOENT) { 4772 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4773 } 4774 4775 return rc; 4776 } 4777 4778 void 4779 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4780 { 4781 struct spdk_bdev_alias *p, *tmp; 4782 4783 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4784 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4785 bdev_name_del(&p->alias); 4786 free(p); 4787 } 4788 } 4789 4790 struct spdk_io_channel * 4791 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4792 { 4793 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4794 } 4795 4796 void * 4797 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4798 { 4799 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4800 void *ctx = NULL; 4801 4802 if (bdev->fn_table->get_module_ctx) { 4803 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4804 } 4805 4806 return ctx; 4807 } 4808 4809 const char * 4810 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4811 { 4812 return bdev->module->name; 4813 } 4814 4815 const char * 4816 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4817 { 4818 return bdev->name; 4819 } 4820 4821 const char * 4822 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4823 { 4824 return bdev->product_name; 4825 } 4826 4827 const struct spdk_bdev_aliases_list * 4828 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4829 { 4830 return &bdev->aliases; 4831 } 4832 4833 uint32_t 4834 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4835 { 4836 return bdev->blocklen; 4837 } 4838 4839 uint32_t 4840 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4841 { 4842 return bdev->write_unit_size; 4843 } 4844 4845 uint64_t 4846 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4847 { 4848 return bdev->blockcnt; 4849 } 4850 4851 const char * 4852 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4853 { 4854 return qos_rpc_type[type]; 4855 } 4856 4857 void 4858 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4859 { 4860 int i; 4861 4862 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4863 4864 spdk_spin_lock(&bdev->internal.spinlock); 4865 if (bdev->internal.qos) { 4866 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4867 if (bdev->internal.qos->rate_limits[i].limit != 4868 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4869 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4870 if (bdev_qos_is_iops_rate_limit(i) == false) { 4871 /* Change from Byte to Megabyte which is user visible. */ 4872 limits[i] = limits[i] / 1024 / 1024; 4873 } 4874 } 4875 } 4876 } 4877 spdk_spin_unlock(&bdev->internal.spinlock); 4878 } 4879 4880 size_t 4881 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4882 { 4883 return 1 << bdev->required_alignment; 4884 } 4885 4886 uint32_t 4887 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4888 { 4889 return bdev->optimal_io_boundary; 4890 } 4891 4892 bool 4893 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4894 { 4895 return bdev->write_cache; 4896 } 4897 4898 const struct spdk_uuid * 4899 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4900 { 4901 return &bdev->uuid; 4902 } 4903 4904 uint16_t 4905 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4906 { 4907 return bdev->acwu; 4908 } 4909 4910 uint32_t 4911 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4912 { 4913 return bdev->md_len; 4914 } 4915 4916 bool 4917 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4918 { 4919 return (bdev->md_len != 0) && bdev->md_interleave; 4920 } 4921 4922 bool 4923 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4924 { 4925 return (bdev->md_len != 0) && !bdev->md_interleave; 4926 } 4927 4928 bool 4929 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4930 { 4931 return bdev->zoned; 4932 } 4933 4934 uint32_t 4935 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4936 { 4937 if (spdk_bdev_is_md_interleaved(bdev)) { 4938 return bdev->blocklen - bdev->md_len; 4939 } else { 4940 return bdev->blocklen; 4941 } 4942 } 4943 4944 uint32_t 4945 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4946 { 4947 return bdev->phys_blocklen; 4948 } 4949 4950 static uint32_t 4951 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4952 { 4953 if (!spdk_bdev_is_md_interleaved(bdev)) { 4954 return bdev->blocklen + bdev->md_len; 4955 } else { 4956 return bdev->blocklen; 4957 } 4958 } 4959 4960 /* We have to use the typedef in the function declaration to appease astyle. */ 4961 typedef enum spdk_dif_type spdk_dif_type_t; 4962 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4963 4964 spdk_dif_type_t 4965 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4966 { 4967 if (bdev->md_len != 0) { 4968 return bdev->dif_type; 4969 } else { 4970 return SPDK_DIF_DISABLE; 4971 } 4972 } 4973 4974 spdk_dif_pi_format_t 4975 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4976 { 4977 return bdev->dif_pi_format; 4978 } 4979 4980 bool 4981 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4982 { 4983 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4984 return bdev->dif_is_head_of_md; 4985 } else { 4986 return false; 4987 } 4988 } 4989 4990 bool 4991 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4992 enum spdk_dif_check_type check_type) 4993 { 4994 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4995 return false; 4996 } 4997 4998 switch (check_type) { 4999 case SPDK_DIF_CHECK_TYPE_REFTAG: 5000 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5001 case SPDK_DIF_CHECK_TYPE_APPTAG: 5002 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5003 case SPDK_DIF_CHECK_TYPE_GUARD: 5004 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5005 default: 5006 return false; 5007 } 5008 } 5009 5010 static uint32_t 5011 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5012 { 5013 uint64_t aligned_length, max_write_blocks; 5014 5015 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5016 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5017 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5018 5019 return max_write_blocks; 5020 } 5021 5022 uint32_t 5023 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5024 { 5025 return bdev->max_copy; 5026 } 5027 5028 uint64_t 5029 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5030 { 5031 return bdev->internal.measured_queue_depth; 5032 } 5033 5034 uint64_t 5035 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5036 { 5037 return bdev->internal.period; 5038 } 5039 5040 uint64_t 5041 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5042 { 5043 return bdev->internal.weighted_io_time; 5044 } 5045 5046 uint64_t 5047 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5048 { 5049 return bdev->internal.io_time; 5050 } 5051 5052 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5053 { 5054 return bdev->ctratt; 5055 } 5056 5057 uint32_t 5058 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5059 { 5060 return bdev->nsid; 5061 } 5062 5063 static void bdev_update_qd_sampling_period(void *ctx); 5064 5065 static void 5066 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5067 { 5068 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5069 5070 if (bdev->internal.measured_queue_depth) { 5071 bdev->internal.io_time += bdev->internal.period; 5072 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5073 } 5074 5075 bdev->internal.qd_poll_in_progress = false; 5076 5077 bdev_update_qd_sampling_period(bdev); 5078 } 5079 5080 static void 5081 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5082 struct spdk_io_channel *io_ch, void *_ctx) 5083 { 5084 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5085 5086 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5087 spdk_bdev_for_each_channel_continue(i, 0); 5088 } 5089 5090 static int 5091 bdev_calculate_measured_queue_depth(void *ctx) 5092 { 5093 struct spdk_bdev *bdev = ctx; 5094 5095 bdev->internal.qd_poll_in_progress = true; 5096 bdev->internal.temporary_queue_depth = 0; 5097 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5098 return SPDK_POLLER_BUSY; 5099 } 5100 5101 static void 5102 bdev_update_qd_sampling_period(void *ctx) 5103 { 5104 struct spdk_bdev *bdev = ctx; 5105 5106 if (bdev->internal.period == bdev->internal.new_period) { 5107 return; 5108 } 5109 5110 if (bdev->internal.qd_poll_in_progress) { 5111 return; 5112 } 5113 5114 bdev->internal.period = bdev->internal.new_period; 5115 5116 spdk_poller_unregister(&bdev->internal.qd_poller); 5117 if (bdev->internal.period != 0) { 5118 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5119 bdev, bdev->internal.period); 5120 } else { 5121 spdk_bdev_close(bdev->internal.qd_desc); 5122 bdev->internal.qd_desc = NULL; 5123 } 5124 } 5125 5126 static void 5127 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5128 { 5129 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5130 } 5131 5132 void 5133 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5134 { 5135 int rc; 5136 5137 if (bdev->internal.new_period == period) { 5138 return; 5139 } 5140 5141 bdev->internal.new_period = period; 5142 5143 if (bdev->internal.qd_desc != NULL) { 5144 assert(bdev->internal.period != 0); 5145 5146 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5147 bdev_update_qd_sampling_period, bdev); 5148 return; 5149 } 5150 5151 assert(bdev->internal.period == 0); 5152 5153 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5154 NULL, &bdev->internal.qd_desc); 5155 if (rc != 0) { 5156 return; 5157 } 5158 5159 bdev->internal.period = period; 5160 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5161 bdev, period); 5162 } 5163 5164 struct bdev_get_current_qd_ctx { 5165 uint64_t current_qd; 5166 spdk_bdev_get_current_qd_cb cb_fn; 5167 void *cb_arg; 5168 }; 5169 5170 static void 5171 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5172 { 5173 struct bdev_get_current_qd_ctx *ctx = _ctx; 5174 5175 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5176 5177 free(ctx); 5178 } 5179 5180 static void 5181 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5182 struct spdk_io_channel *io_ch, void *_ctx) 5183 { 5184 struct bdev_get_current_qd_ctx *ctx = _ctx; 5185 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5186 5187 ctx->current_qd += bdev_ch->io_outstanding; 5188 5189 spdk_bdev_for_each_channel_continue(i, 0); 5190 } 5191 5192 void 5193 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5194 void *cb_arg) 5195 { 5196 struct bdev_get_current_qd_ctx *ctx; 5197 5198 assert(cb_fn != NULL); 5199 5200 ctx = calloc(1, sizeof(*ctx)); 5201 if (ctx == NULL) { 5202 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5203 return; 5204 } 5205 5206 ctx->cb_fn = cb_fn; 5207 ctx->cb_arg = cb_arg; 5208 5209 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5210 } 5211 5212 static void 5213 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5214 { 5215 assert(desc->thread == spdk_get_thread()); 5216 5217 spdk_spin_lock(&desc->spinlock); 5218 desc->refs--; 5219 if (!desc->closed) { 5220 spdk_spin_unlock(&desc->spinlock); 5221 desc->callback.event_fn(type, 5222 desc->bdev, 5223 desc->callback.ctx); 5224 return; 5225 } else if (desc->refs == 0) { 5226 /* This descriptor was closed after this event_notify message was sent. 5227 * spdk_bdev_close() could not free the descriptor since this message was 5228 * in flight, so we free it now using bdev_desc_free(). 5229 */ 5230 spdk_spin_unlock(&desc->spinlock); 5231 bdev_desc_free(desc); 5232 return; 5233 } 5234 spdk_spin_unlock(&desc->spinlock); 5235 } 5236 5237 static void 5238 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5239 { 5240 spdk_spin_lock(&desc->spinlock); 5241 desc->refs++; 5242 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5243 spdk_spin_unlock(&desc->spinlock); 5244 } 5245 5246 static void 5247 _resize_notify(void *ctx) 5248 { 5249 struct spdk_bdev_desc *desc = ctx; 5250 5251 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5252 } 5253 5254 int 5255 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5256 { 5257 struct spdk_bdev_desc *desc; 5258 int ret; 5259 5260 if (size == bdev->blockcnt) { 5261 return 0; 5262 } 5263 5264 spdk_spin_lock(&bdev->internal.spinlock); 5265 5266 /* bdev has open descriptors */ 5267 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5268 bdev->blockcnt > size) { 5269 ret = -EBUSY; 5270 } else { 5271 bdev->blockcnt = size; 5272 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5273 event_notify(desc, _resize_notify); 5274 } 5275 ret = 0; 5276 } 5277 5278 spdk_spin_unlock(&bdev->internal.spinlock); 5279 5280 return ret; 5281 } 5282 5283 /* 5284 * Convert I/O offset and length from bytes to blocks. 5285 * 5286 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5287 */ 5288 static uint64_t 5289 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5290 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5291 { 5292 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5293 uint32_t block_size = bdev->blocklen; 5294 uint8_t shift_cnt; 5295 5296 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5297 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5298 shift_cnt = spdk_u32log2(block_size); 5299 *offset_blocks = offset_bytes >> shift_cnt; 5300 *num_blocks = num_bytes >> shift_cnt; 5301 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5302 (num_bytes - (*num_blocks << shift_cnt)); 5303 } else { 5304 *offset_blocks = offset_bytes / block_size; 5305 *num_blocks = num_bytes / block_size; 5306 return (offset_bytes % block_size) | (num_bytes % block_size); 5307 } 5308 } 5309 5310 static bool 5311 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5312 { 5313 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5314 * has been an overflow and hence the offset has been wrapped around */ 5315 if (offset_blocks + num_blocks < offset_blocks) { 5316 return false; 5317 } 5318 5319 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5320 if (offset_blocks + num_blocks > bdev->blockcnt) { 5321 return false; 5322 } 5323 5324 return true; 5325 } 5326 5327 static void 5328 bdev_seek_complete_cb(void *ctx) 5329 { 5330 struct spdk_bdev_io *bdev_io = ctx; 5331 5332 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5333 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5334 } 5335 5336 static int 5337 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5338 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5339 spdk_bdev_io_completion_cb cb, void *cb_arg) 5340 { 5341 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5342 struct spdk_bdev_io *bdev_io; 5343 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5344 5345 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5346 5347 /* Check if offset_blocks is valid looking at the validity of one block */ 5348 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5349 return -EINVAL; 5350 } 5351 5352 bdev_io = bdev_channel_get_io(channel); 5353 if (!bdev_io) { 5354 return -ENOMEM; 5355 } 5356 5357 bdev_io->internal.ch = channel; 5358 bdev_io->internal.desc = desc; 5359 bdev_io->type = io_type; 5360 bdev_io->u.bdev.offset_blocks = offset_blocks; 5361 bdev_io->u.bdev.memory_domain = NULL; 5362 bdev_io->u.bdev.memory_domain_ctx = NULL; 5363 bdev_io->u.bdev.accel_sequence = NULL; 5364 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5365 5366 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5367 /* In case bdev doesn't support seek to next data/hole offset, 5368 * it is assumed that only data and no holes are present */ 5369 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5370 bdev_io->u.bdev.seek.offset = offset_blocks; 5371 } else { 5372 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5373 } 5374 5375 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5376 return 0; 5377 } 5378 5379 bdev_io_submit(bdev_io); 5380 return 0; 5381 } 5382 5383 int 5384 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5385 uint64_t offset_blocks, 5386 spdk_bdev_io_completion_cb cb, void *cb_arg) 5387 { 5388 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5389 } 5390 5391 int 5392 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5393 uint64_t offset_blocks, 5394 spdk_bdev_io_completion_cb cb, void *cb_arg) 5395 { 5396 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5397 } 5398 5399 uint64_t 5400 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5401 { 5402 return bdev_io->u.bdev.seek.offset; 5403 } 5404 5405 static int 5406 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5407 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5408 spdk_bdev_io_completion_cb cb, void *cb_arg) 5409 { 5410 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5411 struct spdk_bdev_io *bdev_io; 5412 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5413 5414 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5415 return -EINVAL; 5416 } 5417 5418 bdev_io = bdev_channel_get_io(channel); 5419 if (!bdev_io) { 5420 return -ENOMEM; 5421 } 5422 5423 bdev_io->internal.ch = channel; 5424 bdev_io->internal.desc = desc; 5425 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5426 bdev_io->u.bdev.iovs = &bdev_io->iov; 5427 bdev_io->u.bdev.iovs[0].iov_base = buf; 5428 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5429 bdev_io->u.bdev.iovcnt = 1; 5430 bdev_io->u.bdev.md_buf = md_buf; 5431 bdev_io->u.bdev.num_blocks = num_blocks; 5432 bdev_io->u.bdev.offset_blocks = offset_blocks; 5433 bdev_io->u.bdev.memory_domain = NULL; 5434 bdev_io->u.bdev.memory_domain_ctx = NULL; 5435 bdev_io->u.bdev.accel_sequence = NULL; 5436 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5437 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5438 5439 bdev_io_submit(bdev_io); 5440 return 0; 5441 } 5442 5443 int 5444 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5445 void *buf, uint64_t offset, uint64_t nbytes, 5446 spdk_bdev_io_completion_cb cb, void *cb_arg) 5447 { 5448 uint64_t offset_blocks, num_blocks; 5449 5450 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5451 return -EINVAL; 5452 } 5453 5454 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5455 } 5456 5457 int 5458 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5459 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5460 spdk_bdev_io_completion_cb cb, void *cb_arg) 5461 { 5462 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5463 } 5464 5465 int 5466 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5467 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5468 spdk_bdev_io_completion_cb cb, void *cb_arg) 5469 { 5470 struct iovec iov = { 5471 .iov_base = buf, 5472 }; 5473 5474 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5475 return -EINVAL; 5476 } 5477 5478 if (md_buf && !_is_buf_allocated(&iov)) { 5479 return -EINVAL; 5480 } 5481 5482 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5483 cb, cb_arg); 5484 } 5485 5486 int 5487 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5488 struct iovec *iov, int iovcnt, 5489 uint64_t offset, uint64_t nbytes, 5490 spdk_bdev_io_completion_cb cb, void *cb_arg) 5491 { 5492 uint64_t offset_blocks, num_blocks; 5493 5494 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5495 return -EINVAL; 5496 } 5497 5498 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5499 } 5500 5501 static int 5502 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5503 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5504 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5505 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5506 spdk_bdev_io_completion_cb cb, void *cb_arg) 5507 { 5508 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5509 struct spdk_bdev_io *bdev_io; 5510 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5511 5512 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5513 return -EINVAL; 5514 } 5515 5516 bdev_io = bdev_channel_get_io(channel); 5517 if (spdk_unlikely(!bdev_io)) { 5518 return -ENOMEM; 5519 } 5520 5521 bdev_io->internal.ch = channel; 5522 bdev_io->internal.desc = desc; 5523 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5524 bdev_io->u.bdev.iovs = iov; 5525 bdev_io->u.bdev.iovcnt = iovcnt; 5526 bdev_io->u.bdev.md_buf = md_buf; 5527 bdev_io->u.bdev.num_blocks = num_blocks; 5528 bdev_io->u.bdev.offset_blocks = offset_blocks; 5529 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5530 5531 if (seq != NULL) { 5532 bdev_io->internal.f.has_accel_sequence = true; 5533 bdev_io->internal.accel_sequence = seq; 5534 } 5535 5536 if (domain != NULL) { 5537 bdev_io->internal.f.has_memory_domain = true; 5538 bdev_io->internal.memory_domain = domain; 5539 bdev_io->internal.memory_domain_ctx = domain_ctx; 5540 } 5541 5542 bdev_io->u.bdev.memory_domain = domain; 5543 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5544 bdev_io->u.bdev.accel_sequence = seq; 5545 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5546 5547 _bdev_io_submit_ext(desc, bdev_io); 5548 5549 return 0; 5550 } 5551 5552 int 5553 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5554 struct iovec *iov, int iovcnt, 5555 uint64_t offset_blocks, uint64_t num_blocks, 5556 spdk_bdev_io_completion_cb cb, void *cb_arg) 5557 { 5558 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5559 5560 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5561 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5562 } 5563 5564 int 5565 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5566 struct iovec *iov, int iovcnt, void *md_buf, 5567 uint64_t offset_blocks, uint64_t num_blocks, 5568 spdk_bdev_io_completion_cb cb, void *cb_arg) 5569 { 5570 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5571 5572 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5573 return -EINVAL; 5574 } 5575 5576 if (md_buf && !_is_buf_allocated(iov)) { 5577 return -EINVAL; 5578 } 5579 5580 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5581 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5582 } 5583 5584 static inline bool 5585 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5586 { 5587 /* 5588 * We check if opts size is at least of size when we first introduced 5589 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5590 * are not checked internal. 5591 */ 5592 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5593 sizeof(opts->metadata) && 5594 opts->size <= sizeof(*opts) && 5595 /* When memory domain is used, the user must provide data buffers */ 5596 (!opts->memory_domain || (iov && iov[0].iov_base)); 5597 } 5598 5599 int 5600 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5601 struct iovec *iov, int iovcnt, 5602 uint64_t offset_blocks, uint64_t num_blocks, 5603 spdk_bdev_io_completion_cb cb, void *cb_arg, 5604 struct spdk_bdev_ext_io_opts *opts) 5605 { 5606 struct spdk_memory_domain *domain = NULL; 5607 struct spdk_accel_sequence *seq = NULL; 5608 void *domain_ctx = NULL, *md = NULL; 5609 uint32_t dif_check_flags = 0; 5610 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5611 5612 if (opts) { 5613 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5614 return -EINVAL; 5615 } 5616 5617 md = opts->metadata; 5618 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5619 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5620 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5621 if (md) { 5622 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5623 return -EINVAL; 5624 } 5625 5626 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5627 return -EINVAL; 5628 } 5629 5630 if (spdk_unlikely(seq != NULL)) { 5631 return -EINVAL; 5632 } 5633 } 5634 } 5635 5636 dif_check_flags = bdev->dif_check_flags & 5637 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5638 5639 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5640 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5641 } 5642 5643 static int 5644 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5645 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5646 spdk_bdev_io_completion_cb cb, void *cb_arg) 5647 { 5648 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5649 struct spdk_bdev_io *bdev_io; 5650 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5651 5652 if (!desc->write) { 5653 return -EBADF; 5654 } 5655 5656 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5657 return -EINVAL; 5658 } 5659 5660 bdev_io = bdev_channel_get_io(channel); 5661 if (!bdev_io) { 5662 return -ENOMEM; 5663 } 5664 5665 bdev_io->internal.ch = channel; 5666 bdev_io->internal.desc = desc; 5667 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5668 bdev_io->u.bdev.iovs = &bdev_io->iov; 5669 bdev_io->u.bdev.iovs[0].iov_base = buf; 5670 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5671 bdev_io->u.bdev.iovcnt = 1; 5672 bdev_io->u.bdev.md_buf = md_buf; 5673 bdev_io->u.bdev.num_blocks = num_blocks; 5674 bdev_io->u.bdev.offset_blocks = offset_blocks; 5675 bdev_io->u.bdev.memory_domain = NULL; 5676 bdev_io->u.bdev.memory_domain_ctx = NULL; 5677 bdev_io->u.bdev.accel_sequence = NULL; 5678 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5679 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5680 5681 bdev_io_submit(bdev_io); 5682 return 0; 5683 } 5684 5685 int 5686 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5687 void *buf, uint64_t offset, uint64_t nbytes, 5688 spdk_bdev_io_completion_cb cb, void *cb_arg) 5689 { 5690 uint64_t offset_blocks, num_blocks; 5691 5692 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5693 return -EINVAL; 5694 } 5695 5696 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5697 } 5698 5699 int 5700 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5701 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5702 spdk_bdev_io_completion_cb cb, void *cb_arg) 5703 { 5704 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5705 cb, cb_arg); 5706 } 5707 5708 int 5709 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5710 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5711 spdk_bdev_io_completion_cb cb, void *cb_arg) 5712 { 5713 struct iovec iov = { 5714 .iov_base = buf, 5715 }; 5716 5717 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5718 return -EINVAL; 5719 } 5720 5721 if (md_buf && !_is_buf_allocated(&iov)) { 5722 return -EINVAL; 5723 } 5724 5725 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5726 cb, cb_arg); 5727 } 5728 5729 static int 5730 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5731 struct iovec *iov, int iovcnt, void *md_buf, 5732 uint64_t offset_blocks, uint64_t num_blocks, 5733 struct spdk_memory_domain *domain, void *domain_ctx, 5734 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5735 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5736 spdk_bdev_io_completion_cb cb, void *cb_arg) 5737 { 5738 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5739 struct spdk_bdev_io *bdev_io; 5740 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5741 5742 if (spdk_unlikely(!desc->write)) { 5743 return -EBADF; 5744 } 5745 5746 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5747 return -EINVAL; 5748 } 5749 5750 bdev_io = bdev_channel_get_io(channel); 5751 if (spdk_unlikely(!bdev_io)) { 5752 return -ENOMEM; 5753 } 5754 5755 bdev_io->internal.ch = channel; 5756 bdev_io->internal.desc = desc; 5757 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5758 bdev_io->u.bdev.iovs = iov; 5759 bdev_io->u.bdev.iovcnt = iovcnt; 5760 bdev_io->u.bdev.md_buf = md_buf; 5761 bdev_io->u.bdev.num_blocks = num_blocks; 5762 bdev_io->u.bdev.offset_blocks = offset_blocks; 5763 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5764 if (seq != NULL) { 5765 bdev_io->internal.f.has_accel_sequence = true; 5766 bdev_io->internal.accel_sequence = seq; 5767 } 5768 5769 if (domain != NULL) { 5770 bdev_io->internal.f.has_memory_domain = true; 5771 bdev_io->internal.memory_domain = domain; 5772 bdev_io->internal.memory_domain_ctx = domain_ctx; 5773 } 5774 5775 bdev_io->u.bdev.memory_domain = domain; 5776 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5777 bdev_io->u.bdev.accel_sequence = seq; 5778 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5779 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5780 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5781 5782 _bdev_io_submit_ext(desc, bdev_io); 5783 5784 return 0; 5785 } 5786 5787 int 5788 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5789 struct iovec *iov, int iovcnt, 5790 uint64_t offset, uint64_t len, 5791 spdk_bdev_io_completion_cb cb, void *cb_arg) 5792 { 5793 uint64_t offset_blocks, num_blocks; 5794 5795 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 5796 return -EINVAL; 5797 } 5798 5799 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5800 } 5801 5802 int 5803 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5804 struct iovec *iov, int iovcnt, 5805 uint64_t offset_blocks, uint64_t num_blocks, 5806 spdk_bdev_io_completion_cb cb, void *cb_arg) 5807 { 5808 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5809 5810 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5811 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5812 cb, cb_arg); 5813 } 5814 5815 int 5816 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5817 struct iovec *iov, int iovcnt, void *md_buf, 5818 uint64_t offset_blocks, uint64_t num_blocks, 5819 spdk_bdev_io_completion_cb cb, void *cb_arg) 5820 { 5821 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5822 5823 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5824 return -EINVAL; 5825 } 5826 5827 if (md_buf && !_is_buf_allocated(iov)) { 5828 return -EINVAL; 5829 } 5830 5831 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5832 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5833 cb, cb_arg); 5834 } 5835 5836 int 5837 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5838 struct iovec *iov, int iovcnt, 5839 uint64_t offset_blocks, uint64_t num_blocks, 5840 spdk_bdev_io_completion_cb cb, void *cb_arg, 5841 struct spdk_bdev_ext_io_opts *opts) 5842 { 5843 struct spdk_memory_domain *domain = NULL; 5844 struct spdk_accel_sequence *seq = NULL; 5845 void *domain_ctx = NULL, *md = NULL; 5846 uint32_t dif_check_flags = 0; 5847 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5848 uint32_t nvme_cdw12_raw = 0; 5849 uint32_t nvme_cdw13_raw = 0; 5850 5851 if (opts) { 5852 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5853 return -EINVAL; 5854 } 5855 md = opts->metadata; 5856 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5857 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5858 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5859 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5860 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5861 if (md) { 5862 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5863 return -EINVAL; 5864 } 5865 5866 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5867 return -EINVAL; 5868 } 5869 5870 if (spdk_unlikely(seq != NULL)) { 5871 return -EINVAL; 5872 } 5873 } 5874 } 5875 5876 dif_check_flags = bdev->dif_check_flags & 5877 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5878 5879 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5880 domain, domain_ctx, seq, dif_check_flags, 5881 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5882 } 5883 5884 static void 5885 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5886 { 5887 struct spdk_bdev_io *parent_io = cb_arg; 5888 struct spdk_bdev *bdev = parent_io->bdev; 5889 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5890 int i, rc = 0; 5891 5892 if (!success) { 5893 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5894 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5895 spdk_bdev_free_io(bdev_io); 5896 return; 5897 } 5898 5899 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5900 rc = memcmp(read_buf, 5901 parent_io->u.bdev.iovs[i].iov_base, 5902 parent_io->u.bdev.iovs[i].iov_len); 5903 if (rc) { 5904 break; 5905 } 5906 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5907 } 5908 5909 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5910 rc = memcmp(bdev_io->u.bdev.md_buf, 5911 parent_io->u.bdev.md_buf, 5912 spdk_bdev_get_md_size(bdev)); 5913 } 5914 5915 spdk_bdev_free_io(bdev_io); 5916 5917 if (rc == 0) { 5918 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5919 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5920 } else { 5921 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5922 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5923 } 5924 } 5925 5926 static void 5927 bdev_compare_do_read(void *_bdev_io) 5928 { 5929 struct spdk_bdev_io *bdev_io = _bdev_io; 5930 int rc; 5931 5932 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5933 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5934 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5935 bdev_compare_do_read_done, bdev_io); 5936 5937 if (rc == -ENOMEM) { 5938 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5939 } else if (rc != 0) { 5940 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5941 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5942 } 5943 } 5944 5945 static int 5946 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5947 struct iovec *iov, int iovcnt, void *md_buf, 5948 uint64_t offset_blocks, uint64_t num_blocks, 5949 spdk_bdev_io_completion_cb cb, void *cb_arg) 5950 { 5951 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5952 struct spdk_bdev_io *bdev_io; 5953 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5954 5955 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5956 return -EINVAL; 5957 } 5958 5959 bdev_io = bdev_channel_get_io(channel); 5960 if (!bdev_io) { 5961 return -ENOMEM; 5962 } 5963 5964 bdev_io->internal.ch = channel; 5965 bdev_io->internal.desc = desc; 5966 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5967 bdev_io->u.bdev.iovs = iov; 5968 bdev_io->u.bdev.iovcnt = iovcnt; 5969 bdev_io->u.bdev.md_buf = md_buf; 5970 bdev_io->u.bdev.num_blocks = num_blocks; 5971 bdev_io->u.bdev.offset_blocks = offset_blocks; 5972 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5973 bdev_io->u.bdev.memory_domain = NULL; 5974 bdev_io->u.bdev.memory_domain_ctx = NULL; 5975 bdev_io->u.bdev.accel_sequence = NULL; 5976 5977 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5978 bdev_io_submit(bdev_io); 5979 return 0; 5980 } 5981 5982 bdev_compare_do_read(bdev_io); 5983 5984 return 0; 5985 } 5986 5987 int 5988 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5989 struct iovec *iov, int iovcnt, 5990 uint64_t offset_blocks, uint64_t num_blocks, 5991 spdk_bdev_io_completion_cb cb, void *cb_arg) 5992 { 5993 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5994 num_blocks, cb, cb_arg); 5995 } 5996 5997 int 5998 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5999 struct iovec *iov, int iovcnt, void *md_buf, 6000 uint64_t offset_blocks, uint64_t num_blocks, 6001 spdk_bdev_io_completion_cb cb, void *cb_arg) 6002 { 6003 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6004 return -EINVAL; 6005 } 6006 6007 if (md_buf && !_is_buf_allocated(iov)) { 6008 return -EINVAL; 6009 } 6010 6011 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6012 num_blocks, cb, cb_arg); 6013 } 6014 6015 static int 6016 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6017 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6018 spdk_bdev_io_completion_cb cb, void *cb_arg) 6019 { 6020 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6021 struct spdk_bdev_io *bdev_io; 6022 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6023 6024 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6025 return -EINVAL; 6026 } 6027 6028 bdev_io = bdev_channel_get_io(channel); 6029 if (!bdev_io) { 6030 return -ENOMEM; 6031 } 6032 6033 bdev_io->internal.ch = channel; 6034 bdev_io->internal.desc = desc; 6035 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6036 bdev_io->u.bdev.iovs = &bdev_io->iov; 6037 bdev_io->u.bdev.iovs[0].iov_base = buf; 6038 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6039 bdev_io->u.bdev.iovcnt = 1; 6040 bdev_io->u.bdev.md_buf = md_buf; 6041 bdev_io->u.bdev.num_blocks = num_blocks; 6042 bdev_io->u.bdev.offset_blocks = offset_blocks; 6043 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6044 bdev_io->u.bdev.memory_domain = NULL; 6045 bdev_io->u.bdev.memory_domain_ctx = NULL; 6046 bdev_io->u.bdev.accel_sequence = NULL; 6047 6048 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6049 bdev_io_submit(bdev_io); 6050 return 0; 6051 } 6052 6053 bdev_compare_do_read(bdev_io); 6054 6055 return 0; 6056 } 6057 6058 int 6059 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6060 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6061 spdk_bdev_io_completion_cb cb, void *cb_arg) 6062 { 6063 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6064 cb, cb_arg); 6065 } 6066 6067 int 6068 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6069 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6070 spdk_bdev_io_completion_cb cb, void *cb_arg) 6071 { 6072 struct iovec iov = { 6073 .iov_base = buf, 6074 }; 6075 6076 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6077 return -EINVAL; 6078 } 6079 6080 if (md_buf && !_is_buf_allocated(&iov)) { 6081 return -EINVAL; 6082 } 6083 6084 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6085 cb, cb_arg); 6086 } 6087 6088 static void 6089 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6090 { 6091 struct spdk_bdev_io *bdev_io = ctx; 6092 6093 if (unlock_status) { 6094 SPDK_ERRLOG("LBA range unlock failed\n"); 6095 } 6096 6097 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6098 false, bdev_io->internal.caller_ctx); 6099 } 6100 6101 static void 6102 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6103 { 6104 bdev_io->internal.status = status; 6105 6106 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6107 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6108 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6109 } 6110 6111 static void 6112 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6113 { 6114 struct spdk_bdev_io *parent_io = cb_arg; 6115 6116 if (!success) { 6117 SPDK_ERRLOG("Compare and write operation failed\n"); 6118 } 6119 6120 spdk_bdev_free_io(bdev_io); 6121 6122 bdev_comparev_and_writev_blocks_unlock(parent_io, 6123 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6124 } 6125 6126 static void 6127 bdev_compare_and_write_do_write(void *_bdev_io) 6128 { 6129 struct spdk_bdev_io *bdev_io = _bdev_io; 6130 int rc; 6131 6132 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6133 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6134 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6135 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6136 bdev_compare_and_write_do_write_done, bdev_io); 6137 6138 6139 if (rc == -ENOMEM) { 6140 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6141 } else if (rc != 0) { 6142 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6143 } 6144 } 6145 6146 static void 6147 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6148 { 6149 struct spdk_bdev_io *parent_io = cb_arg; 6150 6151 spdk_bdev_free_io(bdev_io); 6152 6153 if (!success) { 6154 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6155 return; 6156 } 6157 6158 bdev_compare_and_write_do_write(parent_io); 6159 } 6160 6161 static void 6162 bdev_compare_and_write_do_compare(void *_bdev_io) 6163 { 6164 struct spdk_bdev_io *bdev_io = _bdev_io; 6165 int rc; 6166 6167 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6168 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6169 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6170 bdev_compare_and_write_do_compare_done, bdev_io); 6171 6172 if (rc == -ENOMEM) { 6173 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6174 } else if (rc != 0) { 6175 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6176 } 6177 } 6178 6179 static void 6180 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6181 { 6182 struct spdk_bdev_io *bdev_io = ctx; 6183 6184 if (status) { 6185 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6186 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6187 return; 6188 } 6189 6190 bdev_compare_and_write_do_compare(bdev_io); 6191 } 6192 6193 int 6194 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6195 struct iovec *compare_iov, int compare_iovcnt, 6196 struct iovec *write_iov, int write_iovcnt, 6197 uint64_t offset_blocks, uint64_t num_blocks, 6198 spdk_bdev_io_completion_cb cb, void *cb_arg) 6199 { 6200 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6201 struct spdk_bdev_io *bdev_io; 6202 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6203 6204 if (!desc->write) { 6205 return -EBADF; 6206 } 6207 6208 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6209 return -EINVAL; 6210 } 6211 6212 if (num_blocks > bdev->acwu) { 6213 return -EINVAL; 6214 } 6215 6216 bdev_io = bdev_channel_get_io(channel); 6217 if (!bdev_io) { 6218 return -ENOMEM; 6219 } 6220 6221 bdev_io->internal.ch = channel; 6222 bdev_io->internal.desc = desc; 6223 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6224 bdev_io->u.bdev.iovs = compare_iov; 6225 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6226 bdev_io->u.bdev.fused_iovs = write_iov; 6227 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6228 bdev_io->u.bdev.md_buf = NULL; 6229 bdev_io->u.bdev.num_blocks = num_blocks; 6230 bdev_io->u.bdev.offset_blocks = offset_blocks; 6231 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6232 bdev_io->u.bdev.memory_domain = NULL; 6233 bdev_io->u.bdev.memory_domain_ctx = NULL; 6234 bdev_io->u.bdev.accel_sequence = NULL; 6235 6236 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6237 bdev_io_submit(bdev_io); 6238 return 0; 6239 } 6240 6241 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6242 bdev_comparev_and_writev_blocks_locked, bdev_io); 6243 } 6244 6245 int 6246 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6247 struct iovec *iov, int iovcnt, 6248 uint64_t offset_blocks, uint64_t num_blocks, 6249 bool populate, 6250 spdk_bdev_io_completion_cb cb, void *cb_arg) 6251 { 6252 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6253 struct spdk_bdev_io *bdev_io; 6254 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6255 6256 if (!desc->write) { 6257 return -EBADF; 6258 } 6259 6260 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6261 return -EINVAL; 6262 } 6263 6264 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6265 return -ENOTSUP; 6266 } 6267 6268 bdev_io = bdev_channel_get_io(channel); 6269 if (!bdev_io) { 6270 return -ENOMEM; 6271 } 6272 6273 bdev_io->internal.ch = channel; 6274 bdev_io->internal.desc = desc; 6275 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6276 bdev_io->u.bdev.num_blocks = num_blocks; 6277 bdev_io->u.bdev.offset_blocks = offset_blocks; 6278 bdev_io->u.bdev.iovs = iov; 6279 bdev_io->u.bdev.iovcnt = iovcnt; 6280 bdev_io->u.bdev.md_buf = NULL; 6281 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6282 bdev_io->u.bdev.zcopy.commit = 0; 6283 bdev_io->u.bdev.zcopy.start = 1; 6284 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6285 bdev_io->u.bdev.memory_domain = NULL; 6286 bdev_io->u.bdev.memory_domain_ctx = NULL; 6287 bdev_io->u.bdev.accel_sequence = NULL; 6288 6289 bdev_io_submit(bdev_io); 6290 6291 return 0; 6292 } 6293 6294 int 6295 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6296 spdk_bdev_io_completion_cb cb, void *cb_arg) 6297 { 6298 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6299 return -EINVAL; 6300 } 6301 6302 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6303 bdev_io->u.bdev.zcopy.start = 0; 6304 bdev_io->internal.caller_ctx = cb_arg; 6305 bdev_io->internal.cb = cb; 6306 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6307 6308 bdev_io_submit(bdev_io); 6309 6310 return 0; 6311 } 6312 6313 int 6314 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6315 uint64_t offset, uint64_t len, 6316 spdk_bdev_io_completion_cb cb, void *cb_arg) 6317 { 6318 uint64_t offset_blocks, num_blocks; 6319 6320 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6321 return -EINVAL; 6322 } 6323 6324 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6325 } 6326 6327 int 6328 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6329 uint64_t offset_blocks, uint64_t num_blocks, 6330 spdk_bdev_io_completion_cb cb, void *cb_arg) 6331 { 6332 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6333 struct spdk_bdev_io *bdev_io; 6334 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6335 6336 if (!desc->write) { 6337 return -EBADF; 6338 } 6339 6340 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6341 return -EINVAL; 6342 } 6343 6344 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6345 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6346 return -ENOTSUP; 6347 } 6348 6349 bdev_io = bdev_channel_get_io(channel); 6350 6351 if (!bdev_io) { 6352 return -ENOMEM; 6353 } 6354 6355 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6356 bdev_io->internal.ch = channel; 6357 bdev_io->internal.desc = desc; 6358 bdev_io->u.bdev.offset_blocks = offset_blocks; 6359 bdev_io->u.bdev.num_blocks = num_blocks; 6360 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6361 bdev_io->u.bdev.memory_domain = NULL; 6362 bdev_io->u.bdev.memory_domain_ctx = NULL; 6363 bdev_io->u.bdev.accel_sequence = NULL; 6364 6365 /* If the write_zeroes size is large and should be split, use the generic split 6366 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6367 * 6368 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6369 * or emulate it using regular write request otherwise. 6370 */ 6371 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6372 bdev_io->internal.f.split) { 6373 bdev_io_submit(bdev_io); 6374 return 0; 6375 } 6376 6377 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6378 6379 return bdev_write_zero_buffer(bdev_io); 6380 } 6381 6382 int 6383 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6384 uint64_t offset, uint64_t nbytes, 6385 spdk_bdev_io_completion_cb cb, void *cb_arg) 6386 { 6387 uint64_t offset_blocks, num_blocks; 6388 6389 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6390 return -EINVAL; 6391 } 6392 6393 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6394 } 6395 6396 static void 6397 bdev_io_complete_cb(void *ctx) 6398 { 6399 struct spdk_bdev_io *bdev_io = ctx; 6400 6401 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6402 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6403 } 6404 6405 int 6406 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6407 uint64_t offset_blocks, uint64_t num_blocks, 6408 spdk_bdev_io_completion_cb cb, void *cb_arg) 6409 { 6410 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6411 struct spdk_bdev_io *bdev_io; 6412 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6413 6414 if (!desc->write) { 6415 return -EBADF; 6416 } 6417 6418 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6419 return -EINVAL; 6420 } 6421 6422 bdev_io = bdev_channel_get_io(channel); 6423 if (!bdev_io) { 6424 return -ENOMEM; 6425 } 6426 6427 bdev_io->internal.ch = channel; 6428 bdev_io->internal.desc = desc; 6429 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6430 6431 bdev_io->u.bdev.iovs = &bdev_io->iov; 6432 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6433 bdev_io->u.bdev.iovs[0].iov_len = 0; 6434 bdev_io->u.bdev.iovcnt = 1; 6435 6436 bdev_io->u.bdev.offset_blocks = offset_blocks; 6437 bdev_io->u.bdev.num_blocks = num_blocks; 6438 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6439 bdev_io->u.bdev.memory_domain = NULL; 6440 bdev_io->u.bdev.memory_domain_ctx = NULL; 6441 bdev_io->u.bdev.accel_sequence = NULL; 6442 6443 if (num_blocks == 0) { 6444 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6445 return 0; 6446 } 6447 6448 bdev_io_submit(bdev_io); 6449 return 0; 6450 } 6451 6452 int 6453 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6454 uint64_t offset, uint64_t length, 6455 spdk_bdev_io_completion_cb cb, void *cb_arg) 6456 { 6457 uint64_t offset_blocks, num_blocks; 6458 6459 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6460 return -EINVAL; 6461 } 6462 6463 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6464 } 6465 6466 int 6467 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6468 uint64_t offset_blocks, uint64_t num_blocks, 6469 spdk_bdev_io_completion_cb cb, void *cb_arg) 6470 { 6471 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6472 struct spdk_bdev_io *bdev_io; 6473 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6474 6475 if (!desc->write) { 6476 return -EBADF; 6477 } 6478 6479 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6480 return -ENOTSUP; 6481 } 6482 6483 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6484 return -EINVAL; 6485 } 6486 6487 bdev_io = bdev_channel_get_io(channel); 6488 if (!bdev_io) { 6489 return -ENOMEM; 6490 } 6491 6492 bdev_io->internal.ch = channel; 6493 bdev_io->internal.desc = desc; 6494 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6495 bdev_io->u.bdev.iovs = NULL; 6496 bdev_io->u.bdev.iovcnt = 0; 6497 bdev_io->u.bdev.offset_blocks = offset_blocks; 6498 bdev_io->u.bdev.num_blocks = num_blocks; 6499 bdev_io->u.bdev.memory_domain = NULL; 6500 bdev_io->u.bdev.memory_domain_ctx = NULL; 6501 bdev_io->u.bdev.accel_sequence = NULL; 6502 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6503 6504 bdev_io_submit(bdev_io); 6505 return 0; 6506 } 6507 6508 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6509 6510 static void 6511 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6512 { 6513 struct spdk_bdev_io *bdev_io = _ctx; 6514 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6515 6516 if (status == -EBUSY) { 6517 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6518 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6519 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6520 } else { 6521 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6522 /* If outstanding IOs are still present and reset_io_drain_timeout 6523 * seconds passed, start the reset. */ 6524 bdev_io_submit_reset(bdev_io); 6525 } else { 6526 /* We still have in progress memory domain pull/push or we're 6527 * executing accel sequence. Since we cannot abort either of those 6528 * operations, fail the reset request. */ 6529 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6530 } 6531 } 6532 } else { 6533 SPDK_DEBUGLOG(bdev, 6534 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6535 ch->bdev->name); 6536 /* Mark the completion status as a SUCCESS and complete the reset. */ 6537 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6538 } 6539 } 6540 6541 static void 6542 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6543 struct spdk_io_channel *io_ch, void *_ctx) 6544 { 6545 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6546 int status = 0; 6547 6548 if (cur_ch->io_outstanding > 0 || 6549 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6550 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6551 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6552 * further iteration over the rest of the channels and pass non-zero status 6553 * to the callback function. */ 6554 status = -EBUSY; 6555 } 6556 spdk_bdev_for_each_channel_continue(i, status); 6557 } 6558 6559 static int 6560 bdev_reset_poll_for_outstanding_io(void *ctx) 6561 { 6562 struct spdk_bdev_io *bdev_io = ctx; 6563 6564 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6565 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6566 bdev_reset_check_outstanding_io_done); 6567 6568 return SPDK_POLLER_BUSY; 6569 } 6570 6571 static void 6572 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6573 { 6574 struct spdk_bdev_io *bdev_io = _ctx; 6575 6576 if (bdev->reset_io_drain_timeout == 0) { 6577 bdev_io_submit_reset(bdev_io); 6578 return; 6579 } 6580 6581 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6582 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6583 6584 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6585 * submit the reset to the underlying module only if outstanding I/O 6586 * remain after reset_io_drain_timeout seconds have passed. */ 6587 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6588 bdev_reset_check_outstanding_io_done); 6589 } 6590 6591 static void 6592 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6593 struct spdk_io_channel *ch, void *_ctx) 6594 { 6595 struct spdk_bdev_channel *channel; 6596 struct spdk_bdev_mgmt_channel *mgmt_channel; 6597 struct spdk_bdev_shared_resource *shared_resource; 6598 bdev_io_tailq_t tmp_queued; 6599 6600 TAILQ_INIT(&tmp_queued); 6601 6602 channel = __io_ch_to_bdev_ch(ch); 6603 shared_resource = channel->shared_resource; 6604 mgmt_channel = shared_resource->mgmt_ch; 6605 6606 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6607 6608 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6609 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6610 } 6611 6612 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6613 bdev_abort_all_buf_io(mgmt_channel, channel); 6614 bdev_abort_all_queued_io(&tmp_queued, channel); 6615 6616 spdk_bdev_for_each_channel_continue(i, 0); 6617 } 6618 6619 static void 6620 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6621 { 6622 struct spdk_bdev *bdev = bdev_io->bdev; 6623 bool freeze_channel = false; 6624 6625 bdev_ch_add_to_io_submitted(bdev_io); 6626 6627 /** 6628 * Take a channel reference for the target bdev for the life of this 6629 * reset. This guards against the channel getting destroyed before 6630 * the reset is completed. We will release the reference when this 6631 * reset is completed. 6632 */ 6633 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6634 6635 spdk_spin_lock(&bdev->internal.spinlock); 6636 if (bdev->internal.reset_in_progress == NULL) { 6637 bdev->internal.reset_in_progress = bdev_io; 6638 freeze_channel = true; 6639 } else { 6640 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6641 } 6642 spdk_spin_unlock(&bdev->internal.spinlock); 6643 6644 if (freeze_channel) { 6645 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6646 bdev_reset_freeze_channel_done); 6647 } 6648 } 6649 6650 int 6651 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6652 spdk_bdev_io_completion_cb cb, void *cb_arg) 6653 { 6654 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6655 struct spdk_bdev_io *bdev_io; 6656 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6657 6658 bdev_io = bdev_channel_get_io(channel); 6659 if (!bdev_io) { 6660 return -ENOMEM; 6661 } 6662 6663 bdev_io->internal.ch = channel; 6664 bdev_io->internal.desc = desc; 6665 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6666 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6667 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6668 6669 bdev_start_reset(bdev_io); 6670 return 0; 6671 } 6672 6673 void 6674 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6675 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6676 { 6677 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6678 6679 bdev_get_io_stat(stat, channel->stat); 6680 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6681 } 6682 6683 static void 6684 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6685 { 6686 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6687 6688 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6689 bdev_iostat_ctx->cb_arg, 0); 6690 free(bdev_iostat_ctx); 6691 } 6692 6693 static void 6694 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6695 struct spdk_io_channel *ch, void *_ctx) 6696 { 6697 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6698 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6699 6700 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6701 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6702 spdk_bdev_for_each_channel_continue(i, 0); 6703 } 6704 6705 void 6706 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6707 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6708 { 6709 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6710 6711 assert(bdev != NULL); 6712 assert(stat != NULL); 6713 assert(cb != NULL); 6714 6715 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6716 if (bdev_iostat_ctx == NULL) { 6717 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6718 cb(bdev, stat, cb_arg, -ENOMEM); 6719 return; 6720 } 6721 6722 bdev_iostat_ctx->stat = stat; 6723 bdev_iostat_ctx->cb = cb; 6724 bdev_iostat_ctx->cb_arg = cb_arg; 6725 bdev_iostat_ctx->reset_mode = reset_mode; 6726 6727 /* Start with the statistics from previously deleted channels. */ 6728 spdk_spin_lock(&bdev->internal.spinlock); 6729 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6730 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6731 spdk_spin_unlock(&bdev->internal.spinlock); 6732 6733 /* Then iterate and add the statistics from each existing channel. */ 6734 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6735 bdev_get_device_stat_done); 6736 } 6737 6738 struct bdev_iostat_reset_ctx { 6739 enum spdk_bdev_reset_stat_mode mode; 6740 bdev_reset_device_stat_cb cb; 6741 void *cb_arg; 6742 }; 6743 6744 static void 6745 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6746 { 6747 struct bdev_iostat_reset_ctx *ctx = _ctx; 6748 6749 ctx->cb(bdev, ctx->cb_arg, 0); 6750 6751 free(ctx); 6752 } 6753 6754 static void 6755 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6756 struct spdk_io_channel *ch, void *_ctx) 6757 { 6758 struct bdev_iostat_reset_ctx *ctx = _ctx; 6759 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6760 6761 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6762 6763 spdk_bdev_for_each_channel_continue(i, 0); 6764 } 6765 6766 void 6767 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6768 bdev_reset_device_stat_cb cb, void *cb_arg) 6769 { 6770 struct bdev_iostat_reset_ctx *ctx; 6771 6772 assert(bdev != NULL); 6773 assert(cb != NULL); 6774 6775 ctx = calloc(1, sizeof(*ctx)); 6776 if (ctx == NULL) { 6777 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6778 cb(bdev, cb_arg, -ENOMEM); 6779 return; 6780 } 6781 6782 ctx->mode = mode; 6783 ctx->cb = cb; 6784 ctx->cb_arg = cb_arg; 6785 6786 spdk_spin_lock(&bdev->internal.spinlock); 6787 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6788 spdk_spin_unlock(&bdev->internal.spinlock); 6789 6790 spdk_bdev_for_each_channel(bdev, 6791 bdev_reset_each_channel_stat, 6792 ctx, 6793 bdev_reset_device_stat_done); 6794 } 6795 6796 int 6797 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6798 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6799 spdk_bdev_io_completion_cb cb, void *cb_arg) 6800 { 6801 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6802 struct spdk_bdev_io *bdev_io; 6803 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6804 6805 if (!desc->write) { 6806 return -EBADF; 6807 } 6808 6809 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6810 return -ENOTSUP; 6811 } 6812 6813 bdev_io = bdev_channel_get_io(channel); 6814 if (!bdev_io) { 6815 return -ENOMEM; 6816 } 6817 6818 bdev_io->internal.ch = channel; 6819 bdev_io->internal.desc = desc; 6820 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6821 bdev_io->u.nvme_passthru.cmd = *cmd; 6822 bdev_io->u.nvme_passthru.buf = buf; 6823 bdev_io->u.nvme_passthru.nbytes = nbytes; 6824 bdev_io->u.nvme_passthru.md_buf = NULL; 6825 bdev_io->u.nvme_passthru.md_len = 0; 6826 6827 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6828 6829 bdev_io_submit(bdev_io); 6830 return 0; 6831 } 6832 6833 int 6834 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6835 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6836 spdk_bdev_io_completion_cb cb, void *cb_arg) 6837 { 6838 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6839 struct spdk_bdev_io *bdev_io; 6840 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6841 6842 if (!desc->write) { 6843 /* 6844 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6845 * to easily determine if the command is a read or write, but for now just 6846 * do not allow io_passthru with a read-only descriptor. 6847 */ 6848 return -EBADF; 6849 } 6850 6851 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6852 return -ENOTSUP; 6853 } 6854 6855 bdev_io = bdev_channel_get_io(channel); 6856 if (!bdev_io) { 6857 return -ENOMEM; 6858 } 6859 6860 bdev_io->internal.ch = channel; 6861 bdev_io->internal.desc = desc; 6862 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6863 bdev_io->u.nvme_passthru.cmd = *cmd; 6864 bdev_io->u.nvme_passthru.buf = buf; 6865 bdev_io->u.nvme_passthru.nbytes = nbytes; 6866 bdev_io->u.nvme_passthru.md_buf = NULL; 6867 bdev_io->u.nvme_passthru.md_len = 0; 6868 6869 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6870 6871 bdev_io_submit(bdev_io); 6872 return 0; 6873 } 6874 6875 int 6876 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6877 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6878 spdk_bdev_io_completion_cb cb, void *cb_arg) 6879 { 6880 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6881 struct spdk_bdev_io *bdev_io; 6882 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6883 6884 if (!desc->write) { 6885 /* 6886 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6887 * to easily determine if the command is a read or write, but for now just 6888 * do not allow io_passthru with a read-only descriptor. 6889 */ 6890 return -EBADF; 6891 } 6892 6893 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6894 return -ENOTSUP; 6895 } 6896 6897 bdev_io = bdev_channel_get_io(channel); 6898 if (!bdev_io) { 6899 return -ENOMEM; 6900 } 6901 6902 bdev_io->internal.ch = channel; 6903 bdev_io->internal.desc = desc; 6904 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6905 bdev_io->u.nvme_passthru.cmd = *cmd; 6906 bdev_io->u.nvme_passthru.buf = buf; 6907 bdev_io->u.nvme_passthru.nbytes = nbytes; 6908 bdev_io->u.nvme_passthru.md_buf = md_buf; 6909 bdev_io->u.nvme_passthru.md_len = md_len; 6910 6911 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6912 6913 bdev_io_submit(bdev_io); 6914 return 0; 6915 } 6916 6917 int 6918 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6919 struct spdk_io_channel *ch, 6920 const struct spdk_nvme_cmd *cmd, 6921 struct iovec *iov, int iovcnt, size_t nbytes, 6922 void *md_buf, size_t md_len, 6923 spdk_bdev_io_completion_cb cb, void *cb_arg) 6924 { 6925 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6926 struct spdk_bdev_io *bdev_io; 6927 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6928 6929 if (!desc->write) { 6930 /* 6931 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6932 * to easily determine if the command is a read or write, but for now just 6933 * do not allow io_passthru with a read-only descriptor. 6934 */ 6935 return -EBADF; 6936 } 6937 6938 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6939 return -ENOTSUP; 6940 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6941 return -ENOTSUP; 6942 } 6943 6944 bdev_io = bdev_channel_get_io(channel); 6945 if (!bdev_io) { 6946 return -ENOMEM; 6947 } 6948 6949 bdev_io->internal.ch = channel; 6950 bdev_io->internal.desc = desc; 6951 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6952 bdev_io->u.nvme_passthru.cmd = *cmd; 6953 bdev_io->u.nvme_passthru.iovs = iov; 6954 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6955 bdev_io->u.nvme_passthru.nbytes = nbytes; 6956 bdev_io->u.nvme_passthru.md_buf = md_buf; 6957 bdev_io->u.nvme_passthru.md_len = md_len; 6958 6959 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6960 6961 bdev_io_submit(bdev_io); 6962 return 0; 6963 } 6964 6965 static void bdev_abort_retry(void *ctx); 6966 static void bdev_abort(struct spdk_bdev_io *parent_io); 6967 6968 static void 6969 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6970 { 6971 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6972 struct spdk_bdev_io *parent_io = cb_arg; 6973 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6974 6975 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6976 6977 spdk_bdev_free_io(bdev_io); 6978 6979 if (!success) { 6980 /* Check if the target I/O completed in the meantime. */ 6981 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6982 if (tmp_io == bio_to_abort) { 6983 break; 6984 } 6985 } 6986 6987 /* If the target I/O still exists, set the parent to failed. */ 6988 if (tmp_io != NULL) { 6989 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6990 } 6991 } 6992 6993 assert(parent_io->internal.f.split); 6994 6995 parent_io->internal.split.outstanding--; 6996 if (parent_io->internal.split.outstanding == 0) { 6997 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6998 bdev_abort_retry(parent_io); 6999 } else { 7000 bdev_io_complete(parent_io); 7001 } 7002 } 7003 } 7004 7005 static int 7006 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7007 struct spdk_bdev_io *bio_to_abort, 7008 spdk_bdev_io_completion_cb cb, void *cb_arg) 7009 { 7010 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7011 struct spdk_bdev_io *bdev_io; 7012 7013 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7014 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7015 /* TODO: Abort reset or abort request. */ 7016 return -ENOTSUP; 7017 } 7018 7019 bdev_io = bdev_channel_get_io(channel); 7020 if (bdev_io == NULL) { 7021 return -ENOMEM; 7022 } 7023 7024 bdev_io->internal.ch = channel; 7025 bdev_io->internal.desc = desc; 7026 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7027 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7028 7029 if (bio_to_abort->internal.f.split) { 7030 assert(bdev_io_should_split(bio_to_abort)); 7031 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7032 7033 /* Parent abort request is not submitted directly, but to manage its 7034 * execution add it to the submitted list here. 7035 */ 7036 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7037 bdev_ch_add_to_io_submitted(bdev_io); 7038 7039 bdev_abort(bdev_io); 7040 7041 return 0; 7042 } 7043 7044 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7045 7046 /* Submit the abort request to the underlying bdev module. */ 7047 bdev_io_submit(bdev_io); 7048 7049 return 0; 7050 } 7051 7052 static bool 7053 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7054 { 7055 struct spdk_bdev_io *iter; 7056 7057 TAILQ_FOREACH(iter, tailq, internal.link) { 7058 if (iter == bdev_io) { 7059 return true; 7060 } 7061 } 7062 7063 return false; 7064 } 7065 7066 static uint32_t 7067 _bdev_abort(struct spdk_bdev_io *parent_io) 7068 { 7069 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7070 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7071 void *bio_cb_arg; 7072 struct spdk_bdev_io *bio_to_abort; 7073 uint32_t matched_ios; 7074 int rc; 7075 7076 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7077 7078 /* matched_ios is returned and will be kept by the caller. 7079 * 7080 * This function will be used for two cases, 1) the same cb_arg is used for 7081 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7082 * Incrementing split_outstanding directly here may confuse readers especially 7083 * for the 1st case. 7084 * 7085 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7086 * works as expected. 7087 */ 7088 matched_ios = 0; 7089 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7090 7091 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7092 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7093 continue; 7094 } 7095 7096 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7097 /* Any I/O which was submitted after this abort command should be excluded. */ 7098 continue; 7099 } 7100 7101 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7102 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7103 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7104 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7105 break; 7106 } 7107 7108 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7109 if (rc != 0) { 7110 if (rc == -ENOMEM) { 7111 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7112 } else { 7113 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7114 } 7115 break; 7116 } 7117 matched_ios++; 7118 } 7119 7120 return matched_ios; 7121 } 7122 7123 static void 7124 bdev_abort_retry(void *ctx) 7125 { 7126 struct spdk_bdev_io *parent_io = ctx; 7127 uint32_t matched_ios; 7128 7129 matched_ios = _bdev_abort(parent_io); 7130 7131 if (matched_ios == 0) { 7132 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7133 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7134 } else { 7135 /* For retry, the case that no target I/O was found is success 7136 * because it means target I/Os completed in the meantime. 7137 */ 7138 bdev_io_complete(parent_io); 7139 } 7140 return; 7141 } 7142 7143 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7144 parent_io->internal.f.split = true; 7145 parent_io->internal.split.outstanding = matched_ios; 7146 } 7147 7148 static void 7149 bdev_abort(struct spdk_bdev_io *parent_io) 7150 { 7151 uint32_t matched_ios; 7152 7153 matched_ios = _bdev_abort(parent_io); 7154 7155 if (matched_ios == 0) { 7156 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7157 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7158 } else { 7159 /* The case the no target I/O was found is failure. */ 7160 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7161 bdev_io_complete(parent_io); 7162 } 7163 return; 7164 } 7165 7166 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7167 parent_io->internal.f.split = true; 7168 parent_io->internal.split.outstanding = matched_ios; 7169 } 7170 7171 int 7172 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7173 void *bio_cb_arg, 7174 spdk_bdev_io_completion_cb cb, void *cb_arg) 7175 { 7176 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7177 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7178 struct spdk_bdev_io *bdev_io; 7179 7180 if (bio_cb_arg == NULL) { 7181 return -EINVAL; 7182 } 7183 7184 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7185 return -ENOTSUP; 7186 } 7187 7188 bdev_io = bdev_channel_get_io(channel); 7189 if (bdev_io == NULL) { 7190 return -ENOMEM; 7191 } 7192 7193 bdev_io->internal.ch = channel; 7194 bdev_io->internal.desc = desc; 7195 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7196 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7197 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7198 7199 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7200 7201 /* Parent abort request is not submitted directly, but to manage its execution, 7202 * add it to the submitted list here. 7203 */ 7204 bdev_ch_add_to_io_submitted(bdev_io); 7205 7206 bdev_abort(bdev_io); 7207 7208 return 0; 7209 } 7210 7211 int 7212 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7213 struct spdk_bdev_io_wait_entry *entry) 7214 { 7215 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7216 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7217 7218 if (bdev != entry->bdev) { 7219 SPDK_ERRLOG("bdevs do not match\n"); 7220 return -EINVAL; 7221 } 7222 7223 if (mgmt_ch->per_thread_cache_count > 0) { 7224 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7225 return -EINVAL; 7226 } 7227 7228 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7229 return 0; 7230 } 7231 7232 static inline void 7233 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7234 { 7235 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7236 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7237 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7238 uint32_t blocklen = bdev_io->bdev->blocklen; 7239 7240 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7241 switch (bdev_io->type) { 7242 case SPDK_BDEV_IO_TYPE_READ: 7243 io_stat->bytes_read += num_blocks * blocklen; 7244 io_stat->num_read_ops++; 7245 io_stat->read_latency_ticks += tsc_diff; 7246 if (io_stat->max_read_latency_ticks < tsc_diff) { 7247 io_stat->max_read_latency_ticks = tsc_diff; 7248 } 7249 if (io_stat->min_read_latency_ticks > tsc_diff) { 7250 io_stat->min_read_latency_ticks = tsc_diff; 7251 } 7252 break; 7253 case SPDK_BDEV_IO_TYPE_WRITE: 7254 io_stat->bytes_written += num_blocks * blocklen; 7255 io_stat->num_write_ops++; 7256 io_stat->write_latency_ticks += tsc_diff; 7257 if (io_stat->max_write_latency_ticks < tsc_diff) { 7258 io_stat->max_write_latency_ticks = tsc_diff; 7259 } 7260 if (io_stat->min_write_latency_ticks > tsc_diff) { 7261 io_stat->min_write_latency_ticks = tsc_diff; 7262 } 7263 break; 7264 case SPDK_BDEV_IO_TYPE_UNMAP: 7265 io_stat->bytes_unmapped += num_blocks * blocklen; 7266 io_stat->num_unmap_ops++; 7267 io_stat->unmap_latency_ticks += tsc_diff; 7268 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7269 io_stat->max_unmap_latency_ticks = tsc_diff; 7270 } 7271 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7272 io_stat->min_unmap_latency_ticks = tsc_diff; 7273 } 7274 break; 7275 case SPDK_BDEV_IO_TYPE_ZCOPY: 7276 /* Track the data in the start phase only */ 7277 if (bdev_io->u.bdev.zcopy.start) { 7278 if (bdev_io->u.bdev.zcopy.populate) { 7279 io_stat->bytes_read += num_blocks * blocklen; 7280 io_stat->num_read_ops++; 7281 io_stat->read_latency_ticks += tsc_diff; 7282 if (io_stat->max_read_latency_ticks < tsc_diff) { 7283 io_stat->max_read_latency_ticks = tsc_diff; 7284 } 7285 if (io_stat->min_read_latency_ticks > tsc_diff) { 7286 io_stat->min_read_latency_ticks = tsc_diff; 7287 } 7288 } else { 7289 io_stat->bytes_written += num_blocks * blocklen; 7290 io_stat->num_write_ops++; 7291 io_stat->write_latency_ticks += tsc_diff; 7292 if (io_stat->max_write_latency_ticks < tsc_diff) { 7293 io_stat->max_write_latency_ticks = tsc_diff; 7294 } 7295 if (io_stat->min_write_latency_ticks > tsc_diff) { 7296 io_stat->min_write_latency_ticks = tsc_diff; 7297 } 7298 } 7299 } 7300 break; 7301 case SPDK_BDEV_IO_TYPE_COPY: 7302 io_stat->bytes_copied += num_blocks * blocklen; 7303 io_stat->num_copy_ops++; 7304 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7305 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7306 io_stat->max_copy_latency_ticks = tsc_diff; 7307 } 7308 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7309 io_stat->min_copy_latency_ticks = tsc_diff; 7310 } 7311 break; 7312 default: 7313 break; 7314 } 7315 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7316 io_stat = bdev_io->bdev->internal.stat; 7317 assert(io_stat->io_error != NULL); 7318 7319 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7320 io_stat->io_error->error_status[-io_status - 1]++; 7321 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7322 } 7323 7324 #ifdef SPDK_CONFIG_VTUNE 7325 uint64_t now_tsc = spdk_get_ticks(); 7326 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7327 uint64_t data[5]; 7328 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7329 7330 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7331 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7332 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7333 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7334 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7335 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7336 7337 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7338 __itt_metadata_u64, 5, data); 7339 7340 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7341 bdev_io->internal.ch->start_tsc = now_tsc; 7342 } 7343 #endif 7344 } 7345 7346 static inline void 7347 _bdev_io_complete(void *ctx) 7348 { 7349 struct spdk_bdev_io *bdev_io = ctx; 7350 7351 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7352 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7353 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7354 } 7355 7356 assert(bdev_io->internal.cb != NULL); 7357 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7358 7359 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7360 bdev_io->internal.caller_ctx); 7361 } 7362 7363 static inline void 7364 bdev_io_complete(void *ctx) 7365 { 7366 struct spdk_bdev_io *bdev_io = ctx; 7367 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7368 uint64_t tsc, tsc_diff; 7369 7370 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7371 /* 7372 * Defer completion to avoid potential infinite recursion if the 7373 * user's completion callback issues a new I/O. 7374 */ 7375 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7376 bdev_io_complete, bdev_io); 7377 return; 7378 } 7379 7380 tsc = spdk_get_ticks(); 7381 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7382 7383 bdev_ch_remove_from_io_submitted(bdev_io); 7384 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7385 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7386 7387 if (bdev_ch->histogram) { 7388 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7389 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7390 /* 7391 * Tally all I/O types if the histogram_io_type is set to 0. 7392 */ 7393 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7394 } 7395 } 7396 7397 bdev_io_update_io_stat(bdev_io, tsc_diff); 7398 _bdev_io_complete(bdev_io); 7399 } 7400 7401 /* The difference between this function and bdev_io_complete() is that this should be called to 7402 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7403 * io_submitted list and don't have submit_tsc updated. 7404 */ 7405 static inline void 7406 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7407 { 7408 /* Since the IO hasn't been submitted it's bound to be failed */ 7409 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7410 7411 /* At this point we don't know if the IO is completed from submission context or not, but, 7412 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7413 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7414 _bdev_io_complete, bdev_io); 7415 } 7416 7417 static void bdev_destroy_cb(void *io_device); 7418 7419 static inline void 7420 _bdev_reset_complete(void *ctx) 7421 { 7422 struct spdk_bdev_io *bdev_io = ctx; 7423 7424 /* Put the channel reference we got in submission. */ 7425 assert(bdev_io->u.reset.ch_ref != NULL); 7426 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7427 bdev_io->u.reset.ch_ref = NULL; 7428 7429 bdev_io_complete(bdev_io); 7430 } 7431 7432 static void 7433 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7434 { 7435 struct spdk_bdev_io *bdev_io = _ctx; 7436 bdev_io_tailq_t queued_resets; 7437 struct spdk_bdev_io *queued_reset; 7438 7439 assert(bdev_io == bdev->internal.reset_in_progress); 7440 7441 TAILQ_INIT(&queued_resets); 7442 7443 spdk_spin_lock(&bdev->internal.spinlock); 7444 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7445 spdk_bdev_io, internal.link); 7446 bdev->internal.reset_in_progress = NULL; 7447 spdk_spin_unlock(&bdev->internal.spinlock); 7448 7449 while (!TAILQ_EMPTY(&queued_resets)) { 7450 queued_reset = TAILQ_FIRST(&queued_resets); 7451 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7452 queued_reset->internal.status = bdev_io->internal.status; 7453 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7454 _bdev_reset_complete, queued_reset); 7455 } 7456 7457 _bdev_reset_complete(bdev_io); 7458 7459 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7460 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7461 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7462 } 7463 } 7464 7465 static void 7466 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7467 struct spdk_io_channel *_ch, void *_ctx) 7468 { 7469 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7470 7471 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7472 7473 spdk_bdev_for_each_channel_continue(i, 0); 7474 } 7475 7476 static void 7477 bdev_io_complete_sequence_cb(void *ctx, int status) 7478 { 7479 struct spdk_bdev_io *bdev_io = ctx; 7480 7481 /* u.bdev.accel_sequence should have already been cleared at this point */ 7482 assert(bdev_io->u.bdev.accel_sequence == NULL); 7483 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7484 bdev_io->internal.f.has_accel_sequence = false; 7485 7486 if (spdk_unlikely(status != 0)) { 7487 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7488 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7489 } 7490 7491 bdev_io_complete(bdev_io); 7492 } 7493 7494 void 7495 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7496 { 7497 struct spdk_bdev *bdev = bdev_io->bdev; 7498 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7499 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7500 7501 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7502 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7503 spdk_bdev_get_module_name(bdev), 7504 bdev_io_status_get_string(bdev_io->internal.status)); 7505 assert(false); 7506 } 7507 bdev_io->internal.status = status; 7508 7509 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7510 assert(bdev_io == bdev->internal.reset_in_progress); 7511 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7512 bdev_reset_complete); 7513 return; 7514 } else { 7515 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7516 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7517 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7518 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7519 return; 7520 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7521 !bdev_io_use_accel_sequence(bdev_io))) { 7522 _bdev_io_push_bounce_data_buffer(bdev_io, 7523 _bdev_io_complete_push_bounce_done); 7524 /* bdev IO will be completed in the callback */ 7525 return; 7526 } 7527 } 7528 7529 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7530 return; 7531 } 7532 } 7533 7534 bdev_io_complete(bdev_io); 7535 } 7536 7537 void 7538 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7539 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7540 { 7541 enum spdk_bdev_io_status status; 7542 7543 if (sc == SPDK_SCSI_STATUS_GOOD) { 7544 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7545 } else { 7546 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7547 bdev_io->internal.error.scsi.sc = sc; 7548 bdev_io->internal.error.scsi.sk = sk; 7549 bdev_io->internal.error.scsi.asc = asc; 7550 bdev_io->internal.error.scsi.ascq = ascq; 7551 } 7552 7553 spdk_bdev_io_complete(bdev_io, status); 7554 } 7555 7556 void 7557 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7558 int *sc, int *sk, int *asc, int *ascq) 7559 { 7560 assert(sc != NULL); 7561 assert(sk != NULL); 7562 assert(asc != NULL); 7563 assert(ascq != NULL); 7564 7565 switch (bdev_io->internal.status) { 7566 case SPDK_BDEV_IO_STATUS_SUCCESS: 7567 *sc = SPDK_SCSI_STATUS_GOOD; 7568 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7569 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7570 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7571 break; 7572 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7573 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7574 break; 7575 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7576 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7577 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7578 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7579 *ascq = bdev_io->internal.error.scsi.ascq; 7580 break; 7581 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7582 *sc = bdev_io->internal.error.scsi.sc; 7583 *sk = bdev_io->internal.error.scsi.sk; 7584 *asc = bdev_io->internal.error.scsi.asc; 7585 *ascq = bdev_io->internal.error.scsi.ascq; 7586 break; 7587 default: 7588 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7589 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7590 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7591 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7592 break; 7593 } 7594 } 7595 7596 void 7597 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7598 { 7599 enum spdk_bdev_io_status status; 7600 7601 if (aio_result == 0) { 7602 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7603 } else { 7604 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7605 } 7606 7607 bdev_io->internal.error.aio_result = aio_result; 7608 7609 spdk_bdev_io_complete(bdev_io, status); 7610 } 7611 7612 void 7613 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7614 { 7615 assert(aio_result != NULL); 7616 7617 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7618 *aio_result = bdev_io->internal.error.aio_result; 7619 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7620 *aio_result = 0; 7621 } else { 7622 *aio_result = -EIO; 7623 } 7624 } 7625 7626 void 7627 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7628 { 7629 enum spdk_bdev_io_status status; 7630 7631 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7632 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7633 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7634 status = SPDK_BDEV_IO_STATUS_ABORTED; 7635 } else { 7636 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7637 } 7638 7639 bdev_io->internal.error.nvme.cdw0 = cdw0; 7640 bdev_io->internal.error.nvme.sct = sct; 7641 bdev_io->internal.error.nvme.sc = sc; 7642 7643 spdk_bdev_io_complete(bdev_io, status); 7644 } 7645 7646 void 7647 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7648 { 7649 assert(sct != NULL); 7650 assert(sc != NULL); 7651 assert(cdw0 != NULL); 7652 7653 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7654 *sct = SPDK_NVME_SCT_GENERIC; 7655 *sc = SPDK_NVME_SC_SUCCESS; 7656 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7657 *cdw0 = 0; 7658 } else { 7659 *cdw0 = 1U; 7660 } 7661 return; 7662 } 7663 7664 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7665 *sct = SPDK_NVME_SCT_GENERIC; 7666 *sc = SPDK_NVME_SC_SUCCESS; 7667 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7668 *sct = bdev_io->internal.error.nvme.sct; 7669 *sc = bdev_io->internal.error.nvme.sc; 7670 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7671 *sct = SPDK_NVME_SCT_GENERIC; 7672 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7673 } else { 7674 *sct = SPDK_NVME_SCT_GENERIC; 7675 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7676 } 7677 7678 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7679 } 7680 7681 void 7682 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7683 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7684 { 7685 assert(first_sct != NULL); 7686 assert(first_sc != NULL); 7687 assert(second_sct != NULL); 7688 assert(second_sc != NULL); 7689 assert(cdw0 != NULL); 7690 7691 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7692 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7693 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7694 *first_sct = bdev_io->internal.error.nvme.sct; 7695 *first_sc = bdev_io->internal.error.nvme.sc; 7696 *second_sct = SPDK_NVME_SCT_GENERIC; 7697 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7698 } else { 7699 *first_sct = SPDK_NVME_SCT_GENERIC; 7700 *first_sc = SPDK_NVME_SC_SUCCESS; 7701 *second_sct = bdev_io->internal.error.nvme.sct; 7702 *second_sc = bdev_io->internal.error.nvme.sc; 7703 } 7704 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7705 *first_sct = SPDK_NVME_SCT_GENERIC; 7706 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7707 *second_sct = SPDK_NVME_SCT_GENERIC; 7708 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7709 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7710 *first_sct = SPDK_NVME_SCT_GENERIC; 7711 *first_sc = SPDK_NVME_SC_SUCCESS; 7712 *second_sct = SPDK_NVME_SCT_GENERIC; 7713 *second_sc = SPDK_NVME_SC_SUCCESS; 7714 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7715 *first_sct = SPDK_NVME_SCT_GENERIC; 7716 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7717 *second_sct = SPDK_NVME_SCT_GENERIC; 7718 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7719 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7720 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7721 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7722 *second_sct = SPDK_NVME_SCT_GENERIC; 7723 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7724 } else { 7725 *first_sct = SPDK_NVME_SCT_GENERIC; 7726 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7727 *second_sct = SPDK_NVME_SCT_GENERIC; 7728 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7729 } 7730 7731 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7732 } 7733 7734 void 7735 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7736 const struct spdk_bdev_io *base_io) 7737 { 7738 switch (base_io->internal.status) { 7739 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7740 spdk_bdev_io_complete_nvme_status(bdev_io, 7741 base_io->internal.error.nvme.cdw0, 7742 base_io->internal.error.nvme.sct, 7743 base_io->internal.error.nvme.sc); 7744 break; 7745 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7746 spdk_bdev_io_complete_scsi_status(bdev_io, 7747 base_io->internal.error.scsi.sc, 7748 base_io->internal.error.scsi.sk, 7749 base_io->internal.error.scsi.asc, 7750 base_io->internal.error.scsi.ascq); 7751 break; 7752 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7753 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7754 break; 7755 default: 7756 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7757 break; 7758 } 7759 } 7760 7761 struct spdk_thread * 7762 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7763 { 7764 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7765 } 7766 7767 struct spdk_io_channel * 7768 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7769 { 7770 return bdev_io->internal.ch->channel; 7771 } 7772 7773 static int 7774 bdev_register(struct spdk_bdev *bdev) 7775 { 7776 char *bdev_name; 7777 char uuid[SPDK_UUID_STRING_LEN]; 7778 struct spdk_iobuf_opts iobuf_opts; 7779 int ret; 7780 7781 assert(bdev->module != NULL); 7782 7783 if (!bdev->name) { 7784 SPDK_ERRLOG("Bdev name is NULL\n"); 7785 return -EINVAL; 7786 } 7787 7788 if (!strlen(bdev->name)) { 7789 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7790 return -EINVAL; 7791 } 7792 7793 /* Users often register their own I/O devices using the bdev name. In 7794 * order to avoid conflicts, prepend bdev_. */ 7795 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7796 if (!bdev_name) { 7797 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7798 return -ENOMEM; 7799 } 7800 7801 bdev->internal.stat = bdev_alloc_io_stat(true); 7802 if (!bdev->internal.stat) { 7803 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7804 free(bdev_name); 7805 return -ENOMEM; 7806 } 7807 7808 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7809 bdev->internal.measured_queue_depth = UINT64_MAX; 7810 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7811 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7812 bdev->internal.qd_poller = NULL; 7813 bdev->internal.qos = NULL; 7814 7815 TAILQ_INIT(&bdev->internal.open_descs); 7816 TAILQ_INIT(&bdev->internal.locked_ranges); 7817 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7818 TAILQ_INIT(&bdev->internal.queued_resets); 7819 TAILQ_INIT(&bdev->aliases); 7820 7821 /* UUID may be specified by the user or defined by bdev itself. 7822 * Otherwise it will be generated here, so this field will never be empty. */ 7823 if (spdk_uuid_is_null(&bdev->uuid)) { 7824 spdk_uuid_generate(&bdev->uuid); 7825 } 7826 7827 /* Add the UUID alias only if it's different than the name */ 7828 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7829 if (strcmp(bdev->name, uuid) != 0) { 7830 ret = spdk_bdev_alias_add(bdev, uuid); 7831 if (ret != 0) { 7832 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7833 bdev_free_io_stat(bdev->internal.stat); 7834 free(bdev_name); 7835 return ret; 7836 } 7837 } 7838 7839 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7840 if (spdk_bdev_get_buf_align(bdev) > 1) { 7841 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7842 iobuf_opts.large_bufsize / bdev->blocklen); 7843 } 7844 7845 /* If the user didn't specify a write unit size, set it to one. */ 7846 if (bdev->write_unit_size == 0) { 7847 bdev->write_unit_size = 1; 7848 } 7849 7850 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7851 if (bdev->acwu == 0) { 7852 bdev->acwu = bdev->write_unit_size; 7853 } 7854 7855 if (bdev->phys_blocklen == 0) { 7856 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7857 } 7858 7859 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7860 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7861 } 7862 7863 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7864 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7865 } 7866 7867 bdev->internal.reset_in_progress = NULL; 7868 bdev->internal.qd_poll_in_progress = false; 7869 bdev->internal.period = 0; 7870 bdev->internal.new_period = 0; 7871 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7872 7873 /* 7874 * Initialize spinlock before registering IO device because spinlock is used in 7875 * bdev_channel_create 7876 */ 7877 spdk_spin_init(&bdev->internal.spinlock); 7878 7879 spdk_io_device_register(__bdev_to_io_dev(bdev), 7880 bdev_channel_create, bdev_channel_destroy, 7881 sizeof(struct spdk_bdev_channel), 7882 bdev_name); 7883 7884 /* 7885 * Register bdev name only after the bdev object is ready. 7886 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7887 * create IO channels... 7888 */ 7889 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7890 if (ret != 0) { 7891 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7892 bdev_free_io_stat(bdev->internal.stat); 7893 spdk_spin_destroy(&bdev->internal.spinlock); 7894 free(bdev_name); 7895 return ret; 7896 } 7897 7898 free(bdev_name); 7899 7900 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7901 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7902 7903 return 0; 7904 } 7905 7906 static void 7907 bdev_destroy_cb(void *io_device) 7908 { 7909 int rc; 7910 struct spdk_bdev *bdev; 7911 spdk_bdev_unregister_cb cb_fn; 7912 void *cb_arg; 7913 7914 bdev = __bdev_from_io_dev(io_device); 7915 7916 if (bdev->internal.unregister_td != spdk_get_thread()) { 7917 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7918 return; 7919 } 7920 7921 cb_fn = bdev->internal.unregister_cb; 7922 cb_arg = bdev->internal.unregister_ctx; 7923 7924 spdk_spin_destroy(&bdev->internal.spinlock); 7925 free(bdev->internal.qos); 7926 bdev_free_io_stat(bdev->internal.stat); 7927 spdk_trace_unregister_owner(bdev->internal.trace_id); 7928 7929 rc = bdev->fn_table->destruct(bdev->ctxt); 7930 if (rc < 0) { 7931 SPDK_ERRLOG("destruct failed\n"); 7932 } 7933 if (rc <= 0 && cb_fn != NULL) { 7934 cb_fn(cb_arg, rc); 7935 } 7936 } 7937 7938 void 7939 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7940 { 7941 if (bdev->internal.unregister_cb != NULL) { 7942 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7943 } 7944 } 7945 7946 static void 7947 _remove_notify(void *arg) 7948 { 7949 struct spdk_bdev_desc *desc = arg; 7950 7951 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7952 } 7953 7954 /* returns: 0 - bdev removed and ready to be destructed. 7955 * -EBUSY - bdev can't be destructed yet. */ 7956 static int 7957 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7958 { 7959 struct spdk_bdev_desc *desc, *tmp; 7960 struct spdk_bdev_alias *alias; 7961 int rc = 0; 7962 char uuid[SPDK_UUID_STRING_LEN]; 7963 7964 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7965 assert(spdk_spin_held(&bdev->internal.spinlock)); 7966 7967 /* Notify each descriptor about hotremoval */ 7968 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7969 rc = -EBUSY; 7970 /* 7971 * Defer invocation of the event_cb to a separate message that will 7972 * run later on its thread. This ensures this context unwinds and 7973 * we don't recursively unregister this bdev again if the event_cb 7974 * immediately closes its descriptor. 7975 */ 7976 event_notify(desc, _remove_notify); 7977 } 7978 7979 /* If there are no descriptors, proceed removing the bdev */ 7980 if (rc == 0) { 7981 bdev_examine_allowlist_remove(bdev->name); 7982 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 7983 bdev_examine_allowlist_remove(alias->alias.name); 7984 } 7985 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7986 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7987 7988 /* Delete the name and the UUID alias */ 7989 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7990 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7991 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7992 7993 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7994 7995 if (bdev->internal.reset_in_progress != NULL) { 7996 /* If reset is in progress, let the completion callback for reset 7997 * unregister the bdev. 7998 */ 7999 rc = -EBUSY; 8000 } 8001 } 8002 8003 return rc; 8004 } 8005 8006 static void 8007 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8008 struct spdk_io_channel *io_ch, void *_ctx) 8009 { 8010 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8011 8012 bdev_channel_abort_queued_ios(bdev_ch); 8013 spdk_bdev_for_each_channel_continue(i, 0); 8014 } 8015 8016 static void 8017 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8018 { 8019 int rc; 8020 8021 spdk_spin_lock(&g_bdev_mgr.spinlock); 8022 spdk_spin_lock(&bdev->internal.spinlock); 8023 /* 8024 * Set the status to REMOVING after completing to abort channels. Otherwise, 8025 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8026 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8027 * may fail. 8028 */ 8029 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8030 rc = bdev_unregister_unsafe(bdev); 8031 spdk_spin_unlock(&bdev->internal.spinlock); 8032 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8033 8034 if (rc == 0) { 8035 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8036 } 8037 } 8038 8039 void 8040 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8041 { 8042 struct spdk_thread *thread; 8043 8044 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8045 8046 thread = spdk_get_thread(); 8047 if (!thread) { 8048 /* The user called this from a non-SPDK thread. */ 8049 if (cb_fn != NULL) { 8050 cb_fn(cb_arg, -ENOTSUP); 8051 } 8052 return; 8053 } 8054 8055 spdk_spin_lock(&g_bdev_mgr.spinlock); 8056 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8057 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8058 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8059 if (cb_fn) { 8060 cb_fn(cb_arg, -EBUSY); 8061 } 8062 return; 8063 } 8064 8065 spdk_spin_lock(&bdev->internal.spinlock); 8066 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8067 bdev->internal.unregister_cb = cb_fn; 8068 bdev->internal.unregister_ctx = cb_arg; 8069 bdev->internal.unregister_td = thread; 8070 spdk_spin_unlock(&bdev->internal.spinlock); 8071 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8072 8073 spdk_bdev_set_qd_sampling_period(bdev, 0); 8074 8075 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8076 bdev_unregister); 8077 } 8078 8079 int 8080 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8081 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8082 { 8083 struct spdk_bdev_desc *desc; 8084 struct spdk_bdev *bdev; 8085 int rc; 8086 8087 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8088 if (rc != 0) { 8089 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8090 return rc; 8091 } 8092 8093 bdev = spdk_bdev_desc_get_bdev(desc); 8094 8095 if (bdev->module != module) { 8096 spdk_bdev_close(desc); 8097 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8098 bdev_name); 8099 return -ENODEV; 8100 } 8101 8102 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8103 8104 spdk_bdev_close(desc); 8105 8106 return 0; 8107 } 8108 8109 static int 8110 bdev_start_qos(struct spdk_bdev *bdev) 8111 { 8112 struct set_qos_limit_ctx *ctx; 8113 8114 /* Enable QoS */ 8115 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8116 ctx = calloc(1, sizeof(*ctx)); 8117 if (ctx == NULL) { 8118 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8119 return -ENOMEM; 8120 } 8121 ctx->bdev = bdev; 8122 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8123 } 8124 8125 return 0; 8126 } 8127 8128 static void 8129 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8130 struct spdk_bdev *bdev) 8131 { 8132 enum spdk_bdev_claim_type type; 8133 const char *typename, *modname; 8134 extern struct spdk_log_flag SPDK_LOG_bdev; 8135 8136 assert(spdk_spin_held(&bdev->internal.spinlock)); 8137 8138 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8139 return; 8140 } 8141 8142 type = bdev->internal.claim_type; 8143 typename = spdk_bdev_claim_get_name(type); 8144 8145 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8146 modname = bdev->internal.claim.v1.module->name; 8147 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8148 bdev->name, detail, typename, modname); 8149 return; 8150 } 8151 8152 if (claim_type_is_v2(type)) { 8153 struct spdk_bdev_module_claim *claim; 8154 8155 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8156 modname = claim->module->name; 8157 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8158 bdev->name, detail, typename, modname); 8159 } 8160 return; 8161 } 8162 8163 assert(false); 8164 } 8165 8166 static int 8167 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8168 { 8169 struct spdk_thread *thread; 8170 int rc = 0; 8171 8172 thread = spdk_get_thread(); 8173 if (!thread) { 8174 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8175 return -ENOTSUP; 8176 } 8177 8178 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8179 spdk_get_thread()); 8180 8181 desc->bdev = bdev; 8182 desc->thread = thread; 8183 desc->write = write; 8184 8185 spdk_spin_lock(&bdev->internal.spinlock); 8186 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8187 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8188 spdk_spin_unlock(&bdev->internal.spinlock); 8189 return -ENODEV; 8190 } 8191 8192 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8193 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8194 spdk_spin_unlock(&bdev->internal.spinlock); 8195 return -EPERM; 8196 } 8197 8198 rc = bdev_start_qos(bdev); 8199 if (rc != 0) { 8200 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8201 spdk_spin_unlock(&bdev->internal.spinlock); 8202 return rc; 8203 } 8204 8205 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8206 8207 spdk_spin_unlock(&bdev->internal.spinlock); 8208 8209 return 0; 8210 } 8211 8212 static int 8213 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8214 struct spdk_bdev_desc **_desc) 8215 { 8216 struct spdk_bdev_desc *desc; 8217 unsigned int i; 8218 8219 desc = calloc(1, sizeof(*desc)); 8220 if (desc == NULL) { 8221 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8222 return -ENOMEM; 8223 } 8224 8225 TAILQ_INIT(&desc->pending_media_events); 8226 TAILQ_INIT(&desc->free_media_events); 8227 8228 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8229 desc->callback.event_fn = event_cb; 8230 desc->callback.ctx = event_ctx; 8231 spdk_spin_init(&desc->spinlock); 8232 8233 if (bdev->media_events) { 8234 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8235 sizeof(*desc->media_events_buffer)); 8236 if (desc->media_events_buffer == NULL) { 8237 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8238 bdev_desc_free(desc); 8239 return -ENOMEM; 8240 } 8241 8242 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8243 TAILQ_INSERT_TAIL(&desc->free_media_events, 8244 &desc->media_events_buffer[i], tailq); 8245 } 8246 } 8247 8248 if (bdev->fn_table->accel_sequence_supported != NULL) { 8249 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8250 desc->accel_sequence_supported[i] = 8251 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8252 (enum spdk_bdev_io_type)i); 8253 } 8254 } 8255 8256 *_desc = desc; 8257 8258 return 0; 8259 } 8260 8261 static int 8262 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8263 void *event_ctx, struct spdk_bdev_desc **_desc) 8264 { 8265 struct spdk_bdev_desc *desc; 8266 struct spdk_bdev *bdev; 8267 int rc; 8268 8269 bdev = bdev_get_by_name(bdev_name); 8270 8271 if (bdev == NULL) { 8272 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8273 return -ENODEV; 8274 } 8275 8276 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8277 if (rc != 0) { 8278 return rc; 8279 } 8280 8281 rc = bdev_open(bdev, write, desc); 8282 if (rc != 0) { 8283 bdev_desc_free(desc); 8284 desc = NULL; 8285 } 8286 8287 *_desc = desc; 8288 8289 return rc; 8290 } 8291 8292 int 8293 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8294 void *event_ctx, struct spdk_bdev_desc **_desc) 8295 { 8296 int rc; 8297 8298 if (event_cb == NULL) { 8299 SPDK_ERRLOG("Missing event callback function\n"); 8300 return -EINVAL; 8301 } 8302 8303 spdk_spin_lock(&g_bdev_mgr.spinlock); 8304 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8305 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8306 8307 return rc; 8308 } 8309 8310 struct spdk_bdev_open_async_ctx { 8311 char *bdev_name; 8312 spdk_bdev_event_cb_t event_cb; 8313 void *event_ctx; 8314 bool write; 8315 int rc; 8316 spdk_bdev_open_async_cb_t cb_fn; 8317 void *cb_arg; 8318 struct spdk_bdev_desc *desc; 8319 struct spdk_bdev_open_async_opts opts; 8320 uint64_t start_ticks; 8321 struct spdk_thread *orig_thread; 8322 struct spdk_poller *poller; 8323 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8324 }; 8325 8326 static void 8327 bdev_open_async_done(void *arg) 8328 { 8329 struct spdk_bdev_open_async_ctx *ctx = arg; 8330 8331 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8332 8333 free(ctx->bdev_name); 8334 free(ctx); 8335 } 8336 8337 static void 8338 bdev_open_async_cancel(void *arg) 8339 { 8340 struct spdk_bdev_open_async_ctx *ctx = arg; 8341 8342 assert(ctx->rc == -ESHUTDOWN); 8343 8344 spdk_poller_unregister(&ctx->poller); 8345 8346 bdev_open_async_done(ctx); 8347 } 8348 8349 /* This is called when the bdev library finishes at shutdown. */ 8350 static void 8351 bdev_open_async_fini(void) 8352 { 8353 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8354 8355 spdk_spin_lock(&g_bdev_mgr.spinlock); 8356 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8357 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8358 /* 8359 * We have to move to ctx->orig_thread to unregister ctx->poller. 8360 * However, there is a chance that ctx->poller is executed before 8361 * message is executed, which could result in bdev_open_async_done() 8362 * being called twice. To avoid such race condition, set ctx->rc to 8363 * -ESHUTDOWN. 8364 */ 8365 ctx->rc = -ESHUTDOWN; 8366 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8367 } 8368 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8369 } 8370 8371 static int bdev_open_async(void *arg); 8372 8373 static void 8374 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8375 { 8376 uint64_t timeout_ticks; 8377 8378 if (ctx->rc == -ESHUTDOWN) { 8379 /* This context is being canceled. Do nothing. */ 8380 return; 8381 } 8382 8383 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8384 &ctx->desc); 8385 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8386 goto exit; 8387 } 8388 8389 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8390 if (spdk_get_ticks() >= timeout_ticks) { 8391 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8392 ctx->rc = -ETIMEDOUT; 8393 goto exit; 8394 } 8395 8396 return; 8397 8398 exit: 8399 spdk_poller_unregister(&ctx->poller); 8400 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8401 8402 /* Completion callback is processed after stack unwinding. */ 8403 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8404 } 8405 8406 static int 8407 bdev_open_async(void *arg) 8408 { 8409 struct spdk_bdev_open_async_ctx *ctx = arg; 8410 8411 spdk_spin_lock(&g_bdev_mgr.spinlock); 8412 8413 _bdev_open_async(ctx); 8414 8415 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8416 8417 return SPDK_POLLER_BUSY; 8418 } 8419 8420 static void 8421 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8422 struct spdk_bdev_open_async_opts *opts_src, 8423 size_t size) 8424 { 8425 assert(opts); 8426 assert(opts_src); 8427 8428 opts->size = size; 8429 8430 #define SET_FIELD(field) \ 8431 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8432 opts->field = opts_src->field; \ 8433 } \ 8434 8435 SET_FIELD(timeout_ms); 8436 8437 /* Do not remove this statement, you should always update this statement when you adding a new field, 8438 * and do not forget to add the SET_FIELD statement for your added field. */ 8439 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8440 8441 #undef SET_FIELD 8442 } 8443 8444 static void 8445 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8446 { 8447 assert(opts); 8448 8449 opts->size = size; 8450 8451 #define SET_FIELD(field, value) \ 8452 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8453 opts->field = value; \ 8454 } \ 8455 8456 SET_FIELD(timeout_ms, 0); 8457 8458 #undef SET_FIELD 8459 } 8460 8461 int 8462 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8463 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8464 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8465 { 8466 struct spdk_bdev_open_async_ctx *ctx; 8467 8468 if (event_cb == NULL) { 8469 SPDK_ERRLOG("Missing event callback function\n"); 8470 return -EINVAL; 8471 } 8472 8473 if (open_cb == NULL) { 8474 SPDK_ERRLOG("Missing open callback function\n"); 8475 return -EINVAL; 8476 } 8477 8478 if (opts != NULL && opts->size == 0) { 8479 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8480 return -EINVAL; 8481 } 8482 8483 ctx = calloc(1, sizeof(*ctx)); 8484 if (ctx == NULL) { 8485 SPDK_ERRLOG("Failed to allocate open context\n"); 8486 return -ENOMEM; 8487 } 8488 8489 ctx->bdev_name = strdup(bdev_name); 8490 if (ctx->bdev_name == NULL) { 8491 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8492 free(ctx); 8493 return -ENOMEM; 8494 } 8495 8496 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8497 if (ctx->poller == NULL) { 8498 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8499 free(ctx->bdev_name); 8500 free(ctx); 8501 return -ENOMEM; 8502 } 8503 8504 ctx->cb_fn = open_cb; 8505 ctx->cb_arg = open_cb_arg; 8506 ctx->write = write; 8507 ctx->event_cb = event_cb; 8508 ctx->event_ctx = event_ctx; 8509 ctx->orig_thread = spdk_get_thread(); 8510 ctx->start_ticks = spdk_get_ticks(); 8511 8512 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8513 if (opts != NULL) { 8514 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8515 } 8516 8517 spdk_spin_lock(&g_bdev_mgr.spinlock); 8518 8519 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8520 _bdev_open_async(ctx); 8521 8522 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8523 8524 return 0; 8525 } 8526 8527 static void 8528 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8529 { 8530 int rc; 8531 8532 spdk_spin_lock(&bdev->internal.spinlock); 8533 spdk_spin_lock(&desc->spinlock); 8534 8535 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8536 8537 desc->closed = true; 8538 8539 if (desc->claim != NULL) { 8540 bdev_desc_release_claims(desc); 8541 } 8542 8543 if (0 == desc->refs) { 8544 spdk_spin_unlock(&desc->spinlock); 8545 bdev_desc_free(desc); 8546 } else { 8547 spdk_spin_unlock(&desc->spinlock); 8548 } 8549 8550 /* If no more descriptors, kill QoS channel */ 8551 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8552 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8553 bdev->name, spdk_get_thread()); 8554 8555 if (bdev_qos_destroy(bdev)) { 8556 /* There isn't anything we can do to recover here. Just let the 8557 * old QoS poller keep running. The QoS handling won't change 8558 * cores when the user allocates a new channel, but it won't break. */ 8559 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8560 } 8561 } 8562 8563 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8564 rc = bdev_unregister_unsafe(bdev); 8565 spdk_spin_unlock(&bdev->internal.spinlock); 8566 8567 if (rc == 0) { 8568 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8569 } 8570 } else { 8571 spdk_spin_unlock(&bdev->internal.spinlock); 8572 } 8573 } 8574 8575 void 8576 spdk_bdev_close(struct spdk_bdev_desc *desc) 8577 { 8578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8579 8580 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8581 spdk_get_thread()); 8582 8583 assert(desc->thread == spdk_get_thread()); 8584 8585 spdk_poller_unregister(&desc->io_timeout_poller); 8586 8587 spdk_spin_lock(&g_bdev_mgr.spinlock); 8588 8589 bdev_close(bdev, desc); 8590 8591 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8592 } 8593 8594 int32_t 8595 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8596 { 8597 if (bdev->numa.id_valid) { 8598 return bdev->numa.id; 8599 } else { 8600 return SPDK_ENV_NUMA_ID_ANY; 8601 } 8602 } 8603 8604 static void 8605 bdev_register_finished(void *arg) 8606 { 8607 struct spdk_bdev_desc *desc = arg; 8608 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8609 8610 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8611 8612 spdk_spin_lock(&g_bdev_mgr.spinlock); 8613 8614 bdev_close(bdev, desc); 8615 8616 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8617 } 8618 8619 int 8620 spdk_bdev_register(struct spdk_bdev *bdev) 8621 { 8622 struct spdk_bdev_desc *desc; 8623 struct spdk_thread *thread = spdk_get_thread(); 8624 int rc; 8625 8626 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8627 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8628 thread ? spdk_thread_get_name(thread) : "null"); 8629 return -EINVAL; 8630 } 8631 8632 rc = bdev_register(bdev); 8633 if (rc != 0) { 8634 return rc; 8635 } 8636 8637 /* A descriptor is opened to prevent bdev deletion during examination */ 8638 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8639 if (rc != 0) { 8640 spdk_bdev_unregister(bdev, NULL, NULL); 8641 return rc; 8642 } 8643 8644 rc = bdev_open(bdev, false, desc); 8645 if (rc != 0) { 8646 bdev_desc_free(desc); 8647 spdk_bdev_unregister(bdev, NULL, NULL); 8648 return rc; 8649 } 8650 8651 /* Examine configuration before initializing I/O */ 8652 bdev_examine(bdev); 8653 8654 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8655 if (rc != 0) { 8656 bdev_close(bdev, desc); 8657 spdk_bdev_unregister(bdev, NULL, NULL); 8658 } 8659 8660 return rc; 8661 } 8662 8663 int 8664 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8665 struct spdk_bdev_module *module) 8666 { 8667 spdk_spin_lock(&bdev->internal.spinlock); 8668 8669 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8670 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8671 spdk_spin_unlock(&bdev->internal.spinlock); 8672 return -EPERM; 8673 } 8674 8675 if (desc && !desc->write) { 8676 desc->write = true; 8677 } 8678 8679 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8680 bdev->internal.claim.v1.module = module; 8681 8682 spdk_spin_unlock(&bdev->internal.spinlock); 8683 return 0; 8684 } 8685 8686 void 8687 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8688 { 8689 spdk_spin_lock(&bdev->internal.spinlock); 8690 8691 assert(bdev->internal.claim.v1.module != NULL); 8692 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8693 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8694 bdev->internal.claim.v1.module = NULL; 8695 8696 spdk_spin_unlock(&bdev->internal.spinlock); 8697 } 8698 8699 /* 8700 * Start claims v2 8701 */ 8702 8703 const char * 8704 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8705 { 8706 switch (type) { 8707 case SPDK_BDEV_CLAIM_NONE: 8708 return "not_claimed"; 8709 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8710 return "exclusive_write"; 8711 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8712 return "read_many_write_one"; 8713 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8714 return "read_many_write_none"; 8715 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8716 return "read_many_write_many"; 8717 default: 8718 break; 8719 } 8720 return "invalid_claim"; 8721 } 8722 8723 static bool 8724 claim_type_is_v2(enum spdk_bdev_claim_type type) 8725 { 8726 switch (type) { 8727 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8728 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8729 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8730 return true; 8731 default: 8732 break; 8733 } 8734 return false; 8735 } 8736 8737 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8738 static bool 8739 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8740 { 8741 switch (type) { 8742 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8743 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8744 return true; 8745 default: 8746 break; 8747 } 8748 return false; 8749 } 8750 8751 void 8752 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8753 { 8754 if (opts == NULL) { 8755 SPDK_ERRLOG("opts should not be NULL\n"); 8756 assert(opts != NULL); 8757 return; 8758 } 8759 if (size == 0) { 8760 SPDK_ERRLOG("size should not be zero\n"); 8761 assert(size != 0); 8762 return; 8763 } 8764 8765 memset(opts, 0, size); 8766 opts->opts_size = size; 8767 8768 #define FIELD_OK(field) \ 8769 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8770 8771 #define SET_FIELD(field, value) \ 8772 if (FIELD_OK(field)) { \ 8773 opts->field = value; \ 8774 } \ 8775 8776 SET_FIELD(shared_claim_key, 0); 8777 8778 #undef FIELD_OK 8779 #undef SET_FIELD 8780 } 8781 8782 static int 8783 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8784 { 8785 if (src->opts_size == 0) { 8786 SPDK_ERRLOG("size should not be zero\n"); 8787 return -1; 8788 } 8789 8790 memset(dst, 0, sizeof(*dst)); 8791 dst->opts_size = src->opts_size; 8792 8793 #define FIELD_OK(field) \ 8794 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8795 8796 #define SET_FIELD(field) \ 8797 if (FIELD_OK(field)) { \ 8798 dst->field = src->field; \ 8799 } \ 8800 8801 if (FIELD_OK(name)) { 8802 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8803 } 8804 8805 SET_FIELD(shared_claim_key); 8806 8807 /* You should not remove this statement, but need to update the assert statement 8808 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8809 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8810 8811 #undef FIELD_OK 8812 #undef SET_FIELD 8813 return 0; 8814 } 8815 8816 /* Returns 0 if a read-write-once claim can be taken. */ 8817 static int 8818 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8819 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8820 { 8821 struct spdk_bdev *bdev = desc->bdev; 8822 struct spdk_bdev_desc *open_desc; 8823 8824 assert(spdk_spin_held(&bdev->internal.spinlock)); 8825 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8826 8827 if (opts->shared_claim_key != 0) { 8828 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8829 bdev->name); 8830 return -EINVAL; 8831 } 8832 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8833 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8834 return -EPERM; 8835 } 8836 if (desc->claim != NULL) { 8837 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8838 bdev->name, desc->claim->module->name); 8839 return -EPERM; 8840 } 8841 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8842 if (desc != open_desc && open_desc->write) { 8843 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8844 "another descriptor is open for writing\n", 8845 bdev->name); 8846 return -EPERM; 8847 } 8848 } 8849 8850 return 0; 8851 } 8852 8853 /* Returns 0 if a read-only-many claim can be taken. */ 8854 static int 8855 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8856 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8857 { 8858 struct spdk_bdev *bdev = desc->bdev; 8859 struct spdk_bdev_desc *open_desc; 8860 8861 assert(spdk_spin_held(&bdev->internal.spinlock)); 8862 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8863 assert(desc->claim == NULL); 8864 8865 if (desc->write) { 8866 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8867 bdev->name); 8868 return -EINVAL; 8869 } 8870 if (opts->shared_claim_key != 0) { 8871 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8872 return -EINVAL; 8873 } 8874 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8875 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8876 if (open_desc->write) { 8877 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8878 "another descriptor is open for writing\n", 8879 bdev->name); 8880 return -EPERM; 8881 } 8882 } 8883 } 8884 8885 return 0; 8886 } 8887 8888 /* Returns 0 if a read-write-many claim can be taken. */ 8889 static int 8890 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8891 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8892 { 8893 struct spdk_bdev *bdev = desc->bdev; 8894 struct spdk_bdev_desc *open_desc; 8895 8896 assert(spdk_spin_held(&bdev->internal.spinlock)); 8897 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8898 assert(desc->claim == NULL); 8899 8900 if (opts->shared_claim_key == 0) { 8901 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8902 bdev->name); 8903 return -EINVAL; 8904 } 8905 switch (bdev->internal.claim_type) { 8906 case SPDK_BDEV_CLAIM_NONE: 8907 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8908 if (open_desc == desc) { 8909 continue; 8910 } 8911 if (open_desc->write) { 8912 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8913 "another descriptor is open for writing without a " 8914 "claim\n", bdev->name); 8915 return -EPERM; 8916 } 8917 } 8918 break; 8919 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8920 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8921 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8922 return -EPERM; 8923 } 8924 break; 8925 default: 8926 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8927 return -EBUSY; 8928 } 8929 8930 return 0; 8931 } 8932 8933 /* Updates desc and its bdev with a v2 claim. */ 8934 static int 8935 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8936 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8937 { 8938 struct spdk_bdev *bdev = desc->bdev; 8939 struct spdk_bdev_module_claim *claim; 8940 8941 assert(spdk_spin_held(&bdev->internal.spinlock)); 8942 assert(claim_type_is_v2(type)); 8943 assert(desc->claim == NULL); 8944 8945 claim = calloc(1, sizeof(*desc->claim)); 8946 if (claim == NULL) { 8947 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8948 return -ENOMEM; 8949 } 8950 claim->module = module; 8951 claim->desc = desc; 8952 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8953 memcpy(claim->name, opts->name, sizeof(claim->name)); 8954 desc->claim = claim; 8955 8956 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8957 bdev->internal.claim_type = type; 8958 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8959 bdev->internal.claim.v2.key = opts->shared_claim_key; 8960 } 8961 assert(type == bdev->internal.claim_type); 8962 8963 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8964 8965 if (!desc->write && claim_type_promotes_to_write(type)) { 8966 desc->write = true; 8967 } 8968 8969 return 0; 8970 } 8971 8972 int 8973 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8974 struct spdk_bdev_claim_opts *_opts, 8975 struct spdk_bdev_module *module) 8976 { 8977 struct spdk_bdev *bdev; 8978 struct spdk_bdev_claim_opts opts; 8979 int rc = 0; 8980 8981 if (desc == NULL) { 8982 SPDK_ERRLOG("descriptor must not be NULL\n"); 8983 return -EINVAL; 8984 } 8985 8986 bdev = desc->bdev; 8987 8988 if (_opts == NULL) { 8989 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8990 } else if (claim_opts_copy(_opts, &opts) != 0) { 8991 return -EINVAL; 8992 } 8993 8994 spdk_spin_lock(&bdev->internal.spinlock); 8995 8996 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8997 bdev->internal.claim_type != type) { 8998 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8999 spdk_spin_unlock(&bdev->internal.spinlock); 9000 return -EPERM; 9001 } 9002 9003 if (claim_type_is_v2(type) && desc->claim != NULL) { 9004 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9005 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9006 spdk_spin_unlock(&bdev->internal.spinlock); 9007 return -EPERM; 9008 } 9009 9010 switch (type) { 9011 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9012 spdk_spin_unlock(&bdev->internal.spinlock); 9013 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9014 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9015 rc = claim_verify_rwo(desc, type, &opts, module); 9016 break; 9017 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9018 rc = claim_verify_rom(desc, type, &opts, module); 9019 break; 9020 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9021 rc = claim_verify_rwm(desc, type, &opts, module); 9022 break; 9023 default: 9024 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9025 rc = -ENOTSUP; 9026 } 9027 9028 if (rc == 0) { 9029 rc = claim_bdev(desc, type, &opts, module); 9030 } 9031 9032 spdk_spin_unlock(&bdev->internal.spinlock); 9033 return rc; 9034 } 9035 9036 static void 9037 claim_reset(struct spdk_bdev *bdev) 9038 { 9039 assert(spdk_spin_held(&bdev->internal.spinlock)); 9040 assert(claim_type_is_v2(bdev->internal.claim_type)); 9041 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9042 9043 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9044 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9045 } 9046 9047 static void 9048 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9049 { 9050 struct spdk_bdev *bdev = desc->bdev; 9051 9052 assert(spdk_spin_held(&bdev->internal.spinlock)); 9053 assert(claim_type_is_v2(bdev->internal.claim_type)); 9054 9055 if (bdev->internal.examine_in_progress == 0) { 9056 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9057 free(desc->claim); 9058 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9059 claim_reset(bdev); 9060 } 9061 } else { 9062 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9063 desc->claim->module = NULL; 9064 desc->claim->desc = NULL; 9065 } 9066 desc->claim = NULL; 9067 } 9068 9069 /* 9070 * End claims v2 9071 */ 9072 9073 struct spdk_bdev * 9074 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9075 { 9076 assert(desc != NULL); 9077 return desc->bdev; 9078 } 9079 9080 int 9081 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9082 { 9083 struct spdk_bdev *bdev, *tmp; 9084 struct spdk_bdev_desc *desc; 9085 int rc = 0; 9086 9087 assert(fn != NULL); 9088 9089 spdk_spin_lock(&g_bdev_mgr.spinlock); 9090 bdev = spdk_bdev_first(); 9091 while (bdev != NULL) { 9092 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9093 if (rc != 0) { 9094 break; 9095 } 9096 rc = bdev_open(bdev, false, desc); 9097 if (rc != 0) { 9098 bdev_desc_free(desc); 9099 if (rc == -ENODEV) { 9100 /* Ignore the error and move to the next bdev. */ 9101 rc = 0; 9102 bdev = spdk_bdev_next(bdev); 9103 continue; 9104 } 9105 break; 9106 } 9107 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9108 9109 rc = fn(ctx, bdev); 9110 9111 spdk_spin_lock(&g_bdev_mgr.spinlock); 9112 tmp = spdk_bdev_next(bdev); 9113 bdev_close(bdev, desc); 9114 if (rc != 0) { 9115 break; 9116 } 9117 bdev = tmp; 9118 } 9119 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9120 9121 return rc; 9122 } 9123 9124 int 9125 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9126 { 9127 struct spdk_bdev *bdev, *tmp; 9128 struct spdk_bdev_desc *desc; 9129 int rc = 0; 9130 9131 assert(fn != NULL); 9132 9133 spdk_spin_lock(&g_bdev_mgr.spinlock); 9134 bdev = spdk_bdev_first_leaf(); 9135 while (bdev != NULL) { 9136 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9137 if (rc != 0) { 9138 break; 9139 } 9140 rc = bdev_open(bdev, false, desc); 9141 if (rc != 0) { 9142 bdev_desc_free(desc); 9143 if (rc == -ENODEV) { 9144 /* Ignore the error and move to the next bdev. */ 9145 rc = 0; 9146 bdev = spdk_bdev_next_leaf(bdev); 9147 continue; 9148 } 9149 break; 9150 } 9151 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9152 9153 rc = fn(ctx, bdev); 9154 9155 spdk_spin_lock(&g_bdev_mgr.spinlock); 9156 tmp = spdk_bdev_next_leaf(bdev); 9157 bdev_close(bdev, desc); 9158 if (rc != 0) { 9159 break; 9160 } 9161 bdev = tmp; 9162 } 9163 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9164 9165 return rc; 9166 } 9167 9168 void 9169 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9170 { 9171 struct iovec *iovs; 9172 int iovcnt; 9173 9174 if (bdev_io == NULL) { 9175 return; 9176 } 9177 9178 switch (bdev_io->type) { 9179 case SPDK_BDEV_IO_TYPE_READ: 9180 case SPDK_BDEV_IO_TYPE_WRITE: 9181 case SPDK_BDEV_IO_TYPE_ZCOPY: 9182 iovs = bdev_io->u.bdev.iovs; 9183 iovcnt = bdev_io->u.bdev.iovcnt; 9184 break; 9185 default: 9186 iovs = NULL; 9187 iovcnt = 0; 9188 break; 9189 } 9190 9191 if (iovp) { 9192 *iovp = iovs; 9193 } 9194 if (iovcntp) { 9195 *iovcntp = iovcnt; 9196 } 9197 } 9198 9199 void * 9200 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9201 { 9202 if (bdev_io == NULL) { 9203 return NULL; 9204 } 9205 9206 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9207 return NULL; 9208 } 9209 9210 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9211 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9212 return bdev_io->u.bdev.md_buf; 9213 } 9214 9215 return NULL; 9216 } 9217 9218 void * 9219 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9220 { 9221 if (bdev_io == NULL) { 9222 assert(false); 9223 return NULL; 9224 } 9225 9226 return bdev_io->internal.caller_ctx; 9227 } 9228 9229 void 9230 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9231 { 9232 9233 if (spdk_bdev_module_list_find(bdev_module->name)) { 9234 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9235 assert(false); 9236 } 9237 9238 spdk_spin_init(&bdev_module->internal.spinlock); 9239 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9240 9241 /* 9242 * Modules with examine callbacks must be initialized first, so they are 9243 * ready to handle examine callbacks from later modules that will 9244 * register physical bdevs. 9245 */ 9246 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9247 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9248 } else { 9249 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9250 } 9251 } 9252 9253 struct spdk_bdev_module * 9254 spdk_bdev_module_list_find(const char *name) 9255 { 9256 struct spdk_bdev_module *bdev_module; 9257 9258 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9259 if (strcmp(name, bdev_module->name) == 0) { 9260 break; 9261 } 9262 } 9263 9264 return bdev_module; 9265 } 9266 9267 static int 9268 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9269 { 9270 uint64_t num_blocks; 9271 void *md_buf = NULL; 9272 9273 num_blocks = bdev_io->u.bdev.num_blocks; 9274 9275 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9276 md_buf = (char *)g_bdev_mgr.zero_buffer + 9277 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9278 } 9279 9280 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9281 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9282 g_bdev_mgr.zero_buffer, md_buf, 9283 bdev_io->u.bdev.offset_blocks, num_blocks, 9284 bdev_write_zero_buffer_done, bdev_io); 9285 } 9286 9287 static void 9288 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9289 { 9290 struct spdk_bdev_io *parent_io = cb_arg; 9291 9292 spdk_bdev_free_io(bdev_io); 9293 9294 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9295 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9296 } 9297 9298 static void 9299 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9300 { 9301 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9302 ctx->bdev->internal.qos_mod_in_progress = false; 9303 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9304 9305 if (ctx->cb_fn) { 9306 ctx->cb_fn(ctx->cb_arg, status); 9307 } 9308 free(ctx); 9309 } 9310 9311 static void 9312 bdev_disable_qos_done(void *cb_arg) 9313 { 9314 struct set_qos_limit_ctx *ctx = cb_arg; 9315 struct spdk_bdev *bdev = ctx->bdev; 9316 struct spdk_bdev_qos *qos; 9317 9318 spdk_spin_lock(&bdev->internal.spinlock); 9319 qos = bdev->internal.qos; 9320 bdev->internal.qos = NULL; 9321 spdk_spin_unlock(&bdev->internal.spinlock); 9322 9323 if (qos->thread != NULL) { 9324 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9325 spdk_poller_unregister(&qos->poller); 9326 } 9327 9328 free(qos); 9329 9330 bdev_set_qos_limit_done(ctx, 0); 9331 } 9332 9333 static void 9334 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9335 { 9336 struct set_qos_limit_ctx *ctx = _ctx; 9337 struct spdk_thread *thread; 9338 9339 spdk_spin_lock(&bdev->internal.spinlock); 9340 thread = bdev->internal.qos->thread; 9341 spdk_spin_unlock(&bdev->internal.spinlock); 9342 9343 if (thread != NULL) { 9344 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9345 } else { 9346 bdev_disable_qos_done(ctx); 9347 } 9348 } 9349 9350 static void 9351 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9352 struct spdk_io_channel *ch, void *_ctx) 9353 { 9354 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9355 struct spdk_bdev_io *bdev_io; 9356 9357 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9358 9359 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9360 /* Re-submit the queued I/O. */ 9361 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9362 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9363 _bdev_io_submit(bdev_io); 9364 } 9365 9366 spdk_bdev_for_each_channel_continue(i, 0); 9367 } 9368 9369 static void 9370 bdev_update_qos_rate_limit_msg(void *cb_arg) 9371 { 9372 struct set_qos_limit_ctx *ctx = cb_arg; 9373 struct spdk_bdev *bdev = ctx->bdev; 9374 9375 spdk_spin_lock(&bdev->internal.spinlock); 9376 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9377 spdk_spin_unlock(&bdev->internal.spinlock); 9378 9379 bdev_set_qos_limit_done(ctx, 0); 9380 } 9381 9382 static void 9383 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9384 struct spdk_io_channel *ch, void *_ctx) 9385 { 9386 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9387 9388 spdk_spin_lock(&bdev->internal.spinlock); 9389 bdev_enable_qos(bdev, bdev_ch); 9390 spdk_spin_unlock(&bdev->internal.spinlock); 9391 spdk_bdev_for_each_channel_continue(i, 0); 9392 } 9393 9394 static void 9395 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9396 { 9397 struct set_qos_limit_ctx *ctx = _ctx; 9398 9399 bdev_set_qos_limit_done(ctx, status); 9400 } 9401 9402 static void 9403 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9404 { 9405 int i; 9406 9407 assert(bdev->internal.qos != NULL); 9408 9409 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9410 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9411 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9412 9413 if (limits[i] == 0) { 9414 bdev->internal.qos->rate_limits[i].limit = 9415 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9416 } 9417 } 9418 } 9419 } 9420 9421 void 9422 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9423 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9424 { 9425 struct set_qos_limit_ctx *ctx; 9426 uint32_t limit_set_complement; 9427 uint64_t min_limit_per_sec; 9428 int i; 9429 bool disable_rate_limit = true; 9430 9431 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9432 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9433 continue; 9434 } 9435 9436 if (limits[i] > 0) { 9437 disable_rate_limit = false; 9438 } 9439 9440 if (bdev_qos_is_iops_rate_limit(i) == true) { 9441 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9442 } else { 9443 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9444 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9445 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9446 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9447 } 9448 /* Change from megabyte to byte rate limit */ 9449 limits[i] = limits[i] * 1024 * 1024; 9450 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9451 } 9452 9453 limit_set_complement = limits[i] % min_limit_per_sec; 9454 if (limit_set_complement) { 9455 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9456 limits[i], min_limit_per_sec); 9457 limits[i] += min_limit_per_sec - limit_set_complement; 9458 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9459 } 9460 } 9461 9462 ctx = calloc(1, sizeof(*ctx)); 9463 if (ctx == NULL) { 9464 cb_fn(cb_arg, -ENOMEM); 9465 return; 9466 } 9467 9468 ctx->cb_fn = cb_fn; 9469 ctx->cb_arg = cb_arg; 9470 ctx->bdev = bdev; 9471 9472 spdk_spin_lock(&bdev->internal.spinlock); 9473 if (bdev->internal.qos_mod_in_progress) { 9474 spdk_spin_unlock(&bdev->internal.spinlock); 9475 free(ctx); 9476 cb_fn(cb_arg, -EAGAIN); 9477 return; 9478 } 9479 bdev->internal.qos_mod_in_progress = true; 9480 9481 if (disable_rate_limit == true && bdev->internal.qos) { 9482 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9483 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9484 (bdev->internal.qos->rate_limits[i].limit > 0 && 9485 bdev->internal.qos->rate_limits[i].limit != 9486 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9487 disable_rate_limit = false; 9488 break; 9489 } 9490 } 9491 } 9492 9493 if (disable_rate_limit == false) { 9494 if (bdev->internal.qos == NULL) { 9495 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9496 if (!bdev->internal.qos) { 9497 spdk_spin_unlock(&bdev->internal.spinlock); 9498 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9499 bdev_set_qos_limit_done(ctx, -ENOMEM); 9500 return; 9501 } 9502 } 9503 9504 if (bdev->internal.qos->thread == NULL) { 9505 /* Enabling */ 9506 bdev_set_qos_rate_limits(bdev, limits); 9507 9508 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9509 bdev_enable_qos_done); 9510 } else { 9511 /* Updating */ 9512 bdev_set_qos_rate_limits(bdev, limits); 9513 9514 spdk_thread_send_msg(bdev->internal.qos->thread, 9515 bdev_update_qos_rate_limit_msg, ctx); 9516 } 9517 } else { 9518 if (bdev->internal.qos != NULL) { 9519 bdev_set_qos_rate_limits(bdev, limits); 9520 9521 /* Disabling */ 9522 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9523 bdev_disable_qos_msg_done); 9524 } else { 9525 spdk_spin_unlock(&bdev->internal.spinlock); 9526 bdev_set_qos_limit_done(ctx, 0); 9527 return; 9528 } 9529 } 9530 9531 spdk_spin_unlock(&bdev->internal.spinlock); 9532 } 9533 9534 struct spdk_bdev_histogram_ctx { 9535 spdk_bdev_histogram_status_cb cb_fn; 9536 void *cb_arg; 9537 struct spdk_bdev *bdev; 9538 int status; 9539 }; 9540 9541 static void 9542 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9543 { 9544 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9545 9546 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9547 ctx->bdev->internal.histogram_in_progress = false; 9548 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9549 ctx->cb_fn(ctx->cb_arg, ctx->status); 9550 free(ctx); 9551 } 9552 9553 static void 9554 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9555 struct spdk_io_channel *_ch, void *_ctx) 9556 { 9557 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9558 9559 if (ch->histogram != NULL) { 9560 spdk_histogram_data_free(ch->histogram); 9561 ch->histogram = NULL; 9562 } 9563 spdk_bdev_for_each_channel_continue(i, 0); 9564 } 9565 9566 static void 9567 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9568 { 9569 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9570 9571 if (status != 0) { 9572 ctx->status = status; 9573 ctx->bdev->internal.histogram_enabled = false; 9574 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9575 bdev_histogram_disable_channel_cb); 9576 } else { 9577 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9578 ctx->bdev->internal.histogram_in_progress = false; 9579 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9580 ctx->cb_fn(ctx->cb_arg, ctx->status); 9581 free(ctx); 9582 } 9583 } 9584 9585 static void 9586 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9587 struct spdk_io_channel *_ch, void *_ctx) 9588 { 9589 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9590 int status = 0; 9591 9592 if (ch->histogram == NULL) { 9593 ch->histogram = spdk_histogram_data_alloc(); 9594 if (ch->histogram == NULL) { 9595 status = -ENOMEM; 9596 } 9597 } 9598 9599 spdk_bdev_for_each_channel_continue(i, status); 9600 } 9601 9602 void 9603 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9604 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9605 { 9606 struct spdk_bdev_histogram_ctx *ctx; 9607 9608 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9609 if (ctx == NULL) { 9610 cb_fn(cb_arg, -ENOMEM); 9611 return; 9612 } 9613 9614 ctx->bdev = bdev; 9615 ctx->status = 0; 9616 ctx->cb_fn = cb_fn; 9617 ctx->cb_arg = cb_arg; 9618 9619 spdk_spin_lock(&bdev->internal.spinlock); 9620 if (bdev->internal.histogram_in_progress) { 9621 spdk_spin_unlock(&bdev->internal.spinlock); 9622 free(ctx); 9623 cb_fn(cb_arg, -EAGAIN); 9624 return; 9625 } 9626 9627 bdev->internal.histogram_in_progress = true; 9628 spdk_spin_unlock(&bdev->internal.spinlock); 9629 9630 bdev->internal.histogram_enabled = enable; 9631 bdev->internal.histogram_io_type = opts->io_type; 9632 9633 if (enable) { 9634 /* Allocate histogram for each channel */ 9635 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9636 bdev_histogram_enable_channel_cb); 9637 } else { 9638 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9639 bdev_histogram_disable_channel_cb); 9640 } 9641 } 9642 9643 void 9644 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9645 { 9646 if (opts == NULL) { 9647 SPDK_ERRLOG("opts should not be NULL\n"); 9648 assert(opts != NULL); 9649 return; 9650 } 9651 if (size == 0) { 9652 SPDK_ERRLOG("size should not be zero\n"); 9653 assert(size != 0); 9654 return; 9655 } 9656 9657 memset(opts, 0, size); 9658 opts->size = size; 9659 9660 #define FIELD_OK(field) \ 9661 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9662 9663 #define SET_FIELD(field, value) \ 9664 if (FIELD_OK(field)) { \ 9665 opts->field = value; \ 9666 } \ 9667 9668 SET_FIELD(io_type, 0); 9669 9670 /* You should not remove this statement, but need to update the assert statement 9671 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9672 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9673 9674 #undef FIELD_OK 9675 #undef SET_FIELD 9676 } 9677 9678 void 9679 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9680 void *cb_arg, bool enable) 9681 { 9682 struct spdk_bdev_enable_histogram_opts opts; 9683 9684 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9685 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9686 } 9687 9688 struct spdk_bdev_histogram_data_ctx { 9689 spdk_bdev_histogram_data_cb cb_fn; 9690 void *cb_arg; 9691 struct spdk_bdev *bdev; 9692 /** merged histogram data from all channels */ 9693 struct spdk_histogram_data *histogram; 9694 }; 9695 9696 static void 9697 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9698 { 9699 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9700 9701 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9702 free(ctx); 9703 } 9704 9705 static void 9706 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9707 struct spdk_io_channel *_ch, void *_ctx) 9708 { 9709 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9710 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9711 int status = 0; 9712 9713 if (ch->histogram == NULL) { 9714 status = -EFAULT; 9715 } else { 9716 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9717 } 9718 9719 spdk_bdev_for_each_channel_continue(i, status); 9720 } 9721 9722 void 9723 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9724 spdk_bdev_histogram_data_cb cb_fn, 9725 void *cb_arg) 9726 { 9727 struct spdk_bdev_histogram_data_ctx *ctx; 9728 9729 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9730 if (ctx == NULL) { 9731 cb_fn(cb_arg, -ENOMEM, NULL); 9732 return; 9733 } 9734 9735 ctx->bdev = bdev; 9736 ctx->cb_fn = cb_fn; 9737 ctx->cb_arg = cb_arg; 9738 9739 ctx->histogram = histogram; 9740 9741 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9742 bdev_histogram_get_channel_cb); 9743 } 9744 9745 void 9746 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9747 void *cb_arg) 9748 { 9749 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9750 int status = 0; 9751 9752 assert(cb_fn != NULL); 9753 9754 if (bdev_ch->histogram == NULL) { 9755 status = -EFAULT; 9756 } 9757 cb_fn(cb_arg, status, bdev_ch->histogram); 9758 } 9759 9760 size_t 9761 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9762 size_t max_events) 9763 { 9764 struct media_event_entry *entry; 9765 size_t num_events = 0; 9766 9767 for (; num_events < max_events; ++num_events) { 9768 entry = TAILQ_FIRST(&desc->pending_media_events); 9769 if (entry == NULL) { 9770 break; 9771 } 9772 9773 events[num_events] = entry->event; 9774 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9775 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9776 } 9777 9778 return num_events; 9779 } 9780 9781 int 9782 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9783 size_t num_events) 9784 { 9785 struct spdk_bdev_desc *desc; 9786 struct media_event_entry *entry; 9787 size_t event_id; 9788 int rc = 0; 9789 9790 assert(bdev->media_events); 9791 9792 spdk_spin_lock(&bdev->internal.spinlock); 9793 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9794 if (desc->write) { 9795 break; 9796 } 9797 } 9798 9799 if (desc == NULL || desc->media_events_buffer == NULL) { 9800 rc = -ENODEV; 9801 goto out; 9802 } 9803 9804 for (event_id = 0; event_id < num_events; ++event_id) { 9805 entry = TAILQ_FIRST(&desc->free_media_events); 9806 if (entry == NULL) { 9807 break; 9808 } 9809 9810 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9811 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9812 entry->event = events[event_id]; 9813 } 9814 9815 rc = event_id; 9816 out: 9817 spdk_spin_unlock(&bdev->internal.spinlock); 9818 return rc; 9819 } 9820 9821 static void 9822 _media_management_notify(void *arg) 9823 { 9824 struct spdk_bdev_desc *desc = arg; 9825 9826 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9827 } 9828 9829 void 9830 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9831 { 9832 struct spdk_bdev_desc *desc; 9833 9834 spdk_spin_lock(&bdev->internal.spinlock); 9835 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9836 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9837 event_notify(desc, _media_management_notify); 9838 } 9839 } 9840 spdk_spin_unlock(&bdev->internal.spinlock); 9841 } 9842 9843 struct locked_lba_range_ctx { 9844 struct lba_range range; 9845 struct lba_range *current_range; 9846 struct lba_range *owner_range; 9847 struct spdk_poller *poller; 9848 lock_range_cb cb_fn; 9849 void *cb_arg; 9850 }; 9851 9852 static void 9853 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9854 { 9855 struct locked_lba_range_ctx *ctx = _ctx; 9856 9857 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9858 free(ctx); 9859 } 9860 9861 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9862 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9863 9864 static void 9865 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9866 { 9867 struct locked_lba_range_ctx *ctx = _ctx; 9868 9869 if (status == -ENOMEM) { 9870 /* One of the channels could not allocate a range object. 9871 * So we have to go back and clean up any ranges that were 9872 * allocated successfully before we return error status to 9873 * the caller. We can reuse the unlock function to do that 9874 * clean up. 9875 */ 9876 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9877 bdev_lock_error_cleanup_cb); 9878 return; 9879 } 9880 9881 /* All channels have locked this range and no I/O overlapping the range 9882 * are outstanding! Set the owner_ch for the range object for the 9883 * locking channel, so that this channel will know that it is allowed 9884 * to write to this range. 9885 */ 9886 if (ctx->owner_range != NULL) { 9887 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9888 } 9889 9890 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9891 9892 /* Don't free the ctx here. Its range is in the bdev's global list of 9893 * locked ranges still, and will be removed and freed when this range 9894 * is later unlocked. 9895 */ 9896 } 9897 9898 static int 9899 bdev_lock_lba_range_check_io(void *_i) 9900 { 9901 struct spdk_bdev_channel_iter *i = _i; 9902 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9903 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9904 struct locked_lba_range_ctx *ctx = i->ctx; 9905 struct lba_range *range = ctx->current_range; 9906 struct spdk_bdev_io *bdev_io; 9907 9908 spdk_poller_unregister(&ctx->poller); 9909 9910 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9911 * range. But we need to wait until any outstanding IO overlapping with this range 9912 * are completed. 9913 */ 9914 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9915 if (bdev_io_range_is_locked(bdev_io, range)) { 9916 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9917 return SPDK_POLLER_BUSY; 9918 } 9919 } 9920 9921 spdk_bdev_for_each_channel_continue(i, 0); 9922 return SPDK_POLLER_BUSY; 9923 } 9924 9925 static void 9926 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9927 struct spdk_io_channel *_ch, void *_ctx) 9928 { 9929 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9930 struct locked_lba_range_ctx *ctx = _ctx; 9931 struct lba_range *range; 9932 9933 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9934 if (range->length == ctx->range.length && 9935 range->offset == ctx->range.offset && 9936 range->locked_ctx == ctx->range.locked_ctx) { 9937 /* This range already exists on this channel, so don't add 9938 * it again. This can happen when a new channel is created 9939 * while the for_each_channel operation is in progress. 9940 * Do not check for outstanding I/O in that case, since the 9941 * range was locked before any I/O could be submitted to the 9942 * new channel. 9943 */ 9944 spdk_bdev_for_each_channel_continue(i, 0); 9945 return; 9946 } 9947 } 9948 9949 range = calloc(1, sizeof(*range)); 9950 if (range == NULL) { 9951 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9952 return; 9953 } 9954 9955 range->length = ctx->range.length; 9956 range->offset = ctx->range.offset; 9957 range->locked_ctx = ctx->range.locked_ctx; 9958 range->quiesce = ctx->range.quiesce; 9959 ctx->current_range = range; 9960 if (ctx->range.owner_ch == ch) { 9961 /* This is the range object for the channel that will hold 9962 * the lock. Store it in the ctx object so that we can easily 9963 * set its owner_ch after the lock is finally acquired. 9964 */ 9965 ctx->owner_range = range; 9966 } 9967 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9968 bdev_lock_lba_range_check_io(i); 9969 } 9970 9971 static void 9972 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9973 { 9974 assert(spdk_get_thread() == ctx->range.owner_thread); 9975 assert(ctx->range.owner_ch == NULL || 9976 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9977 9978 /* We will add a copy of this range to each channel now. */ 9979 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9980 bdev_lock_lba_range_cb); 9981 } 9982 9983 static bool 9984 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9985 { 9986 struct lba_range *r; 9987 9988 TAILQ_FOREACH(r, tailq, tailq) { 9989 if (bdev_lba_range_overlapped(range, r)) { 9990 return true; 9991 } 9992 } 9993 return false; 9994 } 9995 9996 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9997 9998 static int 9999 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10000 uint64_t offset, uint64_t length, 10001 lock_range_cb cb_fn, void *cb_arg) 10002 { 10003 struct locked_lba_range_ctx *ctx; 10004 10005 ctx = calloc(1, sizeof(*ctx)); 10006 if (ctx == NULL) { 10007 return -ENOMEM; 10008 } 10009 10010 ctx->range.offset = offset; 10011 ctx->range.length = length; 10012 ctx->range.owner_thread = spdk_get_thread(); 10013 ctx->range.owner_ch = ch; 10014 ctx->range.locked_ctx = cb_arg; 10015 ctx->range.bdev = bdev; 10016 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10017 ctx->cb_fn = cb_fn; 10018 ctx->cb_arg = cb_arg; 10019 10020 spdk_spin_lock(&bdev->internal.spinlock); 10021 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10022 /* There is an active lock overlapping with this range. 10023 * Put it on the pending list until this range no 10024 * longer overlaps with another. 10025 */ 10026 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10027 } else { 10028 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10029 bdev_lock_lba_range_ctx(bdev, ctx); 10030 } 10031 spdk_spin_unlock(&bdev->internal.spinlock); 10032 return 0; 10033 } 10034 10035 static int 10036 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10037 uint64_t offset, uint64_t length, 10038 lock_range_cb cb_fn, void *cb_arg) 10039 { 10040 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10041 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10042 10043 if (cb_arg == NULL) { 10044 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10045 return -EINVAL; 10046 } 10047 10048 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10049 } 10050 10051 static void 10052 bdev_lock_lba_range_ctx_msg(void *_ctx) 10053 { 10054 struct locked_lba_range_ctx *ctx = _ctx; 10055 10056 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10057 } 10058 10059 static void 10060 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10061 { 10062 struct locked_lba_range_ctx *ctx = _ctx; 10063 struct locked_lba_range_ctx *pending_ctx; 10064 struct lba_range *range, *tmp; 10065 10066 spdk_spin_lock(&bdev->internal.spinlock); 10067 /* Check if there are any pending locked ranges that overlap with this range 10068 * that was just unlocked. If there are, check that it doesn't overlap with any 10069 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10070 * the lock process. 10071 */ 10072 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10073 if (bdev_lba_range_overlapped(range, &ctx->range) && 10074 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10075 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10076 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10077 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10078 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10079 bdev_lock_lba_range_ctx_msg, pending_ctx); 10080 } 10081 } 10082 spdk_spin_unlock(&bdev->internal.spinlock); 10083 10084 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10085 free(ctx); 10086 } 10087 10088 static void 10089 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10090 struct spdk_io_channel *_ch, void *_ctx) 10091 { 10092 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10093 struct locked_lba_range_ctx *ctx = _ctx; 10094 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10095 struct spdk_bdev_io *bdev_io; 10096 struct lba_range *range; 10097 10098 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10099 if (ctx->range.offset == range->offset && 10100 ctx->range.length == range->length && 10101 ctx->range.locked_ctx == range->locked_ctx) { 10102 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10103 free(range); 10104 break; 10105 } 10106 } 10107 10108 /* Note: we should almost always be able to assert that the range specified 10109 * was found. But there are some very rare corner cases where a new channel 10110 * gets created simultaneously with a range unlock, where this function 10111 * would execute on that new channel and wouldn't have the range. 10112 * We also use this to clean up range allocations when a later allocation 10113 * fails in the locking path. 10114 * So we can't actually assert() here. 10115 */ 10116 10117 /* Swap the locked IO into a temporary list, and then try to submit them again. 10118 * We could hyper-optimize this to only resubmit locked I/O that overlap 10119 * with the range that was just unlocked, but this isn't a performance path so 10120 * we go for simplicity here. 10121 */ 10122 TAILQ_INIT(&io_locked); 10123 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10124 while (!TAILQ_EMPTY(&io_locked)) { 10125 bdev_io = TAILQ_FIRST(&io_locked); 10126 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10127 bdev_io_submit(bdev_io); 10128 } 10129 10130 spdk_bdev_for_each_channel_continue(i, 0); 10131 } 10132 10133 static int 10134 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10135 lock_range_cb cb_fn, void *cb_arg) 10136 { 10137 struct locked_lba_range_ctx *ctx; 10138 struct lba_range *range; 10139 10140 spdk_spin_lock(&bdev->internal.spinlock); 10141 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10142 * and remove it. This ensures new channels don't inherit the locked range. 10143 * Then we will send a message to each channel to remove the range from its 10144 * per-channel list. 10145 */ 10146 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10147 if (range->offset == offset && range->length == length && 10148 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10149 break; 10150 } 10151 } 10152 if (range == NULL) { 10153 assert(false); 10154 spdk_spin_unlock(&bdev->internal.spinlock); 10155 return -EINVAL; 10156 } 10157 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10158 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10159 spdk_spin_unlock(&bdev->internal.spinlock); 10160 10161 ctx->cb_fn = cb_fn; 10162 ctx->cb_arg = cb_arg; 10163 10164 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10165 bdev_unlock_lba_range_cb); 10166 return 0; 10167 } 10168 10169 static int 10170 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10171 uint64_t offset, uint64_t length, 10172 lock_range_cb cb_fn, void *cb_arg) 10173 { 10174 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10175 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10176 struct lba_range *range; 10177 bool range_found = false; 10178 10179 /* Let's make sure the specified channel actually has a lock on 10180 * the specified range. Note that the range must match exactly. 10181 */ 10182 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10183 if (range->offset == offset && range->length == length && 10184 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10185 range_found = true; 10186 break; 10187 } 10188 } 10189 10190 if (!range_found) { 10191 return -EINVAL; 10192 } 10193 10194 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10195 } 10196 10197 struct bdev_quiesce_ctx { 10198 spdk_bdev_quiesce_cb cb_fn; 10199 void *cb_arg; 10200 }; 10201 10202 static void 10203 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10204 { 10205 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10206 10207 if (quiesce_ctx->cb_fn != NULL) { 10208 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10209 } 10210 10211 free(quiesce_ctx); 10212 } 10213 10214 static void 10215 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10216 { 10217 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10218 struct spdk_bdev_module *module = range->bdev->module; 10219 10220 if (status != 0) { 10221 if (quiesce_ctx->cb_fn != NULL) { 10222 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10223 } 10224 free(quiesce_ctx); 10225 return; 10226 } 10227 10228 spdk_spin_lock(&module->internal.spinlock); 10229 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10230 spdk_spin_unlock(&module->internal.spinlock); 10231 10232 if (quiesce_ctx->cb_fn != NULL) { 10233 /* copy the context in case the range is unlocked by the callback */ 10234 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10235 10236 quiesce_ctx->cb_fn = NULL; 10237 quiesce_ctx->cb_arg = NULL; 10238 10239 tmp.cb_fn(tmp.cb_arg, status); 10240 } 10241 /* quiesce_ctx will be freed on unquiesce */ 10242 } 10243 10244 static int 10245 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10246 uint64_t offset, uint64_t length, 10247 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10248 bool unquiesce) 10249 { 10250 struct bdev_quiesce_ctx *quiesce_ctx; 10251 int rc; 10252 10253 if (module != bdev->module) { 10254 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10255 return -EINVAL; 10256 } 10257 10258 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10259 return -EINVAL; 10260 } 10261 10262 if (unquiesce) { 10263 struct lba_range *range; 10264 10265 /* Make sure the specified range is actually quiesced in the specified module and 10266 * then remove it from the list. Note that the range must match exactly. 10267 */ 10268 spdk_spin_lock(&module->internal.spinlock); 10269 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10270 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10271 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10272 break; 10273 } 10274 } 10275 spdk_spin_unlock(&module->internal.spinlock); 10276 10277 if (range == NULL) { 10278 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10279 return -EINVAL; 10280 } 10281 10282 quiesce_ctx = range->locked_ctx; 10283 quiesce_ctx->cb_fn = cb_fn; 10284 quiesce_ctx->cb_arg = cb_arg; 10285 10286 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10287 } else { 10288 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10289 if (quiesce_ctx == NULL) { 10290 return -ENOMEM; 10291 } 10292 10293 quiesce_ctx->cb_fn = cb_fn; 10294 quiesce_ctx->cb_arg = cb_arg; 10295 10296 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10297 if (rc != 0) { 10298 free(quiesce_ctx); 10299 } 10300 } 10301 10302 return rc; 10303 } 10304 10305 int 10306 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10307 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10308 { 10309 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10310 } 10311 10312 int 10313 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10314 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10315 { 10316 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10317 } 10318 10319 int 10320 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10321 uint64_t offset, uint64_t length, 10322 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10323 { 10324 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10325 } 10326 10327 int 10328 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10329 uint64_t offset, uint64_t length, 10330 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10331 { 10332 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10333 } 10334 10335 int 10336 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10337 int array_size) 10338 { 10339 if (!bdev) { 10340 return -EINVAL; 10341 } 10342 10343 if (bdev->fn_table->get_memory_domains) { 10344 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10345 } 10346 10347 return 0; 10348 } 10349 10350 struct spdk_bdev_for_each_io_ctx { 10351 void *ctx; 10352 spdk_bdev_io_fn fn; 10353 spdk_bdev_for_each_io_cb cb; 10354 }; 10355 10356 static void 10357 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10358 struct spdk_io_channel *io_ch, void *_ctx) 10359 { 10360 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10361 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10362 struct spdk_bdev_io *bdev_io; 10363 int rc = 0; 10364 10365 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10366 rc = ctx->fn(ctx->ctx, bdev_io); 10367 if (rc != 0) { 10368 break; 10369 } 10370 } 10371 10372 spdk_bdev_for_each_channel_continue(i, rc); 10373 } 10374 10375 static void 10376 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10377 { 10378 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10379 10380 ctx->cb(ctx->ctx, status); 10381 10382 free(ctx); 10383 } 10384 10385 void 10386 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10387 spdk_bdev_for_each_io_cb cb) 10388 { 10389 struct spdk_bdev_for_each_io_ctx *ctx; 10390 10391 assert(fn != NULL && cb != NULL); 10392 10393 ctx = calloc(1, sizeof(*ctx)); 10394 if (ctx == NULL) { 10395 SPDK_ERRLOG("Failed to allocate context.\n"); 10396 cb(_ctx, -ENOMEM); 10397 return; 10398 } 10399 10400 ctx->ctx = _ctx; 10401 ctx->fn = fn; 10402 ctx->cb = cb; 10403 10404 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10405 bdev_for_each_io_done); 10406 } 10407 10408 void 10409 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10410 { 10411 spdk_for_each_channel_continue(iter->i, status); 10412 } 10413 10414 static struct spdk_bdev * 10415 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10416 { 10417 void *io_device = spdk_io_channel_iter_get_io_device(i); 10418 10419 return __bdev_from_io_dev(io_device); 10420 } 10421 10422 static void 10423 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10424 { 10425 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10426 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10427 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10428 10429 iter->i = i; 10430 iter->fn(iter, bdev, ch, iter->ctx); 10431 } 10432 10433 static void 10434 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10435 { 10436 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10437 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10438 10439 iter->i = i; 10440 iter->cpl(bdev, iter->ctx, status); 10441 10442 free(iter); 10443 } 10444 10445 void 10446 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10447 void *ctx, spdk_bdev_for_each_channel_done cpl) 10448 { 10449 struct spdk_bdev_channel_iter *iter; 10450 10451 assert(bdev != NULL && fn != NULL && ctx != NULL); 10452 10453 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10454 if (iter == NULL) { 10455 SPDK_ERRLOG("Unable to allocate iterator\n"); 10456 assert(false); 10457 return; 10458 } 10459 10460 iter->fn = fn; 10461 iter->cpl = cpl; 10462 iter->ctx = ctx; 10463 10464 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10465 iter, bdev_each_channel_cpl); 10466 } 10467 10468 static void 10469 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10470 { 10471 struct spdk_bdev_io *parent_io = cb_arg; 10472 10473 spdk_bdev_free_io(bdev_io); 10474 10475 /* Check return status of write */ 10476 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10477 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10478 } 10479 10480 static void 10481 bdev_copy_do_write(void *_bdev_io) 10482 { 10483 struct spdk_bdev_io *bdev_io = _bdev_io; 10484 int rc; 10485 10486 /* Write blocks */ 10487 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10488 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10489 bdev_io->u.bdev.iovs[0].iov_base, 10490 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10491 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10492 10493 if (rc == -ENOMEM) { 10494 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10495 } else if (rc != 0) { 10496 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10497 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10498 } 10499 } 10500 10501 static void 10502 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10503 { 10504 struct spdk_bdev_io *parent_io = cb_arg; 10505 10506 spdk_bdev_free_io(bdev_io); 10507 10508 /* Check return status of read */ 10509 if (!success) { 10510 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10511 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10512 return; 10513 } 10514 10515 /* Do write */ 10516 bdev_copy_do_write(parent_io); 10517 } 10518 10519 static void 10520 bdev_copy_do_read(void *_bdev_io) 10521 { 10522 struct spdk_bdev_io *bdev_io = _bdev_io; 10523 int rc; 10524 10525 /* Read blocks */ 10526 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10527 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10528 bdev_io->u.bdev.iovs[0].iov_base, 10529 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10530 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10531 10532 if (rc == -ENOMEM) { 10533 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10534 } else if (rc != 0) { 10535 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10536 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10537 } 10538 } 10539 10540 static void 10541 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10542 { 10543 if (!success) { 10544 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10545 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10546 return; 10547 } 10548 10549 bdev_copy_do_read(bdev_io); 10550 } 10551 10552 int 10553 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10554 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10555 spdk_bdev_io_completion_cb cb, void *cb_arg) 10556 { 10557 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10558 struct spdk_bdev_io *bdev_io; 10559 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10560 10561 if (!desc->write) { 10562 return -EBADF; 10563 } 10564 10565 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10566 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10567 SPDK_DEBUGLOG(bdev, 10568 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10569 dst_offset_blocks, src_offset_blocks, num_blocks); 10570 return -EINVAL; 10571 } 10572 10573 bdev_io = bdev_channel_get_io(channel); 10574 if (!bdev_io) { 10575 return -ENOMEM; 10576 } 10577 10578 bdev_io->internal.ch = channel; 10579 bdev_io->internal.desc = desc; 10580 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10581 10582 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10583 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10584 bdev_io->u.bdev.num_blocks = num_blocks; 10585 bdev_io->u.bdev.memory_domain = NULL; 10586 bdev_io->u.bdev.memory_domain_ctx = NULL; 10587 bdev_io->u.bdev.iovs = NULL; 10588 bdev_io->u.bdev.iovcnt = 0; 10589 bdev_io->u.bdev.md_buf = NULL; 10590 bdev_io->u.bdev.accel_sequence = NULL; 10591 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10592 10593 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10594 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10595 return 0; 10596 } 10597 10598 10599 /* If the copy size is large and should be split, use the generic split logic 10600 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10601 * 10602 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10603 * emulate it using regular read and write requests otherwise. 10604 */ 10605 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10606 bdev_io->internal.f.split) { 10607 bdev_io_submit(bdev_io); 10608 return 0; 10609 } 10610 10611 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10612 10613 return 0; 10614 } 10615 10616 SPDK_LOG_REGISTER_COMPONENT(bdev) 10617 10618 static void 10619 bdev_trace(void) 10620 { 10621 struct spdk_trace_tpoint_opts opts[] = { 10622 { 10623 "BDEV_IO_START", TRACE_BDEV_IO_START, 10624 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10625 { 10626 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10627 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10628 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10629 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10630 } 10631 }, 10632 { 10633 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10634 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10635 { 10636 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10637 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10638 } 10639 }, 10640 { 10641 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10642 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10643 { 10644 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10645 } 10646 }, 10647 { 10648 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10649 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10650 { 10651 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10652 } 10653 }, 10654 }; 10655 10656 10657 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10658 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10659 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10660 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10661 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10662 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10663 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10664 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10665 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10666 } 10667 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10668