1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 struct spdk_thread *thread; 337 struct { 338 spdk_bdev_event_cb_t event_fn; 339 void *ctx; 340 } callback; 341 bool closed; 342 bool write; 343 bool memory_domains_supported; 344 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 345 struct spdk_spinlock spinlock; 346 uint32_t refs; 347 TAILQ_HEAD(, media_event_entry) pending_media_events; 348 TAILQ_HEAD(, media_event_entry) free_media_events; 349 struct media_event_entry *media_events_buffer; 350 TAILQ_ENTRY(spdk_bdev_desc) link; 351 352 uint64_t timeout_in_sec; 353 spdk_bdev_io_timeout_cb cb_fn; 354 void *cb_arg; 355 struct spdk_poller *io_timeout_poller; 356 struct spdk_bdev_module_claim *claim; 357 }; 358 359 struct spdk_bdev_iostat_ctx { 360 struct spdk_bdev_io_stat *stat; 361 enum spdk_bdev_reset_stat_mode reset_mode; 362 spdk_bdev_get_device_stat_cb cb; 363 void *cb_arg; 364 }; 365 366 struct set_qos_limit_ctx { 367 void (*cb_fn)(void *cb_arg, int status); 368 void *cb_arg; 369 struct spdk_bdev *bdev; 370 }; 371 372 struct spdk_bdev_channel_iter { 373 spdk_bdev_for_each_channel_msg fn; 374 spdk_bdev_for_each_channel_done cpl; 375 struct spdk_io_channel_iter *i; 376 void *ctx; 377 }; 378 379 struct spdk_bdev_io_error_stat { 380 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 381 }; 382 383 enum bdev_io_retry_state { 384 BDEV_IO_RETRY_STATE_INVALID, 385 BDEV_IO_RETRY_STATE_PULL, 386 BDEV_IO_RETRY_STATE_PULL_MD, 387 BDEV_IO_RETRY_STATE_SUBMIT, 388 BDEV_IO_RETRY_STATE_PUSH, 389 BDEV_IO_RETRY_STATE_PUSH_MD, 390 }; 391 392 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 393 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 394 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 395 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 396 397 static inline void bdev_io_complete(void *ctx); 398 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 399 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 401 402 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 403 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 404 405 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 406 struct spdk_io_channel *ch, void *_ctx); 407 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 408 409 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 410 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 411 uint64_t num_blocks, 412 struct spdk_memory_domain *domain, void *domain_ctx, 413 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 414 spdk_bdev_io_completion_cb cb, void *cb_arg); 415 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 416 struct iovec *iov, int iovcnt, void *md_buf, 417 uint64_t offset_blocks, uint64_t num_blocks, 418 struct spdk_memory_domain *domain, void *domain_ctx, 419 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 420 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 421 spdk_bdev_io_completion_cb cb, void *cb_arg); 422 423 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 424 uint64_t offset, uint64_t length, 425 lock_range_cb cb_fn, void *cb_arg); 426 427 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 428 uint64_t offset, uint64_t length, 429 lock_range_cb cb_fn, void *cb_arg); 430 431 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 432 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 433 434 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 435 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 436 static void claim_reset(struct spdk_bdev *bdev); 437 438 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 439 440 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 441 442 #define bdev_get_ext_io_opt(opts, field, defval) \ 443 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 444 445 static inline void 446 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 447 { 448 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 449 bdev_io->internal.ch->queue_depth++; 450 } 451 452 static inline void 453 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 454 { 455 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 456 bdev_io->internal.ch->queue_depth--; 457 } 458 459 void 460 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 461 { 462 if (!opts) { 463 SPDK_ERRLOG("opts should not be NULL\n"); 464 return; 465 } 466 467 if (!opts_size) { 468 SPDK_ERRLOG("opts_size should not be zero value\n"); 469 return; 470 } 471 472 opts->opts_size = opts_size; 473 474 #define SET_FIELD(field) \ 475 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 476 opts->field = g_bdev_opts.field; \ 477 } \ 478 479 SET_FIELD(bdev_io_pool_size); 480 SET_FIELD(bdev_io_cache_size); 481 SET_FIELD(bdev_auto_examine); 482 SET_FIELD(iobuf_small_cache_size); 483 SET_FIELD(iobuf_large_cache_size); 484 485 /* Do not remove this statement, you should always update this statement when you adding a new field, 486 * and do not forget to add the SET_FIELD statement for your added field. */ 487 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 488 489 #undef SET_FIELD 490 } 491 492 int 493 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 494 { 495 uint32_t min_pool_size; 496 497 if (!opts) { 498 SPDK_ERRLOG("opts cannot be NULL\n"); 499 return -1; 500 } 501 502 if (!opts->opts_size) { 503 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 504 return -1; 505 } 506 507 /* 508 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 509 * initialization. A second mgmt_ch will be created on the same thread when the application starts 510 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 511 */ 512 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 513 if (opts->bdev_io_pool_size < min_pool_size) { 514 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 515 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 516 spdk_thread_get_count()); 517 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 518 return -1; 519 } 520 521 #define SET_FIELD(field) \ 522 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 523 g_bdev_opts.field = opts->field; \ 524 } \ 525 526 SET_FIELD(bdev_io_pool_size); 527 SET_FIELD(bdev_io_cache_size); 528 SET_FIELD(bdev_auto_examine); 529 SET_FIELD(iobuf_small_cache_size); 530 SET_FIELD(iobuf_large_cache_size); 531 532 g_bdev_opts.opts_size = opts->opts_size; 533 534 #undef SET_FIELD 535 536 return 0; 537 } 538 539 static struct spdk_bdev * 540 bdev_get_by_name(const char *bdev_name) 541 { 542 struct spdk_bdev_name find; 543 struct spdk_bdev_name *res; 544 545 find.name = (char *)bdev_name; 546 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 547 if (res != NULL) { 548 return res->bdev; 549 } 550 551 return NULL; 552 } 553 554 struct spdk_bdev * 555 spdk_bdev_get_by_name(const char *bdev_name) 556 { 557 struct spdk_bdev *bdev; 558 559 spdk_spin_lock(&g_bdev_mgr.spinlock); 560 bdev = bdev_get_by_name(bdev_name); 561 spdk_spin_unlock(&g_bdev_mgr.spinlock); 562 563 return bdev; 564 } 565 566 struct bdev_io_status_string { 567 enum spdk_bdev_io_status status; 568 const char *str; 569 }; 570 571 static const struct bdev_io_status_string bdev_io_status_strings[] = { 572 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 573 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 574 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 575 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 576 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 577 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 578 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 579 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 580 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 581 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 582 }; 583 584 static const char * 585 bdev_io_status_get_string(enum spdk_bdev_io_status status) 586 { 587 uint32_t i; 588 589 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 590 if (bdev_io_status_strings[i].status == status) { 591 return bdev_io_status_strings[i].str; 592 } 593 } 594 595 return "reserved"; 596 } 597 598 struct spdk_bdev_wait_for_examine_ctx { 599 struct spdk_poller *poller; 600 spdk_bdev_wait_for_examine_cb cb_fn; 601 void *cb_arg; 602 }; 603 604 static bool bdev_module_all_actions_completed(void); 605 606 static int 607 bdev_wait_for_examine_cb(void *arg) 608 { 609 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 610 611 if (!bdev_module_all_actions_completed()) { 612 return SPDK_POLLER_IDLE; 613 } 614 615 spdk_poller_unregister(&ctx->poller); 616 ctx->cb_fn(ctx->cb_arg); 617 free(ctx); 618 619 return SPDK_POLLER_BUSY; 620 } 621 622 int 623 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 624 { 625 struct spdk_bdev_wait_for_examine_ctx *ctx; 626 627 ctx = calloc(1, sizeof(*ctx)); 628 if (ctx == NULL) { 629 return -ENOMEM; 630 } 631 ctx->cb_fn = cb_fn; 632 ctx->cb_arg = cb_arg; 633 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 634 635 return 0; 636 } 637 638 struct spdk_bdev_examine_item { 639 char *name; 640 TAILQ_ENTRY(spdk_bdev_examine_item) link; 641 }; 642 643 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 644 645 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 646 g_bdev_examine_allowlist); 647 648 static inline bool 649 bdev_examine_allowlist_check(const char *name) 650 { 651 struct spdk_bdev_examine_item *item; 652 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 653 if (strcmp(name, item->name) == 0) { 654 return true; 655 } 656 } 657 return false; 658 } 659 660 static inline void 661 bdev_examine_allowlist_remove(const char *name) 662 { 663 struct spdk_bdev_examine_item *item; 664 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 665 if (strcmp(name, item->name) == 0) { 666 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 667 free(item->name); 668 free(item); 669 break; 670 } 671 } 672 } 673 674 static inline void 675 bdev_examine_allowlist_free(void) 676 { 677 struct spdk_bdev_examine_item *item; 678 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 679 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 680 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 681 free(item->name); 682 free(item); 683 } 684 } 685 686 static inline bool 687 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 688 { 689 struct spdk_bdev_alias *tmp; 690 if (bdev_examine_allowlist_check(bdev->name)) { 691 return true; 692 } 693 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 694 if (bdev_examine_allowlist_check(tmp->alias.name)) { 695 return true; 696 } 697 } 698 return false; 699 } 700 701 static inline bool 702 bdev_ok_to_examine(struct spdk_bdev *bdev) 703 { 704 /* Some bdevs may not support the READ command. 705 * Do not try to examine them. 706 */ 707 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 708 return false; 709 } 710 711 if (g_bdev_opts.bdev_auto_examine) { 712 return true; 713 } else { 714 return bdev_in_examine_allowlist(bdev); 715 } 716 } 717 718 static void 719 bdev_examine(struct spdk_bdev *bdev) 720 { 721 struct spdk_bdev_module *module; 722 struct spdk_bdev_module_claim *claim, *tmpclaim; 723 uint32_t action; 724 725 if (!bdev_ok_to_examine(bdev)) { 726 return; 727 } 728 729 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 730 if (module->examine_config) { 731 spdk_spin_lock(&module->internal.spinlock); 732 action = module->internal.action_in_progress; 733 module->internal.action_in_progress++; 734 spdk_spin_unlock(&module->internal.spinlock); 735 module->examine_config(bdev); 736 if (action != module->internal.action_in_progress) { 737 SPDK_ERRLOG("examine_config for module %s did not call " 738 "spdk_bdev_module_examine_done()\n", module->name); 739 } 740 } 741 } 742 743 spdk_spin_lock(&bdev->internal.spinlock); 744 745 switch (bdev->internal.claim_type) { 746 case SPDK_BDEV_CLAIM_NONE: 747 /* Examine by all bdev modules */ 748 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 749 if (module->examine_disk) { 750 spdk_spin_lock(&module->internal.spinlock); 751 module->internal.action_in_progress++; 752 spdk_spin_unlock(&module->internal.spinlock); 753 spdk_spin_unlock(&bdev->internal.spinlock); 754 module->examine_disk(bdev); 755 spdk_spin_lock(&bdev->internal.spinlock); 756 } 757 } 758 break; 759 case SPDK_BDEV_CLAIM_EXCL_WRITE: 760 /* Examine by the one bdev module with a v1 claim */ 761 module = bdev->internal.claim.v1.module; 762 if (module->examine_disk) { 763 spdk_spin_lock(&module->internal.spinlock); 764 module->internal.action_in_progress++; 765 spdk_spin_unlock(&module->internal.spinlock); 766 spdk_spin_unlock(&bdev->internal.spinlock); 767 module->examine_disk(bdev); 768 return; 769 } 770 break; 771 default: 772 /* Examine by all bdev modules with a v2 claim */ 773 assert(claim_type_is_v2(bdev->internal.claim_type)); 774 /* 775 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 776 * list, perhaps accessing freed memory. Without protection, this could happen 777 * while the lock is dropped during the examine callback. 778 */ 779 bdev->internal.examine_in_progress++; 780 781 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 782 module = claim->module; 783 784 if (module == NULL) { 785 /* This is a vestigial claim, held by examine_count */ 786 continue; 787 } 788 789 if (module->examine_disk == NULL) { 790 continue; 791 } 792 793 spdk_spin_lock(&module->internal.spinlock); 794 module->internal.action_in_progress++; 795 spdk_spin_unlock(&module->internal.spinlock); 796 797 /* Call examine_disk without holding internal.spinlock. */ 798 spdk_spin_unlock(&bdev->internal.spinlock); 799 module->examine_disk(bdev); 800 spdk_spin_lock(&bdev->internal.spinlock); 801 } 802 803 assert(bdev->internal.examine_in_progress > 0); 804 bdev->internal.examine_in_progress--; 805 if (bdev->internal.examine_in_progress == 0) { 806 /* Remove any claims that were released during examine_disk */ 807 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 808 if (claim->desc != NULL) { 809 continue; 810 } 811 812 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 813 free(claim); 814 } 815 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 816 claim_reset(bdev); 817 } 818 } 819 } 820 821 spdk_spin_unlock(&bdev->internal.spinlock); 822 } 823 824 int 825 spdk_bdev_examine(const char *name) 826 { 827 struct spdk_bdev *bdev; 828 struct spdk_bdev_examine_item *item; 829 struct spdk_thread *thread = spdk_get_thread(); 830 831 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 832 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 833 thread ? spdk_thread_get_name(thread) : "null"); 834 return -EINVAL; 835 } 836 837 if (g_bdev_opts.bdev_auto_examine) { 838 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 839 return -EINVAL; 840 } 841 842 if (bdev_examine_allowlist_check(name)) { 843 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 844 return -EEXIST; 845 } 846 847 item = calloc(1, sizeof(*item)); 848 if (!item) { 849 return -ENOMEM; 850 } 851 item->name = strdup(name); 852 if (!item->name) { 853 free(item); 854 return -ENOMEM; 855 } 856 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 857 858 bdev = spdk_bdev_get_by_name(name); 859 if (bdev) { 860 bdev_examine(bdev); 861 } 862 return 0; 863 } 864 865 static inline void 866 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 867 { 868 struct spdk_bdev_examine_item *item; 869 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 870 spdk_json_write_object_begin(w); 871 spdk_json_write_named_string(w, "method", "bdev_examine"); 872 spdk_json_write_named_object_begin(w, "params"); 873 spdk_json_write_named_string(w, "name", item->name); 874 spdk_json_write_object_end(w); 875 spdk_json_write_object_end(w); 876 } 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_first(void) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = TAILQ_NEXT(prev, internal.link); 898 if (bdev) { 899 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 900 } 901 902 return bdev; 903 } 904 905 static struct spdk_bdev * 906 _bdev_next_leaf(struct spdk_bdev *bdev) 907 { 908 while (bdev != NULL) { 909 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 910 return bdev; 911 } else { 912 bdev = TAILQ_NEXT(bdev, internal.link); 913 } 914 } 915 916 return bdev; 917 } 918 919 struct spdk_bdev * 920 spdk_bdev_first_leaf(void) 921 { 922 struct spdk_bdev *bdev; 923 924 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 925 926 if (bdev) { 927 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 928 } 929 930 return bdev; 931 } 932 933 struct spdk_bdev * 934 spdk_bdev_next_leaf(struct spdk_bdev *prev) 935 { 936 struct spdk_bdev *bdev; 937 938 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 939 940 if (bdev) { 941 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 942 } 943 944 return bdev; 945 } 946 947 static inline bool 948 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 949 { 950 return bdev_io->internal.f.has_memory_domain; 951 } 952 953 static inline bool 954 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 955 { 956 return bdev_io->internal.f.has_accel_sequence; 957 } 958 959 static inline void 960 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 961 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 962 { 963 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 964 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 965 * channels we will instead wait for half to complete. 966 */ 967 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 968 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 969 970 assert(state != BDEV_IO_RETRY_STATE_INVALID); 971 bdev_io->internal.retry_state = state; 972 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 973 } 974 975 static inline void 976 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 977 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 978 { 979 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 980 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 981 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 982 983 assert(state != BDEV_IO_RETRY_STATE_INVALID); 984 bdev_io->internal.retry_state = state; 985 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 986 } 987 988 void 989 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 990 { 991 struct iovec *iovs; 992 993 if (bdev_io->u.bdev.iovs == NULL) { 994 bdev_io->u.bdev.iovs = &bdev_io->iov; 995 bdev_io->u.bdev.iovcnt = 1; 996 } 997 998 iovs = bdev_io->u.bdev.iovs; 999 1000 assert(iovs != NULL); 1001 assert(bdev_io->u.bdev.iovcnt >= 1); 1002 1003 iovs[0].iov_base = buf; 1004 iovs[0].iov_len = len; 1005 } 1006 1007 void 1008 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1009 { 1010 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1011 bdev_io->u.bdev.md_buf = md_buf; 1012 } 1013 1014 static bool 1015 _is_buf_allocated(const struct iovec *iovs) 1016 { 1017 if (iovs == NULL) { 1018 return false; 1019 } 1020 1021 return iovs[0].iov_base != NULL; 1022 } 1023 1024 static bool 1025 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1026 { 1027 int i; 1028 uintptr_t iov_base; 1029 1030 if (spdk_likely(alignment == 1)) { 1031 return true; 1032 } 1033 1034 for (i = 0; i < iovcnt; i++) { 1035 iov_base = (uintptr_t)iovs[i].iov_base; 1036 if ((iov_base & (alignment - 1)) != 0) { 1037 return false; 1038 } 1039 } 1040 1041 return true; 1042 } 1043 1044 static inline bool 1045 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1046 { 1047 if (!bdev_io_use_accel_sequence(bdev_io)) { 1048 return false; 1049 } 1050 1051 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1052 * bdev module didn't support accel sequences */ 1053 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1054 } 1055 1056 static inline void 1057 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1058 struct spdk_bdev_shared_resource *shared_resource) 1059 { 1060 bdev_ch->io_outstanding++; 1061 shared_resource->io_outstanding++; 1062 } 1063 1064 static inline void 1065 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1066 struct spdk_bdev_shared_resource *shared_resource) 1067 { 1068 assert(bdev_ch->io_outstanding > 0); 1069 assert(shared_resource->io_outstanding > 0); 1070 bdev_ch->io_outstanding--; 1071 shared_resource->io_outstanding--; 1072 } 1073 1074 static void 1075 bdev_io_submit_sequence_cb(void *ctx, int status) 1076 { 1077 struct spdk_bdev_io *bdev_io = ctx; 1078 1079 assert(bdev_io_use_accel_sequence(bdev_io)); 1080 1081 bdev_io->u.bdev.accel_sequence = NULL; 1082 bdev_io->internal.f.has_accel_sequence = false; 1083 1084 if (spdk_unlikely(status != 0)) { 1085 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1086 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1087 bdev_io_complete_unsubmitted(bdev_io); 1088 return; 1089 } 1090 1091 bdev_io_submit(bdev_io); 1092 } 1093 1094 static void 1095 bdev_io_exec_sequence_cb(void *ctx, int status) 1096 { 1097 struct spdk_bdev_io *bdev_io = ctx; 1098 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1099 1100 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1101 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1102 1103 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1104 bdev_ch_retry_io(ch); 1105 } 1106 1107 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1108 } 1109 1110 static void 1111 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1112 { 1113 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1114 1115 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1116 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1117 assert(bdev_io_use_accel_sequence(bdev_io)); 1118 1119 /* Since the operations are appended during submission, they're in the opposite order than 1120 * how we want to execute them for reads (i.e. we need to execute the most recently added 1121 * operation first), so reverse the sequence before executing it. 1122 */ 1123 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1124 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1125 } 1126 1127 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1128 bdev_io_increment_outstanding(ch, ch->shared_resource); 1129 bdev_io->internal.data_transfer_cpl = cb_fn; 1130 1131 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1132 bdev_io_exec_sequence_cb, bdev_io); 1133 } 1134 1135 static void 1136 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1137 { 1138 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1139 void *buf; 1140 1141 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1142 buf = bdev_io->internal.buf.ptr; 1143 bdev_io->internal.buf.ptr = NULL; 1144 bdev_io->internal.f.has_buf = false; 1145 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1146 bdev_io->internal.get_aux_buf_cb = NULL; 1147 } else { 1148 assert(bdev_io->internal.get_buf_cb != NULL); 1149 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1150 bdev_io->internal.get_buf_cb = NULL; 1151 } 1152 } 1153 1154 static void 1155 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1156 { 1157 struct spdk_bdev_io *bdev_io = ctx; 1158 1159 if (rc) { 1160 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1161 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1162 } 1163 bdev_io_get_buf_complete(bdev_io, !rc); 1164 } 1165 1166 static void 1167 bdev_io_pull_md_buf_done(void *ctx, int status) 1168 { 1169 struct spdk_bdev_io *bdev_io = ctx; 1170 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1171 1172 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1173 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1174 1175 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1176 bdev_ch_retry_io(ch); 1177 } 1178 1179 assert(bdev_io->internal.data_transfer_cpl); 1180 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1181 } 1182 1183 static void 1184 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1185 { 1186 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1187 int rc = 0; 1188 1189 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1190 assert(bdev_io->internal.f.has_bounce_buf); 1191 if (bdev_io_use_memory_domain(bdev_io)) { 1192 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1193 bdev_io_increment_outstanding(ch, ch->shared_resource); 1194 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1195 bdev_io->internal.memory_domain_ctx, 1196 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1197 &bdev_io->internal.bounce_buf.md_iov, 1, 1198 bdev_io_pull_md_buf_done, bdev_io); 1199 if (rc == 0) { 1200 /* Continue to submit IO in completion callback */ 1201 return; 1202 } 1203 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1204 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1205 if (rc != -ENOMEM) { 1206 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1207 spdk_memory_domain_get_dma_device_id( 1208 bdev_io->internal.memory_domain), rc); 1209 } 1210 } else { 1211 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1212 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1213 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1214 } 1215 } 1216 1217 if (spdk_unlikely(rc == -ENOMEM)) { 1218 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1219 } else { 1220 assert(bdev_io->internal.data_transfer_cpl); 1221 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1222 } 1223 } 1224 1225 static void 1226 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1227 { 1228 assert(bdev_io->internal.f.has_bounce_buf); 1229 1230 /* save original md_buf */ 1231 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1232 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1233 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1234 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1235 /* set bounce md_buf */ 1236 bdev_io->u.bdev.md_buf = md_buf; 1237 1238 bdev_io_pull_md_buf(bdev_io); 1239 } 1240 1241 static void 1242 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1243 { 1244 struct spdk_bdev *bdev = bdev_io->bdev; 1245 uint64_t md_len; 1246 void *buf; 1247 1248 if (spdk_bdev_is_md_separate(bdev)) { 1249 assert(!bdev_io_use_accel_sequence(bdev_io)); 1250 1251 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1252 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1253 1254 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1255 1256 if (bdev_io->u.bdev.md_buf != NULL) { 1257 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1258 return; 1259 } else { 1260 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1261 } 1262 } 1263 1264 bdev_io_get_buf_complete(bdev_io, true); 1265 } 1266 1267 static inline void 1268 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1269 { 1270 if (rc) { 1271 SPDK_ERRLOG("Failed to get data buffer\n"); 1272 assert(bdev_io->internal.data_transfer_cpl); 1273 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1274 return; 1275 } 1276 1277 _bdev_io_set_md_buf(bdev_io); 1278 } 1279 1280 static void 1281 bdev_io_pull_data_done_and_track(void *ctx, int status) 1282 { 1283 struct spdk_bdev_io *bdev_io = ctx; 1284 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1285 1286 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1287 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1288 1289 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1290 bdev_ch_retry_io(ch); 1291 } 1292 1293 bdev_io_pull_data_done(bdev_io, status); 1294 } 1295 1296 static void 1297 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1298 { 1299 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1300 int rc = 0; 1301 1302 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1303 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1304 * operation */ 1305 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1306 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1307 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1308 assert(bdev_io_use_accel_sequence(bdev_io)); 1309 assert(bdev_io->internal.f.has_bounce_buf); 1310 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1311 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1312 NULL, NULL, 1313 bdev_io->internal.bounce_buf.orig_iovs, 1314 bdev_io->internal.bounce_buf.orig_iovcnt, 1315 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1316 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1317 NULL, NULL); 1318 } else { 1319 /* We need to reverse the src/dst for reads */ 1320 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1321 assert(bdev_io_use_accel_sequence(bdev_io)); 1322 assert(bdev_io->internal.f.has_bounce_buf); 1323 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1324 bdev_io->internal.bounce_buf.orig_iovs, 1325 bdev_io->internal.bounce_buf.orig_iovcnt, 1326 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1327 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1328 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1329 NULL, NULL, NULL, NULL); 1330 } 1331 1332 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1333 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1334 bdev_io->internal.accel_sequence); 1335 } 1336 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1337 /* if this is write path, copy data from original buffer to bounce buffer */ 1338 if (bdev_io_use_memory_domain(bdev_io)) { 1339 assert(bdev_io->internal.f.has_bounce_buf); 1340 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1341 bdev_io_increment_outstanding(ch, ch->shared_resource); 1342 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1343 bdev_io->internal.memory_domain_ctx, 1344 bdev_io->internal.bounce_buf.orig_iovs, 1345 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1346 bdev_io->u.bdev.iovs, 1, 1347 bdev_io_pull_data_done_and_track, 1348 bdev_io); 1349 if (rc == 0) { 1350 /* Continue to submit IO in completion callback */ 1351 return; 1352 } 1353 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1354 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1355 if (rc != -ENOMEM) { 1356 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1357 spdk_memory_domain_get_dma_device_id( 1358 bdev_io->internal.memory_domain)); 1359 } 1360 } else { 1361 assert(bdev_io->u.bdev.iovcnt == 1); 1362 assert(bdev_io->internal.f.has_bounce_buf); 1363 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1364 bdev_io->u.bdev.iovs[0].iov_len, 1365 bdev_io->internal.bounce_buf.orig_iovs, 1366 bdev_io->internal.bounce_buf.orig_iovcnt); 1367 } 1368 } 1369 1370 if (spdk_unlikely(rc == -ENOMEM)) { 1371 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1372 } else { 1373 bdev_io_pull_data_done(bdev_io, rc); 1374 } 1375 } 1376 1377 static void 1378 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1379 bdev_copy_bounce_buffer_cpl cpl_cb) 1380 { 1381 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1382 1383 assert(bdev_io->internal.f.has_bounce_buf == false); 1384 1385 bdev_io->internal.data_transfer_cpl = cpl_cb; 1386 bdev_io->internal.f.has_bounce_buf = true; 1387 /* save original iovec */ 1388 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1389 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1390 /* zero the other data members */ 1391 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1392 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1393 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1394 /* set bounce iov */ 1395 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1396 bdev_io->u.bdev.iovcnt = 1; 1397 /* set bounce buffer for this operation */ 1398 bdev_io->u.bdev.iovs[0].iov_base = buf; 1399 bdev_io->u.bdev.iovs[0].iov_len = len; 1400 /* Now we use 1 iov, the split condition could have been changed */ 1401 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1402 1403 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1404 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1405 } else { 1406 bdev_io_pull_data(bdev_io); 1407 } 1408 } 1409 1410 static void 1411 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1412 { 1413 struct spdk_bdev *bdev = bdev_io->bdev; 1414 bool buf_allocated; 1415 uint64_t alignment; 1416 void *aligned_buf; 1417 1418 bdev_io->internal.buf.ptr = buf; 1419 bdev_io->internal.f.has_buf = true; 1420 1421 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1422 bdev_io_get_buf_complete(bdev_io, true); 1423 return; 1424 } 1425 1426 alignment = spdk_bdev_get_buf_align(bdev); 1427 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1428 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1429 1430 if (buf_allocated) { 1431 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1432 /* Continue in completion callback */ 1433 return; 1434 } else { 1435 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1436 } 1437 1438 _bdev_io_set_md_buf(bdev_io); 1439 } 1440 1441 static inline uint64_t 1442 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1443 { 1444 struct spdk_bdev *bdev = bdev_io->bdev; 1445 uint64_t md_len, alignment; 1446 1447 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1448 1449 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1450 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1451 1452 return len + alignment + md_len; 1453 } 1454 1455 static void 1456 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1457 { 1458 struct spdk_bdev_mgmt_channel *ch; 1459 1460 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1461 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1462 } 1463 1464 static void 1465 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1466 { 1467 assert(bdev_io->internal.f.has_buf); 1468 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1469 bdev_io->internal.buf.ptr = NULL; 1470 bdev_io->internal.f.has_buf = false; 1471 } 1472 1473 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1474 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1475 1476 void 1477 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1478 { 1479 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1480 1481 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1482 1483 assert(buf != NULL); 1484 _bdev_io_put_buf(bdev_io, buf, len); 1485 } 1486 1487 static inline void 1488 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1489 struct spdk_bdev_io *bdev_io) 1490 { 1491 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1492 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1493 * sequence pointer to make sure we won't touch it anymore. */ 1494 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1495 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1496 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1497 bdev_io->internal.f.has_accel_sequence = false; 1498 } 1499 1500 bdev->fn_table->submit_request(ioch, bdev_io); 1501 } 1502 1503 static inline void 1504 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1505 { 1506 struct spdk_bdev *bdev = bdev_io->bdev; 1507 1508 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1509 bdev_io->internal.error.nvme.cdw0 = 0; 1510 bdev_io->num_retries++; 1511 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1512 } 1513 1514 static void 1515 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1516 { 1517 struct spdk_bdev_io *bdev_io; 1518 1519 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1520 /* 1521 * Allow some more I/O to complete before retrying the nomem_io queue. 1522 * Some drivers (such as nvme) cannot immediately take a new I/O in 1523 * the context of a completion, because the resources for the I/O are 1524 * not released until control returns to the bdev poller. Also, we 1525 * may require several small I/O to complete before a larger I/O 1526 * (that requires splitting) can be submitted. 1527 */ 1528 return; 1529 } 1530 1531 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1532 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1533 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1534 1535 switch (bdev_io->internal.retry_state) { 1536 case BDEV_IO_RETRY_STATE_SUBMIT: 1537 bdev_ch_resubmit_io(shared_resource, bdev_io); 1538 break; 1539 case BDEV_IO_RETRY_STATE_PULL: 1540 bdev_io_pull_data(bdev_io); 1541 break; 1542 case BDEV_IO_RETRY_STATE_PULL_MD: 1543 bdev_io_pull_md_buf(bdev_io); 1544 break; 1545 case BDEV_IO_RETRY_STATE_PUSH: 1546 bdev_io_push_bounce_data(bdev_io); 1547 break; 1548 case BDEV_IO_RETRY_STATE_PUSH_MD: 1549 bdev_io_push_bounce_md_buf(bdev_io); 1550 break; 1551 default: 1552 assert(0 && "invalid retry state"); 1553 break; 1554 } 1555 1556 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1557 /* This IO completed again with NOMEM status, so break the loop and 1558 * don't try anymore. Note that a bdev_io that fails with NOMEM 1559 * always gets requeued at the front of the list, to maintain 1560 * ordering. 1561 */ 1562 break; 1563 } 1564 } 1565 } 1566 1567 static void 1568 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1569 { 1570 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1571 } 1572 1573 static int 1574 bdev_no_mem_poller(void *ctx) 1575 { 1576 struct spdk_bdev_shared_resource *shared_resource = ctx; 1577 1578 spdk_poller_unregister(&shared_resource->nomem_poller); 1579 1580 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1581 bdev_shared_ch_retry_io(shared_resource); 1582 } 1583 /* the retry cb may re-register the poller so double check */ 1584 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1585 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1586 /* No IOs were submitted, try again */ 1587 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1588 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1589 } 1590 1591 return SPDK_POLLER_BUSY; 1592 } 1593 1594 static inline bool 1595 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1596 { 1597 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1598 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1599 1600 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1601 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1602 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1603 1604 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1605 /* Special case when we have nomem IOs and no outstanding IOs which completions 1606 * could trigger retry of queued IOs 1607 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1608 * new IOs submitted, e.g. qd==1 */ 1609 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1610 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1611 } 1612 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1613 * ownership of that sequence is transferred back to the bdev layer, so we need to 1614 * restore internal.accel_sequence to make sure that the sequence is handled 1615 * correctly in case the I/O is later aborted. */ 1616 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1617 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1618 assert(!bdev_io_use_accel_sequence(bdev_io)); 1619 bdev_io->internal.f.has_accel_sequence = true; 1620 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1621 } 1622 1623 return true; 1624 } 1625 1626 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1627 bdev_ch_retry_io(bdev_ch); 1628 } 1629 1630 return false; 1631 } 1632 1633 static void 1634 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1635 { 1636 struct spdk_bdev_io *bdev_io = ctx; 1637 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1638 1639 if (rc) { 1640 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1641 } 1642 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1643 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1644 */ 1645 bdev_io_put_buf(bdev_io); 1646 1647 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1648 bdev_ch_retry_io(ch); 1649 } 1650 1651 /* Continue with IO completion flow */ 1652 bdev_io_complete(bdev_io); 1653 } 1654 1655 static void 1656 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1657 { 1658 struct spdk_bdev_io *bdev_io = ctx; 1659 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1660 1661 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1662 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1663 bdev_io->internal.f.has_bounce_buf = false; 1664 1665 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1666 bdev_ch_retry_io(ch); 1667 } 1668 1669 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1670 } 1671 1672 static inline void 1673 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1674 { 1675 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1676 int rc = 0; 1677 1678 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1679 assert(bdev_io->internal.f.has_bounce_buf); 1680 1681 /* do the same for metadata buffer */ 1682 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1683 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1684 1685 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1686 if (bdev_io_use_memory_domain(bdev_io)) { 1687 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1688 bdev_io_increment_outstanding(ch, ch->shared_resource); 1689 /* If memory domain is used then we need to call async push function */ 1690 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1691 bdev_io->internal.memory_domain_ctx, 1692 &bdev_io->internal.bounce_buf.orig_md_iov, 1693 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1694 &bdev_io->internal.bounce_buf.md_iov, 1, 1695 bdev_io_push_bounce_md_buf_done, 1696 bdev_io); 1697 if (rc == 0) { 1698 /* Continue IO completion in async callback */ 1699 return; 1700 } 1701 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1702 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1703 if (rc != -ENOMEM) { 1704 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1705 spdk_memory_domain_get_dma_device_id( 1706 bdev_io->internal.memory_domain)); 1707 } 1708 } else { 1709 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1710 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1711 } 1712 } 1713 } 1714 1715 if (spdk_unlikely(rc == -ENOMEM)) { 1716 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1717 } else { 1718 assert(bdev_io->internal.data_transfer_cpl); 1719 bdev_io->internal.f.has_bounce_buf = false; 1720 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1721 } 1722 } 1723 1724 static inline void 1725 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1726 { 1727 assert(bdev_io->internal.data_transfer_cpl); 1728 if (rc) { 1729 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1730 return; 1731 } 1732 1733 /* set original buffer for this io */ 1734 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1735 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1736 1737 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1738 * we still need to clear the md buf */ 1739 1740 bdev_io_push_bounce_md_buf(bdev_io); 1741 } 1742 1743 static void 1744 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1745 { 1746 struct spdk_bdev_io *bdev_io = ctx; 1747 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1748 1749 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1750 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1751 1752 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1753 bdev_ch_retry_io(ch); 1754 } 1755 1756 bdev_io_push_bounce_data_done(bdev_io, status); 1757 } 1758 1759 static inline void 1760 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1761 { 1762 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1763 int rc = 0; 1764 1765 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1766 assert(!bdev_io_use_accel_sequence(bdev_io)); 1767 assert(bdev_io->internal.f.has_bounce_buf); 1768 1769 /* if this is read path, copy data from bounce buffer to original buffer */ 1770 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1771 if (bdev_io_use_memory_domain(bdev_io)) { 1772 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1773 bdev_io_increment_outstanding(ch, ch->shared_resource); 1774 /* If memory domain is used then we need to call async push function */ 1775 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1776 bdev_io->internal.memory_domain_ctx, 1777 bdev_io->internal.bounce_buf.orig_iovs, 1778 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1779 &bdev_io->internal.bounce_buf.iov, 1, 1780 bdev_io_push_bounce_data_done_and_track, 1781 bdev_io); 1782 if (rc == 0) { 1783 /* Continue IO completion in async callback */ 1784 return; 1785 } 1786 1787 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1788 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1789 if (rc != -ENOMEM) { 1790 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1791 spdk_memory_domain_get_dma_device_id( 1792 bdev_io->internal.memory_domain)); 1793 } 1794 } else { 1795 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1796 bdev_io->internal.bounce_buf.orig_iovcnt, 1797 bdev_io->internal.bounce_buf.iov.iov_base, 1798 bdev_io->internal.bounce_buf.iov.iov_len); 1799 } 1800 } 1801 1802 if (spdk_unlikely(rc == -ENOMEM)) { 1803 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1804 } else { 1805 bdev_io_push_bounce_data_done(bdev_io, rc); 1806 } 1807 } 1808 1809 static inline void 1810 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1811 { 1812 bdev_io->internal.data_transfer_cpl = cpl_cb; 1813 bdev_io_push_bounce_data(bdev_io); 1814 } 1815 1816 static void 1817 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1818 { 1819 struct spdk_bdev_io *bdev_io; 1820 1821 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1822 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1823 } 1824 1825 static void 1826 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1827 { 1828 struct spdk_bdev_mgmt_channel *mgmt_ch; 1829 uint64_t max_len; 1830 void *buf; 1831 1832 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1833 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1834 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1835 1836 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1837 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1838 bdev_io_get_buf_complete(bdev_io, false); 1839 return; 1840 } 1841 1842 bdev_io->internal.buf.len = len; 1843 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1844 bdev_io_get_iobuf_cb); 1845 if (buf != NULL) { 1846 _bdev_io_set_buf(bdev_io, buf, len); 1847 } 1848 } 1849 1850 void 1851 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1852 { 1853 struct spdk_bdev *bdev = bdev_io->bdev; 1854 uint64_t alignment; 1855 1856 assert(cb != NULL); 1857 bdev_io->internal.get_buf_cb = cb; 1858 1859 alignment = spdk_bdev_get_buf_align(bdev); 1860 1861 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1862 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1863 /* Buffer already present and aligned */ 1864 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1865 return; 1866 } 1867 1868 bdev_io_get_buf(bdev_io, len); 1869 } 1870 1871 static void 1872 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1873 bool success) 1874 { 1875 if (!success) { 1876 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1877 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1878 bdev_io_complete_unsubmitted(bdev_io); 1879 return; 1880 } 1881 1882 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1883 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1884 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1885 return; 1886 } 1887 /* For reads we'll execute the sequence after the data is read, so, for now, only 1888 * clear out accel_sequence pointer and submit the IO */ 1889 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1890 bdev_io->u.bdev.accel_sequence = NULL; 1891 } 1892 1893 bdev_io_submit(bdev_io); 1894 } 1895 1896 static void 1897 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1898 uint64_t len) 1899 { 1900 assert(cb != NULL); 1901 bdev_io->internal.get_buf_cb = cb; 1902 1903 bdev_io_get_buf(bdev_io, len); 1904 } 1905 1906 1907 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1908 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1909 1910 void 1911 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1912 { 1913 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1914 1915 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1916 1917 assert(cb != NULL); 1918 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1919 bdev_io->internal.get_aux_buf_cb = cb; 1920 bdev_io_get_buf(bdev_io, len); 1921 } 1922 1923 static int 1924 bdev_module_get_max_ctx_size(void) 1925 { 1926 struct spdk_bdev_module *bdev_module; 1927 int max_bdev_module_size = 0; 1928 1929 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1930 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1931 max_bdev_module_size = bdev_module->get_ctx_size(); 1932 } 1933 } 1934 1935 return max_bdev_module_size; 1936 } 1937 1938 static void 1939 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1940 { 1941 if (!bdev->internal.histogram_enabled) { 1942 return; 1943 } 1944 1945 spdk_json_write_object_begin(w); 1946 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1947 1948 spdk_json_write_named_object_begin(w, "params"); 1949 spdk_json_write_named_string(w, "name", bdev->name); 1950 1951 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1952 1953 if (bdev->internal.histogram_io_type) { 1954 spdk_json_write_named_string(w, "opc", 1955 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1956 } 1957 1958 spdk_json_write_object_end(w); 1959 1960 spdk_json_write_object_end(w); 1961 } 1962 1963 static void 1964 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1965 { 1966 int i; 1967 struct spdk_bdev_qos *qos = bdev->internal.qos; 1968 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1969 1970 if (!qos) { 1971 return; 1972 } 1973 1974 spdk_bdev_get_qos_rate_limits(bdev, limits); 1975 1976 spdk_json_write_object_begin(w); 1977 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1978 1979 spdk_json_write_named_object_begin(w, "params"); 1980 spdk_json_write_named_string(w, "name", bdev->name); 1981 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1982 if (limits[i] > 0) { 1983 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1984 } 1985 } 1986 spdk_json_write_object_end(w); 1987 1988 spdk_json_write_object_end(w); 1989 } 1990 1991 void 1992 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1993 { 1994 struct spdk_bdev_module *bdev_module; 1995 struct spdk_bdev *bdev; 1996 1997 assert(w != NULL); 1998 1999 spdk_json_write_array_begin(w); 2000 2001 spdk_json_write_object_begin(w); 2002 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2003 spdk_json_write_named_object_begin(w, "params"); 2004 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2005 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2006 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2007 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2008 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2009 spdk_json_write_object_end(w); 2010 spdk_json_write_object_end(w); 2011 2012 bdev_examine_allowlist_config_json(w); 2013 2014 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2015 if (bdev_module->config_json) { 2016 bdev_module->config_json(w); 2017 } 2018 } 2019 2020 spdk_spin_lock(&g_bdev_mgr.spinlock); 2021 2022 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2023 if (bdev->fn_table->write_config_json) { 2024 bdev->fn_table->write_config_json(bdev, w); 2025 } 2026 2027 bdev_qos_config_json(bdev, w); 2028 bdev_enable_histogram_config_json(bdev, w); 2029 } 2030 2031 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2032 2033 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2034 spdk_json_write_object_begin(w); 2035 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2036 spdk_json_write_object_end(w); 2037 2038 spdk_json_write_array_end(w); 2039 } 2040 2041 static void 2042 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2043 { 2044 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2045 struct spdk_bdev_io *bdev_io; 2046 2047 spdk_iobuf_channel_fini(&ch->iobuf); 2048 2049 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2050 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2051 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2052 ch->per_thread_cache_count--; 2053 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2054 } 2055 2056 assert(ch->per_thread_cache_count == 0); 2057 } 2058 2059 static int 2060 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2061 { 2062 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2063 struct spdk_bdev_io *bdev_io; 2064 uint32_t i; 2065 int rc; 2066 2067 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2068 g_bdev_opts.iobuf_small_cache_size, 2069 g_bdev_opts.iobuf_large_cache_size); 2070 if (rc != 0) { 2071 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2072 return -1; 2073 } 2074 2075 STAILQ_INIT(&ch->per_thread_cache); 2076 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2077 2078 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2079 ch->per_thread_cache_count = 0; 2080 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2081 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2082 if (bdev_io == NULL) { 2083 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2084 assert(false); 2085 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2086 return -1; 2087 } 2088 ch->per_thread_cache_count++; 2089 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2090 } 2091 2092 TAILQ_INIT(&ch->shared_resources); 2093 TAILQ_INIT(&ch->io_wait_queue); 2094 2095 return 0; 2096 } 2097 2098 static void 2099 bdev_init_complete(int rc) 2100 { 2101 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2102 void *cb_arg = g_init_cb_arg; 2103 struct spdk_bdev_module *m; 2104 2105 g_bdev_mgr.init_complete = true; 2106 g_init_cb_fn = NULL; 2107 g_init_cb_arg = NULL; 2108 2109 /* 2110 * For modules that need to know when subsystem init is complete, 2111 * inform them now. 2112 */ 2113 if (rc == 0) { 2114 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2115 if (m->init_complete) { 2116 m->init_complete(); 2117 } 2118 } 2119 } 2120 2121 cb_fn(cb_arg, rc); 2122 } 2123 2124 static bool 2125 bdev_module_all_actions_completed(void) 2126 { 2127 struct spdk_bdev_module *m; 2128 2129 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2130 if (m->internal.action_in_progress > 0) { 2131 return false; 2132 } 2133 } 2134 return true; 2135 } 2136 2137 static void 2138 bdev_module_action_complete(void) 2139 { 2140 /* 2141 * Don't finish bdev subsystem initialization if 2142 * module pre-initialization is still in progress, or 2143 * the subsystem been already initialized. 2144 */ 2145 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2146 return; 2147 } 2148 2149 /* 2150 * Check all bdev modules for inits/examinations in progress. If any 2151 * exist, return immediately since we cannot finish bdev subsystem 2152 * initialization until all are completed. 2153 */ 2154 if (!bdev_module_all_actions_completed()) { 2155 return; 2156 } 2157 2158 /* 2159 * Modules already finished initialization - now that all 2160 * the bdev modules have finished their asynchronous I/O 2161 * processing, the entire bdev layer can be marked as complete. 2162 */ 2163 bdev_init_complete(0); 2164 } 2165 2166 static void 2167 bdev_module_action_done(struct spdk_bdev_module *module) 2168 { 2169 spdk_spin_lock(&module->internal.spinlock); 2170 assert(module->internal.action_in_progress > 0); 2171 module->internal.action_in_progress--; 2172 spdk_spin_unlock(&module->internal.spinlock); 2173 bdev_module_action_complete(); 2174 } 2175 2176 void 2177 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2178 { 2179 assert(module->async_init); 2180 bdev_module_action_done(module); 2181 } 2182 2183 void 2184 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2185 { 2186 bdev_module_action_done(module); 2187 } 2188 2189 /** The last initialized bdev module */ 2190 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2191 2192 static void 2193 bdev_init_failed(void *cb_arg) 2194 { 2195 struct spdk_bdev_module *module = cb_arg; 2196 2197 spdk_spin_lock(&module->internal.spinlock); 2198 assert(module->internal.action_in_progress > 0); 2199 module->internal.action_in_progress--; 2200 spdk_spin_unlock(&module->internal.spinlock); 2201 bdev_init_complete(-1); 2202 } 2203 2204 static int 2205 bdev_modules_init(void) 2206 { 2207 struct spdk_bdev_module *module; 2208 int rc = 0; 2209 2210 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2211 g_resume_bdev_module = module; 2212 if (module->async_init) { 2213 spdk_spin_lock(&module->internal.spinlock); 2214 module->internal.action_in_progress = 1; 2215 spdk_spin_unlock(&module->internal.spinlock); 2216 } 2217 rc = module->module_init(); 2218 if (rc != 0) { 2219 /* Bump action_in_progress to prevent other modules from completion of modules_init 2220 * Send message to defer application shutdown until resources are cleaned up */ 2221 spdk_spin_lock(&module->internal.spinlock); 2222 module->internal.action_in_progress = 1; 2223 spdk_spin_unlock(&module->internal.spinlock); 2224 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2225 return rc; 2226 } 2227 } 2228 2229 g_resume_bdev_module = NULL; 2230 return 0; 2231 } 2232 2233 void 2234 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2235 { 2236 int rc = 0; 2237 char mempool_name[32]; 2238 2239 assert(cb_fn != NULL); 2240 2241 g_init_cb_fn = cb_fn; 2242 g_init_cb_arg = cb_arg; 2243 2244 spdk_notify_type_register("bdev_register"); 2245 spdk_notify_type_register("bdev_unregister"); 2246 2247 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2248 2249 rc = spdk_iobuf_register_module("bdev"); 2250 if (rc != 0) { 2251 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2252 bdev_init_complete(-1); 2253 return; 2254 } 2255 2256 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2257 g_bdev_opts.bdev_io_pool_size, 2258 sizeof(struct spdk_bdev_io) + 2259 bdev_module_get_max_ctx_size(), 2260 0, 2261 SPDK_ENV_NUMA_ID_ANY); 2262 2263 if (g_bdev_mgr.bdev_io_pool == NULL) { 2264 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2265 bdev_init_complete(-1); 2266 return; 2267 } 2268 2269 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2270 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2271 if (!g_bdev_mgr.zero_buffer) { 2272 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2273 bdev_init_complete(-1); 2274 return; 2275 } 2276 2277 #ifdef SPDK_CONFIG_VTUNE 2278 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2279 #endif 2280 2281 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2282 bdev_mgmt_channel_destroy, 2283 sizeof(struct spdk_bdev_mgmt_channel), 2284 "bdev_mgr"); 2285 2286 rc = bdev_modules_init(); 2287 g_bdev_mgr.module_init_complete = true; 2288 if (rc != 0) { 2289 SPDK_ERRLOG("bdev modules init failed\n"); 2290 return; 2291 } 2292 2293 bdev_module_action_complete(); 2294 } 2295 2296 static void 2297 bdev_mgr_unregister_cb(void *io_device) 2298 { 2299 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2300 2301 if (g_bdev_mgr.bdev_io_pool) { 2302 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2303 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2304 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2305 g_bdev_opts.bdev_io_pool_size); 2306 } 2307 2308 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2309 } 2310 2311 spdk_free(g_bdev_mgr.zero_buffer); 2312 2313 bdev_examine_allowlist_free(); 2314 2315 cb_fn(g_fini_cb_arg); 2316 g_fini_cb_fn = NULL; 2317 g_fini_cb_arg = NULL; 2318 g_bdev_mgr.init_complete = false; 2319 g_bdev_mgr.module_init_complete = false; 2320 } 2321 2322 static void 2323 bdev_module_fini_iter(void *arg) 2324 { 2325 struct spdk_bdev_module *bdev_module; 2326 2327 /* FIXME: Handling initialization failures is broken now, 2328 * so we won't even try cleaning up after successfully 2329 * initialized modules. if module_init_complete is false, 2330 * just call spdk_bdev_mgr_unregister_cb 2331 */ 2332 if (!g_bdev_mgr.module_init_complete) { 2333 bdev_mgr_unregister_cb(NULL); 2334 return; 2335 } 2336 2337 /* Start iterating from the last touched module */ 2338 if (!g_resume_bdev_module) { 2339 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2340 } else { 2341 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2342 internal.tailq); 2343 } 2344 2345 while (bdev_module) { 2346 if (bdev_module->async_fini) { 2347 /* Save our place so we can resume later. We must 2348 * save the variable here, before calling module_fini() 2349 * below, because in some cases the module may immediately 2350 * call spdk_bdev_module_fini_done() and re-enter 2351 * this function to continue iterating. */ 2352 g_resume_bdev_module = bdev_module; 2353 } 2354 2355 if (bdev_module->module_fini) { 2356 bdev_module->module_fini(); 2357 } 2358 2359 if (bdev_module->async_fini) { 2360 return; 2361 } 2362 2363 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2364 internal.tailq); 2365 } 2366 2367 g_resume_bdev_module = NULL; 2368 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2369 } 2370 2371 void 2372 spdk_bdev_module_fini_done(void) 2373 { 2374 if (spdk_get_thread() != g_fini_thread) { 2375 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2376 } else { 2377 bdev_module_fini_iter(NULL); 2378 } 2379 } 2380 2381 static void 2382 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2383 { 2384 struct spdk_bdev *bdev = cb_arg; 2385 2386 if (bdeverrno && bdev) { 2387 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2388 bdev->name); 2389 2390 /* 2391 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2392 * bdev; try to continue by manually removing this bdev from the list and continue 2393 * with the next bdev in the list. 2394 */ 2395 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2396 } 2397 2398 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2399 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2400 /* 2401 * Bdev module finish need to be deferred as we might be in the middle of some context 2402 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2403 * after returning. 2404 */ 2405 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2406 return; 2407 } 2408 2409 /* 2410 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2411 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2412 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2413 * base bdevs. 2414 * 2415 * Also, walk the list in the reverse order. 2416 */ 2417 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2418 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2419 spdk_spin_lock(&bdev->internal.spinlock); 2420 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2421 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2422 spdk_spin_unlock(&bdev->internal.spinlock); 2423 continue; 2424 } 2425 spdk_spin_unlock(&bdev->internal.spinlock); 2426 2427 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2428 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2429 return; 2430 } 2431 2432 /* 2433 * If any bdev fails to unclaim underlying bdev properly, we may face the 2434 * case of bdev list consisting of claimed bdevs only (if claims are managed 2435 * correctly, this would mean there's a loop in the claims graph which is 2436 * clearly impossible). Warn and unregister last bdev on the list then. 2437 */ 2438 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2439 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2440 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2441 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2442 return; 2443 } 2444 } 2445 2446 static void 2447 bdev_module_fini_start_iter(void *arg) 2448 { 2449 struct spdk_bdev_module *bdev_module; 2450 2451 if (!g_resume_bdev_module) { 2452 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2453 } else { 2454 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2455 } 2456 2457 while (bdev_module) { 2458 if (bdev_module->async_fini_start) { 2459 /* Save our place so we can resume later. We must 2460 * save the variable here, before calling fini_start() 2461 * below, because in some cases the module may immediately 2462 * call spdk_bdev_module_fini_start_done() and re-enter 2463 * this function to continue iterating. */ 2464 g_resume_bdev_module = bdev_module; 2465 } 2466 2467 if (bdev_module->fini_start) { 2468 bdev_module->fini_start(); 2469 } 2470 2471 if (bdev_module->async_fini_start) { 2472 return; 2473 } 2474 2475 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2476 } 2477 2478 g_resume_bdev_module = NULL; 2479 2480 bdev_finish_unregister_bdevs_iter(NULL, 0); 2481 } 2482 2483 void 2484 spdk_bdev_module_fini_start_done(void) 2485 { 2486 if (spdk_get_thread() != g_fini_thread) { 2487 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2488 } else { 2489 bdev_module_fini_start_iter(NULL); 2490 } 2491 } 2492 2493 static void 2494 bdev_finish_wait_for_examine_done(void *cb_arg) 2495 { 2496 bdev_module_fini_start_iter(NULL); 2497 } 2498 2499 static void bdev_open_async_fini(void); 2500 2501 void 2502 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2503 { 2504 int rc; 2505 2506 assert(cb_fn != NULL); 2507 2508 g_fini_thread = spdk_get_thread(); 2509 2510 g_fini_cb_fn = cb_fn; 2511 g_fini_cb_arg = cb_arg; 2512 2513 bdev_open_async_fini(); 2514 2515 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2516 if (rc != 0) { 2517 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2518 bdev_finish_wait_for_examine_done(NULL); 2519 } 2520 } 2521 2522 struct spdk_bdev_io * 2523 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2524 { 2525 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2526 struct spdk_bdev_io *bdev_io; 2527 2528 if (ch->per_thread_cache_count > 0) { 2529 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2530 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2531 ch->per_thread_cache_count--; 2532 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2533 /* 2534 * Don't try to look for bdev_ios in the global pool if there are 2535 * waiters on bdev_ios - we don't want this caller to jump the line. 2536 */ 2537 bdev_io = NULL; 2538 } else { 2539 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2540 } 2541 2542 return bdev_io; 2543 } 2544 2545 void 2546 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2547 { 2548 struct spdk_bdev_mgmt_channel *ch; 2549 2550 assert(bdev_io != NULL); 2551 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2552 2553 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2554 2555 if (bdev_io->internal.f.has_buf) { 2556 bdev_io_put_buf(bdev_io); 2557 } 2558 2559 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2560 ch->per_thread_cache_count++; 2561 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2562 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2563 struct spdk_bdev_io_wait_entry *entry; 2564 2565 entry = TAILQ_FIRST(&ch->io_wait_queue); 2566 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2567 entry->cb_fn(entry->cb_arg); 2568 } 2569 } else { 2570 /* We should never have a full cache with entries on the io wait queue. */ 2571 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2572 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2573 } 2574 } 2575 2576 static bool 2577 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2578 { 2579 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2580 2581 switch (limit) { 2582 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2583 return true; 2584 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2585 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2586 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2587 return false; 2588 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2589 default: 2590 return false; 2591 } 2592 } 2593 2594 static bool 2595 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2596 { 2597 switch (bdev_io->type) { 2598 case SPDK_BDEV_IO_TYPE_NVME_IO: 2599 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2600 case SPDK_BDEV_IO_TYPE_READ: 2601 case SPDK_BDEV_IO_TYPE_WRITE: 2602 return true; 2603 case SPDK_BDEV_IO_TYPE_ZCOPY: 2604 if (bdev_io->u.bdev.zcopy.start) { 2605 return true; 2606 } else { 2607 return false; 2608 } 2609 default: 2610 return false; 2611 } 2612 } 2613 2614 static bool 2615 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2616 { 2617 switch (bdev_io->type) { 2618 case SPDK_BDEV_IO_TYPE_NVME_IO: 2619 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2620 /* Bit 1 (0x2) set for read operation */ 2621 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2622 return true; 2623 } else { 2624 return false; 2625 } 2626 case SPDK_BDEV_IO_TYPE_READ: 2627 return true; 2628 case SPDK_BDEV_IO_TYPE_ZCOPY: 2629 /* Populate to read from disk */ 2630 if (bdev_io->u.bdev.zcopy.populate) { 2631 return true; 2632 } else { 2633 return false; 2634 } 2635 default: 2636 return false; 2637 } 2638 } 2639 2640 static uint64_t 2641 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2642 { 2643 struct spdk_bdev *bdev = bdev_io->bdev; 2644 2645 switch (bdev_io->type) { 2646 case SPDK_BDEV_IO_TYPE_NVME_IO: 2647 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2648 return bdev_io->u.nvme_passthru.nbytes; 2649 case SPDK_BDEV_IO_TYPE_READ: 2650 case SPDK_BDEV_IO_TYPE_WRITE: 2651 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2652 case SPDK_BDEV_IO_TYPE_ZCOPY: 2653 /* Track the data in the start phase only */ 2654 if (bdev_io->u.bdev.zcopy.start) { 2655 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2656 } else { 2657 return 0; 2658 } 2659 default: 2660 return 0; 2661 } 2662 } 2663 2664 static inline bool 2665 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2666 { 2667 int64_t remaining_this_timeslice; 2668 2669 if (!limit->max_per_timeslice) { 2670 /* The QoS is disabled */ 2671 return false; 2672 } 2673 2674 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2675 __ATOMIC_RELAXED); 2676 if (remaining_this_timeslice + (int64_t)delta > 0) { 2677 /* There was still a quota for this delta -> the IO shouldn't be queued 2678 * 2679 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2680 * quota can be allowed once a while. Such overrun then taken into account in 2681 * the QoS poller, where the next timeslice quota is calculated. 2682 */ 2683 return false; 2684 } 2685 2686 /* There was no quota for this delta -> the IO should be queued 2687 * The remaining_this_timeslice must be rewinded so it reflects the real 2688 * amount of IOs or bytes allowed. 2689 */ 2690 __atomic_add_fetch( 2691 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2692 return true; 2693 } 2694 2695 static inline void 2696 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2697 { 2698 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2699 } 2700 2701 static bool 2702 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2703 { 2704 return bdev_qos_rw_queue_io(limit, io, 1); 2705 } 2706 2707 static void 2708 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2709 { 2710 bdev_qos_rw_rewind_io(limit, io, 1); 2711 } 2712 2713 static bool 2714 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2715 { 2716 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2717 } 2718 2719 static void 2720 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2721 { 2722 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2723 } 2724 2725 static bool 2726 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2727 { 2728 if (bdev_is_read_io(io) == false) { 2729 return false; 2730 } 2731 2732 return bdev_qos_rw_bps_queue(limit, io); 2733 } 2734 2735 static void 2736 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2737 { 2738 if (bdev_is_read_io(io) != false) { 2739 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2740 } 2741 } 2742 2743 static bool 2744 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2745 { 2746 if (bdev_is_read_io(io) == true) { 2747 return false; 2748 } 2749 2750 return bdev_qos_rw_bps_queue(limit, io); 2751 } 2752 2753 static void 2754 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2755 { 2756 if (bdev_is_read_io(io) != true) { 2757 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2758 } 2759 } 2760 2761 static void 2762 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2763 { 2764 int i; 2765 2766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2767 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2768 qos->rate_limits[i].queue_io = NULL; 2769 continue; 2770 } 2771 2772 switch (i) { 2773 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2774 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2775 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2776 break; 2777 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2778 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2779 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2780 break; 2781 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2782 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2783 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2784 break; 2785 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2786 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2787 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2788 break; 2789 default: 2790 break; 2791 } 2792 } 2793 } 2794 2795 static void 2796 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2797 struct spdk_bdev_io *bdev_io, 2798 enum spdk_bdev_io_status status) 2799 { 2800 bdev_io->internal.f.in_submit_request = true; 2801 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2802 spdk_bdev_io_complete(bdev_io, status); 2803 bdev_io->internal.f.in_submit_request = false; 2804 } 2805 2806 static inline void 2807 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2808 { 2809 struct spdk_bdev *bdev = bdev_io->bdev; 2810 struct spdk_io_channel *ch = bdev_ch->channel; 2811 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2812 2813 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2814 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2815 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2816 2817 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2818 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2819 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2820 SPDK_BDEV_IO_STATUS_SUCCESS); 2821 return; 2822 } 2823 } 2824 2825 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2826 bdev_io->bdev->split_on_write_unit && 2827 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2828 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2829 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2830 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2831 return; 2832 } 2833 2834 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2835 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2836 bdev_io->internal.f.in_submit_request = true; 2837 bdev_submit_request(bdev, ch, bdev_io); 2838 bdev_io->internal.f.in_submit_request = false; 2839 } else { 2840 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2841 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2842 /* Special case when we have nomem IOs and no outstanding IOs which completions 2843 * could trigger retry of queued IOs */ 2844 bdev_shared_ch_retry_io(shared_resource); 2845 } 2846 } 2847 } 2848 2849 static bool 2850 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2851 { 2852 int i; 2853 2854 if (bdev_qos_io_to_limit(bdev_io) == true) { 2855 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2856 if (!qos->rate_limits[i].queue_io) { 2857 continue; 2858 } 2859 2860 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2861 bdev_io) == true) { 2862 for (i -= 1; i >= 0 ; i--) { 2863 if (!qos->rate_limits[i].queue_io) { 2864 continue; 2865 } 2866 2867 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2868 } 2869 return true; 2870 } 2871 } 2872 } 2873 2874 return false; 2875 } 2876 2877 static int 2878 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2879 { 2880 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2881 int submitted_ios = 0; 2882 2883 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2884 if (!bdev_qos_queue_io(qos, bdev_io)) { 2885 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2886 bdev_io_do_submit(ch, bdev_io); 2887 2888 submitted_ios++; 2889 } 2890 } 2891 2892 return submitted_ios; 2893 } 2894 2895 static void 2896 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2897 { 2898 int rc; 2899 2900 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2901 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2902 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2903 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2904 &bdev_io->internal.waitq_entry); 2905 if (rc != 0) { 2906 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2907 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2908 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2909 } 2910 } 2911 2912 static bool 2913 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2914 { 2915 uint32_t io_boundary; 2916 struct spdk_bdev *bdev = bdev_io->bdev; 2917 uint32_t max_segment_size = bdev->max_segment_size; 2918 uint32_t max_size = bdev->max_rw_size; 2919 int max_segs = bdev->max_num_segments; 2920 2921 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2922 io_boundary = bdev->write_unit_size; 2923 } else if (bdev->split_on_optimal_io_boundary) { 2924 io_boundary = bdev->optimal_io_boundary; 2925 } else { 2926 io_boundary = 0; 2927 } 2928 2929 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2930 return false; 2931 } 2932 2933 if (io_boundary) { 2934 uint64_t start_stripe, end_stripe; 2935 2936 start_stripe = bdev_io->u.bdev.offset_blocks; 2937 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2938 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2939 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2940 start_stripe >>= spdk_u32log2(io_boundary); 2941 end_stripe >>= spdk_u32log2(io_boundary); 2942 } else { 2943 start_stripe /= io_boundary; 2944 end_stripe /= io_boundary; 2945 } 2946 2947 if (start_stripe != end_stripe) { 2948 return true; 2949 } 2950 } 2951 2952 if (max_segs) { 2953 if (bdev_io->u.bdev.iovcnt > max_segs) { 2954 return true; 2955 } 2956 } 2957 2958 if (max_segment_size) { 2959 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2960 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2961 return true; 2962 } 2963 } 2964 } 2965 2966 if (max_size) { 2967 if (bdev_io->u.bdev.num_blocks > max_size) { 2968 return true; 2969 } 2970 } 2971 2972 return false; 2973 } 2974 2975 static bool 2976 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2977 { 2978 uint32_t num_unmap_segments; 2979 2980 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2981 return false; 2982 } 2983 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2984 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2985 return true; 2986 } 2987 2988 return false; 2989 } 2990 2991 static bool 2992 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2993 { 2994 if (!bdev_io->bdev->max_write_zeroes) { 2995 return false; 2996 } 2997 2998 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2999 return true; 3000 } 3001 3002 return false; 3003 } 3004 3005 static bool 3006 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3007 { 3008 if (bdev_io->bdev->max_copy != 0 && 3009 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3010 return true; 3011 } 3012 3013 return false; 3014 } 3015 3016 static bool 3017 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3018 { 3019 switch (bdev_io->type) { 3020 case SPDK_BDEV_IO_TYPE_READ: 3021 case SPDK_BDEV_IO_TYPE_WRITE: 3022 return bdev_rw_should_split(bdev_io); 3023 case SPDK_BDEV_IO_TYPE_UNMAP: 3024 return bdev_unmap_should_split(bdev_io); 3025 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3026 return bdev_write_zeroes_should_split(bdev_io); 3027 case SPDK_BDEV_IO_TYPE_COPY: 3028 return bdev_copy_should_split(bdev_io); 3029 default: 3030 return false; 3031 } 3032 } 3033 3034 static uint32_t 3035 _to_next_boundary(uint64_t offset, uint32_t boundary) 3036 { 3037 return (boundary - (offset % boundary)); 3038 } 3039 3040 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3041 3042 static void _bdev_rw_split(void *_bdev_io); 3043 3044 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3045 3046 static void 3047 _bdev_unmap_split(void *_bdev_io) 3048 { 3049 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3050 } 3051 3052 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3053 3054 static void 3055 _bdev_write_zeroes_split(void *_bdev_io) 3056 { 3057 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3058 } 3059 3060 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3061 3062 static void 3063 _bdev_copy_split(void *_bdev_io) 3064 { 3065 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3066 } 3067 3068 static int 3069 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3070 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3071 { 3072 int rc; 3073 uint64_t current_offset, current_remaining, current_src_offset; 3074 spdk_bdev_io_wait_cb io_wait_fn; 3075 3076 current_offset = *offset; 3077 current_remaining = *remaining; 3078 3079 assert(bdev_io->internal.f.split); 3080 3081 bdev_io->internal.split.outstanding++; 3082 3083 io_wait_fn = _bdev_rw_split; 3084 switch (bdev_io->type) { 3085 case SPDK_BDEV_IO_TYPE_READ: 3086 assert(bdev_io->u.bdev.accel_sequence == NULL); 3087 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3088 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3089 iov, iovcnt, md_buf, current_offset, 3090 num_blocks, 3091 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3092 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3093 NULL, 3094 bdev_io->u.bdev.dif_check_flags, 3095 bdev_io_split_done, bdev_io); 3096 break; 3097 case SPDK_BDEV_IO_TYPE_WRITE: 3098 assert(bdev_io->u.bdev.accel_sequence == NULL); 3099 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3100 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3101 iov, iovcnt, md_buf, current_offset, 3102 num_blocks, 3103 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3104 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3105 NULL, 3106 bdev_io->u.bdev.dif_check_flags, 3107 bdev_io->u.bdev.nvme_cdw12.raw, 3108 bdev_io->u.bdev.nvme_cdw13.raw, 3109 bdev_io_split_done, bdev_io); 3110 break; 3111 case SPDK_BDEV_IO_TYPE_UNMAP: 3112 io_wait_fn = _bdev_unmap_split; 3113 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3114 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3115 current_offset, num_blocks, 3116 bdev_io_split_done, bdev_io); 3117 break; 3118 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3119 io_wait_fn = _bdev_write_zeroes_split; 3120 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3121 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3122 current_offset, num_blocks, 3123 bdev_io_split_done, bdev_io); 3124 break; 3125 case SPDK_BDEV_IO_TYPE_COPY: 3126 io_wait_fn = _bdev_copy_split; 3127 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3128 (current_offset - bdev_io->u.bdev.offset_blocks); 3129 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3130 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3131 current_offset, current_src_offset, num_blocks, 3132 bdev_io_split_done, bdev_io); 3133 break; 3134 default: 3135 assert(false); 3136 rc = -EINVAL; 3137 break; 3138 } 3139 3140 if (rc == 0) { 3141 current_offset += num_blocks; 3142 current_remaining -= num_blocks; 3143 bdev_io->internal.split.current_offset_blocks = current_offset; 3144 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3145 *offset = current_offset; 3146 *remaining = current_remaining; 3147 } else { 3148 bdev_io->internal.split.outstanding--; 3149 if (rc == -ENOMEM) { 3150 if (bdev_io->internal.split.outstanding == 0) { 3151 /* No I/O is outstanding. Hence we should wait here. */ 3152 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3153 } 3154 } else { 3155 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3156 if (bdev_io->internal.split.outstanding == 0) { 3157 bdev_ch_remove_from_io_submitted(bdev_io); 3158 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3159 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3160 bdev_io->internal.ch->queue_depth); 3161 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3162 } 3163 } 3164 } 3165 3166 return rc; 3167 } 3168 3169 static void 3170 _bdev_rw_split(void *_bdev_io) 3171 { 3172 struct iovec *parent_iov, *iov; 3173 struct spdk_bdev_io *bdev_io = _bdev_io; 3174 struct spdk_bdev *bdev = bdev_io->bdev; 3175 uint64_t parent_offset, current_offset, remaining; 3176 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3177 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3178 uint32_t iovcnt, iov_len, child_iovsize; 3179 uint32_t blocklen = bdev->blocklen; 3180 uint32_t io_boundary; 3181 uint32_t max_segment_size = bdev->max_segment_size; 3182 uint32_t max_child_iovcnt = bdev->max_num_segments; 3183 uint32_t max_size = bdev->max_rw_size; 3184 void *md_buf = NULL; 3185 int rc; 3186 3187 max_size = max_size ? max_size : UINT32_MAX; 3188 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3189 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3190 SPDK_BDEV_IO_NUM_CHILD_IOV; 3191 3192 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3193 io_boundary = bdev->write_unit_size; 3194 } else if (bdev->split_on_optimal_io_boundary) { 3195 io_boundary = bdev->optimal_io_boundary; 3196 } else { 3197 io_boundary = UINT32_MAX; 3198 } 3199 3200 assert(bdev_io->internal.f.split); 3201 3202 remaining = bdev_io->internal.split.remaining_num_blocks; 3203 current_offset = bdev_io->internal.split.current_offset_blocks; 3204 parent_offset = bdev_io->u.bdev.offset_blocks; 3205 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3206 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3207 3208 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3209 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3210 if (parent_iov_offset < parent_iov->iov_len) { 3211 break; 3212 } 3213 parent_iov_offset -= parent_iov->iov_len; 3214 } 3215 3216 child_iovcnt = 0; 3217 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3218 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3219 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3220 to_next_boundary = spdk_min(remaining, to_next_boundary); 3221 to_next_boundary = spdk_min(max_size, to_next_boundary); 3222 to_next_boundary_bytes = to_next_boundary * blocklen; 3223 3224 iov = &bdev_io->child_iov[child_iovcnt]; 3225 iovcnt = 0; 3226 3227 if (bdev_io->u.bdev.md_buf) { 3228 md_buf = (char *)bdev_io->u.bdev.md_buf + 3229 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3230 } 3231 3232 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3233 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3234 iovcnt < child_iovsize) { 3235 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3236 iov_len = parent_iov->iov_len - parent_iov_offset; 3237 3238 iov_len = spdk_min(iov_len, max_segment_size); 3239 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3240 to_next_boundary_bytes -= iov_len; 3241 3242 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3243 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3244 3245 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3246 parent_iov_offset += iov_len; 3247 } else { 3248 parent_iovpos++; 3249 parent_iov_offset = 0; 3250 } 3251 child_iovcnt++; 3252 iovcnt++; 3253 } 3254 3255 if (to_next_boundary_bytes > 0) { 3256 /* We had to stop this child I/O early because we ran out of 3257 * child_iov space or were limited by max_num_segments. 3258 * Ensure the iovs to be aligned with block size and 3259 * then adjust to_next_boundary before starting the 3260 * child I/O. 3261 */ 3262 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3263 iovcnt == child_iovsize); 3264 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3265 if (to_last_block_bytes != 0) { 3266 uint32_t child_iovpos = child_iovcnt - 1; 3267 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3268 * so the loop will naturally end 3269 */ 3270 3271 to_last_block_bytes = blocklen - to_last_block_bytes; 3272 to_next_boundary_bytes += to_last_block_bytes; 3273 while (to_last_block_bytes > 0 && iovcnt > 0) { 3274 iov_len = spdk_min(to_last_block_bytes, 3275 bdev_io->child_iov[child_iovpos].iov_len); 3276 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3277 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3278 child_iovpos--; 3279 if (--iovcnt == 0) { 3280 /* If the child IO is less than a block size just return. 3281 * If the first child IO of any split round is less than 3282 * a block size, an error exit. 3283 */ 3284 if (bdev_io->internal.split.outstanding == 0) { 3285 SPDK_ERRLOG("The first child io was less than a block size\n"); 3286 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3287 bdev_ch_remove_from_io_submitted(bdev_io); 3288 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3289 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3290 bdev_io->internal.ch->queue_depth); 3291 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3292 } 3293 3294 return; 3295 } 3296 } 3297 3298 to_last_block_bytes -= iov_len; 3299 3300 if (parent_iov_offset == 0) { 3301 parent_iovpos--; 3302 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3303 } 3304 parent_iov_offset -= iov_len; 3305 } 3306 3307 assert(to_last_block_bytes == 0); 3308 } 3309 to_next_boundary -= to_next_boundary_bytes / blocklen; 3310 } 3311 3312 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3313 ¤t_offset, &remaining); 3314 if (spdk_unlikely(rc)) { 3315 return; 3316 } 3317 } 3318 } 3319 3320 static void 3321 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3322 { 3323 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3324 uint32_t num_children_reqs = 0; 3325 int rc; 3326 3327 assert(bdev_io->internal.f.split); 3328 3329 offset = bdev_io->internal.split.current_offset_blocks; 3330 remaining = bdev_io->internal.split.remaining_num_blocks; 3331 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3332 3333 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3334 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3335 3336 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3337 &offset, &remaining); 3338 if (spdk_likely(rc == 0)) { 3339 num_children_reqs++; 3340 } else { 3341 return; 3342 } 3343 } 3344 } 3345 3346 static void 3347 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3348 { 3349 uint64_t offset, write_zeroes_blocks, remaining; 3350 uint32_t num_children_reqs = 0; 3351 int rc; 3352 3353 assert(bdev_io->internal.f.split); 3354 3355 offset = bdev_io->internal.split.current_offset_blocks; 3356 remaining = bdev_io->internal.split.remaining_num_blocks; 3357 3358 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3359 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3360 3361 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3362 &offset, &remaining); 3363 if (spdk_likely(rc == 0)) { 3364 num_children_reqs++; 3365 } else { 3366 return; 3367 } 3368 } 3369 } 3370 3371 static void 3372 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3373 { 3374 uint64_t offset, copy_blocks, remaining; 3375 uint32_t num_children_reqs = 0; 3376 int rc; 3377 3378 assert(bdev_io->internal.f.split); 3379 3380 offset = bdev_io->internal.split.current_offset_blocks; 3381 remaining = bdev_io->internal.split.remaining_num_blocks; 3382 3383 assert(bdev_io->bdev->max_copy != 0); 3384 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3385 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3386 3387 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3388 &offset, &remaining); 3389 if (spdk_likely(rc == 0)) { 3390 num_children_reqs++; 3391 } else { 3392 return; 3393 } 3394 } 3395 } 3396 3397 static void 3398 parent_bdev_io_complete(void *ctx, int rc) 3399 { 3400 struct spdk_bdev_io *parent_io = ctx; 3401 3402 if (rc) { 3403 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3404 } 3405 3406 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3407 parent_io->internal.caller_ctx); 3408 } 3409 3410 static void 3411 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3412 { 3413 struct spdk_bdev_io *bdev_io = ctx; 3414 3415 /* u.bdev.accel_sequence should have already been cleared at this point */ 3416 assert(bdev_io->u.bdev.accel_sequence == NULL); 3417 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3418 bdev_io->internal.f.has_accel_sequence = false; 3419 3420 if (spdk_unlikely(status != 0)) { 3421 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3422 } 3423 3424 parent_bdev_io_complete(bdev_io, status); 3425 } 3426 3427 static void 3428 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3429 { 3430 struct spdk_bdev_io *parent_io = cb_arg; 3431 3432 spdk_bdev_free_io(bdev_io); 3433 3434 assert(parent_io->internal.f.split); 3435 3436 if (!success) { 3437 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3438 /* If any child I/O failed, stop further splitting process. */ 3439 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3440 parent_io->internal.split.remaining_num_blocks = 0; 3441 } 3442 parent_io->internal.split.outstanding--; 3443 if (parent_io->internal.split.outstanding != 0) { 3444 return; 3445 } 3446 3447 /* 3448 * Parent I/O finishes when all blocks are consumed. 3449 */ 3450 if (parent_io->internal.split.remaining_num_blocks == 0) { 3451 assert(parent_io->internal.cb != bdev_io_split_done); 3452 bdev_ch_remove_from_io_submitted(parent_io); 3453 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3454 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3455 parent_io->internal.ch->queue_depth); 3456 3457 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3458 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3459 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3460 return; 3461 } else if (parent_io->internal.f.has_bounce_buf && 3462 !bdev_io_use_accel_sequence(bdev_io)) { 3463 /* bdev IO will be completed in the callback */ 3464 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3465 return; 3466 } 3467 } 3468 3469 parent_bdev_io_complete(parent_io, 0); 3470 return; 3471 } 3472 3473 /* 3474 * Continue with the splitting process. This function will complete the parent I/O if the 3475 * splitting is done. 3476 */ 3477 switch (parent_io->type) { 3478 case SPDK_BDEV_IO_TYPE_READ: 3479 case SPDK_BDEV_IO_TYPE_WRITE: 3480 _bdev_rw_split(parent_io); 3481 break; 3482 case SPDK_BDEV_IO_TYPE_UNMAP: 3483 bdev_unmap_split(parent_io); 3484 break; 3485 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3486 bdev_write_zeroes_split(parent_io); 3487 break; 3488 case SPDK_BDEV_IO_TYPE_COPY: 3489 bdev_copy_split(parent_io); 3490 break; 3491 default: 3492 assert(false); 3493 break; 3494 } 3495 } 3496 3497 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3498 bool success); 3499 3500 static void 3501 bdev_io_split(struct spdk_bdev_io *bdev_io) 3502 { 3503 assert(bdev_io_should_split(bdev_io)); 3504 assert(bdev_io->internal.f.split); 3505 3506 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3507 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3508 bdev_io->internal.split.outstanding = 0; 3509 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3510 3511 switch (bdev_io->type) { 3512 case SPDK_BDEV_IO_TYPE_READ: 3513 case SPDK_BDEV_IO_TYPE_WRITE: 3514 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3515 _bdev_rw_split(bdev_io); 3516 } else { 3517 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3518 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3519 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3520 } 3521 break; 3522 case SPDK_BDEV_IO_TYPE_UNMAP: 3523 bdev_unmap_split(bdev_io); 3524 break; 3525 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3526 bdev_write_zeroes_split(bdev_io); 3527 break; 3528 case SPDK_BDEV_IO_TYPE_COPY: 3529 bdev_copy_split(bdev_io); 3530 break; 3531 default: 3532 assert(false); 3533 break; 3534 } 3535 } 3536 3537 static void 3538 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3539 { 3540 if (!success) { 3541 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3542 return; 3543 } 3544 3545 _bdev_rw_split(bdev_io); 3546 } 3547 3548 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3549 * be inlined, at least on some compilers. 3550 */ 3551 static inline void 3552 _bdev_io_submit(void *ctx) 3553 { 3554 struct spdk_bdev_io *bdev_io = ctx; 3555 struct spdk_bdev *bdev = bdev_io->bdev; 3556 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3557 3558 if (spdk_likely(bdev_ch->flags == 0)) { 3559 bdev_io_do_submit(bdev_ch, bdev_io); 3560 return; 3561 } 3562 3563 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3564 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3565 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3566 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3567 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3568 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3569 } else { 3570 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3571 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3572 } 3573 } else { 3574 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3575 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3576 } 3577 } 3578 3579 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3580 3581 bool 3582 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3583 { 3584 if (range1->length == 0 || range2->length == 0) { 3585 return false; 3586 } 3587 3588 if (range1->offset + range1->length <= range2->offset) { 3589 return false; 3590 } 3591 3592 if (range2->offset + range2->length <= range1->offset) { 3593 return false; 3594 } 3595 3596 return true; 3597 } 3598 3599 static bool 3600 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3601 { 3602 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3603 struct lba_range r; 3604 3605 switch (bdev_io->type) { 3606 case SPDK_BDEV_IO_TYPE_NVME_IO: 3607 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3608 /* Don't try to decode the NVMe command - just assume worst-case and that 3609 * it overlaps a locked range. 3610 */ 3611 return true; 3612 case SPDK_BDEV_IO_TYPE_READ: 3613 if (!range->quiesce) { 3614 return false; 3615 } 3616 /* fallthrough */ 3617 case SPDK_BDEV_IO_TYPE_WRITE: 3618 case SPDK_BDEV_IO_TYPE_UNMAP: 3619 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3620 case SPDK_BDEV_IO_TYPE_ZCOPY: 3621 case SPDK_BDEV_IO_TYPE_COPY: 3622 r.offset = bdev_io->u.bdev.offset_blocks; 3623 r.length = bdev_io->u.bdev.num_blocks; 3624 if (!bdev_lba_range_overlapped(range, &r)) { 3625 /* This I/O doesn't overlap the specified LBA range. */ 3626 return false; 3627 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3628 /* This I/O overlaps, but the I/O is on the same channel that locked this 3629 * range, and the caller_ctx is the same as the locked_ctx. This means 3630 * that this I/O is associated with the lock, and is allowed to execute. 3631 */ 3632 return false; 3633 } else { 3634 return true; 3635 } 3636 default: 3637 return false; 3638 } 3639 } 3640 3641 void 3642 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3643 { 3644 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3645 3646 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3647 3648 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3649 struct lba_range *range; 3650 3651 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3652 if (bdev_io_range_is_locked(bdev_io, range)) { 3653 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3654 return; 3655 } 3656 } 3657 } 3658 3659 bdev_ch_add_to_io_submitted(bdev_io); 3660 3661 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3662 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3663 ch->trace_id, bdev_io->u.bdev.num_blocks, 3664 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3665 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3666 3667 if (bdev_io->internal.f.split) { 3668 bdev_io_split(bdev_io); 3669 return; 3670 } 3671 3672 _bdev_io_submit(bdev_io); 3673 } 3674 3675 static inline void 3676 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3677 { 3678 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3679 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3680 * For write operation we need to pull buffers from memory domain before submitting IO. 3681 * Once read operation completes, we need to use memory_domain push functionality to 3682 * update data in original memory domain IO buffer 3683 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3684 assert(bdev_io->internal.f.has_memory_domain); 3685 bdev_io->u.bdev.memory_domain = NULL; 3686 bdev_io->u.bdev.memory_domain_ctx = NULL; 3687 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3688 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3689 } 3690 3691 static inline void 3692 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3693 { 3694 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3695 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3696 3697 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3698 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3699 bdev_io_complete_unsubmitted(bdev_io); 3700 return; 3701 } 3702 3703 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3704 * support them, but we need to execute an accel sequence and the data buffer is from accel 3705 * memory domain (to avoid doing a push/pull from that domain). 3706 */ 3707 if (bdev_io_use_memory_domain(bdev_io)) { 3708 if (!desc->memory_domains_supported || 3709 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3710 _bdev_io_ext_use_bounce_buffer(bdev_io); 3711 return; 3712 } 3713 } 3714 3715 if (needs_exec) { 3716 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3717 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3718 return; 3719 } 3720 /* For reads we'll execute the sequence after the data is read, so, for now, only 3721 * clear out accel_sequence pointer and submit the IO */ 3722 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3723 bdev_io->u.bdev.accel_sequence = NULL; 3724 } 3725 3726 bdev_io_submit(bdev_io); 3727 } 3728 3729 static void 3730 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3731 { 3732 struct spdk_bdev *bdev = bdev_io->bdev; 3733 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3734 struct spdk_io_channel *ch = bdev_ch->channel; 3735 3736 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3737 3738 bdev_io->internal.f.in_submit_request = true; 3739 bdev_submit_request(bdev, ch, bdev_io); 3740 bdev_io->internal.f.in_submit_request = false; 3741 } 3742 3743 void 3744 bdev_io_init(struct spdk_bdev_io *bdev_io, 3745 struct spdk_bdev *bdev, void *cb_arg, 3746 spdk_bdev_io_completion_cb cb) 3747 { 3748 bdev_io->bdev = bdev; 3749 bdev_io->internal.f.raw = 0; 3750 bdev_io->internal.caller_ctx = cb_arg; 3751 bdev_io->internal.cb = cb; 3752 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3753 bdev_io->internal.f.in_submit_request = false; 3754 bdev_io->internal.error.nvme.cdw0 = 0; 3755 bdev_io->num_retries = 0; 3756 bdev_io->internal.get_buf_cb = NULL; 3757 bdev_io->internal.get_aux_buf_cb = NULL; 3758 bdev_io->internal.data_transfer_cpl = NULL; 3759 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3760 } 3761 3762 static bool 3763 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3764 { 3765 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3766 } 3767 3768 bool 3769 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3770 { 3771 bool supported; 3772 3773 supported = bdev_io_type_supported(bdev, io_type); 3774 3775 if (!supported) { 3776 switch (io_type) { 3777 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3778 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3779 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3780 break; 3781 default: 3782 break; 3783 } 3784 } 3785 3786 return supported; 3787 } 3788 3789 static const char *g_io_type_strings[] = { 3790 [SPDK_BDEV_IO_TYPE_READ] = "read", 3791 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3792 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3793 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3794 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3795 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3796 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3797 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3798 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3799 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3800 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3801 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3802 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3803 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3804 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3805 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3806 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3807 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3808 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3809 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3810 }; 3811 3812 const char * 3813 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3814 { 3815 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3816 return NULL; 3817 } 3818 3819 return g_io_type_strings[io_type]; 3820 } 3821 3822 int 3823 spdk_bdev_get_io_type(const char *io_type_string) 3824 { 3825 int i; 3826 3827 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3828 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3829 return i; 3830 } 3831 } 3832 3833 return -1; 3834 } 3835 3836 uint64_t 3837 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3838 { 3839 return bdev_io->internal.submit_tsc; 3840 } 3841 3842 int 3843 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3844 { 3845 if (bdev->fn_table->dump_info_json) { 3846 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3847 } 3848 3849 return 0; 3850 } 3851 3852 static void 3853 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3854 { 3855 uint32_t max_per_timeslice = 0; 3856 int i; 3857 3858 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3859 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3860 qos->rate_limits[i].max_per_timeslice = 0; 3861 continue; 3862 } 3863 3864 max_per_timeslice = qos->rate_limits[i].limit * 3865 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3866 3867 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3868 qos->rate_limits[i].min_per_timeslice); 3869 3870 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3871 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3872 } 3873 3874 bdev_qos_set_ops(qos); 3875 } 3876 3877 static void 3878 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3879 struct spdk_io_channel *io_ch, void *ctx) 3880 { 3881 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3882 int status; 3883 3884 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3885 3886 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3887 /* TODO: channels round robing */ 3888 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3889 3890 spdk_bdev_for_each_channel_continue(i, status); 3891 } 3892 3893 3894 static void 3895 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3896 { 3897 3898 } 3899 3900 static int 3901 bdev_channel_poll_qos(void *arg) 3902 { 3903 struct spdk_bdev *bdev = arg; 3904 struct spdk_bdev_qos *qos = bdev->internal.qos; 3905 uint64_t now = spdk_get_ticks(); 3906 int i; 3907 int64_t remaining_last_timeslice; 3908 3909 if (spdk_unlikely(qos->thread == NULL)) { 3910 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3911 return SPDK_POLLER_IDLE; 3912 } 3913 3914 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3915 /* We received our callback earlier than expected - return 3916 * immediately and wait to do accounting until at least one 3917 * timeslice has actually expired. This should never happen 3918 * with a well-behaved timer implementation. 3919 */ 3920 return SPDK_POLLER_IDLE; 3921 } 3922 3923 /* Reset for next round of rate limiting */ 3924 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3925 /* We may have allowed the IOs or bytes to slightly overrun in the last 3926 * timeslice. remaining_this_timeslice is signed, so if it's negative 3927 * here, we'll account for the overrun so that the next timeslice will 3928 * be appropriately reduced. 3929 */ 3930 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3931 0, __ATOMIC_RELAXED); 3932 if (remaining_last_timeslice < 0) { 3933 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3934 * potentially use 2 atomic ops each, so they can intertwine. 3935 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3936 */ 3937 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3938 remaining_last_timeslice, __ATOMIC_RELAXED); 3939 } 3940 } 3941 3942 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3943 qos->last_timeslice += qos->timeslice_size; 3944 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3945 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3946 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3947 } 3948 } 3949 3950 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3951 bdev_channel_submit_qos_io_done); 3952 3953 return SPDK_POLLER_BUSY; 3954 } 3955 3956 static void 3957 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3958 { 3959 struct spdk_bdev_shared_resource *shared_resource; 3960 struct lba_range *range; 3961 3962 bdev_free_io_stat(ch->stat); 3963 #ifdef SPDK_CONFIG_VTUNE 3964 bdev_free_io_stat(ch->prev_stat); 3965 #endif 3966 3967 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3968 range = TAILQ_FIRST(&ch->locked_ranges); 3969 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3970 free(range); 3971 } 3972 3973 spdk_put_io_channel(ch->channel); 3974 spdk_put_io_channel(ch->accel_channel); 3975 3976 shared_resource = ch->shared_resource; 3977 3978 assert(TAILQ_EMPTY(&ch->io_locked)); 3979 assert(TAILQ_EMPTY(&ch->io_submitted)); 3980 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3981 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3982 assert(ch->io_outstanding == 0); 3983 assert(shared_resource->ref > 0); 3984 shared_resource->ref--; 3985 if (shared_resource->ref == 0) { 3986 assert(shared_resource->io_outstanding == 0); 3987 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3988 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3989 spdk_poller_unregister(&shared_resource->nomem_poller); 3990 free(shared_resource); 3991 } 3992 } 3993 3994 static void 3995 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3996 { 3997 struct spdk_bdev_qos *qos = bdev->internal.qos; 3998 int i; 3999 4000 assert(spdk_spin_held(&bdev->internal.spinlock)); 4001 4002 /* Rate limiting on this bdev enabled */ 4003 if (qos) { 4004 if (qos->ch == NULL) { 4005 struct spdk_io_channel *io_ch; 4006 4007 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4008 bdev->name, spdk_get_thread()); 4009 4010 /* No qos channel has been selected, so set one up */ 4011 4012 /* Take another reference to ch */ 4013 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4014 assert(io_ch != NULL); 4015 qos->ch = ch; 4016 4017 qos->thread = spdk_io_channel_get_thread(io_ch); 4018 4019 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4020 if (bdev_qos_is_iops_rate_limit(i) == true) { 4021 qos->rate_limits[i].min_per_timeslice = 4022 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4023 } else { 4024 qos->rate_limits[i].min_per_timeslice = 4025 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4026 } 4027 4028 if (qos->rate_limits[i].limit == 0) { 4029 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4030 } 4031 } 4032 bdev_qos_update_max_quota_per_timeslice(qos); 4033 qos->timeslice_size = 4034 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4035 qos->last_timeslice = spdk_get_ticks(); 4036 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4037 bdev, 4038 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4039 } 4040 4041 ch->flags |= BDEV_CH_QOS_ENABLED; 4042 } 4043 } 4044 4045 struct poll_timeout_ctx { 4046 struct spdk_bdev_desc *desc; 4047 uint64_t timeout_in_sec; 4048 spdk_bdev_io_timeout_cb cb_fn; 4049 void *cb_arg; 4050 }; 4051 4052 static void 4053 bdev_desc_free(struct spdk_bdev_desc *desc) 4054 { 4055 spdk_spin_destroy(&desc->spinlock); 4056 free(desc->media_events_buffer); 4057 free(desc); 4058 } 4059 4060 static void 4061 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4062 { 4063 struct poll_timeout_ctx *ctx = _ctx; 4064 struct spdk_bdev_desc *desc = ctx->desc; 4065 4066 free(ctx); 4067 4068 spdk_spin_lock(&desc->spinlock); 4069 desc->refs--; 4070 if (desc->closed == true && desc->refs == 0) { 4071 spdk_spin_unlock(&desc->spinlock); 4072 bdev_desc_free(desc); 4073 return; 4074 } 4075 spdk_spin_unlock(&desc->spinlock); 4076 } 4077 4078 static void 4079 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4080 struct spdk_io_channel *io_ch, void *_ctx) 4081 { 4082 struct poll_timeout_ctx *ctx = _ctx; 4083 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4084 struct spdk_bdev_desc *desc = ctx->desc; 4085 struct spdk_bdev_io *bdev_io; 4086 uint64_t now; 4087 4088 spdk_spin_lock(&desc->spinlock); 4089 if (desc->closed == true) { 4090 spdk_spin_unlock(&desc->spinlock); 4091 spdk_bdev_for_each_channel_continue(i, -1); 4092 return; 4093 } 4094 spdk_spin_unlock(&desc->spinlock); 4095 4096 now = spdk_get_ticks(); 4097 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4098 /* Exclude any I/O that are generated via splitting. */ 4099 if (bdev_io->internal.cb == bdev_io_split_done) { 4100 continue; 4101 } 4102 4103 /* Once we find an I/O that has not timed out, we can immediately 4104 * exit the loop. 4105 */ 4106 if (now < (bdev_io->internal.submit_tsc + 4107 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4108 goto end; 4109 } 4110 4111 if (bdev_io->internal.desc == desc) { 4112 ctx->cb_fn(ctx->cb_arg, bdev_io); 4113 } 4114 } 4115 4116 end: 4117 spdk_bdev_for_each_channel_continue(i, 0); 4118 } 4119 4120 static int 4121 bdev_poll_timeout_io(void *arg) 4122 { 4123 struct spdk_bdev_desc *desc = arg; 4124 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4125 struct poll_timeout_ctx *ctx; 4126 4127 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4128 if (!ctx) { 4129 SPDK_ERRLOG("failed to allocate memory\n"); 4130 return SPDK_POLLER_BUSY; 4131 } 4132 ctx->desc = desc; 4133 ctx->cb_arg = desc->cb_arg; 4134 ctx->cb_fn = desc->cb_fn; 4135 ctx->timeout_in_sec = desc->timeout_in_sec; 4136 4137 /* Take a ref on the descriptor in case it gets closed while we are checking 4138 * all of the channels. 4139 */ 4140 spdk_spin_lock(&desc->spinlock); 4141 desc->refs++; 4142 spdk_spin_unlock(&desc->spinlock); 4143 4144 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4145 bdev_channel_poll_timeout_io_done); 4146 4147 return SPDK_POLLER_BUSY; 4148 } 4149 4150 int 4151 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4152 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4153 { 4154 assert(desc->thread == spdk_get_thread()); 4155 4156 spdk_poller_unregister(&desc->io_timeout_poller); 4157 4158 if (timeout_in_sec) { 4159 assert(cb_fn != NULL); 4160 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4161 desc, 4162 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4163 1000); 4164 if (desc->io_timeout_poller == NULL) { 4165 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4166 return -1; 4167 } 4168 } 4169 4170 desc->cb_fn = cb_fn; 4171 desc->cb_arg = cb_arg; 4172 desc->timeout_in_sec = timeout_in_sec; 4173 4174 return 0; 4175 } 4176 4177 static int 4178 bdev_channel_create(void *io_device, void *ctx_buf) 4179 { 4180 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4181 struct spdk_bdev_channel *ch = ctx_buf; 4182 struct spdk_io_channel *mgmt_io_ch; 4183 struct spdk_bdev_mgmt_channel *mgmt_ch; 4184 struct spdk_bdev_shared_resource *shared_resource; 4185 struct lba_range *range; 4186 4187 ch->bdev = bdev; 4188 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4189 if (!ch->channel) { 4190 return -1; 4191 } 4192 4193 ch->accel_channel = spdk_accel_get_io_channel(); 4194 if (!ch->accel_channel) { 4195 spdk_put_io_channel(ch->channel); 4196 return -1; 4197 } 4198 4199 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4200 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4201 4202 assert(ch->histogram == NULL); 4203 if (bdev->internal.histogram_enabled) { 4204 ch->histogram = spdk_histogram_data_alloc(); 4205 if (ch->histogram == NULL) { 4206 SPDK_ERRLOG("Could not allocate histogram\n"); 4207 } 4208 } 4209 4210 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4211 if (!mgmt_io_ch) { 4212 spdk_put_io_channel(ch->channel); 4213 spdk_put_io_channel(ch->accel_channel); 4214 return -1; 4215 } 4216 4217 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4218 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4219 if (shared_resource->shared_ch == ch->channel) { 4220 spdk_put_io_channel(mgmt_io_ch); 4221 shared_resource->ref++; 4222 break; 4223 } 4224 } 4225 4226 if (shared_resource == NULL) { 4227 shared_resource = calloc(1, sizeof(*shared_resource)); 4228 if (shared_resource == NULL) { 4229 spdk_put_io_channel(ch->channel); 4230 spdk_put_io_channel(ch->accel_channel); 4231 spdk_put_io_channel(mgmt_io_ch); 4232 return -1; 4233 } 4234 4235 shared_resource->mgmt_ch = mgmt_ch; 4236 shared_resource->io_outstanding = 0; 4237 TAILQ_INIT(&shared_resource->nomem_io); 4238 shared_resource->nomem_threshold = 0; 4239 shared_resource->shared_ch = ch->channel; 4240 shared_resource->ref = 1; 4241 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4242 } 4243 4244 ch->io_outstanding = 0; 4245 TAILQ_INIT(&ch->locked_ranges); 4246 TAILQ_INIT(&ch->qos_queued_io); 4247 ch->flags = 0; 4248 ch->trace_id = bdev->internal.trace_id; 4249 ch->shared_resource = shared_resource; 4250 4251 TAILQ_INIT(&ch->io_submitted); 4252 TAILQ_INIT(&ch->io_locked); 4253 TAILQ_INIT(&ch->io_accel_exec); 4254 TAILQ_INIT(&ch->io_memory_domain); 4255 4256 ch->stat = bdev_alloc_io_stat(false); 4257 if (ch->stat == NULL) { 4258 bdev_channel_destroy_resource(ch); 4259 return -1; 4260 } 4261 4262 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4263 4264 #ifdef SPDK_CONFIG_VTUNE 4265 { 4266 char *name; 4267 __itt_init_ittlib(NULL, 0); 4268 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4269 if (!name) { 4270 bdev_channel_destroy_resource(ch); 4271 return -1; 4272 } 4273 ch->handle = __itt_string_handle_create(name); 4274 free(name); 4275 ch->start_tsc = spdk_get_ticks(); 4276 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4277 ch->prev_stat = bdev_alloc_io_stat(false); 4278 if (ch->prev_stat == NULL) { 4279 bdev_channel_destroy_resource(ch); 4280 return -1; 4281 } 4282 } 4283 #endif 4284 4285 spdk_spin_lock(&bdev->internal.spinlock); 4286 bdev_enable_qos(bdev, ch); 4287 4288 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4289 struct lba_range *new_range; 4290 4291 new_range = calloc(1, sizeof(*new_range)); 4292 if (new_range == NULL) { 4293 spdk_spin_unlock(&bdev->internal.spinlock); 4294 bdev_channel_destroy_resource(ch); 4295 return -1; 4296 } 4297 new_range->length = range->length; 4298 new_range->offset = range->offset; 4299 new_range->locked_ctx = range->locked_ctx; 4300 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4301 } 4302 4303 spdk_spin_unlock(&bdev->internal.spinlock); 4304 4305 return 0; 4306 } 4307 4308 static int 4309 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4310 void *cb_ctx) 4311 { 4312 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4313 struct spdk_bdev_io *bdev_io; 4314 uint64_t buf_len; 4315 4316 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4317 if (bdev_io->internal.ch == bdev_ch) { 4318 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4319 spdk_iobuf_entry_abort(ch, entry, buf_len); 4320 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4321 } 4322 4323 return 0; 4324 } 4325 4326 /* 4327 * Abort I/O that are waiting on a data buffer. 4328 */ 4329 static void 4330 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4331 { 4332 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4333 } 4334 4335 /* 4336 * Abort I/O that are queued waiting for submission. These types of I/O are 4337 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4338 */ 4339 static void 4340 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4341 { 4342 struct spdk_bdev_io *bdev_io, *tmp; 4343 4344 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4345 if (bdev_io->internal.ch == ch) { 4346 TAILQ_REMOVE(queue, bdev_io, internal.link); 4347 /* 4348 * spdk_bdev_io_complete() assumes that the completed I/O had 4349 * been submitted to the bdev module. Since in this case it 4350 * hadn't, bump io_outstanding to account for the decrement 4351 * that spdk_bdev_io_complete() will do. 4352 */ 4353 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4354 bdev_io_increment_outstanding(ch, ch->shared_resource); 4355 } 4356 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4357 } 4358 } 4359 } 4360 4361 static bool 4362 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4363 { 4364 struct spdk_bdev_io *bdev_io; 4365 4366 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4367 if (bdev_io == bio_to_abort) { 4368 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4369 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4370 return true; 4371 } 4372 } 4373 4374 return false; 4375 } 4376 4377 static int 4378 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4379 { 4380 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4381 uint64_t buf_len; 4382 4383 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4384 if (bdev_io == bio_to_abort) { 4385 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4386 spdk_iobuf_entry_abort(ch, entry, buf_len); 4387 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4388 return 1; 4389 } 4390 4391 return 0; 4392 } 4393 4394 static bool 4395 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4396 { 4397 int rc; 4398 4399 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4400 return rc == 1; 4401 } 4402 4403 static void 4404 bdev_qos_channel_destroy(void *cb_arg) 4405 { 4406 struct spdk_bdev_qos *qos = cb_arg; 4407 4408 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4409 spdk_poller_unregister(&qos->poller); 4410 4411 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4412 4413 free(qos); 4414 } 4415 4416 static int 4417 bdev_qos_destroy(struct spdk_bdev *bdev) 4418 { 4419 int i; 4420 4421 /* 4422 * Cleanly shutting down the QoS poller is tricky, because 4423 * during the asynchronous operation the user could open 4424 * a new descriptor and create a new channel, spawning 4425 * a new QoS poller. 4426 * 4427 * The strategy is to create a new QoS structure here and swap it 4428 * in. The shutdown path then continues to refer to the old one 4429 * until it completes and then releases it. 4430 */ 4431 struct spdk_bdev_qos *new_qos, *old_qos; 4432 4433 old_qos = bdev->internal.qos; 4434 4435 new_qos = calloc(1, sizeof(*new_qos)); 4436 if (!new_qos) { 4437 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4438 return -ENOMEM; 4439 } 4440 4441 /* Copy the old QoS data into the newly allocated structure */ 4442 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4443 4444 /* Zero out the key parts of the QoS structure */ 4445 new_qos->ch = NULL; 4446 new_qos->thread = NULL; 4447 new_qos->poller = NULL; 4448 /* 4449 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4450 * It will be used later for the new QoS structure. 4451 */ 4452 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4453 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4454 new_qos->rate_limits[i].min_per_timeslice = 0; 4455 new_qos->rate_limits[i].max_per_timeslice = 0; 4456 } 4457 4458 bdev->internal.qos = new_qos; 4459 4460 if (old_qos->thread == NULL) { 4461 free(old_qos); 4462 } else { 4463 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4464 } 4465 4466 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4467 * been destroyed yet. The destruction path will end up waiting for the final 4468 * channel to be put before it releases resources. */ 4469 4470 return 0; 4471 } 4472 4473 void 4474 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4475 { 4476 total->bytes_read += add->bytes_read; 4477 total->num_read_ops += add->num_read_ops; 4478 total->bytes_written += add->bytes_written; 4479 total->num_write_ops += add->num_write_ops; 4480 total->bytes_unmapped += add->bytes_unmapped; 4481 total->num_unmap_ops += add->num_unmap_ops; 4482 total->bytes_copied += add->bytes_copied; 4483 total->num_copy_ops += add->num_copy_ops; 4484 total->read_latency_ticks += add->read_latency_ticks; 4485 total->write_latency_ticks += add->write_latency_ticks; 4486 total->unmap_latency_ticks += add->unmap_latency_ticks; 4487 total->copy_latency_ticks += add->copy_latency_ticks; 4488 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4489 total->max_read_latency_ticks = add->max_read_latency_ticks; 4490 } 4491 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4492 total->min_read_latency_ticks = add->min_read_latency_ticks; 4493 } 4494 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4495 total->max_write_latency_ticks = add->max_write_latency_ticks; 4496 } 4497 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4498 total->min_write_latency_ticks = add->min_write_latency_ticks; 4499 } 4500 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4501 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4502 } 4503 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4504 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4505 } 4506 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4507 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4508 } 4509 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4510 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4511 } 4512 } 4513 4514 static void 4515 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4516 { 4517 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4518 4519 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4520 memcpy(to_stat->io_error, from_stat->io_error, 4521 sizeof(struct spdk_bdev_io_error_stat)); 4522 } 4523 } 4524 4525 void 4526 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4527 { 4528 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4529 return; 4530 } 4531 4532 stat->max_read_latency_ticks = 0; 4533 stat->min_read_latency_ticks = UINT64_MAX; 4534 stat->max_write_latency_ticks = 0; 4535 stat->min_write_latency_ticks = UINT64_MAX; 4536 stat->max_unmap_latency_ticks = 0; 4537 stat->min_unmap_latency_ticks = UINT64_MAX; 4538 stat->max_copy_latency_ticks = 0; 4539 stat->min_copy_latency_ticks = UINT64_MAX; 4540 4541 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4542 return; 4543 } 4544 4545 stat->bytes_read = 0; 4546 stat->num_read_ops = 0; 4547 stat->bytes_written = 0; 4548 stat->num_write_ops = 0; 4549 stat->bytes_unmapped = 0; 4550 stat->num_unmap_ops = 0; 4551 stat->bytes_copied = 0; 4552 stat->num_copy_ops = 0; 4553 stat->read_latency_ticks = 0; 4554 stat->write_latency_ticks = 0; 4555 stat->unmap_latency_ticks = 0; 4556 stat->copy_latency_ticks = 0; 4557 4558 if (stat->io_error != NULL) { 4559 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4560 } 4561 } 4562 4563 struct spdk_bdev_io_stat * 4564 bdev_alloc_io_stat(bool io_error_stat) 4565 { 4566 struct spdk_bdev_io_stat *stat; 4567 4568 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4569 if (stat == NULL) { 4570 return NULL; 4571 } 4572 4573 if (io_error_stat) { 4574 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4575 if (stat->io_error == NULL) { 4576 free(stat); 4577 return NULL; 4578 } 4579 } else { 4580 stat->io_error = NULL; 4581 } 4582 4583 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4584 4585 return stat; 4586 } 4587 4588 void 4589 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4590 { 4591 if (stat != NULL) { 4592 free(stat->io_error); 4593 free(stat); 4594 } 4595 } 4596 4597 void 4598 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4599 { 4600 int i; 4601 4602 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4603 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4604 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4605 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4606 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4607 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4608 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4609 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4610 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4611 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4612 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4613 stat->min_read_latency_ticks != UINT64_MAX ? 4614 stat->min_read_latency_ticks : 0); 4615 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4616 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4617 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4618 stat->min_write_latency_ticks != UINT64_MAX ? 4619 stat->min_write_latency_ticks : 0); 4620 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4621 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4622 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4623 stat->min_unmap_latency_ticks != UINT64_MAX ? 4624 stat->min_unmap_latency_ticks : 0); 4625 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4626 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4627 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4628 stat->min_copy_latency_ticks != UINT64_MAX ? 4629 stat->min_copy_latency_ticks : 0); 4630 4631 if (stat->io_error != NULL) { 4632 spdk_json_write_named_object_begin(w, "io_error"); 4633 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4634 if (stat->io_error->error_status[i] != 0) { 4635 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4636 stat->io_error->error_status[i]); 4637 } 4638 } 4639 spdk_json_write_object_end(w); 4640 } 4641 } 4642 4643 static void 4644 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4645 { 4646 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4647 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4648 4649 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4650 bdev_abort_all_buf_io(mgmt_ch, ch); 4651 } 4652 4653 static void 4654 bdev_channel_destroy(void *io_device, void *ctx_buf) 4655 { 4656 struct spdk_bdev_channel *ch = ctx_buf; 4657 4658 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4659 spdk_get_thread()); 4660 4661 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4662 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4663 4664 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4665 spdk_spin_lock(&ch->bdev->internal.spinlock); 4666 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4667 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4668 4669 bdev_channel_abort_queued_ios(ch); 4670 4671 if (ch->histogram) { 4672 spdk_histogram_data_free(ch->histogram); 4673 } 4674 4675 bdev_channel_destroy_resource(ch); 4676 } 4677 4678 /* 4679 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4680 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4681 */ 4682 static int 4683 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4684 { 4685 struct spdk_bdev_name *tmp; 4686 4687 bdev_name->name = strdup(name); 4688 if (bdev_name->name == NULL) { 4689 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4690 return -ENOMEM; 4691 } 4692 4693 bdev_name->bdev = bdev; 4694 4695 spdk_spin_lock(&g_bdev_mgr.spinlock); 4696 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4697 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4698 4699 if (tmp != NULL) { 4700 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4701 free(bdev_name->name); 4702 return -EEXIST; 4703 } 4704 4705 return 0; 4706 } 4707 4708 static void 4709 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4710 { 4711 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4712 free(bdev_name->name); 4713 } 4714 4715 static void 4716 bdev_name_del(struct spdk_bdev_name *bdev_name) 4717 { 4718 spdk_spin_lock(&g_bdev_mgr.spinlock); 4719 bdev_name_del_unsafe(bdev_name); 4720 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4721 } 4722 4723 int 4724 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4725 { 4726 struct spdk_bdev_alias *tmp; 4727 int ret; 4728 4729 if (alias == NULL) { 4730 SPDK_ERRLOG("Empty alias passed\n"); 4731 return -EINVAL; 4732 } 4733 4734 tmp = calloc(1, sizeof(*tmp)); 4735 if (tmp == NULL) { 4736 SPDK_ERRLOG("Unable to allocate alias\n"); 4737 return -ENOMEM; 4738 } 4739 4740 ret = bdev_name_add(&tmp->alias, bdev, alias); 4741 if (ret != 0) { 4742 free(tmp); 4743 return ret; 4744 } 4745 4746 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4747 4748 return 0; 4749 } 4750 4751 static int 4752 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4753 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4754 { 4755 struct spdk_bdev_alias *tmp; 4756 4757 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4758 if (strcmp(alias, tmp->alias.name) == 0) { 4759 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4760 alias_del_fn(&tmp->alias); 4761 free(tmp); 4762 return 0; 4763 } 4764 } 4765 4766 return -ENOENT; 4767 } 4768 4769 int 4770 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4771 { 4772 int rc; 4773 4774 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4775 if (rc == -ENOENT) { 4776 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4777 } 4778 4779 return rc; 4780 } 4781 4782 void 4783 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4784 { 4785 struct spdk_bdev_alias *p, *tmp; 4786 4787 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4788 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4789 bdev_name_del(&p->alias); 4790 free(p); 4791 } 4792 } 4793 4794 struct spdk_io_channel * 4795 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4796 { 4797 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4798 } 4799 4800 void * 4801 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4802 { 4803 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4804 void *ctx = NULL; 4805 4806 if (bdev->fn_table->get_module_ctx) { 4807 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4808 } 4809 4810 return ctx; 4811 } 4812 4813 const char * 4814 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4815 { 4816 return bdev->module->name; 4817 } 4818 4819 const char * 4820 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4821 { 4822 return bdev->name; 4823 } 4824 4825 const char * 4826 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4827 { 4828 return bdev->product_name; 4829 } 4830 4831 const struct spdk_bdev_aliases_list * 4832 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4833 { 4834 return &bdev->aliases; 4835 } 4836 4837 uint32_t 4838 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4839 { 4840 return bdev->blocklen; 4841 } 4842 4843 uint32_t 4844 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4845 { 4846 return bdev->write_unit_size; 4847 } 4848 4849 uint64_t 4850 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4851 { 4852 return bdev->blockcnt; 4853 } 4854 4855 const char * 4856 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4857 { 4858 return qos_rpc_type[type]; 4859 } 4860 4861 void 4862 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4863 { 4864 int i; 4865 4866 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4867 4868 spdk_spin_lock(&bdev->internal.spinlock); 4869 if (bdev->internal.qos) { 4870 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4871 if (bdev->internal.qos->rate_limits[i].limit != 4872 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4873 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4874 if (bdev_qos_is_iops_rate_limit(i) == false) { 4875 /* Change from Byte to Megabyte which is user visible. */ 4876 limits[i] = limits[i] / 1024 / 1024; 4877 } 4878 } 4879 } 4880 } 4881 spdk_spin_unlock(&bdev->internal.spinlock); 4882 } 4883 4884 size_t 4885 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4886 { 4887 return 1 << bdev->required_alignment; 4888 } 4889 4890 uint32_t 4891 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4892 { 4893 return bdev->optimal_io_boundary; 4894 } 4895 4896 bool 4897 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4898 { 4899 return bdev->write_cache; 4900 } 4901 4902 const struct spdk_uuid * 4903 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4904 { 4905 return &bdev->uuid; 4906 } 4907 4908 uint16_t 4909 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4910 { 4911 return bdev->acwu; 4912 } 4913 4914 uint32_t 4915 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4916 { 4917 return bdev->md_len; 4918 } 4919 4920 bool 4921 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4922 { 4923 return (bdev->md_len != 0) && bdev->md_interleave; 4924 } 4925 4926 bool 4927 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4928 { 4929 return (bdev->md_len != 0) && !bdev->md_interleave; 4930 } 4931 4932 bool 4933 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4934 { 4935 return bdev->zoned; 4936 } 4937 4938 uint32_t 4939 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4940 { 4941 if (spdk_bdev_is_md_interleaved(bdev)) { 4942 return bdev->blocklen - bdev->md_len; 4943 } else { 4944 return bdev->blocklen; 4945 } 4946 } 4947 4948 uint32_t 4949 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4950 { 4951 return bdev->phys_blocklen; 4952 } 4953 4954 static uint32_t 4955 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4956 { 4957 if (!spdk_bdev_is_md_interleaved(bdev)) { 4958 return bdev->blocklen + bdev->md_len; 4959 } else { 4960 return bdev->blocklen; 4961 } 4962 } 4963 4964 /* We have to use the typedef in the function declaration to appease astyle. */ 4965 typedef enum spdk_dif_type spdk_dif_type_t; 4966 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4967 4968 spdk_dif_type_t 4969 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4970 { 4971 if (bdev->md_len != 0) { 4972 return bdev->dif_type; 4973 } else { 4974 return SPDK_DIF_DISABLE; 4975 } 4976 } 4977 4978 spdk_dif_pi_format_t 4979 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4980 { 4981 return bdev->dif_pi_format; 4982 } 4983 4984 bool 4985 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4986 { 4987 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4988 return bdev->dif_is_head_of_md; 4989 } else { 4990 return false; 4991 } 4992 } 4993 4994 bool 4995 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4996 enum spdk_dif_check_type check_type) 4997 { 4998 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4999 return false; 5000 } 5001 5002 switch (check_type) { 5003 case SPDK_DIF_CHECK_TYPE_REFTAG: 5004 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5005 case SPDK_DIF_CHECK_TYPE_APPTAG: 5006 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5007 case SPDK_DIF_CHECK_TYPE_GUARD: 5008 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5009 default: 5010 return false; 5011 } 5012 } 5013 5014 static uint32_t 5015 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5016 { 5017 uint64_t aligned_length, max_write_blocks; 5018 5019 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5020 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5021 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5022 5023 return max_write_blocks; 5024 } 5025 5026 uint32_t 5027 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5028 { 5029 return bdev->max_copy; 5030 } 5031 5032 uint64_t 5033 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5034 { 5035 return bdev->internal.measured_queue_depth; 5036 } 5037 5038 uint64_t 5039 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5040 { 5041 return bdev->internal.period; 5042 } 5043 5044 uint64_t 5045 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5046 { 5047 return bdev->internal.weighted_io_time; 5048 } 5049 5050 uint64_t 5051 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5052 { 5053 return bdev->internal.io_time; 5054 } 5055 5056 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5057 { 5058 return bdev->ctratt; 5059 } 5060 5061 uint32_t 5062 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5063 { 5064 return bdev->nsid; 5065 } 5066 5067 static void bdev_update_qd_sampling_period(void *ctx); 5068 5069 static void 5070 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5071 { 5072 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5073 5074 if (bdev->internal.measured_queue_depth) { 5075 bdev->internal.io_time += bdev->internal.period; 5076 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5077 } 5078 5079 bdev->internal.qd_poll_in_progress = false; 5080 5081 bdev_update_qd_sampling_period(bdev); 5082 } 5083 5084 static void 5085 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5086 struct spdk_io_channel *io_ch, void *_ctx) 5087 { 5088 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5089 5090 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5091 spdk_bdev_for_each_channel_continue(i, 0); 5092 } 5093 5094 static int 5095 bdev_calculate_measured_queue_depth(void *ctx) 5096 { 5097 struct spdk_bdev *bdev = ctx; 5098 5099 bdev->internal.qd_poll_in_progress = true; 5100 bdev->internal.temporary_queue_depth = 0; 5101 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5102 return SPDK_POLLER_BUSY; 5103 } 5104 5105 static void 5106 bdev_update_qd_sampling_period(void *ctx) 5107 { 5108 struct spdk_bdev *bdev = ctx; 5109 5110 if (bdev->internal.period == bdev->internal.new_period) { 5111 return; 5112 } 5113 5114 if (bdev->internal.qd_poll_in_progress) { 5115 return; 5116 } 5117 5118 bdev->internal.period = bdev->internal.new_period; 5119 5120 spdk_poller_unregister(&bdev->internal.qd_poller); 5121 if (bdev->internal.period != 0) { 5122 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5123 bdev, bdev->internal.period); 5124 } else { 5125 spdk_bdev_close(bdev->internal.qd_desc); 5126 bdev->internal.qd_desc = NULL; 5127 } 5128 } 5129 5130 static void 5131 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5132 { 5133 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5134 } 5135 5136 void 5137 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5138 { 5139 int rc; 5140 5141 if (bdev->internal.new_period == period) { 5142 return; 5143 } 5144 5145 bdev->internal.new_period = period; 5146 5147 if (bdev->internal.qd_desc != NULL) { 5148 assert(bdev->internal.period != 0); 5149 5150 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5151 bdev_update_qd_sampling_period, bdev); 5152 return; 5153 } 5154 5155 assert(bdev->internal.period == 0); 5156 5157 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5158 NULL, &bdev->internal.qd_desc); 5159 if (rc != 0) { 5160 return; 5161 } 5162 5163 bdev->internal.period = period; 5164 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5165 bdev, period); 5166 } 5167 5168 struct bdev_get_current_qd_ctx { 5169 uint64_t current_qd; 5170 spdk_bdev_get_current_qd_cb cb_fn; 5171 void *cb_arg; 5172 }; 5173 5174 static void 5175 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5176 { 5177 struct bdev_get_current_qd_ctx *ctx = _ctx; 5178 5179 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5180 5181 free(ctx); 5182 } 5183 5184 static void 5185 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5186 struct spdk_io_channel *io_ch, void *_ctx) 5187 { 5188 struct bdev_get_current_qd_ctx *ctx = _ctx; 5189 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5190 5191 ctx->current_qd += bdev_ch->io_outstanding; 5192 5193 spdk_bdev_for_each_channel_continue(i, 0); 5194 } 5195 5196 void 5197 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5198 void *cb_arg) 5199 { 5200 struct bdev_get_current_qd_ctx *ctx; 5201 5202 assert(cb_fn != NULL); 5203 5204 ctx = calloc(1, sizeof(*ctx)); 5205 if (ctx == NULL) { 5206 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5207 return; 5208 } 5209 5210 ctx->cb_fn = cb_fn; 5211 ctx->cb_arg = cb_arg; 5212 5213 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5214 } 5215 5216 static void 5217 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5218 { 5219 assert(desc->thread == spdk_get_thread()); 5220 5221 spdk_spin_lock(&desc->spinlock); 5222 desc->refs--; 5223 if (!desc->closed) { 5224 spdk_spin_unlock(&desc->spinlock); 5225 desc->callback.event_fn(type, 5226 desc->bdev, 5227 desc->callback.ctx); 5228 return; 5229 } else if (desc->refs == 0) { 5230 /* This descriptor was closed after this event_notify message was sent. 5231 * spdk_bdev_close() could not free the descriptor since this message was 5232 * in flight, so we free it now using bdev_desc_free(). 5233 */ 5234 spdk_spin_unlock(&desc->spinlock); 5235 bdev_desc_free(desc); 5236 return; 5237 } 5238 spdk_spin_unlock(&desc->spinlock); 5239 } 5240 5241 static void 5242 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5243 { 5244 spdk_spin_lock(&desc->spinlock); 5245 desc->refs++; 5246 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5247 spdk_spin_unlock(&desc->spinlock); 5248 } 5249 5250 static void 5251 _resize_notify(void *ctx) 5252 { 5253 struct spdk_bdev_desc *desc = ctx; 5254 5255 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5256 } 5257 5258 int 5259 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5260 { 5261 struct spdk_bdev_desc *desc; 5262 int ret; 5263 5264 if (size == bdev->blockcnt) { 5265 return 0; 5266 } 5267 5268 spdk_spin_lock(&bdev->internal.spinlock); 5269 5270 /* bdev has open descriptors */ 5271 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5272 bdev->blockcnt > size) { 5273 ret = -EBUSY; 5274 } else { 5275 bdev->blockcnt = size; 5276 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5277 event_notify(desc, _resize_notify); 5278 } 5279 ret = 0; 5280 } 5281 5282 spdk_spin_unlock(&bdev->internal.spinlock); 5283 5284 return ret; 5285 } 5286 5287 /* 5288 * Convert I/O offset and length from bytes to blocks. 5289 * 5290 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5291 */ 5292 static uint64_t 5293 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5294 uint64_t num_bytes, uint64_t *num_blocks) 5295 { 5296 uint32_t block_size = bdev->blocklen; 5297 uint8_t shift_cnt; 5298 5299 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5300 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5301 shift_cnt = spdk_u32log2(block_size); 5302 *offset_blocks = offset_bytes >> shift_cnt; 5303 *num_blocks = num_bytes >> shift_cnt; 5304 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5305 (num_bytes - (*num_blocks << shift_cnt)); 5306 } else { 5307 *offset_blocks = offset_bytes / block_size; 5308 *num_blocks = num_bytes / block_size; 5309 return (offset_bytes % block_size) | (num_bytes % block_size); 5310 } 5311 } 5312 5313 static bool 5314 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5315 { 5316 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5317 * has been an overflow and hence the offset has been wrapped around */ 5318 if (offset_blocks + num_blocks < offset_blocks) { 5319 return false; 5320 } 5321 5322 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5323 if (offset_blocks + num_blocks > bdev->blockcnt) { 5324 return false; 5325 } 5326 5327 return true; 5328 } 5329 5330 static void 5331 bdev_seek_complete_cb(void *ctx) 5332 { 5333 struct spdk_bdev_io *bdev_io = ctx; 5334 5335 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5336 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5337 } 5338 5339 static int 5340 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5341 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5342 spdk_bdev_io_completion_cb cb, void *cb_arg) 5343 { 5344 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5345 struct spdk_bdev_io *bdev_io; 5346 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5347 5348 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5349 5350 /* Check if offset_blocks is valid looking at the validity of one block */ 5351 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5352 return -EINVAL; 5353 } 5354 5355 bdev_io = bdev_channel_get_io(channel); 5356 if (!bdev_io) { 5357 return -ENOMEM; 5358 } 5359 5360 bdev_io->internal.ch = channel; 5361 bdev_io->internal.desc = desc; 5362 bdev_io->type = io_type; 5363 bdev_io->u.bdev.offset_blocks = offset_blocks; 5364 bdev_io->u.bdev.memory_domain = NULL; 5365 bdev_io->u.bdev.memory_domain_ctx = NULL; 5366 bdev_io->u.bdev.accel_sequence = NULL; 5367 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5368 5369 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5370 /* In case bdev doesn't support seek to next data/hole offset, 5371 * it is assumed that only data and no holes are present */ 5372 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5373 bdev_io->u.bdev.seek.offset = offset_blocks; 5374 } else { 5375 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5376 } 5377 5378 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5379 return 0; 5380 } 5381 5382 bdev_io_submit(bdev_io); 5383 return 0; 5384 } 5385 5386 int 5387 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5388 uint64_t offset_blocks, 5389 spdk_bdev_io_completion_cb cb, void *cb_arg) 5390 { 5391 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5392 } 5393 5394 int 5395 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5396 uint64_t offset_blocks, 5397 spdk_bdev_io_completion_cb cb, void *cb_arg) 5398 { 5399 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5400 } 5401 5402 uint64_t 5403 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5404 { 5405 return bdev_io->u.bdev.seek.offset; 5406 } 5407 5408 static int 5409 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5410 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5411 spdk_bdev_io_completion_cb cb, void *cb_arg) 5412 { 5413 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5414 struct spdk_bdev_io *bdev_io; 5415 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5416 5417 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5418 return -EINVAL; 5419 } 5420 5421 bdev_io = bdev_channel_get_io(channel); 5422 if (!bdev_io) { 5423 return -ENOMEM; 5424 } 5425 5426 bdev_io->internal.ch = channel; 5427 bdev_io->internal.desc = desc; 5428 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5429 bdev_io->u.bdev.iovs = &bdev_io->iov; 5430 bdev_io->u.bdev.iovs[0].iov_base = buf; 5431 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5432 bdev_io->u.bdev.iovcnt = 1; 5433 bdev_io->u.bdev.md_buf = md_buf; 5434 bdev_io->u.bdev.num_blocks = num_blocks; 5435 bdev_io->u.bdev.offset_blocks = offset_blocks; 5436 bdev_io->u.bdev.memory_domain = NULL; 5437 bdev_io->u.bdev.memory_domain_ctx = NULL; 5438 bdev_io->u.bdev.accel_sequence = NULL; 5439 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5440 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5441 5442 bdev_io_submit(bdev_io); 5443 return 0; 5444 } 5445 5446 int 5447 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5448 void *buf, uint64_t offset, uint64_t nbytes, 5449 spdk_bdev_io_completion_cb cb, void *cb_arg) 5450 { 5451 uint64_t offset_blocks, num_blocks; 5452 5453 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5454 nbytes, &num_blocks) != 0) { 5455 return -EINVAL; 5456 } 5457 5458 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5459 } 5460 5461 int 5462 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5463 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5464 spdk_bdev_io_completion_cb cb, void *cb_arg) 5465 { 5466 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5467 } 5468 5469 int 5470 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5471 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5472 spdk_bdev_io_completion_cb cb, void *cb_arg) 5473 { 5474 struct iovec iov = { 5475 .iov_base = buf, 5476 }; 5477 5478 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5479 return -EINVAL; 5480 } 5481 5482 if (md_buf && !_is_buf_allocated(&iov)) { 5483 return -EINVAL; 5484 } 5485 5486 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5487 cb, cb_arg); 5488 } 5489 5490 int 5491 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5492 struct iovec *iov, int iovcnt, 5493 uint64_t offset, uint64_t nbytes, 5494 spdk_bdev_io_completion_cb cb, void *cb_arg) 5495 { 5496 uint64_t offset_blocks, num_blocks; 5497 5498 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5499 nbytes, &num_blocks) != 0) { 5500 return -EINVAL; 5501 } 5502 5503 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5504 } 5505 5506 static int 5507 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5508 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5509 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5510 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5511 spdk_bdev_io_completion_cb cb, void *cb_arg) 5512 { 5513 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5514 struct spdk_bdev_io *bdev_io; 5515 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5516 5517 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5518 return -EINVAL; 5519 } 5520 5521 bdev_io = bdev_channel_get_io(channel); 5522 if (spdk_unlikely(!bdev_io)) { 5523 return -ENOMEM; 5524 } 5525 5526 bdev_io->internal.ch = channel; 5527 bdev_io->internal.desc = desc; 5528 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5529 bdev_io->u.bdev.iovs = iov; 5530 bdev_io->u.bdev.iovcnt = iovcnt; 5531 bdev_io->u.bdev.md_buf = md_buf; 5532 bdev_io->u.bdev.num_blocks = num_blocks; 5533 bdev_io->u.bdev.offset_blocks = offset_blocks; 5534 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5535 5536 if (seq != NULL) { 5537 bdev_io->internal.f.has_accel_sequence = true; 5538 bdev_io->internal.accel_sequence = seq; 5539 } 5540 5541 if (domain != NULL) { 5542 bdev_io->internal.f.has_memory_domain = true; 5543 bdev_io->internal.memory_domain = domain; 5544 bdev_io->internal.memory_domain_ctx = domain_ctx; 5545 } 5546 5547 bdev_io->u.bdev.memory_domain = domain; 5548 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5549 bdev_io->u.bdev.accel_sequence = seq; 5550 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5551 5552 _bdev_io_submit_ext(desc, bdev_io); 5553 5554 return 0; 5555 } 5556 5557 int 5558 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5559 struct iovec *iov, int iovcnt, 5560 uint64_t offset_blocks, uint64_t num_blocks, 5561 spdk_bdev_io_completion_cb cb, void *cb_arg) 5562 { 5563 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5564 5565 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5566 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5567 } 5568 5569 int 5570 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5571 struct iovec *iov, int iovcnt, void *md_buf, 5572 uint64_t offset_blocks, uint64_t num_blocks, 5573 spdk_bdev_io_completion_cb cb, void *cb_arg) 5574 { 5575 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5576 5577 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5578 return -EINVAL; 5579 } 5580 5581 if (md_buf && !_is_buf_allocated(iov)) { 5582 return -EINVAL; 5583 } 5584 5585 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5586 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5587 } 5588 5589 static inline bool 5590 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5591 { 5592 /* 5593 * We check if opts size is at least of size when we first introduced 5594 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5595 * are not checked internal. 5596 */ 5597 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5598 sizeof(opts->metadata) && 5599 opts->size <= sizeof(*opts) && 5600 /* When memory domain is used, the user must provide data buffers */ 5601 (!opts->memory_domain || (iov && iov[0].iov_base)); 5602 } 5603 5604 int 5605 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5606 struct iovec *iov, int iovcnt, 5607 uint64_t offset_blocks, uint64_t num_blocks, 5608 spdk_bdev_io_completion_cb cb, void *cb_arg, 5609 struct spdk_bdev_ext_io_opts *opts) 5610 { 5611 struct spdk_memory_domain *domain = NULL; 5612 struct spdk_accel_sequence *seq = NULL; 5613 void *domain_ctx = NULL, *md = NULL; 5614 uint32_t dif_check_flags = 0; 5615 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5616 5617 if (opts) { 5618 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5619 return -EINVAL; 5620 } 5621 5622 md = opts->metadata; 5623 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5624 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5625 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5626 if (md) { 5627 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5628 return -EINVAL; 5629 } 5630 5631 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5632 return -EINVAL; 5633 } 5634 5635 if (spdk_unlikely(seq != NULL)) { 5636 return -EINVAL; 5637 } 5638 } 5639 } 5640 5641 dif_check_flags = bdev->dif_check_flags & 5642 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5643 5644 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5645 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5646 } 5647 5648 static int 5649 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5650 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5651 spdk_bdev_io_completion_cb cb, void *cb_arg) 5652 { 5653 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5654 struct spdk_bdev_io *bdev_io; 5655 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5656 5657 if (!desc->write) { 5658 return -EBADF; 5659 } 5660 5661 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5662 return -EINVAL; 5663 } 5664 5665 bdev_io = bdev_channel_get_io(channel); 5666 if (!bdev_io) { 5667 return -ENOMEM; 5668 } 5669 5670 bdev_io->internal.ch = channel; 5671 bdev_io->internal.desc = desc; 5672 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5673 bdev_io->u.bdev.iovs = &bdev_io->iov; 5674 bdev_io->u.bdev.iovs[0].iov_base = buf; 5675 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5676 bdev_io->u.bdev.iovcnt = 1; 5677 bdev_io->u.bdev.md_buf = md_buf; 5678 bdev_io->u.bdev.num_blocks = num_blocks; 5679 bdev_io->u.bdev.offset_blocks = offset_blocks; 5680 bdev_io->u.bdev.memory_domain = NULL; 5681 bdev_io->u.bdev.memory_domain_ctx = NULL; 5682 bdev_io->u.bdev.accel_sequence = NULL; 5683 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5684 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5685 5686 bdev_io_submit(bdev_io); 5687 return 0; 5688 } 5689 5690 int 5691 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5692 void *buf, uint64_t offset, uint64_t nbytes, 5693 spdk_bdev_io_completion_cb cb, void *cb_arg) 5694 { 5695 uint64_t offset_blocks, num_blocks; 5696 5697 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5698 nbytes, &num_blocks) != 0) { 5699 return -EINVAL; 5700 } 5701 5702 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5703 } 5704 5705 int 5706 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5707 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5708 spdk_bdev_io_completion_cb cb, void *cb_arg) 5709 { 5710 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5711 cb, cb_arg); 5712 } 5713 5714 int 5715 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5716 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5717 spdk_bdev_io_completion_cb cb, void *cb_arg) 5718 { 5719 struct iovec iov = { 5720 .iov_base = buf, 5721 }; 5722 5723 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5724 return -EINVAL; 5725 } 5726 5727 if (md_buf && !_is_buf_allocated(&iov)) { 5728 return -EINVAL; 5729 } 5730 5731 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5732 cb, cb_arg); 5733 } 5734 5735 static int 5736 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5737 struct iovec *iov, int iovcnt, void *md_buf, 5738 uint64_t offset_blocks, uint64_t num_blocks, 5739 struct spdk_memory_domain *domain, void *domain_ctx, 5740 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5741 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5742 spdk_bdev_io_completion_cb cb, void *cb_arg) 5743 { 5744 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5745 struct spdk_bdev_io *bdev_io; 5746 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5747 5748 if (spdk_unlikely(!desc->write)) { 5749 return -EBADF; 5750 } 5751 5752 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5753 return -EINVAL; 5754 } 5755 5756 bdev_io = bdev_channel_get_io(channel); 5757 if (spdk_unlikely(!bdev_io)) { 5758 return -ENOMEM; 5759 } 5760 5761 bdev_io->internal.ch = channel; 5762 bdev_io->internal.desc = desc; 5763 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5764 bdev_io->u.bdev.iovs = iov; 5765 bdev_io->u.bdev.iovcnt = iovcnt; 5766 bdev_io->u.bdev.md_buf = md_buf; 5767 bdev_io->u.bdev.num_blocks = num_blocks; 5768 bdev_io->u.bdev.offset_blocks = offset_blocks; 5769 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5770 if (seq != NULL) { 5771 bdev_io->internal.f.has_accel_sequence = true; 5772 bdev_io->internal.accel_sequence = seq; 5773 } 5774 5775 if (domain != NULL) { 5776 bdev_io->internal.f.has_memory_domain = true; 5777 bdev_io->internal.memory_domain = domain; 5778 bdev_io->internal.memory_domain_ctx = domain_ctx; 5779 } 5780 5781 bdev_io->u.bdev.memory_domain = domain; 5782 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5783 bdev_io->u.bdev.accel_sequence = seq; 5784 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5785 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5786 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5787 5788 _bdev_io_submit_ext(desc, bdev_io); 5789 5790 return 0; 5791 } 5792 5793 int 5794 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5795 struct iovec *iov, int iovcnt, 5796 uint64_t offset, uint64_t len, 5797 spdk_bdev_io_completion_cb cb, void *cb_arg) 5798 { 5799 uint64_t offset_blocks, num_blocks; 5800 5801 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5802 len, &num_blocks) != 0) { 5803 return -EINVAL; 5804 } 5805 5806 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5807 } 5808 5809 int 5810 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5811 struct iovec *iov, int iovcnt, 5812 uint64_t offset_blocks, uint64_t num_blocks, 5813 spdk_bdev_io_completion_cb cb, void *cb_arg) 5814 { 5815 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5816 5817 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5818 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5819 cb, cb_arg); 5820 } 5821 5822 int 5823 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5824 struct iovec *iov, int iovcnt, void *md_buf, 5825 uint64_t offset_blocks, uint64_t num_blocks, 5826 spdk_bdev_io_completion_cb cb, void *cb_arg) 5827 { 5828 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5829 5830 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5831 return -EINVAL; 5832 } 5833 5834 if (md_buf && !_is_buf_allocated(iov)) { 5835 return -EINVAL; 5836 } 5837 5838 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5839 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5840 cb, cb_arg); 5841 } 5842 5843 int 5844 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5845 struct iovec *iov, int iovcnt, 5846 uint64_t offset_blocks, uint64_t num_blocks, 5847 spdk_bdev_io_completion_cb cb, void *cb_arg, 5848 struct spdk_bdev_ext_io_opts *opts) 5849 { 5850 struct spdk_memory_domain *domain = NULL; 5851 struct spdk_accel_sequence *seq = NULL; 5852 void *domain_ctx = NULL, *md = NULL; 5853 uint32_t dif_check_flags = 0; 5854 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5855 uint32_t nvme_cdw12_raw = 0; 5856 uint32_t nvme_cdw13_raw = 0; 5857 5858 if (opts) { 5859 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5860 return -EINVAL; 5861 } 5862 md = opts->metadata; 5863 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5864 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5865 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5866 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5867 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5868 if (md) { 5869 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5870 return -EINVAL; 5871 } 5872 5873 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5874 return -EINVAL; 5875 } 5876 5877 if (spdk_unlikely(seq != NULL)) { 5878 return -EINVAL; 5879 } 5880 } 5881 } 5882 5883 dif_check_flags = bdev->dif_check_flags & 5884 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5885 5886 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5887 domain, domain_ctx, seq, dif_check_flags, 5888 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5889 } 5890 5891 static void 5892 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5893 { 5894 struct spdk_bdev_io *parent_io = cb_arg; 5895 struct spdk_bdev *bdev = parent_io->bdev; 5896 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5897 int i, rc = 0; 5898 5899 if (!success) { 5900 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5901 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5902 spdk_bdev_free_io(bdev_io); 5903 return; 5904 } 5905 5906 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5907 rc = memcmp(read_buf, 5908 parent_io->u.bdev.iovs[i].iov_base, 5909 parent_io->u.bdev.iovs[i].iov_len); 5910 if (rc) { 5911 break; 5912 } 5913 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5914 } 5915 5916 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5917 rc = memcmp(bdev_io->u.bdev.md_buf, 5918 parent_io->u.bdev.md_buf, 5919 spdk_bdev_get_md_size(bdev)); 5920 } 5921 5922 spdk_bdev_free_io(bdev_io); 5923 5924 if (rc == 0) { 5925 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5926 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5927 } else { 5928 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5929 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5930 } 5931 } 5932 5933 static void 5934 bdev_compare_do_read(void *_bdev_io) 5935 { 5936 struct spdk_bdev_io *bdev_io = _bdev_io; 5937 int rc; 5938 5939 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5940 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5941 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5942 bdev_compare_do_read_done, bdev_io); 5943 5944 if (rc == -ENOMEM) { 5945 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5946 } else if (rc != 0) { 5947 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5948 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5949 } 5950 } 5951 5952 static int 5953 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5954 struct iovec *iov, int iovcnt, void *md_buf, 5955 uint64_t offset_blocks, uint64_t num_blocks, 5956 spdk_bdev_io_completion_cb cb, void *cb_arg) 5957 { 5958 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5959 struct spdk_bdev_io *bdev_io; 5960 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5961 5962 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5963 return -EINVAL; 5964 } 5965 5966 bdev_io = bdev_channel_get_io(channel); 5967 if (!bdev_io) { 5968 return -ENOMEM; 5969 } 5970 5971 bdev_io->internal.ch = channel; 5972 bdev_io->internal.desc = desc; 5973 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5974 bdev_io->u.bdev.iovs = iov; 5975 bdev_io->u.bdev.iovcnt = iovcnt; 5976 bdev_io->u.bdev.md_buf = md_buf; 5977 bdev_io->u.bdev.num_blocks = num_blocks; 5978 bdev_io->u.bdev.offset_blocks = offset_blocks; 5979 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5980 bdev_io->u.bdev.memory_domain = NULL; 5981 bdev_io->u.bdev.memory_domain_ctx = NULL; 5982 bdev_io->u.bdev.accel_sequence = NULL; 5983 5984 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5985 bdev_io_submit(bdev_io); 5986 return 0; 5987 } 5988 5989 bdev_compare_do_read(bdev_io); 5990 5991 return 0; 5992 } 5993 5994 int 5995 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5996 struct iovec *iov, int iovcnt, 5997 uint64_t offset_blocks, uint64_t num_blocks, 5998 spdk_bdev_io_completion_cb cb, void *cb_arg) 5999 { 6000 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6001 num_blocks, cb, cb_arg); 6002 } 6003 6004 int 6005 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6006 struct iovec *iov, int iovcnt, void *md_buf, 6007 uint64_t offset_blocks, uint64_t num_blocks, 6008 spdk_bdev_io_completion_cb cb, void *cb_arg) 6009 { 6010 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6011 return -EINVAL; 6012 } 6013 6014 if (md_buf && !_is_buf_allocated(iov)) { 6015 return -EINVAL; 6016 } 6017 6018 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6019 num_blocks, cb, cb_arg); 6020 } 6021 6022 static int 6023 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6024 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6025 spdk_bdev_io_completion_cb cb, void *cb_arg) 6026 { 6027 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6028 struct spdk_bdev_io *bdev_io; 6029 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6030 6031 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6032 return -EINVAL; 6033 } 6034 6035 bdev_io = bdev_channel_get_io(channel); 6036 if (!bdev_io) { 6037 return -ENOMEM; 6038 } 6039 6040 bdev_io->internal.ch = channel; 6041 bdev_io->internal.desc = desc; 6042 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6043 bdev_io->u.bdev.iovs = &bdev_io->iov; 6044 bdev_io->u.bdev.iovs[0].iov_base = buf; 6045 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6046 bdev_io->u.bdev.iovcnt = 1; 6047 bdev_io->u.bdev.md_buf = md_buf; 6048 bdev_io->u.bdev.num_blocks = num_blocks; 6049 bdev_io->u.bdev.offset_blocks = offset_blocks; 6050 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6051 bdev_io->u.bdev.memory_domain = NULL; 6052 bdev_io->u.bdev.memory_domain_ctx = NULL; 6053 bdev_io->u.bdev.accel_sequence = NULL; 6054 6055 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6056 bdev_io_submit(bdev_io); 6057 return 0; 6058 } 6059 6060 bdev_compare_do_read(bdev_io); 6061 6062 return 0; 6063 } 6064 6065 int 6066 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6067 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6068 spdk_bdev_io_completion_cb cb, void *cb_arg) 6069 { 6070 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6071 cb, cb_arg); 6072 } 6073 6074 int 6075 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6076 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6077 spdk_bdev_io_completion_cb cb, void *cb_arg) 6078 { 6079 struct iovec iov = { 6080 .iov_base = buf, 6081 }; 6082 6083 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6084 return -EINVAL; 6085 } 6086 6087 if (md_buf && !_is_buf_allocated(&iov)) { 6088 return -EINVAL; 6089 } 6090 6091 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6092 cb, cb_arg); 6093 } 6094 6095 static void 6096 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6097 { 6098 struct spdk_bdev_io *bdev_io = ctx; 6099 6100 if (unlock_status) { 6101 SPDK_ERRLOG("LBA range unlock failed\n"); 6102 } 6103 6104 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6105 false, bdev_io->internal.caller_ctx); 6106 } 6107 6108 static void 6109 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6110 { 6111 bdev_io->internal.status = status; 6112 6113 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6114 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6115 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6116 } 6117 6118 static void 6119 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6120 { 6121 struct spdk_bdev_io *parent_io = cb_arg; 6122 6123 if (!success) { 6124 SPDK_ERRLOG("Compare and write operation failed\n"); 6125 } 6126 6127 spdk_bdev_free_io(bdev_io); 6128 6129 bdev_comparev_and_writev_blocks_unlock(parent_io, 6130 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6131 } 6132 6133 static void 6134 bdev_compare_and_write_do_write(void *_bdev_io) 6135 { 6136 struct spdk_bdev_io *bdev_io = _bdev_io; 6137 int rc; 6138 6139 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6140 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6141 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6142 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6143 bdev_compare_and_write_do_write_done, bdev_io); 6144 6145 6146 if (rc == -ENOMEM) { 6147 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6148 } else if (rc != 0) { 6149 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6150 } 6151 } 6152 6153 static void 6154 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6155 { 6156 struct spdk_bdev_io *parent_io = cb_arg; 6157 6158 spdk_bdev_free_io(bdev_io); 6159 6160 if (!success) { 6161 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6162 return; 6163 } 6164 6165 bdev_compare_and_write_do_write(parent_io); 6166 } 6167 6168 static void 6169 bdev_compare_and_write_do_compare(void *_bdev_io) 6170 { 6171 struct spdk_bdev_io *bdev_io = _bdev_io; 6172 int rc; 6173 6174 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6175 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6176 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6177 bdev_compare_and_write_do_compare_done, bdev_io); 6178 6179 if (rc == -ENOMEM) { 6180 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6181 } else if (rc != 0) { 6182 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6183 } 6184 } 6185 6186 static void 6187 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6188 { 6189 struct spdk_bdev_io *bdev_io = ctx; 6190 6191 if (status) { 6192 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6193 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6194 return; 6195 } 6196 6197 bdev_compare_and_write_do_compare(bdev_io); 6198 } 6199 6200 int 6201 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6202 struct iovec *compare_iov, int compare_iovcnt, 6203 struct iovec *write_iov, int write_iovcnt, 6204 uint64_t offset_blocks, uint64_t num_blocks, 6205 spdk_bdev_io_completion_cb cb, void *cb_arg) 6206 { 6207 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6208 struct spdk_bdev_io *bdev_io; 6209 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6210 6211 if (!desc->write) { 6212 return -EBADF; 6213 } 6214 6215 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6216 return -EINVAL; 6217 } 6218 6219 if (num_blocks > bdev->acwu) { 6220 return -EINVAL; 6221 } 6222 6223 bdev_io = bdev_channel_get_io(channel); 6224 if (!bdev_io) { 6225 return -ENOMEM; 6226 } 6227 6228 bdev_io->internal.ch = channel; 6229 bdev_io->internal.desc = desc; 6230 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6231 bdev_io->u.bdev.iovs = compare_iov; 6232 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6233 bdev_io->u.bdev.fused_iovs = write_iov; 6234 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6235 bdev_io->u.bdev.md_buf = NULL; 6236 bdev_io->u.bdev.num_blocks = num_blocks; 6237 bdev_io->u.bdev.offset_blocks = offset_blocks; 6238 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6239 bdev_io->u.bdev.memory_domain = NULL; 6240 bdev_io->u.bdev.memory_domain_ctx = NULL; 6241 bdev_io->u.bdev.accel_sequence = NULL; 6242 6243 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6244 bdev_io_submit(bdev_io); 6245 return 0; 6246 } 6247 6248 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6249 bdev_comparev_and_writev_blocks_locked, bdev_io); 6250 } 6251 6252 int 6253 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6254 struct iovec *iov, int iovcnt, 6255 uint64_t offset_blocks, uint64_t num_blocks, 6256 bool populate, 6257 spdk_bdev_io_completion_cb cb, void *cb_arg) 6258 { 6259 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6260 struct spdk_bdev_io *bdev_io; 6261 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6262 6263 if (!desc->write) { 6264 return -EBADF; 6265 } 6266 6267 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6268 return -EINVAL; 6269 } 6270 6271 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6272 return -ENOTSUP; 6273 } 6274 6275 bdev_io = bdev_channel_get_io(channel); 6276 if (!bdev_io) { 6277 return -ENOMEM; 6278 } 6279 6280 bdev_io->internal.ch = channel; 6281 bdev_io->internal.desc = desc; 6282 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6283 bdev_io->u.bdev.num_blocks = num_blocks; 6284 bdev_io->u.bdev.offset_blocks = offset_blocks; 6285 bdev_io->u.bdev.iovs = iov; 6286 bdev_io->u.bdev.iovcnt = iovcnt; 6287 bdev_io->u.bdev.md_buf = NULL; 6288 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6289 bdev_io->u.bdev.zcopy.commit = 0; 6290 bdev_io->u.bdev.zcopy.start = 1; 6291 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6292 bdev_io->u.bdev.memory_domain = NULL; 6293 bdev_io->u.bdev.memory_domain_ctx = NULL; 6294 bdev_io->u.bdev.accel_sequence = NULL; 6295 6296 bdev_io_submit(bdev_io); 6297 6298 return 0; 6299 } 6300 6301 int 6302 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6303 spdk_bdev_io_completion_cb cb, void *cb_arg) 6304 { 6305 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6306 return -EINVAL; 6307 } 6308 6309 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6310 bdev_io->u.bdev.zcopy.start = 0; 6311 bdev_io->internal.caller_ctx = cb_arg; 6312 bdev_io->internal.cb = cb; 6313 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6314 6315 bdev_io_submit(bdev_io); 6316 6317 return 0; 6318 } 6319 6320 int 6321 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6322 uint64_t offset, uint64_t len, 6323 spdk_bdev_io_completion_cb cb, void *cb_arg) 6324 { 6325 uint64_t offset_blocks, num_blocks; 6326 6327 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6328 len, &num_blocks) != 0) { 6329 return -EINVAL; 6330 } 6331 6332 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6333 } 6334 6335 int 6336 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6337 uint64_t offset_blocks, uint64_t num_blocks, 6338 spdk_bdev_io_completion_cb cb, void *cb_arg) 6339 { 6340 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6341 struct spdk_bdev_io *bdev_io; 6342 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6343 6344 if (!desc->write) { 6345 return -EBADF; 6346 } 6347 6348 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6349 return -EINVAL; 6350 } 6351 6352 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6353 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6354 return -ENOTSUP; 6355 } 6356 6357 bdev_io = bdev_channel_get_io(channel); 6358 6359 if (!bdev_io) { 6360 return -ENOMEM; 6361 } 6362 6363 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6364 bdev_io->internal.ch = channel; 6365 bdev_io->internal.desc = desc; 6366 bdev_io->u.bdev.offset_blocks = offset_blocks; 6367 bdev_io->u.bdev.num_blocks = num_blocks; 6368 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6369 bdev_io->u.bdev.memory_domain = NULL; 6370 bdev_io->u.bdev.memory_domain_ctx = NULL; 6371 bdev_io->u.bdev.accel_sequence = NULL; 6372 6373 /* If the write_zeroes size is large and should be split, use the generic split 6374 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6375 * 6376 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6377 * or emulate it using regular write request otherwise. 6378 */ 6379 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6380 bdev_io->internal.f.split) { 6381 bdev_io_submit(bdev_io); 6382 return 0; 6383 } 6384 6385 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6386 6387 return bdev_write_zero_buffer(bdev_io); 6388 } 6389 6390 int 6391 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6392 uint64_t offset, uint64_t nbytes, 6393 spdk_bdev_io_completion_cb cb, void *cb_arg) 6394 { 6395 uint64_t offset_blocks, num_blocks; 6396 6397 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6398 nbytes, &num_blocks) != 0) { 6399 return -EINVAL; 6400 } 6401 6402 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6403 } 6404 6405 static void 6406 bdev_io_complete_cb(void *ctx) 6407 { 6408 struct spdk_bdev_io *bdev_io = ctx; 6409 6410 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6411 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6412 } 6413 6414 int 6415 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6416 uint64_t offset_blocks, uint64_t num_blocks, 6417 spdk_bdev_io_completion_cb cb, void *cb_arg) 6418 { 6419 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6420 struct spdk_bdev_io *bdev_io; 6421 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6422 6423 if (!desc->write) { 6424 return -EBADF; 6425 } 6426 6427 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6428 return -EINVAL; 6429 } 6430 6431 bdev_io = bdev_channel_get_io(channel); 6432 if (!bdev_io) { 6433 return -ENOMEM; 6434 } 6435 6436 bdev_io->internal.ch = channel; 6437 bdev_io->internal.desc = desc; 6438 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6439 6440 bdev_io->u.bdev.iovs = &bdev_io->iov; 6441 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6442 bdev_io->u.bdev.iovs[0].iov_len = 0; 6443 bdev_io->u.bdev.iovcnt = 1; 6444 6445 bdev_io->u.bdev.offset_blocks = offset_blocks; 6446 bdev_io->u.bdev.num_blocks = num_blocks; 6447 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6448 bdev_io->u.bdev.memory_domain = NULL; 6449 bdev_io->u.bdev.memory_domain_ctx = NULL; 6450 bdev_io->u.bdev.accel_sequence = NULL; 6451 6452 if (num_blocks == 0) { 6453 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6454 return 0; 6455 } 6456 6457 bdev_io_submit(bdev_io); 6458 return 0; 6459 } 6460 6461 int 6462 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6463 uint64_t offset, uint64_t length, 6464 spdk_bdev_io_completion_cb cb, void *cb_arg) 6465 { 6466 uint64_t offset_blocks, num_blocks; 6467 6468 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6469 length, &num_blocks) != 0) { 6470 return -EINVAL; 6471 } 6472 6473 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6474 } 6475 6476 int 6477 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6478 uint64_t offset_blocks, uint64_t num_blocks, 6479 spdk_bdev_io_completion_cb cb, void *cb_arg) 6480 { 6481 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6482 struct spdk_bdev_io *bdev_io; 6483 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6484 6485 if (!desc->write) { 6486 return -EBADF; 6487 } 6488 6489 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6490 return -ENOTSUP; 6491 } 6492 6493 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6494 return -EINVAL; 6495 } 6496 6497 bdev_io = bdev_channel_get_io(channel); 6498 if (!bdev_io) { 6499 return -ENOMEM; 6500 } 6501 6502 bdev_io->internal.ch = channel; 6503 bdev_io->internal.desc = desc; 6504 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6505 bdev_io->u.bdev.iovs = NULL; 6506 bdev_io->u.bdev.iovcnt = 0; 6507 bdev_io->u.bdev.offset_blocks = offset_blocks; 6508 bdev_io->u.bdev.num_blocks = num_blocks; 6509 bdev_io->u.bdev.memory_domain = NULL; 6510 bdev_io->u.bdev.memory_domain_ctx = NULL; 6511 bdev_io->u.bdev.accel_sequence = NULL; 6512 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6513 6514 bdev_io_submit(bdev_io); 6515 return 0; 6516 } 6517 6518 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6519 6520 static void 6521 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6522 { 6523 struct spdk_bdev_io *bdev_io = _ctx; 6524 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6525 6526 if (status == -EBUSY) { 6527 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6528 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6529 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6530 } else { 6531 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6532 /* If outstanding IOs are still present and reset_io_drain_timeout 6533 * seconds passed, start the reset. */ 6534 bdev_io_submit_reset(bdev_io); 6535 } else { 6536 /* We still have in progress memory domain pull/push or we're 6537 * executing accel sequence. Since we cannot abort either of those 6538 * operations, fail the reset request. */ 6539 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6540 } 6541 } 6542 } else { 6543 SPDK_DEBUGLOG(bdev, 6544 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6545 ch->bdev->name); 6546 /* Mark the completion status as a SUCCESS and complete the reset. */ 6547 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6548 } 6549 } 6550 6551 static void 6552 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6553 struct spdk_io_channel *io_ch, void *_ctx) 6554 { 6555 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6556 int status = 0; 6557 6558 if (cur_ch->io_outstanding > 0 || 6559 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6560 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6561 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6562 * further iteration over the rest of the channels and pass non-zero status 6563 * to the callback function. */ 6564 status = -EBUSY; 6565 } 6566 spdk_bdev_for_each_channel_continue(i, status); 6567 } 6568 6569 static int 6570 bdev_reset_poll_for_outstanding_io(void *ctx) 6571 { 6572 struct spdk_bdev_io *bdev_io = ctx; 6573 6574 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6575 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6576 bdev_reset_check_outstanding_io_done); 6577 6578 return SPDK_POLLER_BUSY; 6579 } 6580 6581 static void 6582 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6583 { 6584 struct spdk_bdev_io *bdev_io = _ctx; 6585 6586 if (bdev->reset_io_drain_timeout == 0) { 6587 bdev_io_submit_reset(bdev_io); 6588 return; 6589 } 6590 6591 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6592 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6593 6594 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6595 * submit the reset to the underlying module only if outstanding I/O 6596 * remain after reset_io_drain_timeout seconds have passed. */ 6597 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6598 bdev_reset_check_outstanding_io_done); 6599 } 6600 6601 static void 6602 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6603 struct spdk_io_channel *ch, void *_ctx) 6604 { 6605 struct spdk_bdev_channel *channel; 6606 struct spdk_bdev_mgmt_channel *mgmt_channel; 6607 struct spdk_bdev_shared_resource *shared_resource; 6608 bdev_io_tailq_t tmp_queued; 6609 6610 TAILQ_INIT(&tmp_queued); 6611 6612 channel = __io_ch_to_bdev_ch(ch); 6613 shared_resource = channel->shared_resource; 6614 mgmt_channel = shared_resource->mgmt_ch; 6615 6616 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6617 6618 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6619 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6620 } 6621 6622 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6623 bdev_abort_all_buf_io(mgmt_channel, channel); 6624 bdev_abort_all_queued_io(&tmp_queued, channel); 6625 6626 spdk_bdev_for_each_channel_continue(i, 0); 6627 } 6628 6629 static void 6630 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6631 { 6632 struct spdk_bdev *bdev = bdev_io->bdev; 6633 bool freeze_channel = false; 6634 6635 bdev_ch_add_to_io_submitted(bdev_io); 6636 6637 /** 6638 * Take a channel reference for the target bdev for the life of this 6639 * reset. This guards against the channel getting destroyed before 6640 * the reset is completed. We will release the reference when this 6641 * reset is completed. 6642 */ 6643 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6644 6645 spdk_spin_lock(&bdev->internal.spinlock); 6646 if (bdev->internal.reset_in_progress == NULL) { 6647 bdev->internal.reset_in_progress = bdev_io; 6648 freeze_channel = true; 6649 } else { 6650 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6651 } 6652 spdk_spin_unlock(&bdev->internal.spinlock); 6653 6654 if (freeze_channel) { 6655 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6656 bdev_reset_freeze_channel_done); 6657 } 6658 } 6659 6660 int 6661 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6662 spdk_bdev_io_completion_cb cb, void *cb_arg) 6663 { 6664 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6665 struct spdk_bdev_io *bdev_io; 6666 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6667 6668 bdev_io = bdev_channel_get_io(channel); 6669 if (!bdev_io) { 6670 return -ENOMEM; 6671 } 6672 6673 bdev_io->internal.ch = channel; 6674 bdev_io->internal.desc = desc; 6675 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6676 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6677 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6678 6679 bdev_start_reset(bdev_io); 6680 return 0; 6681 } 6682 6683 void 6684 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6685 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6686 { 6687 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6688 6689 bdev_get_io_stat(stat, channel->stat); 6690 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6691 } 6692 6693 static void 6694 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6695 { 6696 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6697 6698 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6699 bdev_iostat_ctx->cb_arg, 0); 6700 free(bdev_iostat_ctx); 6701 } 6702 6703 static void 6704 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6705 struct spdk_io_channel *ch, void *_ctx) 6706 { 6707 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6708 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6709 6710 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6711 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6712 spdk_bdev_for_each_channel_continue(i, 0); 6713 } 6714 6715 void 6716 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6717 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6718 { 6719 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6720 6721 assert(bdev != NULL); 6722 assert(stat != NULL); 6723 assert(cb != NULL); 6724 6725 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6726 if (bdev_iostat_ctx == NULL) { 6727 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6728 cb(bdev, stat, cb_arg, -ENOMEM); 6729 return; 6730 } 6731 6732 bdev_iostat_ctx->stat = stat; 6733 bdev_iostat_ctx->cb = cb; 6734 bdev_iostat_ctx->cb_arg = cb_arg; 6735 bdev_iostat_ctx->reset_mode = reset_mode; 6736 6737 /* Start with the statistics from previously deleted channels. */ 6738 spdk_spin_lock(&bdev->internal.spinlock); 6739 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6740 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6741 spdk_spin_unlock(&bdev->internal.spinlock); 6742 6743 /* Then iterate and add the statistics from each existing channel. */ 6744 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6745 bdev_get_device_stat_done); 6746 } 6747 6748 struct bdev_iostat_reset_ctx { 6749 enum spdk_bdev_reset_stat_mode mode; 6750 bdev_reset_device_stat_cb cb; 6751 void *cb_arg; 6752 }; 6753 6754 static void 6755 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6756 { 6757 struct bdev_iostat_reset_ctx *ctx = _ctx; 6758 6759 ctx->cb(bdev, ctx->cb_arg, 0); 6760 6761 free(ctx); 6762 } 6763 6764 static void 6765 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6766 struct spdk_io_channel *ch, void *_ctx) 6767 { 6768 struct bdev_iostat_reset_ctx *ctx = _ctx; 6769 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6770 6771 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6772 6773 spdk_bdev_for_each_channel_continue(i, 0); 6774 } 6775 6776 void 6777 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6778 bdev_reset_device_stat_cb cb, void *cb_arg) 6779 { 6780 struct bdev_iostat_reset_ctx *ctx; 6781 6782 assert(bdev != NULL); 6783 assert(cb != NULL); 6784 6785 ctx = calloc(1, sizeof(*ctx)); 6786 if (ctx == NULL) { 6787 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6788 cb(bdev, cb_arg, -ENOMEM); 6789 return; 6790 } 6791 6792 ctx->mode = mode; 6793 ctx->cb = cb; 6794 ctx->cb_arg = cb_arg; 6795 6796 spdk_spin_lock(&bdev->internal.spinlock); 6797 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6798 spdk_spin_unlock(&bdev->internal.spinlock); 6799 6800 spdk_bdev_for_each_channel(bdev, 6801 bdev_reset_each_channel_stat, 6802 ctx, 6803 bdev_reset_device_stat_done); 6804 } 6805 6806 int 6807 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6808 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6809 spdk_bdev_io_completion_cb cb, void *cb_arg) 6810 { 6811 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6812 struct spdk_bdev_io *bdev_io; 6813 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6814 6815 if (!desc->write) { 6816 return -EBADF; 6817 } 6818 6819 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6820 return -ENOTSUP; 6821 } 6822 6823 bdev_io = bdev_channel_get_io(channel); 6824 if (!bdev_io) { 6825 return -ENOMEM; 6826 } 6827 6828 bdev_io->internal.ch = channel; 6829 bdev_io->internal.desc = desc; 6830 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6831 bdev_io->u.nvme_passthru.cmd = *cmd; 6832 bdev_io->u.nvme_passthru.buf = buf; 6833 bdev_io->u.nvme_passthru.nbytes = nbytes; 6834 bdev_io->u.nvme_passthru.md_buf = NULL; 6835 bdev_io->u.nvme_passthru.md_len = 0; 6836 6837 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6838 6839 bdev_io_submit(bdev_io); 6840 return 0; 6841 } 6842 6843 int 6844 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6845 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6846 spdk_bdev_io_completion_cb cb, void *cb_arg) 6847 { 6848 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6849 struct spdk_bdev_io *bdev_io; 6850 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6851 6852 if (!desc->write) { 6853 /* 6854 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6855 * to easily determine if the command is a read or write, but for now just 6856 * do not allow io_passthru with a read-only descriptor. 6857 */ 6858 return -EBADF; 6859 } 6860 6861 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6862 return -ENOTSUP; 6863 } 6864 6865 bdev_io = bdev_channel_get_io(channel); 6866 if (!bdev_io) { 6867 return -ENOMEM; 6868 } 6869 6870 bdev_io->internal.ch = channel; 6871 bdev_io->internal.desc = desc; 6872 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6873 bdev_io->u.nvme_passthru.cmd = *cmd; 6874 bdev_io->u.nvme_passthru.buf = buf; 6875 bdev_io->u.nvme_passthru.nbytes = nbytes; 6876 bdev_io->u.nvme_passthru.md_buf = NULL; 6877 bdev_io->u.nvme_passthru.md_len = 0; 6878 6879 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6880 6881 bdev_io_submit(bdev_io); 6882 return 0; 6883 } 6884 6885 int 6886 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6887 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6888 spdk_bdev_io_completion_cb cb, void *cb_arg) 6889 { 6890 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6891 struct spdk_bdev_io *bdev_io; 6892 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6893 6894 if (!desc->write) { 6895 /* 6896 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6897 * to easily determine if the command is a read or write, but for now just 6898 * do not allow io_passthru with a read-only descriptor. 6899 */ 6900 return -EBADF; 6901 } 6902 6903 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6904 return -ENOTSUP; 6905 } 6906 6907 bdev_io = bdev_channel_get_io(channel); 6908 if (!bdev_io) { 6909 return -ENOMEM; 6910 } 6911 6912 bdev_io->internal.ch = channel; 6913 bdev_io->internal.desc = desc; 6914 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6915 bdev_io->u.nvme_passthru.cmd = *cmd; 6916 bdev_io->u.nvme_passthru.buf = buf; 6917 bdev_io->u.nvme_passthru.nbytes = nbytes; 6918 bdev_io->u.nvme_passthru.md_buf = md_buf; 6919 bdev_io->u.nvme_passthru.md_len = md_len; 6920 6921 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6922 6923 bdev_io_submit(bdev_io); 6924 return 0; 6925 } 6926 6927 int 6928 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6929 struct spdk_io_channel *ch, 6930 const struct spdk_nvme_cmd *cmd, 6931 struct iovec *iov, int iovcnt, size_t nbytes, 6932 void *md_buf, size_t md_len, 6933 spdk_bdev_io_completion_cb cb, void *cb_arg) 6934 { 6935 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6936 struct spdk_bdev_io *bdev_io; 6937 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6938 6939 if (!desc->write) { 6940 /* 6941 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6942 * to easily determine if the command is a read or write, but for now just 6943 * do not allow io_passthru with a read-only descriptor. 6944 */ 6945 return -EBADF; 6946 } 6947 6948 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6949 return -ENOTSUP; 6950 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6951 return -ENOTSUP; 6952 } 6953 6954 bdev_io = bdev_channel_get_io(channel); 6955 if (!bdev_io) { 6956 return -ENOMEM; 6957 } 6958 6959 bdev_io->internal.ch = channel; 6960 bdev_io->internal.desc = desc; 6961 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6962 bdev_io->u.nvme_passthru.cmd = *cmd; 6963 bdev_io->u.nvme_passthru.iovs = iov; 6964 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6965 bdev_io->u.nvme_passthru.nbytes = nbytes; 6966 bdev_io->u.nvme_passthru.md_buf = md_buf; 6967 bdev_io->u.nvme_passthru.md_len = md_len; 6968 6969 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6970 6971 bdev_io_submit(bdev_io); 6972 return 0; 6973 } 6974 6975 static void bdev_abort_retry(void *ctx); 6976 static void bdev_abort(struct spdk_bdev_io *parent_io); 6977 6978 static void 6979 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6980 { 6981 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6982 struct spdk_bdev_io *parent_io = cb_arg; 6983 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6984 6985 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6986 6987 spdk_bdev_free_io(bdev_io); 6988 6989 if (!success) { 6990 /* Check if the target I/O completed in the meantime. */ 6991 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6992 if (tmp_io == bio_to_abort) { 6993 break; 6994 } 6995 } 6996 6997 /* If the target I/O still exists, set the parent to failed. */ 6998 if (tmp_io != NULL) { 6999 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7000 } 7001 } 7002 7003 assert(parent_io->internal.f.split); 7004 7005 parent_io->internal.split.outstanding--; 7006 if (parent_io->internal.split.outstanding == 0) { 7007 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7008 bdev_abort_retry(parent_io); 7009 } else { 7010 bdev_io_complete(parent_io); 7011 } 7012 } 7013 } 7014 7015 static int 7016 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7017 struct spdk_bdev_io *bio_to_abort, 7018 spdk_bdev_io_completion_cb cb, void *cb_arg) 7019 { 7020 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7021 struct spdk_bdev_io *bdev_io; 7022 7023 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7024 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7025 /* TODO: Abort reset or abort request. */ 7026 return -ENOTSUP; 7027 } 7028 7029 bdev_io = bdev_channel_get_io(channel); 7030 if (bdev_io == NULL) { 7031 return -ENOMEM; 7032 } 7033 7034 bdev_io->internal.ch = channel; 7035 bdev_io->internal.desc = desc; 7036 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7037 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7038 7039 if (bio_to_abort->internal.f.split) { 7040 assert(bdev_io_should_split(bio_to_abort)); 7041 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7042 7043 /* Parent abort request is not submitted directly, but to manage its 7044 * execution add it to the submitted list here. 7045 */ 7046 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7047 bdev_ch_add_to_io_submitted(bdev_io); 7048 7049 bdev_abort(bdev_io); 7050 7051 return 0; 7052 } 7053 7054 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7055 7056 /* Submit the abort request to the underlying bdev module. */ 7057 bdev_io_submit(bdev_io); 7058 7059 return 0; 7060 } 7061 7062 static bool 7063 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7064 { 7065 struct spdk_bdev_io *iter; 7066 7067 TAILQ_FOREACH(iter, tailq, internal.link) { 7068 if (iter == bdev_io) { 7069 return true; 7070 } 7071 } 7072 7073 return false; 7074 } 7075 7076 static uint32_t 7077 _bdev_abort(struct spdk_bdev_io *parent_io) 7078 { 7079 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7080 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7081 void *bio_cb_arg; 7082 struct spdk_bdev_io *bio_to_abort; 7083 uint32_t matched_ios; 7084 int rc; 7085 7086 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7087 7088 /* matched_ios is returned and will be kept by the caller. 7089 * 7090 * This function will be used for two cases, 1) the same cb_arg is used for 7091 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7092 * Incrementing split_outstanding directly here may confuse readers especially 7093 * for the 1st case. 7094 * 7095 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7096 * works as expected. 7097 */ 7098 matched_ios = 0; 7099 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7100 7101 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7102 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7103 continue; 7104 } 7105 7106 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7107 /* Any I/O which was submitted after this abort command should be excluded. */ 7108 continue; 7109 } 7110 7111 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7112 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7113 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7114 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7115 break; 7116 } 7117 7118 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7119 if (rc != 0) { 7120 if (rc == -ENOMEM) { 7121 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7122 } else { 7123 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7124 } 7125 break; 7126 } 7127 matched_ios++; 7128 } 7129 7130 return matched_ios; 7131 } 7132 7133 static void 7134 bdev_abort_retry(void *ctx) 7135 { 7136 struct spdk_bdev_io *parent_io = ctx; 7137 uint32_t matched_ios; 7138 7139 matched_ios = _bdev_abort(parent_io); 7140 7141 if (matched_ios == 0) { 7142 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7143 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7144 } else { 7145 /* For retry, the case that no target I/O was found is success 7146 * because it means target I/Os completed in the meantime. 7147 */ 7148 bdev_io_complete(parent_io); 7149 } 7150 return; 7151 } 7152 7153 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7154 parent_io->internal.f.split = true; 7155 parent_io->internal.split.outstanding = matched_ios; 7156 } 7157 7158 static void 7159 bdev_abort(struct spdk_bdev_io *parent_io) 7160 { 7161 uint32_t matched_ios; 7162 7163 matched_ios = _bdev_abort(parent_io); 7164 7165 if (matched_ios == 0) { 7166 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7167 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7168 } else { 7169 /* The case the no target I/O was found is failure. */ 7170 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7171 bdev_io_complete(parent_io); 7172 } 7173 return; 7174 } 7175 7176 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7177 parent_io->internal.f.split = true; 7178 parent_io->internal.split.outstanding = matched_ios; 7179 } 7180 7181 int 7182 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7183 void *bio_cb_arg, 7184 spdk_bdev_io_completion_cb cb, void *cb_arg) 7185 { 7186 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7187 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7188 struct spdk_bdev_io *bdev_io; 7189 7190 if (bio_cb_arg == NULL) { 7191 return -EINVAL; 7192 } 7193 7194 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7195 return -ENOTSUP; 7196 } 7197 7198 bdev_io = bdev_channel_get_io(channel); 7199 if (bdev_io == NULL) { 7200 return -ENOMEM; 7201 } 7202 7203 bdev_io->internal.ch = channel; 7204 bdev_io->internal.desc = desc; 7205 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7206 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7207 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7208 7209 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7210 7211 /* Parent abort request is not submitted directly, but to manage its execution, 7212 * add it to the submitted list here. 7213 */ 7214 bdev_ch_add_to_io_submitted(bdev_io); 7215 7216 bdev_abort(bdev_io); 7217 7218 return 0; 7219 } 7220 7221 int 7222 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7223 struct spdk_bdev_io_wait_entry *entry) 7224 { 7225 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7226 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7227 7228 if (bdev != entry->bdev) { 7229 SPDK_ERRLOG("bdevs do not match\n"); 7230 return -EINVAL; 7231 } 7232 7233 if (mgmt_ch->per_thread_cache_count > 0) { 7234 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7235 return -EINVAL; 7236 } 7237 7238 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7239 return 0; 7240 } 7241 7242 static inline void 7243 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7244 { 7245 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7246 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7247 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7248 uint32_t blocklen = bdev_io->bdev->blocklen; 7249 7250 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7251 switch (bdev_io->type) { 7252 case SPDK_BDEV_IO_TYPE_READ: 7253 io_stat->bytes_read += num_blocks * blocklen; 7254 io_stat->num_read_ops++; 7255 io_stat->read_latency_ticks += tsc_diff; 7256 if (io_stat->max_read_latency_ticks < tsc_diff) { 7257 io_stat->max_read_latency_ticks = tsc_diff; 7258 } 7259 if (io_stat->min_read_latency_ticks > tsc_diff) { 7260 io_stat->min_read_latency_ticks = tsc_diff; 7261 } 7262 break; 7263 case SPDK_BDEV_IO_TYPE_WRITE: 7264 io_stat->bytes_written += num_blocks * blocklen; 7265 io_stat->num_write_ops++; 7266 io_stat->write_latency_ticks += tsc_diff; 7267 if (io_stat->max_write_latency_ticks < tsc_diff) { 7268 io_stat->max_write_latency_ticks = tsc_diff; 7269 } 7270 if (io_stat->min_write_latency_ticks > tsc_diff) { 7271 io_stat->min_write_latency_ticks = tsc_diff; 7272 } 7273 break; 7274 case SPDK_BDEV_IO_TYPE_UNMAP: 7275 io_stat->bytes_unmapped += num_blocks * blocklen; 7276 io_stat->num_unmap_ops++; 7277 io_stat->unmap_latency_ticks += tsc_diff; 7278 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7279 io_stat->max_unmap_latency_ticks = tsc_diff; 7280 } 7281 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7282 io_stat->min_unmap_latency_ticks = tsc_diff; 7283 } 7284 break; 7285 case SPDK_BDEV_IO_TYPE_ZCOPY: 7286 /* Track the data in the start phase only */ 7287 if (bdev_io->u.bdev.zcopy.start) { 7288 if (bdev_io->u.bdev.zcopy.populate) { 7289 io_stat->bytes_read += num_blocks * blocklen; 7290 io_stat->num_read_ops++; 7291 io_stat->read_latency_ticks += tsc_diff; 7292 if (io_stat->max_read_latency_ticks < tsc_diff) { 7293 io_stat->max_read_latency_ticks = tsc_diff; 7294 } 7295 if (io_stat->min_read_latency_ticks > tsc_diff) { 7296 io_stat->min_read_latency_ticks = tsc_diff; 7297 } 7298 } else { 7299 io_stat->bytes_written += num_blocks * blocklen; 7300 io_stat->num_write_ops++; 7301 io_stat->write_latency_ticks += tsc_diff; 7302 if (io_stat->max_write_latency_ticks < tsc_diff) { 7303 io_stat->max_write_latency_ticks = tsc_diff; 7304 } 7305 if (io_stat->min_write_latency_ticks > tsc_diff) { 7306 io_stat->min_write_latency_ticks = tsc_diff; 7307 } 7308 } 7309 } 7310 break; 7311 case SPDK_BDEV_IO_TYPE_COPY: 7312 io_stat->bytes_copied += num_blocks * blocklen; 7313 io_stat->num_copy_ops++; 7314 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7315 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7316 io_stat->max_copy_latency_ticks = tsc_diff; 7317 } 7318 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7319 io_stat->min_copy_latency_ticks = tsc_diff; 7320 } 7321 break; 7322 default: 7323 break; 7324 } 7325 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7326 io_stat = bdev_io->bdev->internal.stat; 7327 assert(io_stat->io_error != NULL); 7328 7329 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7330 io_stat->io_error->error_status[-io_status - 1]++; 7331 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7332 } 7333 7334 #ifdef SPDK_CONFIG_VTUNE 7335 uint64_t now_tsc = spdk_get_ticks(); 7336 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7337 uint64_t data[5]; 7338 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7339 7340 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7341 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7342 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7343 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7344 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7345 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7346 7347 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7348 __itt_metadata_u64, 5, data); 7349 7350 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7351 bdev_io->internal.ch->start_tsc = now_tsc; 7352 } 7353 #endif 7354 } 7355 7356 static inline void 7357 _bdev_io_complete(void *ctx) 7358 { 7359 struct spdk_bdev_io *bdev_io = ctx; 7360 7361 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7362 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7363 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7364 } 7365 7366 assert(bdev_io->internal.cb != NULL); 7367 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7368 7369 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7370 bdev_io->internal.caller_ctx); 7371 } 7372 7373 static inline void 7374 bdev_io_complete(void *ctx) 7375 { 7376 struct spdk_bdev_io *bdev_io = ctx; 7377 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7378 uint64_t tsc, tsc_diff; 7379 7380 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7381 /* 7382 * Defer completion to avoid potential infinite recursion if the 7383 * user's completion callback issues a new I/O. 7384 */ 7385 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7386 bdev_io_complete, bdev_io); 7387 return; 7388 } 7389 7390 tsc = spdk_get_ticks(); 7391 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7392 7393 bdev_ch_remove_from_io_submitted(bdev_io); 7394 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7395 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7396 7397 if (bdev_ch->histogram) { 7398 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7399 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7400 /* 7401 * Tally all I/O types if the histogram_io_type is set to 0. 7402 */ 7403 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7404 } 7405 } 7406 7407 bdev_io_update_io_stat(bdev_io, tsc_diff); 7408 _bdev_io_complete(bdev_io); 7409 } 7410 7411 /* The difference between this function and bdev_io_complete() is that this should be called to 7412 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7413 * io_submitted list and don't have submit_tsc updated. 7414 */ 7415 static inline void 7416 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7417 { 7418 /* Since the IO hasn't been submitted it's bound to be failed */ 7419 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7420 7421 /* At this point we don't know if the IO is completed from submission context or not, but, 7422 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7423 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7424 _bdev_io_complete, bdev_io); 7425 } 7426 7427 static void bdev_destroy_cb(void *io_device); 7428 7429 static inline void 7430 _bdev_reset_complete(void *ctx) 7431 { 7432 struct spdk_bdev_io *bdev_io = ctx; 7433 7434 /* Put the channel reference we got in submission. */ 7435 assert(bdev_io->u.reset.ch_ref != NULL); 7436 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7437 bdev_io->u.reset.ch_ref = NULL; 7438 7439 bdev_io_complete(bdev_io); 7440 } 7441 7442 static void 7443 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7444 { 7445 struct spdk_bdev_io *bdev_io = _ctx; 7446 bdev_io_tailq_t queued_resets; 7447 struct spdk_bdev_io *queued_reset; 7448 7449 assert(bdev_io == bdev->internal.reset_in_progress); 7450 7451 TAILQ_INIT(&queued_resets); 7452 7453 spdk_spin_lock(&bdev->internal.spinlock); 7454 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7455 spdk_bdev_io, internal.link); 7456 bdev->internal.reset_in_progress = NULL; 7457 spdk_spin_unlock(&bdev->internal.spinlock); 7458 7459 while (!TAILQ_EMPTY(&queued_resets)) { 7460 queued_reset = TAILQ_FIRST(&queued_resets); 7461 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7462 queued_reset->internal.status = bdev_io->internal.status; 7463 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7464 _bdev_reset_complete, queued_reset); 7465 } 7466 7467 _bdev_reset_complete(bdev_io); 7468 7469 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7470 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7471 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7472 } 7473 } 7474 7475 static void 7476 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7477 struct spdk_io_channel *_ch, void *_ctx) 7478 { 7479 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7480 7481 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7482 7483 spdk_bdev_for_each_channel_continue(i, 0); 7484 } 7485 7486 static void 7487 bdev_io_complete_sequence_cb(void *ctx, int status) 7488 { 7489 struct spdk_bdev_io *bdev_io = ctx; 7490 7491 /* u.bdev.accel_sequence should have already been cleared at this point */ 7492 assert(bdev_io->u.bdev.accel_sequence == NULL); 7493 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7494 bdev_io->internal.f.has_accel_sequence = false; 7495 7496 if (spdk_unlikely(status != 0)) { 7497 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7498 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7499 } 7500 7501 bdev_io_complete(bdev_io); 7502 } 7503 7504 void 7505 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7506 { 7507 struct spdk_bdev *bdev = bdev_io->bdev; 7508 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7509 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7510 7511 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7512 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7513 spdk_bdev_get_module_name(bdev), 7514 bdev_io_status_get_string(bdev_io->internal.status)); 7515 assert(false); 7516 } 7517 bdev_io->internal.status = status; 7518 7519 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7520 assert(bdev_io == bdev->internal.reset_in_progress); 7521 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7522 bdev_reset_complete); 7523 return; 7524 } else { 7525 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7526 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7527 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7528 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7529 return; 7530 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7531 !bdev_io_use_accel_sequence(bdev_io))) { 7532 _bdev_io_push_bounce_data_buffer(bdev_io, 7533 _bdev_io_complete_push_bounce_done); 7534 /* bdev IO will be completed in the callback */ 7535 return; 7536 } 7537 } 7538 7539 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7540 return; 7541 } 7542 } 7543 7544 bdev_io_complete(bdev_io); 7545 } 7546 7547 void 7548 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7549 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7550 { 7551 enum spdk_bdev_io_status status; 7552 7553 if (sc == SPDK_SCSI_STATUS_GOOD) { 7554 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7555 } else { 7556 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7557 bdev_io->internal.error.scsi.sc = sc; 7558 bdev_io->internal.error.scsi.sk = sk; 7559 bdev_io->internal.error.scsi.asc = asc; 7560 bdev_io->internal.error.scsi.ascq = ascq; 7561 } 7562 7563 spdk_bdev_io_complete(bdev_io, status); 7564 } 7565 7566 void 7567 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7568 int *sc, int *sk, int *asc, int *ascq) 7569 { 7570 assert(sc != NULL); 7571 assert(sk != NULL); 7572 assert(asc != NULL); 7573 assert(ascq != NULL); 7574 7575 switch (bdev_io->internal.status) { 7576 case SPDK_BDEV_IO_STATUS_SUCCESS: 7577 *sc = SPDK_SCSI_STATUS_GOOD; 7578 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7579 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7580 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7581 break; 7582 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7583 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7584 break; 7585 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7586 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7587 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7588 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7589 *ascq = bdev_io->internal.error.scsi.ascq; 7590 break; 7591 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7592 *sc = bdev_io->internal.error.scsi.sc; 7593 *sk = bdev_io->internal.error.scsi.sk; 7594 *asc = bdev_io->internal.error.scsi.asc; 7595 *ascq = bdev_io->internal.error.scsi.ascq; 7596 break; 7597 default: 7598 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7599 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7600 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7601 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7602 break; 7603 } 7604 } 7605 7606 void 7607 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7608 { 7609 enum spdk_bdev_io_status status; 7610 7611 if (aio_result == 0) { 7612 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7613 } else { 7614 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7615 } 7616 7617 bdev_io->internal.error.aio_result = aio_result; 7618 7619 spdk_bdev_io_complete(bdev_io, status); 7620 } 7621 7622 void 7623 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7624 { 7625 assert(aio_result != NULL); 7626 7627 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7628 *aio_result = bdev_io->internal.error.aio_result; 7629 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7630 *aio_result = 0; 7631 } else { 7632 *aio_result = -EIO; 7633 } 7634 } 7635 7636 void 7637 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7638 { 7639 enum spdk_bdev_io_status status; 7640 7641 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7642 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7643 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7644 status = SPDK_BDEV_IO_STATUS_ABORTED; 7645 } else { 7646 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7647 } 7648 7649 bdev_io->internal.error.nvme.cdw0 = cdw0; 7650 bdev_io->internal.error.nvme.sct = sct; 7651 bdev_io->internal.error.nvme.sc = sc; 7652 7653 spdk_bdev_io_complete(bdev_io, status); 7654 } 7655 7656 void 7657 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7658 { 7659 assert(sct != NULL); 7660 assert(sc != NULL); 7661 assert(cdw0 != NULL); 7662 7663 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7664 *sct = SPDK_NVME_SCT_GENERIC; 7665 *sc = SPDK_NVME_SC_SUCCESS; 7666 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7667 *cdw0 = 0; 7668 } else { 7669 *cdw0 = 1U; 7670 } 7671 return; 7672 } 7673 7674 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7675 *sct = SPDK_NVME_SCT_GENERIC; 7676 *sc = SPDK_NVME_SC_SUCCESS; 7677 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7678 *sct = bdev_io->internal.error.nvme.sct; 7679 *sc = bdev_io->internal.error.nvme.sc; 7680 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7681 *sct = SPDK_NVME_SCT_GENERIC; 7682 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7683 } else { 7684 *sct = SPDK_NVME_SCT_GENERIC; 7685 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7686 } 7687 7688 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7689 } 7690 7691 void 7692 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7693 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7694 { 7695 assert(first_sct != NULL); 7696 assert(first_sc != NULL); 7697 assert(second_sct != NULL); 7698 assert(second_sc != NULL); 7699 assert(cdw0 != NULL); 7700 7701 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7702 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7703 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7704 *first_sct = bdev_io->internal.error.nvme.sct; 7705 *first_sc = bdev_io->internal.error.nvme.sc; 7706 *second_sct = SPDK_NVME_SCT_GENERIC; 7707 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7708 } else { 7709 *first_sct = SPDK_NVME_SCT_GENERIC; 7710 *first_sc = SPDK_NVME_SC_SUCCESS; 7711 *second_sct = bdev_io->internal.error.nvme.sct; 7712 *second_sc = bdev_io->internal.error.nvme.sc; 7713 } 7714 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7715 *first_sct = SPDK_NVME_SCT_GENERIC; 7716 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7717 *second_sct = SPDK_NVME_SCT_GENERIC; 7718 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7719 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7720 *first_sct = SPDK_NVME_SCT_GENERIC; 7721 *first_sc = SPDK_NVME_SC_SUCCESS; 7722 *second_sct = SPDK_NVME_SCT_GENERIC; 7723 *second_sc = SPDK_NVME_SC_SUCCESS; 7724 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7725 *first_sct = SPDK_NVME_SCT_GENERIC; 7726 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7727 *second_sct = SPDK_NVME_SCT_GENERIC; 7728 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7729 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7730 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7731 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7732 *second_sct = SPDK_NVME_SCT_GENERIC; 7733 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7734 } else { 7735 *first_sct = SPDK_NVME_SCT_GENERIC; 7736 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7737 *second_sct = SPDK_NVME_SCT_GENERIC; 7738 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7739 } 7740 7741 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7742 } 7743 7744 void 7745 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7746 const struct spdk_bdev_io *base_io) 7747 { 7748 switch (base_io->internal.status) { 7749 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7750 spdk_bdev_io_complete_nvme_status(bdev_io, 7751 base_io->internal.error.nvme.cdw0, 7752 base_io->internal.error.nvme.sct, 7753 base_io->internal.error.nvme.sc); 7754 break; 7755 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7756 spdk_bdev_io_complete_scsi_status(bdev_io, 7757 base_io->internal.error.scsi.sc, 7758 base_io->internal.error.scsi.sk, 7759 base_io->internal.error.scsi.asc, 7760 base_io->internal.error.scsi.ascq); 7761 break; 7762 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7763 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7764 break; 7765 default: 7766 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7767 break; 7768 } 7769 } 7770 7771 struct spdk_thread * 7772 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7773 { 7774 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7775 } 7776 7777 struct spdk_io_channel * 7778 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7779 { 7780 return bdev_io->internal.ch->channel; 7781 } 7782 7783 static int 7784 bdev_register(struct spdk_bdev *bdev) 7785 { 7786 char *bdev_name; 7787 char uuid[SPDK_UUID_STRING_LEN]; 7788 struct spdk_iobuf_opts iobuf_opts; 7789 int ret; 7790 7791 assert(bdev->module != NULL); 7792 7793 if (!bdev->name) { 7794 SPDK_ERRLOG("Bdev name is NULL\n"); 7795 return -EINVAL; 7796 } 7797 7798 if (!strlen(bdev->name)) { 7799 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7800 return -EINVAL; 7801 } 7802 7803 /* Users often register their own I/O devices using the bdev name. In 7804 * order to avoid conflicts, prepend bdev_. */ 7805 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7806 if (!bdev_name) { 7807 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7808 return -ENOMEM; 7809 } 7810 7811 bdev->internal.stat = bdev_alloc_io_stat(true); 7812 if (!bdev->internal.stat) { 7813 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7814 free(bdev_name); 7815 return -ENOMEM; 7816 } 7817 7818 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7819 bdev->internal.measured_queue_depth = UINT64_MAX; 7820 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7821 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7822 bdev->internal.qd_poller = NULL; 7823 bdev->internal.qos = NULL; 7824 7825 TAILQ_INIT(&bdev->internal.open_descs); 7826 TAILQ_INIT(&bdev->internal.locked_ranges); 7827 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7828 TAILQ_INIT(&bdev->internal.queued_resets); 7829 TAILQ_INIT(&bdev->aliases); 7830 7831 /* UUID may be specified by the user or defined by bdev itself. 7832 * Otherwise it will be generated here, so this field will never be empty. */ 7833 if (spdk_uuid_is_null(&bdev->uuid)) { 7834 spdk_uuid_generate(&bdev->uuid); 7835 } 7836 7837 /* Add the UUID alias only if it's different than the name */ 7838 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7839 if (strcmp(bdev->name, uuid) != 0) { 7840 ret = spdk_bdev_alias_add(bdev, uuid); 7841 if (ret != 0) { 7842 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7843 bdev_free_io_stat(bdev->internal.stat); 7844 free(bdev_name); 7845 return ret; 7846 } 7847 } 7848 7849 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7850 if (spdk_bdev_get_buf_align(bdev) > 1) { 7851 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7852 iobuf_opts.large_bufsize / bdev->blocklen); 7853 } 7854 7855 /* If the user didn't specify a write unit size, set it to one. */ 7856 if (bdev->write_unit_size == 0) { 7857 bdev->write_unit_size = 1; 7858 } 7859 7860 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7861 if (bdev->acwu == 0) { 7862 bdev->acwu = bdev->write_unit_size; 7863 } 7864 7865 if (bdev->phys_blocklen == 0) { 7866 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7867 } 7868 7869 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7870 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7871 } 7872 7873 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7874 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7875 } 7876 7877 bdev->internal.reset_in_progress = NULL; 7878 bdev->internal.qd_poll_in_progress = false; 7879 bdev->internal.period = 0; 7880 bdev->internal.new_period = 0; 7881 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7882 7883 /* 7884 * Initialize spinlock before registering IO device because spinlock is used in 7885 * bdev_channel_create 7886 */ 7887 spdk_spin_init(&bdev->internal.spinlock); 7888 7889 spdk_io_device_register(__bdev_to_io_dev(bdev), 7890 bdev_channel_create, bdev_channel_destroy, 7891 sizeof(struct spdk_bdev_channel), 7892 bdev_name); 7893 7894 /* 7895 * Register bdev name only after the bdev object is ready. 7896 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7897 * create IO channels... 7898 */ 7899 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7900 if (ret != 0) { 7901 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7902 bdev_free_io_stat(bdev->internal.stat); 7903 spdk_spin_destroy(&bdev->internal.spinlock); 7904 free(bdev_name); 7905 return ret; 7906 } 7907 7908 free(bdev_name); 7909 7910 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7911 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7912 7913 return 0; 7914 } 7915 7916 static void 7917 bdev_destroy_cb(void *io_device) 7918 { 7919 int rc; 7920 struct spdk_bdev *bdev; 7921 spdk_bdev_unregister_cb cb_fn; 7922 void *cb_arg; 7923 7924 bdev = __bdev_from_io_dev(io_device); 7925 7926 if (bdev->internal.unregister_td != spdk_get_thread()) { 7927 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7928 return; 7929 } 7930 7931 cb_fn = bdev->internal.unregister_cb; 7932 cb_arg = bdev->internal.unregister_ctx; 7933 7934 spdk_spin_destroy(&bdev->internal.spinlock); 7935 free(bdev->internal.qos); 7936 bdev_free_io_stat(bdev->internal.stat); 7937 spdk_trace_unregister_owner(bdev->internal.trace_id); 7938 7939 rc = bdev->fn_table->destruct(bdev->ctxt); 7940 if (rc < 0) { 7941 SPDK_ERRLOG("destruct failed\n"); 7942 } 7943 if (rc <= 0 && cb_fn != NULL) { 7944 cb_fn(cb_arg, rc); 7945 } 7946 } 7947 7948 void 7949 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7950 { 7951 if (bdev->internal.unregister_cb != NULL) { 7952 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7953 } 7954 } 7955 7956 static void 7957 _remove_notify(void *arg) 7958 { 7959 struct spdk_bdev_desc *desc = arg; 7960 7961 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7962 } 7963 7964 /* returns: 0 - bdev removed and ready to be destructed. 7965 * -EBUSY - bdev can't be destructed yet. */ 7966 static int 7967 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7968 { 7969 struct spdk_bdev_desc *desc, *tmp; 7970 struct spdk_bdev_alias *alias; 7971 int rc = 0; 7972 char uuid[SPDK_UUID_STRING_LEN]; 7973 7974 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7975 assert(spdk_spin_held(&bdev->internal.spinlock)); 7976 7977 /* Notify each descriptor about hotremoval */ 7978 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7979 rc = -EBUSY; 7980 /* 7981 * Defer invocation of the event_cb to a separate message that will 7982 * run later on its thread. This ensures this context unwinds and 7983 * we don't recursively unregister this bdev again if the event_cb 7984 * immediately closes its descriptor. 7985 */ 7986 event_notify(desc, _remove_notify); 7987 } 7988 7989 /* If there are no descriptors, proceed removing the bdev */ 7990 if (rc == 0) { 7991 bdev_examine_allowlist_remove(bdev->name); 7992 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 7993 bdev_examine_allowlist_remove(alias->alias.name); 7994 } 7995 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7996 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7997 7998 /* Delete the name and the UUID alias */ 7999 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8000 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8001 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8002 8003 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8004 8005 if (bdev->internal.reset_in_progress != NULL) { 8006 /* If reset is in progress, let the completion callback for reset 8007 * unregister the bdev. 8008 */ 8009 rc = -EBUSY; 8010 } 8011 } 8012 8013 return rc; 8014 } 8015 8016 static void 8017 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8018 struct spdk_io_channel *io_ch, void *_ctx) 8019 { 8020 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8021 8022 bdev_channel_abort_queued_ios(bdev_ch); 8023 spdk_bdev_for_each_channel_continue(i, 0); 8024 } 8025 8026 static void 8027 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8028 { 8029 int rc; 8030 8031 spdk_spin_lock(&g_bdev_mgr.spinlock); 8032 spdk_spin_lock(&bdev->internal.spinlock); 8033 /* 8034 * Set the status to REMOVING after completing to abort channels. Otherwise, 8035 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8036 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8037 * may fail. 8038 */ 8039 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8040 rc = bdev_unregister_unsafe(bdev); 8041 spdk_spin_unlock(&bdev->internal.spinlock); 8042 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8043 8044 if (rc == 0) { 8045 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8046 } 8047 } 8048 8049 void 8050 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8051 { 8052 struct spdk_thread *thread; 8053 8054 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8055 8056 thread = spdk_get_thread(); 8057 if (!thread) { 8058 /* The user called this from a non-SPDK thread. */ 8059 if (cb_fn != NULL) { 8060 cb_fn(cb_arg, -ENOTSUP); 8061 } 8062 return; 8063 } 8064 8065 spdk_spin_lock(&g_bdev_mgr.spinlock); 8066 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8067 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8068 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8069 if (cb_fn) { 8070 cb_fn(cb_arg, -EBUSY); 8071 } 8072 return; 8073 } 8074 8075 spdk_spin_lock(&bdev->internal.spinlock); 8076 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8077 bdev->internal.unregister_cb = cb_fn; 8078 bdev->internal.unregister_ctx = cb_arg; 8079 bdev->internal.unregister_td = thread; 8080 spdk_spin_unlock(&bdev->internal.spinlock); 8081 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8082 8083 spdk_bdev_set_qd_sampling_period(bdev, 0); 8084 8085 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8086 bdev_unregister); 8087 } 8088 8089 int 8090 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8091 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8092 { 8093 struct spdk_bdev_desc *desc; 8094 struct spdk_bdev *bdev; 8095 int rc; 8096 8097 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8098 if (rc != 0) { 8099 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8100 return rc; 8101 } 8102 8103 bdev = spdk_bdev_desc_get_bdev(desc); 8104 8105 if (bdev->module != module) { 8106 spdk_bdev_close(desc); 8107 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8108 bdev_name); 8109 return -ENODEV; 8110 } 8111 8112 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8113 8114 spdk_bdev_close(desc); 8115 8116 return 0; 8117 } 8118 8119 static int 8120 bdev_start_qos(struct spdk_bdev *bdev) 8121 { 8122 struct set_qos_limit_ctx *ctx; 8123 8124 /* Enable QoS */ 8125 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8126 ctx = calloc(1, sizeof(*ctx)); 8127 if (ctx == NULL) { 8128 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8129 return -ENOMEM; 8130 } 8131 ctx->bdev = bdev; 8132 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8133 } 8134 8135 return 0; 8136 } 8137 8138 static void 8139 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8140 struct spdk_bdev *bdev) 8141 { 8142 enum spdk_bdev_claim_type type; 8143 const char *typename, *modname; 8144 extern struct spdk_log_flag SPDK_LOG_bdev; 8145 8146 assert(spdk_spin_held(&bdev->internal.spinlock)); 8147 8148 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8149 return; 8150 } 8151 8152 type = bdev->internal.claim_type; 8153 typename = spdk_bdev_claim_get_name(type); 8154 8155 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8156 modname = bdev->internal.claim.v1.module->name; 8157 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8158 bdev->name, detail, typename, modname); 8159 return; 8160 } 8161 8162 if (claim_type_is_v2(type)) { 8163 struct spdk_bdev_module_claim *claim; 8164 8165 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8166 modname = claim->module->name; 8167 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8168 bdev->name, detail, typename, modname); 8169 } 8170 return; 8171 } 8172 8173 assert(false); 8174 } 8175 8176 static int 8177 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8178 { 8179 struct spdk_thread *thread; 8180 int rc = 0; 8181 8182 thread = spdk_get_thread(); 8183 if (!thread) { 8184 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8185 return -ENOTSUP; 8186 } 8187 8188 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8189 spdk_get_thread()); 8190 8191 desc->bdev = bdev; 8192 desc->thread = thread; 8193 desc->write = write; 8194 8195 spdk_spin_lock(&bdev->internal.spinlock); 8196 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8197 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8198 spdk_spin_unlock(&bdev->internal.spinlock); 8199 return -ENODEV; 8200 } 8201 8202 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8203 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8204 spdk_spin_unlock(&bdev->internal.spinlock); 8205 return -EPERM; 8206 } 8207 8208 rc = bdev_start_qos(bdev); 8209 if (rc != 0) { 8210 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8211 spdk_spin_unlock(&bdev->internal.spinlock); 8212 return rc; 8213 } 8214 8215 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8216 8217 spdk_spin_unlock(&bdev->internal.spinlock); 8218 8219 return 0; 8220 } 8221 8222 static int 8223 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8224 struct spdk_bdev_desc **_desc) 8225 { 8226 struct spdk_bdev_desc *desc; 8227 unsigned int i; 8228 8229 desc = calloc(1, sizeof(*desc)); 8230 if (desc == NULL) { 8231 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8232 return -ENOMEM; 8233 } 8234 8235 TAILQ_INIT(&desc->pending_media_events); 8236 TAILQ_INIT(&desc->free_media_events); 8237 8238 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8239 desc->callback.event_fn = event_cb; 8240 desc->callback.ctx = event_ctx; 8241 spdk_spin_init(&desc->spinlock); 8242 8243 if (bdev->media_events) { 8244 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8245 sizeof(*desc->media_events_buffer)); 8246 if (desc->media_events_buffer == NULL) { 8247 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8248 bdev_desc_free(desc); 8249 return -ENOMEM; 8250 } 8251 8252 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8253 TAILQ_INSERT_TAIL(&desc->free_media_events, 8254 &desc->media_events_buffer[i], tailq); 8255 } 8256 } 8257 8258 if (bdev->fn_table->accel_sequence_supported != NULL) { 8259 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8260 desc->accel_sequence_supported[i] = 8261 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8262 (enum spdk_bdev_io_type)i); 8263 } 8264 } 8265 8266 *_desc = desc; 8267 8268 return 0; 8269 } 8270 8271 static int 8272 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8273 void *event_ctx, struct spdk_bdev_desc **_desc) 8274 { 8275 struct spdk_bdev_desc *desc; 8276 struct spdk_bdev *bdev; 8277 int rc; 8278 8279 bdev = bdev_get_by_name(bdev_name); 8280 8281 if (bdev == NULL) { 8282 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8283 return -ENODEV; 8284 } 8285 8286 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8287 if (rc != 0) { 8288 return rc; 8289 } 8290 8291 rc = bdev_open(bdev, write, desc); 8292 if (rc != 0) { 8293 bdev_desc_free(desc); 8294 desc = NULL; 8295 } 8296 8297 *_desc = desc; 8298 8299 return rc; 8300 } 8301 8302 int 8303 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8304 void *event_ctx, struct spdk_bdev_desc **_desc) 8305 { 8306 int rc; 8307 8308 if (event_cb == NULL) { 8309 SPDK_ERRLOG("Missing event callback function\n"); 8310 return -EINVAL; 8311 } 8312 8313 spdk_spin_lock(&g_bdev_mgr.spinlock); 8314 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8315 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8316 8317 return rc; 8318 } 8319 8320 struct spdk_bdev_open_async_ctx { 8321 char *bdev_name; 8322 spdk_bdev_event_cb_t event_cb; 8323 void *event_ctx; 8324 bool write; 8325 int rc; 8326 spdk_bdev_open_async_cb_t cb_fn; 8327 void *cb_arg; 8328 struct spdk_bdev_desc *desc; 8329 struct spdk_bdev_open_async_opts opts; 8330 uint64_t start_ticks; 8331 struct spdk_thread *orig_thread; 8332 struct spdk_poller *poller; 8333 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8334 }; 8335 8336 static void 8337 bdev_open_async_done(void *arg) 8338 { 8339 struct spdk_bdev_open_async_ctx *ctx = arg; 8340 8341 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8342 8343 free(ctx->bdev_name); 8344 free(ctx); 8345 } 8346 8347 static void 8348 bdev_open_async_cancel(void *arg) 8349 { 8350 struct spdk_bdev_open_async_ctx *ctx = arg; 8351 8352 assert(ctx->rc == -ESHUTDOWN); 8353 8354 spdk_poller_unregister(&ctx->poller); 8355 8356 bdev_open_async_done(ctx); 8357 } 8358 8359 /* This is called when the bdev library finishes at shutdown. */ 8360 static void 8361 bdev_open_async_fini(void) 8362 { 8363 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8364 8365 spdk_spin_lock(&g_bdev_mgr.spinlock); 8366 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8367 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8368 /* 8369 * We have to move to ctx->orig_thread to unregister ctx->poller. 8370 * However, there is a chance that ctx->poller is executed before 8371 * message is executed, which could result in bdev_open_async_done() 8372 * being called twice. To avoid such race condition, set ctx->rc to 8373 * -ESHUTDOWN. 8374 */ 8375 ctx->rc = -ESHUTDOWN; 8376 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8377 } 8378 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8379 } 8380 8381 static int bdev_open_async(void *arg); 8382 8383 static void 8384 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8385 { 8386 uint64_t timeout_ticks; 8387 8388 if (ctx->rc == -ESHUTDOWN) { 8389 /* This context is being canceled. Do nothing. */ 8390 return; 8391 } 8392 8393 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8394 &ctx->desc); 8395 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8396 goto exit; 8397 } 8398 8399 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8400 if (spdk_get_ticks() >= timeout_ticks) { 8401 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8402 ctx->rc = -ETIMEDOUT; 8403 goto exit; 8404 } 8405 8406 return; 8407 8408 exit: 8409 spdk_poller_unregister(&ctx->poller); 8410 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8411 8412 /* Completion callback is processed after stack unwinding. */ 8413 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8414 } 8415 8416 static int 8417 bdev_open_async(void *arg) 8418 { 8419 struct spdk_bdev_open_async_ctx *ctx = arg; 8420 8421 spdk_spin_lock(&g_bdev_mgr.spinlock); 8422 8423 _bdev_open_async(ctx); 8424 8425 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8426 8427 return SPDK_POLLER_BUSY; 8428 } 8429 8430 static void 8431 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8432 struct spdk_bdev_open_async_opts *opts_src, 8433 size_t size) 8434 { 8435 assert(opts); 8436 assert(opts_src); 8437 8438 opts->size = size; 8439 8440 #define SET_FIELD(field) \ 8441 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8442 opts->field = opts_src->field; \ 8443 } \ 8444 8445 SET_FIELD(timeout_ms); 8446 8447 /* Do not remove this statement, you should always update this statement when you adding a new field, 8448 * and do not forget to add the SET_FIELD statement for your added field. */ 8449 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8450 8451 #undef SET_FIELD 8452 } 8453 8454 static void 8455 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8456 { 8457 assert(opts); 8458 8459 opts->size = size; 8460 8461 #define SET_FIELD(field, value) \ 8462 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8463 opts->field = value; \ 8464 } \ 8465 8466 SET_FIELD(timeout_ms, 0); 8467 8468 #undef SET_FIELD 8469 } 8470 8471 int 8472 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8473 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8474 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8475 { 8476 struct spdk_bdev_open_async_ctx *ctx; 8477 8478 if (event_cb == NULL) { 8479 SPDK_ERRLOG("Missing event callback function\n"); 8480 return -EINVAL; 8481 } 8482 8483 if (open_cb == NULL) { 8484 SPDK_ERRLOG("Missing open callback function\n"); 8485 return -EINVAL; 8486 } 8487 8488 if (opts != NULL && opts->size == 0) { 8489 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8490 return -EINVAL; 8491 } 8492 8493 ctx = calloc(1, sizeof(*ctx)); 8494 if (ctx == NULL) { 8495 SPDK_ERRLOG("Failed to allocate open context\n"); 8496 return -ENOMEM; 8497 } 8498 8499 ctx->bdev_name = strdup(bdev_name); 8500 if (ctx->bdev_name == NULL) { 8501 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8502 free(ctx); 8503 return -ENOMEM; 8504 } 8505 8506 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8507 if (ctx->poller == NULL) { 8508 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8509 free(ctx->bdev_name); 8510 free(ctx); 8511 return -ENOMEM; 8512 } 8513 8514 ctx->cb_fn = open_cb; 8515 ctx->cb_arg = open_cb_arg; 8516 ctx->write = write; 8517 ctx->event_cb = event_cb; 8518 ctx->event_ctx = event_ctx; 8519 ctx->orig_thread = spdk_get_thread(); 8520 ctx->start_ticks = spdk_get_ticks(); 8521 8522 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8523 if (opts != NULL) { 8524 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8525 } 8526 8527 spdk_spin_lock(&g_bdev_mgr.spinlock); 8528 8529 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8530 _bdev_open_async(ctx); 8531 8532 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8533 8534 return 0; 8535 } 8536 8537 static void 8538 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8539 { 8540 int rc; 8541 8542 spdk_spin_lock(&bdev->internal.spinlock); 8543 spdk_spin_lock(&desc->spinlock); 8544 8545 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8546 8547 desc->closed = true; 8548 8549 if (desc->claim != NULL) { 8550 bdev_desc_release_claims(desc); 8551 } 8552 8553 if (0 == desc->refs) { 8554 spdk_spin_unlock(&desc->spinlock); 8555 bdev_desc_free(desc); 8556 } else { 8557 spdk_spin_unlock(&desc->spinlock); 8558 } 8559 8560 /* If no more descriptors, kill QoS channel */ 8561 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8562 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8563 bdev->name, spdk_get_thread()); 8564 8565 if (bdev_qos_destroy(bdev)) { 8566 /* There isn't anything we can do to recover here. Just let the 8567 * old QoS poller keep running. The QoS handling won't change 8568 * cores when the user allocates a new channel, but it won't break. */ 8569 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8570 } 8571 } 8572 8573 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8574 rc = bdev_unregister_unsafe(bdev); 8575 spdk_spin_unlock(&bdev->internal.spinlock); 8576 8577 if (rc == 0) { 8578 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8579 } 8580 } else { 8581 spdk_spin_unlock(&bdev->internal.spinlock); 8582 } 8583 } 8584 8585 void 8586 spdk_bdev_close(struct spdk_bdev_desc *desc) 8587 { 8588 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8589 8590 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8591 spdk_get_thread()); 8592 8593 assert(desc->thread == spdk_get_thread()); 8594 8595 spdk_poller_unregister(&desc->io_timeout_poller); 8596 8597 spdk_spin_lock(&g_bdev_mgr.spinlock); 8598 8599 bdev_close(bdev, desc); 8600 8601 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8602 } 8603 8604 int32_t 8605 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8606 { 8607 if (bdev->numa.id_valid) { 8608 return bdev->numa.id; 8609 } else { 8610 return SPDK_ENV_NUMA_ID_ANY; 8611 } 8612 } 8613 8614 static void 8615 bdev_register_finished(void *arg) 8616 { 8617 struct spdk_bdev_desc *desc = arg; 8618 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8619 8620 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8621 8622 spdk_spin_lock(&g_bdev_mgr.spinlock); 8623 8624 bdev_close(bdev, desc); 8625 8626 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8627 } 8628 8629 int 8630 spdk_bdev_register(struct spdk_bdev *bdev) 8631 { 8632 struct spdk_bdev_desc *desc; 8633 struct spdk_thread *thread = spdk_get_thread(); 8634 int rc; 8635 8636 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8637 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8638 thread ? spdk_thread_get_name(thread) : "null"); 8639 return -EINVAL; 8640 } 8641 8642 rc = bdev_register(bdev); 8643 if (rc != 0) { 8644 return rc; 8645 } 8646 8647 /* A descriptor is opened to prevent bdev deletion during examination */ 8648 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8649 if (rc != 0) { 8650 spdk_bdev_unregister(bdev, NULL, NULL); 8651 return rc; 8652 } 8653 8654 rc = bdev_open(bdev, false, desc); 8655 if (rc != 0) { 8656 bdev_desc_free(desc); 8657 spdk_bdev_unregister(bdev, NULL, NULL); 8658 return rc; 8659 } 8660 8661 /* Examine configuration before initializing I/O */ 8662 bdev_examine(bdev); 8663 8664 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8665 if (rc != 0) { 8666 bdev_close(bdev, desc); 8667 spdk_bdev_unregister(bdev, NULL, NULL); 8668 } 8669 8670 return rc; 8671 } 8672 8673 int 8674 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8675 struct spdk_bdev_module *module) 8676 { 8677 spdk_spin_lock(&bdev->internal.spinlock); 8678 8679 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8680 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8681 spdk_spin_unlock(&bdev->internal.spinlock); 8682 return -EPERM; 8683 } 8684 8685 if (desc && !desc->write) { 8686 desc->write = true; 8687 } 8688 8689 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8690 bdev->internal.claim.v1.module = module; 8691 8692 spdk_spin_unlock(&bdev->internal.spinlock); 8693 return 0; 8694 } 8695 8696 void 8697 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8698 { 8699 spdk_spin_lock(&bdev->internal.spinlock); 8700 8701 assert(bdev->internal.claim.v1.module != NULL); 8702 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8703 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8704 bdev->internal.claim.v1.module = NULL; 8705 8706 spdk_spin_unlock(&bdev->internal.spinlock); 8707 } 8708 8709 /* 8710 * Start claims v2 8711 */ 8712 8713 const char * 8714 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8715 { 8716 switch (type) { 8717 case SPDK_BDEV_CLAIM_NONE: 8718 return "not_claimed"; 8719 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8720 return "exclusive_write"; 8721 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8722 return "read_many_write_one"; 8723 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8724 return "read_many_write_none"; 8725 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8726 return "read_many_write_many"; 8727 default: 8728 break; 8729 } 8730 return "invalid_claim"; 8731 } 8732 8733 static bool 8734 claim_type_is_v2(enum spdk_bdev_claim_type type) 8735 { 8736 switch (type) { 8737 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8738 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8739 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8740 return true; 8741 default: 8742 break; 8743 } 8744 return false; 8745 } 8746 8747 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8748 static bool 8749 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8750 { 8751 switch (type) { 8752 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8753 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8754 return true; 8755 default: 8756 break; 8757 } 8758 return false; 8759 } 8760 8761 void 8762 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8763 { 8764 if (opts == NULL) { 8765 SPDK_ERRLOG("opts should not be NULL\n"); 8766 assert(opts != NULL); 8767 return; 8768 } 8769 if (size == 0) { 8770 SPDK_ERRLOG("size should not be zero\n"); 8771 assert(size != 0); 8772 return; 8773 } 8774 8775 memset(opts, 0, size); 8776 opts->opts_size = size; 8777 8778 #define FIELD_OK(field) \ 8779 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8780 8781 #define SET_FIELD(field, value) \ 8782 if (FIELD_OK(field)) { \ 8783 opts->field = value; \ 8784 } \ 8785 8786 SET_FIELD(shared_claim_key, 0); 8787 8788 #undef FIELD_OK 8789 #undef SET_FIELD 8790 } 8791 8792 static int 8793 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8794 { 8795 if (src->opts_size == 0) { 8796 SPDK_ERRLOG("size should not be zero\n"); 8797 return -1; 8798 } 8799 8800 memset(dst, 0, sizeof(*dst)); 8801 dst->opts_size = src->opts_size; 8802 8803 #define FIELD_OK(field) \ 8804 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8805 8806 #define SET_FIELD(field) \ 8807 if (FIELD_OK(field)) { \ 8808 dst->field = src->field; \ 8809 } \ 8810 8811 if (FIELD_OK(name)) { 8812 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8813 } 8814 8815 SET_FIELD(shared_claim_key); 8816 8817 /* You should not remove this statement, but need to update the assert statement 8818 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8819 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8820 8821 #undef FIELD_OK 8822 #undef SET_FIELD 8823 return 0; 8824 } 8825 8826 /* Returns 0 if a read-write-once claim can be taken. */ 8827 static int 8828 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8829 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8830 { 8831 struct spdk_bdev *bdev = desc->bdev; 8832 struct spdk_bdev_desc *open_desc; 8833 8834 assert(spdk_spin_held(&bdev->internal.spinlock)); 8835 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8836 8837 if (opts->shared_claim_key != 0) { 8838 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8839 bdev->name); 8840 return -EINVAL; 8841 } 8842 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8843 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8844 return -EPERM; 8845 } 8846 if (desc->claim != NULL) { 8847 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8848 bdev->name, desc->claim->module->name); 8849 return -EPERM; 8850 } 8851 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8852 if (desc != open_desc && open_desc->write) { 8853 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8854 "another descriptor is open for writing\n", 8855 bdev->name); 8856 return -EPERM; 8857 } 8858 } 8859 8860 return 0; 8861 } 8862 8863 /* Returns 0 if a read-only-many claim can be taken. */ 8864 static int 8865 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8866 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8867 { 8868 struct spdk_bdev *bdev = desc->bdev; 8869 struct spdk_bdev_desc *open_desc; 8870 8871 assert(spdk_spin_held(&bdev->internal.spinlock)); 8872 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8873 assert(desc->claim == NULL); 8874 8875 if (desc->write) { 8876 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8877 bdev->name); 8878 return -EINVAL; 8879 } 8880 if (opts->shared_claim_key != 0) { 8881 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8882 return -EINVAL; 8883 } 8884 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8885 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8886 if (open_desc->write) { 8887 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8888 "another descriptor is open for writing\n", 8889 bdev->name); 8890 return -EPERM; 8891 } 8892 } 8893 } 8894 8895 return 0; 8896 } 8897 8898 /* Returns 0 if a read-write-many claim can be taken. */ 8899 static int 8900 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8901 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8902 { 8903 struct spdk_bdev *bdev = desc->bdev; 8904 struct spdk_bdev_desc *open_desc; 8905 8906 assert(spdk_spin_held(&bdev->internal.spinlock)); 8907 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8908 assert(desc->claim == NULL); 8909 8910 if (opts->shared_claim_key == 0) { 8911 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8912 bdev->name); 8913 return -EINVAL; 8914 } 8915 switch (bdev->internal.claim_type) { 8916 case SPDK_BDEV_CLAIM_NONE: 8917 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8918 if (open_desc == desc) { 8919 continue; 8920 } 8921 if (open_desc->write) { 8922 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8923 "another descriptor is open for writing without a " 8924 "claim\n", bdev->name); 8925 return -EPERM; 8926 } 8927 } 8928 break; 8929 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8930 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8931 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8932 return -EPERM; 8933 } 8934 break; 8935 default: 8936 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8937 return -EBUSY; 8938 } 8939 8940 return 0; 8941 } 8942 8943 /* Updates desc and its bdev with a v2 claim. */ 8944 static int 8945 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8946 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8947 { 8948 struct spdk_bdev *bdev = desc->bdev; 8949 struct spdk_bdev_module_claim *claim; 8950 8951 assert(spdk_spin_held(&bdev->internal.spinlock)); 8952 assert(claim_type_is_v2(type)); 8953 assert(desc->claim == NULL); 8954 8955 claim = calloc(1, sizeof(*desc->claim)); 8956 if (claim == NULL) { 8957 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8958 return -ENOMEM; 8959 } 8960 claim->module = module; 8961 claim->desc = desc; 8962 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8963 memcpy(claim->name, opts->name, sizeof(claim->name)); 8964 desc->claim = claim; 8965 8966 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8967 bdev->internal.claim_type = type; 8968 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8969 bdev->internal.claim.v2.key = opts->shared_claim_key; 8970 } 8971 assert(type == bdev->internal.claim_type); 8972 8973 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8974 8975 if (!desc->write && claim_type_promotes_to_write(type)) { 8976 desc->write = true; 8977 } 8978 8979 return 0; 8980 } 8981 8982 int 8983 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8984 struct spdk_bdev_claim_opts *_opts, 8985 struct spdk_bdev_module *module) 8986 { 8987 struct spdk_bdev *bdev; 8988 struct spdk_bdev_claim_opts opts; 8989 int rc = 0; 8990 8991 if (desc == NULL) { 8992 SPDK_ERRLOG("descriptor must not be NULL\n"); 8993 return -EINVAL; 8994 } 8995 8996 bdev = desc->bdev; 8997 8998 if (_opts == NULL) { 8999 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9000 } else if (claim_opts_copy(_opts, &opts) != 0) { 9001 return -EINVAL; 9002 } 9003 9004 spdk_spin_lock(&bdev->internal.spinlock); 9005 9006 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9007 bdev->internal.claim_type != type) { 9008 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9009 spdk_spin_unlock(&bdev->internal.spinlock); 9010 return -EPERM; 9011 } 9012 9013 if (claim_type_is_v2(type) && desc->claim != NULL) { 9014 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9015 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9016 spdk_spin_unlock(&bdev->internal.spinlock); 9017 return -EPERM; 9018 } 9019 9020 switch (type) { 9021 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9022 spdk_spin_unlock(&bdev->internal.spinlock); 9023 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9024 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9025 rc = claim_verify_rwo(desc, type, &opts, module); 9026 break; 9027 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9028 rc = claim_verify_rom(desc, type, &opts, module); 9029 break; 9030 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9031 rc = claim_verify_rwm(desc, type, &opts, module); 9032 break; 9033 default: 9034 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9035 rc = -ENOTSUP; 9036 } 9037 9038 if (rc == 0) { 9039 rc = claim_bdev(desc, type, &opts, module); 9040 } 9041 9042 spdk_spin_unlock(&bdev->internal.spinlock); 9043 return rc; 9044 } 9045 9046 static void 9047 claim_reset(struct spdk_bdev *bdev) 9048 { 9049 assert(spdk_spin_held(&bdev->internal.spinlock)); 9050 assert(claim_type_is_v2(bdev->internal.claim_type)); 9051 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9052 9053 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9054 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9055 } 9056 9057 static void 9058 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9059 { 9060 struct spdk_bdev *bdev = desc->bdev; 9061 9062 assert(spdk_spin_held(&bdev->internal.spinlock)); 9063 assert(claim_type_is_v2(bdev->internal.claim_type)); 9064 9065 if (bdev->internal.examine_in_progress == 0) { 9066 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9067 free(desc->claim); 9068 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9069 claim_reset(bdev); 9070 } 9071 } else { 9072 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9073 desc->claim->module = NULL; 9074 desc->claim->desc = NULL; 9075 } 9076 desc->claim = NULL; 9077 } 9078 9079 /* 9080 * End claims v2 9081 */ 9082 9083 struct spdk_bdev * 9084 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9085 { 9086 assert(desc != NULL); 9087 return desc->bdev; 9088 } 9089 9090 int 9091 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9092 { 9093 struct spdk_bdev *bdev, *tmp; 9094 struct spdk_bdev_desc *desc; 9095 int rc = 0; 9096 9097 assert(fn != NULL); 9098 9099 spdk_spin_lock(&g_bdev_mgr.spinlock); 9100 bdev = spdk_bdev_first(); 9101 while (bdev != NULL) { 9102 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9103 if (rc != 0) { 9104 break; 9105 } 9106 rc = bdev_open(bdev, false, desc); 9107 if (rc != 0) { 9108 bdev_desc_free(desc); 9109 if (rc == -ENODEV) { 9110 /* Ignore the error and move to the next bdev. */ 9111 rc = 0; 9112 bdev = spdk_bdev_next(bdev); 9113 continue; 9114 } 9115 break; 9116 } 9117 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9118 9119 rc = fn(ctx, bdev); 9120 9121 spdk_spin_lock(&g_bdev_mgr.spinlock); 9122 tmp = spdk_bdev_next(bdev); 9123 bdev_close(bdev, desc); 9124 if (rc != 0) { 9125 break; 9126 } 9127 bdev = tmp; 9128 } 9129 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9130 9131 return rc; 9132 } 9133 9134 int 9135 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9136 { 9137 struct spdk_bdev *bdev, *tmp; 9138 struct spdk_bdev_desc *desc; 9139 int rc = 0; 9140 9141 assert(fn != NULL); 9142 9143 spdk_spin_lock(&g_bdev_mgr.spinlock); 9144 bdev = spdk_bdev_first_leaf(); 9145 while (bdev != NULL) { 9146 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9147 if (rc != 0) { 9148 break; 9149 } 9150 rc = bdev_open(bdev, false, desc); 9151 if (rc != 0) { 9152 bdev_desc_free(desc); 9153 if (rc == -ENODEV) { 9154 /* Ignore the error and move to the next bdev. */ 9155 rc = 0; 9156 bdev = spdk_bdev_next_leaf(bdev); 9157 continue; 9158 } 9159 break; 9160 } 9161 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9162 9163 rc = fn(ctx, bdev); 9164 9165 spdk_spin_lock(&g_bdev_mgr.spinlock); 9166 tmp = spdk_bdev_next_leaf(bdev); 9167 bdev_close(bdev, desc); 9168 if (rc != 0) { 9169 break; 9170 } 9171 bdev = tmp; 9172 } 9173 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9174 9175 return rc; 9176 } 9177 9178 void 9179 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9180 { 9181 struct iovec *iovs; 9182 int iovcnt; 9183 9184 if (bdev_io == NULL) { 9185 return; 9186 } 9187 9188 switch (bdev_io->type) { 9189 case SPDK_BDEV_IO_TYPE_READ: 9190 case SPDK_BDEV_IO_TYPE_WRITE: 9191 case SPDK_BDEV_IO_TYPE_ZCOPY: 9192 iovs = bdev_io->u.bdev.iovs; 9193 iovcnt = bdev_io->u.bdev.iovcnt; 9194 break; 9195 default: 9196 iovs = NULL; 9197 iovcnt = 0; 9198 break; 9199 } 9200 9201 if (iovp) { 9202 *iovp = iovs; 9203 } 9204 if (iovcntp) { 9205 *iovcntp = iovcnt; 9206 } 9207 } 9208 9209 void * 9210 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9211 { 9212 if (bdev_io == NULL) { 9213 return NULL; 9214 } 9215 9216 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9217 return NULL; 9218 } 9219 9220 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9221 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9222 return bdev_io->u.bdev.md_buf; 9223 } 9224 9225 return NULL; 9226 } 9227 9228 void * 9229 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9230 { 9231 if (bdev_io == NULL) { 9232 assert(false); 9233 return NULL; 9234 } 9235 9236 return bdev_io->internal.caller_ctx; 9237 } 9238 9239 void 9240 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9241 { 9242 9243 if (spdk_bdev_module_list_find(bdev_module->name)) { 9244 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9245 assert(false); 9246 } 9247 9248 spdk_spin_init(&bdev_module->internal.spinlock); 9249 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9250 9251 /* 9252 * Modules with examine callbacks must be initialized first, so they are 9253 * ready to handle examine callbacks from later modules that will 9254 * register physical bdevs. 9255 */ 9256 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9257 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9258 } else { 9259 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9260 } 9261 } 9262 9263 struct spdk_bdev_module * 9264 spdk_bdev_module_list_find(const char *name) 9265 { 9266 struct spdk_bdev_module *bdev_module; 9267 9268 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9269 if (strcmp(name, bdev_module->name) == 0) { 9270 break; 9271 } 9272 } 9273 9274 return bdev_module; 9275 } 9276 9277 static int 9278 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9279 { 9280 uint64_t num_blocks; 9281 void *md_buf = NULL; 9282 9283 num_blocks = bdev_io->u.bdev.num_blocks; 9284 9285 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9286 md_buf = (char *)g_bdev_mgr.zero_buffer + 9287 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9288 } 9289 9290 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9291 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9292 g_bdev_mgr.zero_buffer, md_buf, 9293 bdev_io->u.bdev.offset_blocks, num_blocks, 9294 bdev_write_zero_buffer_done, bdev_io); 9295 } 9296 9297 static void 9298 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9299 { 9300 struct spdk_bdev_io *parent_io = cb_arg; 9301 9302 spdk_bdev_free_io(bdev_io); 9303 9304 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9305 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9306 } 9307 9308 static void 9309 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9310 { 9311 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9312 ctx->bdev->internal.qos_mod_in_progress = false; 9313 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9314 9315 if (ctx->cb_fn) { 9316 ctx->cb_fn(ctx->cb_arg, status); 9317 } 9318 free(ctx); 9319 } 9320 9321 static void 9322 bdev_disable_qos_done(void *cb_arg) 9323 { 9324 struct set_qos_limit_ctx *ctx = cb_arg; 9325 struct spdk_bdev *bdev = ctx->bdev; 9326 struct spdk_bdev_qos *qos; 9327 9328 spdk_spin_lock(&bdev->internal.spinlock); 9329 qos = bdev->internal.qos; 9330 bdev->internal.qos = NULL; 9331 spdk_spin_unlock(&bdev->internal.spinlock); 9332 9333 if (qos->thread != NULL) { 9334 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9335 spdk_poller_unregister(&qos->poller); 9336 } 9337 9338 free(qos); 9339 9340 bdev_set_qos_limit_done(ctx, 0); 9341 } 9342 9343 static void 9344 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9345 { 9346 struct set_qos_limit_ctx *ctx = _ctx; 9347 struct spdk_thread *thread; 9348 9349 spdk_spin_lock(&bdev->internal.spinlock); 9350 thread = bdev->internal.qos->thread; 9351 spdk_spin_unlock(&bdev->internal.spinlock); 9352 9353 if (thread != NULL) { 9354 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9355 } else { 9356 bdev_disable_qos_done(ctx); 9357 } 9358 } 9359 9360 static void 9361 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9362 struct spdk_io_channel *ch, void *_ctx) 9363 { 9364 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9365 struct spdk_bdev_io *bdev_io; 9366 9367 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9368 9369 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9370 /* Re-submit the queued I/O. */ 9371 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9372 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9373 _bdev_io_submit(bdev_io); 9374 } 9375 9376 spdk_bdev_for_each_channel_continue(i, 0); 9377 } 9378 9379 static void 9380 bdev_update_qos_rate_limit_msg(void *cb_arg) 9381 { 9382 struct set_qos_limit_ctx *ctx = cb_arg; 9383 struct spdk_bdev *bdev = ctx->bdev; 9384 9385 spdk_spin_lock(&bdev->internal.spinlock); 9386 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9387 spdk_spin_unlock(&bdev->internal.spinlock); 9388 9389 bdev_set_qos_limit_done(ctx, 0); 9390 } 9391 9392 static void 9393 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9394 struct spdk_io_channel *ch, void *_ctx) 9395 { 9396 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9397 9398 spdk_spin_lock(&bdev->internal.spinlock); 9399 bdev_enable_qos(bdev, bdev_ch); 9400 spdk_spin_unlock(&bdev->internal.spinlock); 9401 spdk_bdev_for_each_channel_continue(i, 0); 9402 } 9403 9404 static void 9405 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9406 { 9407 struct set_qos_limit_ctx *ctx = _ctx; 9408 9409 bdev_set_qos_limit_done(ctx, status); 9410 } 9411 9412 static void 9413 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9414 { 9415 int i; 9416 9417 assert(bdev->internal.qos != NULL); 9418 9419 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9420 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9421 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9422 9423 if (limits[i] == 0) { 9424 bdev->internal.qos->rate_limits[i].limit = 9425 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9426 } 9427 } 9428 } 9429 } 9430 9431 void 9432 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9433 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9434 { 9435 struct set_qos_limit_ctx *ctx; 9436 uint32_t limit_set_complement; 9437 uint64_t min_limit_per_sec; 9438 int i; 9439 bool disable_rate_limit = true; 9440 9441 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9442 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9443 continue; 9444 } 9445 9446 if (limits[i] > 0) { 9447 disable_rate_limit = false; 9448 } 9449 9450 if (bdev_qos_is_iops_rate_limit(i) == true) { 9451 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9452 } else { 9453 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9454 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9455 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9456 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9457 } 9458 /* Change from megabyte to byte rate limit */ 9459 limits[i] = limits[i] * 1024 * 1024; 9460 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9461 } 9462 9463 limit_set_complement = limits[i] % min_limit_per_sec; 9464 if (limit_set_complement) { 9465 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9466 limits[i], min_limit_per_sec); 9467 limits[i] += min_limit_per_sec - limit_set_complement; 9468 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9469 } 9470 } 9471 9472 ctx = calloc(1, sizeof(*ctx)); 9473 if (ctx == NULL) { 9474 cb_fn(cb_arg, -ENOMEM); 9475 return; 9476 } 9477 9478 ctx->cb_fn = cb_fn; 9479 ctx->cb_arg = cb_arg; 9480 ctx->bdev = bdev; 9481 9482 spdk_spin_lock(&bdev->internal.spinlock); 9483 if (bdev->internal.qos_mod_in_progress) { 9484 spdk_spin_unlock(&bdev->internal.spinlock); 9485 free(ctx); 9486 cb_fn(cb_arg, -EAGAIN); 9487 return; 9488 } 9489 bdev->internal.qos_mod_in_progress = true; 9490 9491 if (disable_rate_limit == true && bdev->internal.qos) { 9492 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9493 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9494 (bdev->internal.qos->rate_limits[i].limit > 0 && 9495 bdev->internal.qos->rate_limits[i].limit != 9496 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9497 disable_rate_limit = false; 9498 break; 9499 } 9500 } 9501 } 9502 9503 if (disable_rate_limit == false) { 9504 if (bdev->internal.qos == NULL) { 9505 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9506 if (!bdev->internal.qos) { 9507 spdk_spin_unlock(&bdev->internal.spinlock); 9508 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9509 bdev_set_qos_limit_done(ctx, -ENOMEM); 9510 return; 9511 } 9512 } 9513 9514 if (bdev->internal.qos->thread == NULL) { 9515 /* Enabling */ 9516 bdev_set_qos_rate_limits(bdev, limits); 9517 9518 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9519 bdev_enable_qos_done); 9520 } else { 9521 /* Updating */ 9522 bdev_set_qos_rate_limits(bdev, limits); 9523 9524 spdk_thread_send_msg(bdev->internal.qos->thread, 9525 bdev_update_qos_rate_limit_msg, ctx); 9526 } 9527 } else { 9528 if (bdev->internal.qos != NULL) { 9529 bdev_set_qos_rate_limits(bdev, limits); 9530 9531 /* Disabling */ 9532 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9533 bdev_disable_qos_msg_done); 9534 } else { 9535 spdk_spin_unlock(&bdev->internal.spinlock); 9536 bdev_set_qos_limit_done(ctx, 0); 9537 return; 9538 } 9539 } 9540 9541 spdk_spin_unlock(&bdev->internal.spinlock); 9542 } 9543 9544 struct spdk_bdev_histogram_ctx { 9545 spdk_bdev_histogram_status_cb cb_fn; 9546 void *cb_arg; 9547 struct spdk_bdev *bdev; 9548 int status; 9549 }; 9550 9551 static void 9552 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9553 { 9554 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9555 9556 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9557 ctx->bdev->internal.histogram_in_progress = false; 9558 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9559 ctx->cb_fn(ctx->cb_arg, ctx->status); 9560 free(ctx); 9561 } 9562 9563 static void 9564 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9565 struct spdk_io_channel *_ch, void *_ctx) 9566 { 9567 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9568 9569 if (ch->histogram != NULL) { 9570 spdk_histogram_data_free(ch->histogram); 9571 ch->histogram = NULL; 9572 } 9573 spdk_bdev_for_each_channel_continue(i, 0); 9574 } 9575 9576 static void 9577 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9578 { 9579 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9580 9581 if (status != 0) { 9582 ctx->status = status; 9583 ctx->bdev->internal.histogram_enabled = false; 9584 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9585 bdev_histogram_disable_channel_cb); 9586 } else { 9587 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9588 ctx->bdev->internal.histogram_in_progress = false; 9589 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9590 ctx->cb_fn(ctx->cb_arg, ctx->status); 9591 free(ctx); 9592 } 9593 } 9594 9595 static void 9596 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9597 struct spdk_io_channel *_ch, void *_ctx) 9598 { 9599 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9600 int status = 0; 9601 9602 if (ch->histogram == NULL) { 9603 ch->histogram = spdk_histogram_data_alloc(); 9604 if (ch->histogram == NULL) { 9605 status = -ENOMEM; 9606 } 9607 } 9608 9609 spdk_bdev_for_each_channel_continue(i, status); 9610 } 9611 9612 void 9613 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9614 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9615 { 9616 struct spdk_bdev_histogram_ctx *ctx; 9617 9618 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9619 if (ctx == NULL) { 9620 cb_fn(cb_arg, -ENOMEM); 9621 return; 9622 } 9623 9624 ctx->bdev = bdev; 9625 ctx->status = 0; 9626 ctx->cb_fn = cb_fn; 9627 ctx->cb_arg = cb_arg; 9628 9629 spdk_spin_lock(&bdev->internal.spinlock); 9630 if (bdev->internal.histogram_in_progress) { 9631 spdk_spin_unlock(&bdev->internal.spinlock); 9632 free(ctx); 9633 cb_fn(cb_arg, -EAGAIN); 9634 return; 9635 } 9636 9637 bdev->internal.histogram_in_progress = true; 9638 spdk_spin_unlock(&bdev->internal.spinlock); 9639 9640 bdev->internal.histogram_enabled = enable; 9641 bdev->internal.histogram_io_type = opts->io_type; 9642 9643 if (enable) { 9644 /* Allocate histogram for each channel */ 9645 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9646 bdev_histogram_enable_channel_cb); 9647 } else { 9648 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9649 bdev_histogram_disable_channel_cb); 9650 } 9651 } 9652 9653 void 9654 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9655 { 9656 if (opts == NULL) { 9657 SPDK_ERRLOG("opts should not be NULL\n"); 9658 assert(opts != NULL); 9659 return; 9660 } 9661 if (size == 0) { 9662 SPDK_ERRLOG("size should not be zero\n"); 9663 assert(size != 0); 9664 return; 9665 } 9666 9667 memset(opts, 0, size); 9668 opts->size = size; 9669 9670 #define FIELD_OK(field) \ 9671 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9672 9673 #define SET_FIELD(field, value) \ 9674 if (FIELD_OK(field)) { \ 9675 opts->field = value; \ 9676 } \ 9677 9678 SET_FIELD(io_type, 0); 9679 9680 /* You should not remove this statement, but need to update the assert statement 9681 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9682 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9683 9684 #undef FIELD_OK 9685 #undef SET_FIELD 9686 } 9687 9688 void 9689 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9690 void *cb_arg, bool enable) 9691 { 9692 struct spdk_bdev_enable_histogram_opts opts; 9693 9694 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9695 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9696 } 9697 9698 struct spdk_bdev_histogram_data_ctx { 9699 spdk_bdev_histogram_data_cb cb_fn; 9700 void *cb_arg; 9701 struct spdk_bdev *bdev; 9702 /** merged histogram data from all channels */ 9703 struct spdk_histogram_data *histogram; 9704 }; 9705 9706 static void 9707 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9708 { 9709 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9710 9711 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9712 free(ctx); 9713 } 9714 9715 static void 9716 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9717 struct spdk_io_channel *_ch, void *_ctx) 9718 { 9719 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9720 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9721 int status = 0; 9722 9723 if (ch->histogram == NULL) { 9724 status = -EFAULT; 9725 } else { 9726 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9727 } 9728 9729 spdk_bdev_for_each_channel_continue(i, status); 9730 } 9731 9732 void 9733 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9734 spdk_bdev_histogram_data_cb cb_fn, 9735 void *cb_arg) 9736 { 9737 struct spdk_bdev_histogram_data_ctx *ctx; 9738 9739 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9740 if (ctx == NULL) { 9741 cb_fn(cb_arg, -ENOMEM, NULL); 9742 return; 9743 } 9744 9745 ctx->bdev = bdev; 9746 ctx->cb_fn = cb_fn; 9747 ctx->cb_arg = cb_arg; 9748 9749 ctx->histogram = histogram; 9750 9751 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9752 bdev_histogram_get_channel_cb); 9753 } 9754 9755 void 9756 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9757 void *cb_arg) 9758 { 9759 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9760 int status = 0; 9761 9762 assert(cb_fn != NULL); 9763 9764 if (bdev_ch->histogram == NULL) { 9765 status = -EFAULT; 9766 } 9767 cb_fn(cb_arg, status, bdev_ch->histogram); 9768 } 9769 9770 size_t 9771 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9772 size_t max_events) 9773 { 9774 struct media_event_entry *entry; 9775 size_t num_events = 0; 9776 9777 for (; num_events < max_events; ++num_events) { 9778 entry = TAILQ_FIRST(&desc->pending_media_events); 9779 if (entry == NULL) { 9780 break; 9781 } 9782 9783 events[num_events] = entry->event; 9784 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9785 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9786 } 9787 9788 return num_events; 9789 } 9790 9791 int 9792 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9793 size_t num_events) 9794 { 9795 struct spdk_bdev_desc *desc; 9796 struct media_event_entry *entry; 9797 size_t event_id; 9798 int rc = 0; 9799 9800 assert(bdev->media_events); 9801 9802 spdk_spin_lock(&bdev->internal.spinlock); 9803 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9804 if (desc->write) { 9805 break; 9806 } 9807 } 9808 9809 if (desc == NULL || desc->media_events_buffer == NULL) { 9810 rc = -ENODEV; 9811 goto out; 9812 } 9813 9814 for (event_id = 0; event_id < num_events; ++event_id) { 9815 entry = TAILQ_FIRST(&desc->free_media_events); 9816 if (entry == NULL) { 9817 break; 9818 } 9819 9820 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9821 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9822 entry->event = events[event_id]; 9823 } 9824 9825 rc = event_id; 9826 out: 9827 spdk_spin_unlock(&bdev->internal.spinlock); 9828 return rc; 9829 } 9830 9831 static void 9832 _media_management_notify(void *arg) 9833 { 9834 struct spdk_bdev_desc *desc = arg; 9835 9836 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9837 } 9838 9839 void 9840 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9841 { 9842 struct spdk_bdev_desc *desc; 9843 9844 spdk_spin_lock(&bdev->internal.spinlock); 9845 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9846 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9847 event_notify(desc, _media_management_notify); 9848 } 9849 } 9850 spdk_spin_unlock(&bdev->internal.spinlock); 9851 } 9852 9853 struct locked_lba_range_ctx { 9854 struct lba_range range; 9855 struct lba_range *current_range; 9856 struct lba_range *owner_range; 9857 struct spdk_poller *poller; 9858 lock_range_cb cb_fn; 9859 void *cb_arg; 9860 }; 9861 9862 static void 9863 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9864 { 9865 struct locked_lba_range_ctx *ctx = _ctx; 9866 9867 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9868 free(ctx); 9869 } 9870 9871 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9872 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9873 9874 static void 9875 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9876 { 9877 struct locked_lba_range_ctx *ctx = _ctx; 9878 9879 if (status == -ENOMEM) { 9880 /* One of the channels could not allocate a range object. 9881 * So we have to go back and clean up any ranges that were 9882 * allocated successfully before we return error status to 9883 * the caller. We can reuse the unlock function to do that 9884 * clean up. 9885 */ 9886 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9887 bdev_lock_error_cleanup_cb); 9888 return; 9889 } 9890 9891 /* All channels have locked this range and no I/O overlapping the range 9892 * are outstanding! Set the owner_ch for the range object for the 9893 * locking channel, so that this channel will know that it is allowed 9894 * to write to this range. 9895 */ 9896 if (ctx->owner_range != NULL) { 9897 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9898 } 9899 9900 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9901 9902 /* Don't free the ctx here. Its range is in the bdev's global list of 9903 * locked ranges still, and will be removed and freed when this range 9904 * is later unlocked. 9905 */ 9906 } 9907 9908 static int 9909 bdev_lock_lba_range_check_io(void *_i) 9910 { 9911 struct spdk_bdev_channel_iter *i = _i; 9912 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9913 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9914 struct locked_lba_range_ctx *ctx = i->ctx; 9915 struct lba_range *range = ctx->current_range; 9916 struct spdk_bdev_io *bdev_io; 9917 9918 spdk_poller_unregister(&ctx->poller); 9919 9920 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9921 * range. But we need to wait until any outstanding IO overlapping with this range 9922 * are completed. 9923 */ 9924 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9925 if (bdev_io_range_is_locked(bdev_io, range)) { 9926 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9927 return SPDK_POLLER_BUSY; 9928 } 9929 } 9930 9931 spdk_bdev_for_each_channel_continue(i, 0); 9932 return SPDK_POLLER_BUSY; 9933 } 9934 9935 static void 9936 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9937 struct spdk_io_channel *_ch, void *_ctx) 9938 { 9939 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9940 struct locked_lba_range_ctx *ctx = _ctx; 9941 struct lba_range *range; 9942 9943 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9944 if (range->length == ctx->range.length && 9945 range->offset == ctx->range.offset && 9946 range->locked_ctx == ctx->range.locked_ctx) { 9947 /* This range already exists on this channel, so don't add 9948 * it again. This can happen when a new channel is created 9949 * while the for_each_channel operation is in progress. 9950 * Do not check for outstanding I/O in that case, since the 9951 * range was locked before any I/O could be submitted to the 9952 * new channel. 9953 */ 9954 spdk_bdev_for_each_channel_continue(i, 0); 9955 return; 9956 } 9957 } 9958 9959 range = calloc(1, sizeof(*range)); 9960 if (range == NULL) { 9961 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9962 return; 9963 } 9964 9965 range->length = ctx->range.length; 9966 range->offset = ctx->range.offset; 9967 range->locked_ctx = ctx->range.locked_ctx; 9968 range->quiesce = ctx->range.quiesce; 9969 ctx->current_range = range; 9970 if (ctx->range.owner_ch == ch) { 9971 /* This is the range object for the channel that will hold 9972 * the lock. Store it in the ctx object so that we can easily 9973 * set its owner_ch after the lock is finally acquired. 9974 */ 9975 ctx->owner_range = range; 9976 } 9977 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9978 bdev_lock_lba_range_check_io(i); 9979 } 9980 9981 static void 9982 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9983 { 9984 assert(spdk_get_thread() == ctx->range.owner_thread); 9985 assert(ctx->range.owner_ch == NULL || 9986 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9987 9988 /* We will add a copy of this range to each channel now. */ 9989 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9990 bdev_lock_lba_range_cb); 9991 } 9992 9993 static bool 9994 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9995 { 9996 struct lba_range *r; 9997 9998 TAILQ_FOREACH(r, tailq, tailq) { 9999 if (bdev_lba_range_overlapped(range, r)) { 10000 return true; 10001 } 10002 } 10003 return false; 10004 } 10005 10006 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10007 10008 static int 10009 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10010 uint64_t offset, uint64_t length, 10011 lock_range_cb cb_fn, void *cb_arg) 10012 { 10013 struct locked_lba_range_ctx *ctx; 10014 10015 ctx = calloc(1, sizeof(*ctx)); 10016 if (ctx == NULL) { 10017 return -ENOMEM; 10018 } 10019 10020 ctx->range.offset = offset; 10021 ctx->range.length = length; 10022 ctx->range.owner_thread = spdk_get_thread(); 10023 ctx->range.owner_ch = ch; 10024 ctx->range.locked_ctx = cb_arg; 10025 ctx->range.bdev = bdev; 10026 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10027 ctx->cb_fn = cb_fn; 10028 ctx->cb_arg = cb_arg; 10029 10030 spdk_spin_lock(&bdev->internal.spinlock); 10031 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10032 /* There is an active lock overlapping with this range. 10033 * Put it on the pending list until this range no 10034 * longer overlaps with another. 10035 */ 10036 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10037 } else { 10038 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10039 bdev_lock_lba_range_ctx(bdev, ctx); 10040 } 10041 spdk_spin_unlock(&bdev->internal.spinlock); 10042 return 0; 10043 } 10044 10045 static int 10046 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10047 uint64_t offset, uint64_t length, 10048 lock_range_cb cb_fn, void *cb_arg) 10049 { 10050 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10051 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10052 10053 if (cb_arg == NULL) { 10054 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10055 return -EINVAL; 10056 } 10057 10058 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10059 } 10060 10061 static void 10062 bdev_lock_lba_range_ctx_msg(void *_ctx) 10063 { 10064 struct locked_lba_range_ctx *ctx = _ctx; 10065 10066 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10067 } 10068 10069 static void 10070 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10071 { 10072 struct locked_lba_range_ctx *ctx = _ctx; 10073 struct locked_lba_range_ctx *pending_ctx; 10074 struct lba_range *range, *tmp; 10075 10076 spdk_spin_lock(&bdev->internal.spinlock); 10077 /* Check if there are any pending locked ranges that overlap with this range 10078 * that was just unlocked. If there are, check that it doesn't overlap with any 10079 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10080 * the lock process. 10081 */ 10082 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10083 if (bdev_lba_range_overlapped(range, &ctx->range) && 10084 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10085 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10086 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10087 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10088 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10089 bdev_lock_lba_range_ctx_msg, pending_ctx); 10090 } 10091 } 10092 spdk_spin_unlock(&bdev->internal.spinlock); 10093 10094 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10095 free(ctx); 10096 } 10097 10098 static void 10099 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10100 struct spdk_io_channel *_ch, void *_ctx) 10101 { 10102 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10103 struct locked_lba_range_ctx *ctx = _ctx; 10104 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10105 struct spdk_bdev_io *bdev_io; 10106 struct lba_range *range; 10107 10108 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10109 if (ctx->range.offset == range->offset && 10110 ctx->range.length == range->length && 10111 ctx->range.locked_ctx == range->locked_ctx) { 10112 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10113 free(range); 10114 break; 10115 } 10116 } 10117 10118 /* Note: we should almost always be able to assert that the range specified 10119 * was found. But there are some very rare corner cases where a new channel 10120 * gets created simultaneously with a range unlock, where this function 10121 * would execute on that new channel and wouldn't have the range. 10122 * We also use this to clean up range allocations when a later allocation 10123 * fails in the locking path. 10124 * So we can't actually assert() here. 10125 */ 10126 10127 /* Swap the locked IO into a temporary list, and then try to submit them again. 10128 * We could hyper-optimize this to only resubmit locked I/O that overlap 10129 * with the range that was just unlocked, but this isn't a performance path so 10130 * we go for simplicity here. 10131 */ 10132 TAILQ_INIT(&io_locked); 10133 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10134 while (!TAILQ_EMPTY(&io_locked)) { 10135 bdev_io = TAILQ_FIRST(&io_locked); 10136 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10137 bdev_io_submit(bdev_io); 10138 } 10139 10140 spdk_bdev_for_each_channel_continue(i, 0); 10141 } 10142 10143 static int 10144 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10145 lock_range_cb cb_fn, void *cb_arg) 10146 { 10147 struct locked_lba_range_ctx *ctx; 10148 struct lba_range *range; 10149 10150 spdk_spin_lock(&bdev->internal.spinlock); 10151 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10152 * and remove it. This ensures new channels don't inherit the locked range. 10153 * Then we will send a message to each channel to remove the range from its 10154 * per-channel list. 10155 */ 10156 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10157 if (range->offset == offset && range->length == length && 10158 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10159 break; 10160 } 10161 } 10162 if (range == NULL) { 10163 assert(false); 10164 spdk_spin_unlock(&bdev->internal.spinlock); 10165 return -EINVAL; 10166 } 10167 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10168 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10169 spdk_spin_unlock(&bdev->internal.spinlock); 10170 10171 ctx->cb_fn = cb_fn; 10172 ctx->cb_arg = cb_arg; 10173 10174 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10175 bdev_unlock_lba_range_cb); 10176 return 0; 10177 } 10178 10179 static int 10180 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10181 uint64_t offset, uint64_t length, 10182 lock_range_cb cb_fn, void *cb_arg) 10183 { 10184 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10185 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10186 struct lba_range *range; 10187 bool range_found = false; 10188 10189 /* Let's make sure the specified channel actually has a lock on 10190 * the specified range. Note that the range must match exactly. 10191 */ 10192 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10193 if (range->offset == offset && range->length == length && 10194 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10195 range_found = true; 10196 break; 10197 } 10198 } 10199 10200 if (!range_found) { 10201 return -EINVAL; 10202 } 10203 10204 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10205 } 10206 10207 struct bdev_quiesce_ctx { 10208 spdk_bdev_quiesce_cb cb_fn; 10209 void *cb_arg; 10210 }; 10211 10212 static void 10213 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10214 { 10215 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10216 10217 if (quiesce_ctx->cb_fn != NULL) { 10218 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10219 } 10220 10221 free(quiesce_ctx); 10222 } 10223 10224 static void 10225 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10226 { 10227 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10228 struct spdk_bdev_module *module = range->bdev->module; 10229 10230 if (status != 0) { 10231 if (quiesce_ctx->cb_fn != NULL) { 10232 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10233 } 10234 free(quiesce_ctx); 10235 return; 10236 } 10237 10238 spdk_spin_lock(&module->internal.spinlock); 10239 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10240 spdk_spin_unlock(&module->internal.spinlock); 10241 10242 if (quiesce_ctx->cb_fn != NULL) { 10243 /* copy the context in case the range is unlocked by the callback */ 10244 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10245 10246 quiesce_ctx->cb_fn = NULL; 10247 quiesce_ctx->cb_arg = NULL; 10248 10249 tmp.cb_fn(tmp.cb_arg, status); 10250 } 10251 /* quiesce_ctx will be freed on unquiesce */ 10252 } 10253 10254 static int 10255 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10256 uint64_t offset, uint64_t length, 10257 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10258 bool unquiesce) 10259 { 10260 struct bdev_quiesce_ctx *quiesce_ctx; 10261 int rc; 10262 10263 if (module != bdev->module) { 10264 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10265 return -EINVAL; 10266 } 10267 10268 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10269 return -EINVAL; 10270 } 10271 10272 if (unquiesce) { 10273 struct lba_range *range; 10274 10275 /* Make sure the specified range is actually quiesced in the specified module and 10276 * then remove it from the list. Note that the range must match exactly. 10277 */ 10278 spdk_spin_lock(&module->internal.spinlock); 10279 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10280 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10281 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10282 break; 10283 } 10284 } 10285 spdk_spin_unlock(&module->internal.spinlock); 10286 10287 if (range == NULL) { 10288 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10289 return -EINVAL; 10290 } 10291 10292 quiesce_ctx = range->locked_ctx; 10293 quiesce_ctx->cb_fn = cb_fn; 10294 quiesce_ctx->cb_arg = cb_arg; 10295 10296 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10297 } else { 10298 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10299 if (quiesce_ctx == NULL) { 10300 return -ENOMEM; 10301 } 10302 10303 quiesce_ctx->cb_fn = cb_fn; 10304 quiesce_ctx->cb_arg = cb_arg; 10305 10306 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10307 if (rc != 0) { 10308 free(quiesce_ctx); 10309 } 10310 } 10311 10312 return rc; 10313 } 10314 10315 int 10316 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10317 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10318 { 10319 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10320 } 10321 10322 int 10323 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10324 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10325 { 10326 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10327 } 10328 10329 int 10330 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10331 uint64_t offset, uint64_t length, 10332 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10333 { 10334 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10335 } 10336 10337 int 10338 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10339 uint64_t offset, uint64_t length, 10340 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10341 { 10342 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10343 } 10344 10345 int 10346 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10347 int array_size) 10348 { 10349 if (!bdev) { 10350 return -EINVAL; 10351 } 10352 10353 if (bdev->fn_table->get_memory_domains) { 10354 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10355 } 10356 10357 return 0; 10358 } 10359 10360 struct spdk_bdev_for_each_io_ctx { 10361 void *ctx; 10362 spdk_bdev_io_fn fn; 10363 spdk_bdev_for_each_io_cb cb; 10364 }; 10365 10366 static void 10367 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10368 struct spdk_io_channel *io_ch, void *_ctx) 10369 { 10370 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10371 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10372 struct spdk_bdev_io *bdev_io; 10373 int rc = 0; 10374 10375 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10376 rc = ctx->fn(ctx->ctx, bdev_io); 10377 if (rc != 0) { 10378 break; 10379 } 10380 } 10381 10382 spdk_bdev_for_each_channel_continue(i, rc); 10383 } 10384 10385 static void 10386 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10387 { 10388 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10389 10390 ctx->cb(ctx->ctx, status); 10391 10392 free(ctx); 10393 } 10394 10395 void 10396 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10397 spdk_bdev_for_each_io_cb cb) 10398 { 10399 struct spdk_bdev_for_each_io_ctx *ctx; 10400 10401 assert(fn != NULL && cb != NULL); 10402 10403 ctx = calloc(1, sizeof(*ctx)); 10404 if (ctx == NULL) { 10405 SPDK_ERRLOG("Failed to allocate context.\n"); 10406 cb(_ctx, -ENOMEM); 10407 return; 10408 } 10409 10410 ctx->ctx = _ctx; 10411 ctx->fn = fn; 10412 ctx->cb = cb; 10413 10414 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10415 bdev_for_each_io_done); 10416 } 10417 10418 void 10419 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10420 { 10421 spdk_for_each_channel_continue(iter->i, status); 10422 } 10423 10424 static struct spdk_bdev * 10425 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10426 { 10427 void *io_device = spdk_io_channel_iter_get_io_device(i); 10428 10429 return __bdev_from_io_dev(io_device); 10430 } 10431 10432 static void 10433 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10434 { 10435 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10436 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10437 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10438 10439 iter->i = i; 10440 iter->fn(iter, bdev, ch, iter->ctx); 10441 } 10442 10443 static void 10444 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10445 { 10446 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10447 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10448 10449 iter->i = i; 10450 iter->cpl(bdev, iter->ctx, status); 10451 10452 free(iter); 10453 } 10454 10455 void 10456 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10457 void *ctx, spdk_bdev_for_each_channel_done cpl) 10458 { 10459 struct spdk_bdev_channel_iter *iter; 10460 10461 assert(bdev != NULL && fn != NULL && ctx != NULL); 10462 10463 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10464 if (iter == NULL) { 10465 SPDK_ERRLOG("Unable to allocate iterator\n"); 10466 assert(false); 10467 return; 10468 } 10469 10470 iter->fn = fn; 10471 iter->cpl = cpl; 10472 iter->ctx = ctx; 10473 10474 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10475 iter, bdev_each_channel_cpl); 10476 } 10477 10478 static void 10479 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10480 { 10481 struct spdk_bdev_io *parent_io = cb_arg; 10482 10483 spdk_bdev_free_io(bdev_io); 10484 10485 /* Check return status of write */ 10486 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10487 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10488 } 10489 10490 static void 10491 bdev_copy_do_write(void *_bdev_io) 10492 { 10493 struct spdk_bdev_io *bdev_io = _bdev_io; 10494 int rc; 10495 10496 /* Write blocks */ 10497 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10498 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10499 bdev_io->u.bdev.iovs[0].iov_base, 10500 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10501 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10502 10503 if (rc == -ENOMEM) { 10504 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10505 } else if (rc != 0) { 10506 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10507 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10508 } 10509 } 10510 10511 static void 10512 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10513 { 10514 struct spdk_bdev_io *parent_io = cb_arg; 10515 10516 spdk_bdev_free_io(bdev_io); 10517 10518 /* Check return status of read */ 10519 if (!success) { 10520 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10521 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10522 return; 10523 } 10524 10525 /* Do write */ 10526 bdev_copy_do_write(parent_io); 10527 } 10528 10529 static void 10530 bdev_copy_do_read(void *_bdev_io) 10531 { 10532 struct spdk_bdev_io *bdev_io = _bdev_io; 10533 int rc; 10534 10535 /* Read blocks */ 10536 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10537 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10538 bdev_io->u.bdev.iovs[0].iov_base, 10539 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10540 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10541 10542 if (rc == -ENOMEM) { 10543 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10544 } else if (rc != 0) { 10545 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10546 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10547 } 10548 } 10549 10550 static void 10551 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10552 { 10553 if (!success) { 10554 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10555 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10556 return; 10557 } 10558 10559 bdev_copy_do_read(bdev_io); 10560 } 10561 10562 int 10563 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10564 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10565 spdk_bdev_io_completion_cb cb, void *cb_arg) 10566 { 10567 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10568 struct spdk_bdev_io *bdev_io; 10569 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10570 10571 if (!desc->write) { 10572 return -EBADF; 10573 } 10574 10575 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10576 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10577 SPDK_DEBUGLOG(bdev, 10578 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10579 dst_offset_blocks, src_offset_blocks, num_blocks); 10580 return -EINVAL; 10581 } 10582 10583 bdev_io = bdev_channel_get_io(channel); 10584 if (!bdev_io) { 10585 return -ENOMEM; 10586 } 10587 10588 bdev_io->internal.ch = channel; 10589 bdev_io->internal.desc = desc; 10590 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10591 10592 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10593 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10594 bdev_io->u.bdev.num_blocks = num_blocks; 10595 bdev_io->u.bdev.memory_domain = NULL; 10596 bdev_io->u.bdev.memory_domain_ctx = NULL; 10597 bdev_io->u.bdev.iovs = NULL; 10598 bdev_io->u.bdev.iovcnt = 0; 10599 bdev_io->u.bdev.md_buf = NULL; 10600 bdev_io->u.bdev.accel_sequence = NULL; 10601 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10602 10603 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10604 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10605 return 0; 10606 } 10607 10608 10609 /* If the copy size is large and should be split, use the generic split logic 10610 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10611 * 10612 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10613 * emulate it using regular read and write requests otherwise. 10614 */ 10615 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10616 bdev_io->internal.f.split) { 10617 bdev_io_submit(bdev_io); 10618 return 0; 10619 } 10620 10621 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10622 10623 return 0; 10624 } 10625 10626 SPDK_LOG_REGISTER_COMPONENT(bdev) 10627 10628 static void 10629 bdev_trace(void) 10630 { 10631 struct spdk_trace_tpoint_opts opts[] = { 10632 { 10633 "BDEV_IO_START", TRACE_BDEV_IO_START, 10634 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10635 { 10636 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10637 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10638 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10639 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10640 } 10641 }, 10642 { 10643 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10644 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10645 { 10646 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10647 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10648 } 10649 }, 10650 { 10651 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10652 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10653 { 10654 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10655 } 10656 }, 10657 { 10658 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10659 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10660 { 10661 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10662 } 10663 }, 10664 }; 10665 10666 10667 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10668 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10669 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10670 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10671 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10672 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10673 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10674 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10675 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10676 } 10677 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10678