1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 bdev_io_tailq_t queued_resets; 322 323 lba_range_tailq_t locked_ranges; 324 325 /** List of I/Os queued by QoS. */ 326 bdev_io_tailq_t qos_queued_io; 327 }; 328 329 struct media_event_entry { 330 struct spdk_bdev_media_event event; 331 TAILQ_ENTRY(media_event_entry) tailq; 332 }; 333 334 #define MEDIA_EVENT_POOL_SIZE 64 335 336 struct spdk_bdev_desc { 337 struct spdk_bdev *bdev; 338 struct spdk_thread *thread; 339 struct { 340 spdk_bdev_event_cb_t event_fn; 341 void *ctx; 342 } callback; 343 bool closed; 344 bool write; 345 bool memory_domains_supported; 346 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 347 struct spdk_spinlock spinlock; 348 uint32_t refs; 349 TAILQ_HEAD(, media_event_entry) pending_media_events; 350 TAILQ_HEAD(, media_event_entry) free_media_events; 351 struct media_event_entry *media_events_buffer; 352 TAILQ_ENTRY(spdk_bdev_desc) link; 353 354 uint64_t timeout_in_sec; 355 spdk_bdev_io_timeout_cb cb_fn; 356 void *cb_arg; 357 struct spdk_poller *io_timeout_poller; 358 struct spdk_bdev_module_claim *claim; 359 }; 360 361 struct spdk_bdev_iostat_ctx { 362 struct spdk_bdev_io_stat *stat; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_free(void) 663 { 664 struct spdk_bdev_examine_item *item; 665 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 666 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 } 671 } 672 673 static inline bool 674 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 675 { 676 struct spdk_bdev_alias *tmp; 677 if (bdev_examine_allowlist_check(bdev->name)) { 678 return true; 679 } 680 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 681 if (bdev_examine_allowlist_check(tmp->alias.name)) { 682 return true; 683 } 684 } 685 return false; 686 } 687 688 static inline bool 689 bdev_ok_to_examine(struct spdk_bdev *bdev) 690 { 691 /* Some bdevs may not support the READ command. 692 * Do not try to examine them. 693 */ 694 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 695 return false; 696 } 697 698 if (g_bdev_opts.bdev_auto_examine) { 699 return true; 700 } else { 701 return bdev_in_examine_allowlist(bdev); 702 } 703 } 704 705 static void 706 bdev_examine(struct spdk_bdev *bdev) 707 { 708 struct spdk_bdev_module *module; 709 struct spdk_bdev_module_claim *claim, *tmpclaim; 710 uint32_t action; 711 712 if (!bdev_ok_to_examine(bdev)) { 713 return; 714 } 715 716 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 717 if (module->examine_config) { 718 spdk_spin_lock(&module->internal.spinlock); 719 action = module->internal.action_in_progress; 720 module->internal.action_in_progress++; 721 spdk_spin_unlock(&module->internal.spinlock); 722 module->examine_config(bdev); 723 if (action != module->internal.action_in_progress) { 724 SPDK_ERRLOG("examine_config for module %s did not call " 725 "spdk_bdev_module_examine_done()\n", module->name); 726 } 727 } 728 } 729 730 spdk_spin_lock(&bdev->internal.spinlock); 731 732 switch (bdev->internal.claim_type) { 733 case SPDK_BDEV_CLAIM_NONE: 734 /* Examine by all bdev modules */ 735 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 736 if (module->examine_disk) { 737 spdk_spin_lock(&module->internal.spinlock); 738 module->internal.action_in_progress++; 739 spdk_spin_unlock(&module->internal.spinlock); 740 spdk_spin_unlock(&bdev->internal.spinlock); 741 module->examine_disk(bdev); 742 spdk_spin_lock(&bdev->internal.spinlock); 743 } 744 } 745 break; 746 case SPDK_BDEV_CLAIM_EXCL_WRITE: 747 /* Examine by the one bdev module with a v1 claim */ 748 module = bdev->internal.claim.v1.module; 749 if (module->examine_disk) { 750 spdk_spin_lock(&module->internal.spinlock); 751 module->internal.action_in_progress++; 752 spdk_spin_unlock(&module->internal.spinlock); 753 spdk_spin_unlock(&bdev->internal.spinlock); 754 module->examine_disk(bdev); 755 return; 756 } 757 break; 758 default: 759 /* Examine by all bdev modules with a v2 claim */ 760 assert(claim_type_is_v2(bdev->internal.claim_type)); 761 /* 762 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 763 * list, perhaps accessing freed memory. Without protection, this could happen 764 * while the lock is dropped during the examine callback. 765 */ 766 bdev->internal.examine_in_progress++; 767 768 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 769 module = claim->module; 770 771 if (module == NULL) { 772 /* This is a vestigial claim, held by examine_count */ 773 continue; 774 } 775 776 if (module->examine_disk == NULL) { 777 continue; 778 } 779 780 spdk_spin_lock(&module->internal.spinlock); 781 module->internal.action_in_progress++; 782 spdk_spin_unlock(&module->internal.spinlock); 783 784 /* Call examine_disk without holding internal.spinlock. */ 785 spdk_spin_unlock(&bdev->internal.spinlock); 786 module->examine_disk(bdev); 787 spdk_spin_lock(&bdev->internal.spinlock); 788 } 789 790 assert(bdev->internal.examine_in_progress > 0); 791 bdev->internal.examine_in_progress--; 792 if (bdev->internal.examine_in_progress == 0) { 793 /* Remove any claims that were released during examine_disk */ 794 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 795 if (claim->desc != NULL) { 796 continue; 797 } 798 799 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 800 free(claim); 801 } 802 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 803 claim_reset(bdev); 804 } 805 } 806 } 807 808 spdk_spin_unlock(&bdev->internal.spinlock); 809 } 810 811 int 812 spdk_bdev_examine(const char *name) 813 { 814 struct spdk_bdev *bdev; 815 struct spdk_bdev_examine_item *item; 816 struct spdk_thread *thread = spdk_get_thread(); 817 818 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 819 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 820 thread ? spdk_thread_get_name(thread) : "null"); 821 return -EINVAL; 822 } 823 824 if (g_bdev_opts.bdev_auto_examine) { 825 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 826 return -EINVAL; 827 } 828 829 if (bdev_examine_allowlist_check(name)) { 830 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 831 return -EEXIST; 832 } 833 834 item = calloc(1, sizeof(*item)); 835 if (!item) { 836 return -ENOMEM; 837 } 838 item->name = strdup(name); 839 if (!item->name) { 840 free(item); 841 return -ENOMEM; 842 } 843 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 844 845 bdev = spdk_bdev_get_by_name(name); 846 if (bdev) { 847 bdev_examine(bdev); 848 } 849 return 0; 850 } 851 852 static inline void 853 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 854 { 855 struct spdk_bdev_examine_item *item; 856 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 857 spdk_json_write_object_begin(w); 858 spdk_json_write_named_string(w, "method", "bdev_examine"); 859 spdk_json_write_named_object_begin(w, "params"); 860 spdk_json_write_named_string(w, "name", item->name); 861 spdk_json_write_object_end(w); 862 spdk_json_write_object_end(w); 863 } 864 } 865 866 struct spdk_bdev * 867 spdk_bdev_first(void) 868 { 869 struct spdk_bdev *bdev; 870 871 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 872 if (bdev) { 873 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_next(struct spdk_bdev *prev) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = TAILQ_NEXT(prev, internal.link); 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 static struct spdk_bdev * 893 _bdev_next_leaf(struct spdk_bdev *bdev) 894 { 895 while (bdev != NULL) { 896 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 897 return bdev; 898 } else { 899 bdev = TAILQ_NEXT(bdev, internal.link); 900 } 901 } 902 903 return bdev; 904 } 905 906 struct spdk_bdev * 907 spdk_bdev_first_leaf(void) 908 { 909 struct spdk_bdev *bdev; 910 911 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 912 913 if (bdev) { 914 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_next_leaf(struct spdk_bdev *prev) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 static inline bool 935 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 936 { 937 return bdev_io->internal.f.has_memory_domain; 938 } 939 940 static inline bool 941 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 942 { 943 return bdev_io->internal.f.has_accel_sequence; 944 } 945 946 static inline void 947 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 948 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 949 { 950 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 951 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 952 * channels we will instead wait for half to complete. 953 */ 954 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 955 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 956 957 assert(state != BDEV_IO_RETRY_STATE_INVALID); 958 bdev_io->internal.retry_state = state; 959 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 960 } 961 962 static inline void 963 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 964 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 965 { 966 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 967 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 968 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 969 970 assert(state != BDEV_IO_RETRY_STATE_INVALID); 971 bdev_io->internal.retry_state = state; 972 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 973 } 974 975 void 976 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 977 { 978 struct iovec *iovs; 979 980 if (bdev_io->u.bdev.iovs == NULL) { 981 bdev_io->u.bdev.iovs = &bdev_io->iov; 982 bdev_io->u.bdev.iovcnt = 1; 983 } 984 985 iovs = bdev_io->u.bdev.iovs; 986 987 assert(iovs != NULL); 988 assert(bdev_io->u.bdev.iovcnt >= 1); 989 990 iovs[0].iov_base = buf; 991 iovs[0].iov_len = len; 992 } 993 994 void 995 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 996 { 997 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 998 bdev_io->u.bdev.md_buf = md_buf; 999 } 1000 1001 static bool 1002 _is_buf_allocated(const struct iovec *iovs) 1003 { 1004 if (iovs == NULL) { 1005 return false; 1006 } 1007 1008 return iovs[0].iov_base != NULL; 1009 } 1010 1011 static bool 1012 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1013 { 1014 int i; 1015 uintptr_t iov_base; 1016 1017 if (spdk_likely(alignment == 1)) { 1018 return true; 1019 } 1020 1021 for (i = 0; i < iovcnt; i++) { 1022 iov_base = (uintptr_t)iovs[i].iov_base; 1023 if ((iov_base & (alignment - 1)) != 0) { 1024 return false; 1025 } 1026 } 1027 1028 return true; 1029 } 1030 1031 static inline bool 1032 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1033 { 1034 if (!bdev_io_use_accel_sequence(bdev_io)) { 1035 return false; 1036 } 1037 1038 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1039 * bdev module didn't support accel sequences */ 1040 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1041 } 1042 1043 static inline void 1044 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1045 struct spdk_bdev_shared_resource *shared_resource) 1046 { 1047 bdev_ch->io_outstanding++; 1048 shared_resource->io_outstanding++; 1049 } 1050 1051 static inline void 1052 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1053 struct spdk_bdev_shared_resource *shared_resource) 1054 { 1055 assert(bdev_ch->io_outstanding > 0); 1056 assert(shared_resource->io_outstanding > 0); 1057 bdev_ch->io_outstanding--; 1058 shared_resource->io_outstanding--; 1059 } 1060 1061 static void 1062 bdev_io_submit_sequence_cb(void *ctx, int status) 1063 { 1064 struct spdk_bdev_io *bdev_io = ctx; 1065 1066 assert(bdev_io_use_accel_sequence(bdev_io)); 1067 1068 bdev_io->u.bdev.accel_sequence = NULL; 1069 bdev_io->internal.f.has_accel_sequence = false; 1070 1071 if (spdk_unlikely(status != 0)) { 1072 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1073 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1074 bdev_io_complete_unsubmitted(bdev_io); 1075 return; 1076 } 1077 1078 bdev_io_submit(bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_exec_sequence_cb(void *ctx, int status) 1083 { 1084 struct spdk_bdev_io *bdev_io = ctx; 1085 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1086 1087 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1088 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1089 1090 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1091 bdev_ch_retry_io(ch); 1092 } 1093 1094 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1095 } 1096 1097 static void 1098 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1099 { 1100 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1101 1102 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1103 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1104 assert(bdev_io_use_accel_sequence(bdev_io)); 1105 1106 /* Since the operations are appended during submission, they're in the opposite order than 1107 * how we want to execute them for reads (i.e. we need to execute the most recently added 1108 * operation first), so reverse the sequence before executing it. 1109 */ 1110 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1111 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1112 } 1113 1114 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1115 bdev_io_increment_outstanding(ch, ch->shared_resource); 1116 bdev_io->internal.data_transfer_cpl = cb_fn; 1117 1118 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1119 bdev_io_exec_sequence_cb, bdev_io); 1120 } 1121 1122 static void 1123 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1124 { 1125 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1126 void *buf; 1127 1128 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1129 buf = bdev_io->internal.buf.ptr; 1130 bdev_io->internal.buf.ptr = NULL; 1131 bdev_io->internal.f.has_buf = false; 1132 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1133 bdev_io->internal.get_aux_buf_cb = NULL; 1134 } else { 1135 assert(bdev_io->internal.get_buf_cb != NULL); 1136 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1137 bdev_io->internal.get_buf_cb = NULL; 1138 } 1139 } 1140 1141 static void 1142 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1143 { 1144 struct spdk_bdev_io *bdev_io = ctx; 1145 1146 if (rc) { 1147 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1148 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1149 } 1150 bdev_io_get_buf_complete(bdev_io, !rc); 1151 } 1152 1153 static void 1154 bdev_io_pull_md_buf_done(void *ctx, int status) 1155 { 1156 struct spdk_bdev_io *bdev_io = ctx; 1157 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1158 1159 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1160 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1161 1162 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1163 bdev_ch_retry_io(ch); 1164 } 1165 1166 assert(bdev_io->internal.data_transfer_cpl); 1167 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1168 } 1169 1170 static void 1171 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1172 { 1173 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1174 int rc = 0; 1175 1176 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1177 assert(bdev_io->internal.f.has_bounce_buf); 1178 if (bdev_io_use_memory_domain(bdev_io)) { 1179 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1180 bdev_io_increment_outstanding(ch, ch->shared_resource); 1181 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1182 bdev_io->internal.memory_domain_ctx, 1183 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1184 &bdev_io->internal.bounce_buf.md_iov, 1, 1185 bdev_io_pull_md_buf_done, bdev_io); 1186 if (rc == 0) { 1187 /* Continue to submit IO in completion callback */ 1188 return; 1189 } 1190 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1191 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1192 if (rc != -ENOMEM) { 1193 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1194 spdk_memory_domain_get_dma_device_id( 1195 bdev_io->internal.memory_domain), rc); 1196 } 1197 } else { 1198 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1199 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1200 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1201 } 1202 } 1203 1204 if (spdk_unlikely(rc == -ENOMEM)) { 1205 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1206 } else { 1207 assert(bdev_io->internal.data_transfer_cpl); 1208 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1209 } 1210 } 1211 1212 static void 1213 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1214 { 1215 assert(bdev_io->internal.f.has_bounce_buf); 1216 1217 /* save original md_buf */ 1218 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1219 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1220 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1221 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1222 /* set bounce md_buf */ 1223 bdev_io->u.bdev.md_buf = md_buf; 1224 1225 bdev_io_pull_md_buf(bdev_io); 1226 } 1227 1228 static void 1229 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1230 { 1231 struct spdk_bdev *bdev = bdev_io->bdev; 1232 uint64_t md_len; 1233 void *buf; 1234 1235 if (spdk_bdev_is_md_separate(bdev)) { 1236 assert(!bdev_io_use_accel_sequence(bdev_io)); 1237 1238 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1239 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1240 1241 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1242 1243 if (bdev_io->u.bdev.md_buf != NULL) { 1244 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1245 return; 1246 } else { 1247 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1248 } 1249 } 1250 1251 bdev_io_get_buf_complete(bdev_io, true); 1252 } 1253 1254 static inline void 1255 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1256 { 1257 if (rc) { 1258 SPDK_ERRLOG("Failed to get data buffer\n"); 1259 assert(bdev_io->internal.data_transfer_cpl); 1260 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1261 return; 1262 } 1263 1264 _bdev_io_set_md_buf(bdev_io); 1265 } 1266 1267 static void 1268 bdev_io_pull_data_done_and_track(void *ctx, int status) 1269 { 1270 struct spdk_bdev_io *bdev_io = ctx; 1271 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1272 1273 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1274 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1275 1276 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1277 bdev_ch_retry_io(ch); 1278 } 1279 1280 bdev_io_pull_data_done(bdev_io, status); 1281 } 1282 1283 static void 1284 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1285 { 1286 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1287 int rc = 0; 1288 1289 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1290 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1291 * operation */ 1292 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1293 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1294 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1295 assert(bdev_io_use_accel_sequence(bdev_io)); 1296 assert(bdev_io->internal.f.has_bounce_buf); 1297 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1298 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1299 NULL, NULL, 1300 bdev_io->internal.bounce_buf.orig_iovs, 1301 bdev_io->internal.bounce_buf.orig_iovcnt, 1302 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1303 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1304 NULL, NULL); 1305 } else { 1306 /* We need to reverse the src/dst for reads */ 1307 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1308 assert(bdev_io_use_accel_sequence(bdev_io)); 1309 assert(bdev_io->internal.f.has_bounce_buf); 1310 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1311 bdev_io->internal.bounce_buf.orig_iovs, 1312 bdev_io->internal.bounce_buf.orig_iovcnt, 1313 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1314 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1315 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1316 NULL, NULL, NULL, NULL); 1317 } 1318 1319 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1320 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1321 bdev_io->internal.accel_sequence); 1322 } 1323 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1324 /* if this is write path, copy data from original buffer to bounce buffer */ 1325 if (bdev_io_use_memory_domain(bdev_io)) { 1326 assert(bdev_io->internal.f.has_bounce_buf); 1327 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1328 bdev_io_increment_outstanding(ch, ch->shared_resource); 1329 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1330 bdev_io->internal.memory_domain_ctx, 1331 bdev_io->internal.bounce_buf.orig_iovs, 1332 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1333 bdev_io->u.bdev.iovs, 1, 1334 bdev_io_pull_data_done_and_track, 1335 bdev_io); 1336 if (rc == 0) { 1337 /* Continue to submit IO in completion callback */ 1338 return; 1339 } 1340 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1341 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1342 if (rc != -ENOMEM) { 1343 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1344 spdk_memory_domain_get_dma_device_id( 1345 bdev_io->internal.memory_domain)); 1346 } 1347 } else { 1348 assert(bdev_io->u.bdev.iovcnt == 1); 1349 assert(bdev_io->internal.f.has_bounce_buf); 1350 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1351 bdev_io->u.bdev.iovs[0].iov_len, 1352 bdev_io->internal.bounce_buf.orig_iovs, 1353 bdev_io->internal.bounce_buf.orig_iovcnt); 1354 } 1355 } 1356 1357 if (spdk_unlikely(rc == -ENOMEM)) { 1358 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1359 } else { 1360 bdev_io_pull_data_done(bdev_io, rc); 1361 } 1362 } 1363 1364 static void 1365 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1366 bdev_copy_bounce_buffer_cpl cpl_cb) 1367 { 1368 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1369 1370 assert(bdev_io->internal.f.has_bounce_buf == false); 1371 1372 bdev_io->internal.data_transfer_cpl = cpl_cb; 1373 bdev_io->internal.f.has_bounce_buf = true; 1374 /* save original iovec */ 1375 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1376 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1377 /* zero the other data members */ 1378 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1379 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1380 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1381 /* set bounce iov */ 1382 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1383 bdev_io->u.bdev.iovcnt = 1; 1384 /* set bounce buffer for this operation */ 1385 bdev_io->u.bdev.iovs[0].iov_base = buf; 1386 bdev_io->u.bdev.iovs[0].iov_len = len; 1387 /* Now we use 1 iov, the split condition could have been changed */ 1388 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1389 1390 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1391 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1392 } else { 1393 bdev_io_pull_data(bdev_io); 1394 } 1395 } 1396 1397 static void 1398 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1399 { 1400 struct spdk_bdev *bdev = bdev_io->bdev; 1401 bool buf_allocated; 1402 uint64_t alignment; 1403 void *aligned_buf; 1404 1405 bdev_io->internal.buf.ptr = buf; 1406 bdev_io->internal.f.has_buf = true; 1407 1408 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1409 bdev_io_get_buf_complete(bdev_io, true); 1410 return; 1411 } 1412 1413 alignment = spdk_bdev_get_buf_align(bdev); 1414 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1415 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1416 1417 if (buf_allocated) { 1418 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1419 /* Continue in completion callback */ 1420 return; 1421 } else { 1422 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1423 } 1424 1425 _bdev_io_set_md_buf(bdev_io); 1426 } 1427 1428 static inline uint64_t 1429 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1430 { 1431 struct spdk_bdev *bdev = bdev_io->bdev; 1432 uint64_t md_len, alignment; 1433 1434 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1435 1436 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1437 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1438 1439 return len + alignment + md_len; 1440 } 1441 1442 static void 1443 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1444 { 1445 struct spdk_bdev_mgmt_channel *ch; 1446 1447 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1448 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1449 } 1450 1451 static void 1452 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1453 { 1454 assert(bdev_io->internal.f.has_buf); 1455 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1456 bdev_io->internal.buf.ptr = NULL; 1457 bdev_io->internal.f.has_buf = false; 1458 } 1459 1460 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1461 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1462 1463 void 1464 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1465 { 1466 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1467 1468 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1469 1470 assert(buf != NULL); 1471 _bdev_io_put_buf(bdev_io, buf, len); 1472 } 1473 1474 static inline void 1475 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1476 struct spdk_bdev_io *bdev_io) 1477 { 1478 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1479 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1480 * sequence pointer to make sure we won't touch it anymore. */ 1481 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1482 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1483 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1484 bdev_io->internal.f.has_accel_sequence = false; 1485 } 1486 1487 bdev->fn_table->submit_request(ioch, bdev_io); 1488 } 1489 1490 static inline void 1491 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1492 { 1493 struct spdk_bdev *bdev = bdev_io->bdev; 1494 1495 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1496 bdev_io->internal.error.nvme.cdw0 = 0; 1497 bdev_io->num_retries++; 1498 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1499 } 1500 1501 static void 1502 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1503 { 1504 struct spdk_bdev_io *bdev_io; 1505 1506 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1507 /* 1508 * Allow some more I/O to complete before retrying the nomem_io queue. 1509 * Some drivers (such as nvme) cannot immediately take a new I/O in 1510 * the context of a completion, because the resources for the I/O are 1511 * not released until control returns to the bdev poller. Also, we 1512 * may require several small I/O to complete before a larger I/O 1513 * (that requires splitting) can be submitted. 1514 */ 1515 return; 1516 } 1517 1518 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1519 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1520 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1521 1522 switch (bdev_io->internal.retry_state) { 1523 case BDEV_IO_RETRY_STATE_SUBMIT: 1524 bdev_ch_resubmit_io(shared_resource, bdev_io); 1525 break; 1526 case BDEV_IO_RETRY_STATE_PULL: 1527 bdev_io_pull_data(bdev_io); 1528 break; 1529 case BDEV_IO_RETRY_STATE_PULL_MD: 1530 bdev_io_pull_md_buf(bdev_io); 1531 break; 1532 case BDEV_IO_RETRY_STATE_PUSH: 1533 bdev_io_push_bounce_data(bdev_io); 1534 break; 1535 case BDEV_IO_RETRY_STATE_PUSH_MD: 1536 bdev_io_push_bounce_md_buf(bdev_io); 1537 break; 1538 default: 1539 assert(0 && "invalid retry state"); 1540 break; 1541 } 1542 1543 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1544 /* This IO completed again with NOMEM status, so break the loop and 1545 * don't try anymore. Note that a bdev_io that fails with NOMEM 1546 * always gets requeued at the front of the list, to maintain 1547 * ordering. 1548 */ 1549 break; 1550 } 1551 } 1552 } 1553 1554 static void 1555 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1556 { 1557 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1558 } 1559 1560 static int 1561 bdev_no_mem_poller(void *ctx) 1562 { 1563 struct spdk_bdev_shared_resource *shared_resource = ctx; 1564 1565 spdk_poller_unregister(&shared_resource->nomem_poller); 1566 1567 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1568 bdev_shared_ch_retry_io(shared_resource); 1569 } 1570 /* the retry cb may re-register the poller so double check */ 1571 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1572 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1573 /* No IOs were submitted, try again */ 1574 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1575 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1576 } 1577 1578 return SPDK_POLLER_BUSY; 1579 } 1580 1581 static inline bool 1582 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1583 { 1584 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1585 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1586 1587 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1588 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1589 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1590 1591 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1592 /* Special case when we have nomem IOs and no outstanding IOs which completions 1593 * could trigger retry of queued IOs 1594 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1595 * new IOs submitted, e.g. qd==1 */ 1596 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1597 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1598 } 1599 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1600 * ownership of that sequence is transferred back to the bdev layer, so we need to 1601 * restore internal.accel_sequence to make sure that the sequence is handled 1602 * correctly in case the I/O is later aborted. */ 1603 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1604 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1605 assert(!bdev_io_use_accel_sequence(bdev_io)); 1606 bdev_io->internal.f.has_accel_sequence = true; 1607 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1608 } 1609 1610 return true; 1611 } 1612 1613 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1614 bdev_ch_retry_io(bdev_ch); 1615 } 1616 1617 return false; 1618 } 1619 1620 static void 1621 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1622 { 1623 struct spdk_bdev_io *bdev_io = ctx; 1624 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1625 1626 if (rc) { 1627 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1628 } 1629 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1630 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1631 */ 1632 bdev_io_put_buf(bdev_io); 1633 1634 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1635 bdev_ch_retry_io(ch); 1636 } 1637 1638 /* Continue with IO completion flow */ 1639 bdev_io_complete(bdev_io); 1640 } 1641 1642 static void 1643 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1644 { 1645 struct spdk_bdev_io *bdev_io = ctx; 1646 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1647 1648 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1649 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1650 bdev_io->internal.f.has_bounce_buf = false; 1651 1652 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1653 bdev_ch_retry_io(ch); 1654 } 1655 1656 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1657 } 1658 1659 static inline void 1660 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1661 { 1662 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1663 int rc = 0; 1664 1665 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1666 assert(bdev_io->internal.f.has_bounce_buf); 1667 1668 /* do the same for metadata buffer */ 1669 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1670 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1671 1672 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1673 if (bdev_io_use_memory_domain(bdev_io)) { 1674 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1675 bdev_io_increment_outstanding(ch, ch->shared_resource); 1676 /* If memory domain is used then we need to call async push function */ 1677 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1678 bdev_io->internal.memory_domain_ctx, 1679 &bdev_io->internal.bounce_buf.orig_md_iov, 1680 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1681 &bdev_io->internal.bounce_buf.md_iov, 1, 1682 bdev_io_push_bounce_md_buf_done, 1683 bdev_io); 1684 if (rc == 0) { 1685 /* Continue IO completion in async callback */ 1686 return; 1687 } 1688 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1689 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1690 if (rc != -ENOMEM) { 1691 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1692 spdk_memory_domain_get_dma_device_id( 1693 bdev_io->internal.memory_domain)); 1694 } 1695 } else { 1696 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1697 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1698 } 1699 } 1700 } 1701 1702 if (spdk_unlikely(rc == -ENOMEM)) { 1703 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1704 } else { 1705 assert(bdev_io->internal.data_transfer_cpl); 1706 bdev_io->internal.f.has_bounce_buf = false; 1707 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1708 } 1709 } 1710 1711 static inline void 1712 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1713 { 1714 assert(bdev_io->internal.data_transfer_cpl); 1715 if (rc) { 1716 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1717 return; 1718 } 1719 1720 /* set original buffer for this io */ 1721 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1722 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1723 1724 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1725 * we still need to clear the md buf */ 1726 1727 bdev_io_push_bounce_md_buf(bdev_io); 1728 } 1729 1730 static void 1731 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1732 { 1733 struct spdk_bdev_io *bdev_io = ctx; 1734 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1735 1736 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1737 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1738 1739 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1740 bdev_ch_retry_io(ch); 1741 } 1742 1743 bdev_io_push_bounce_data_done(bdev_io, status); 1744 } 1745 1746 static inline void 1747 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1748 { 1749 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1750 int rc = 0; 1751 1752 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1753 assert(!bdev_io_use_accel_sequence(bdev_io)); 1754 assert(bdev_io->internal.f.has_bounce_buf); 1755 1756 /* if this is read path, copy data from bounce buffer to original buffer */ 1757 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1758 if (bdev_io_use_memory_domain(bdev_io)) { 1759 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1760 bdev_io_increment_outstanding(ch, ch->shared_resource); 1761 /* If memory domain is used then we need to call async push function */ 1762 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1763 bdev_io->internal.memory_domain_ctx, 1764 bdev_io->internal.bounce_buf.orig_iovs, 1765 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1766 &bdev_io->internal.bounce_buf.iov, 1, 1767 bdev_io_push_bounce_data_done_and_track, 1768 bdev_io); 1769 if (rc == 0) { 1770 /* Continue IO completion in async callback */ 1771 return; 1772 } 1773 1774 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1775 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1776 if (rc != -ENOMEM) { 1777 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1778 spdk_memory_domain_get_dma_device_id( 1779 bdev_io->internal.memory_domain)); 1780 } 1781 } else { 1782 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1783 bdev_io->internal.bounce_buf.orig_iovcnt, 1784 bdev_io->internal.bounce_buf.iov.iov_base, 1785 bdev_io->internal.bounce_buf.iov.iov_len); 1786 } 1787 } 1788 1789 if (spdk_unlikely(rc == -ENOMEM)) { 1790 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1791 } else { 1792 bdev_io_push_bounce_data_done(bdev_io, rc); 1793 } 1794 } 1795 1796 static inline void 1797 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1798 { 1799 bdev_io->internal.data_transfer_cpl = cpl_cb; 1800 bdev_io_push_bounce_data(bdev_io); 1801 } 1802 1803 static void 1804 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1805 { 1806 struct spdk_bdev_io *bdev_io; 1807 1808 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1809 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1810 } 1811 1812 static void 1813 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1814 { 1815 struct spdk_bdev_mgmt_channel *mgmt_ch; 1816 uint64_t max_len; 1817 void *buf; 1818 1819 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1820 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1821 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1822 1823 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1824 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1825 bdev_io_get_buf_complete(bdev_io, false); 1826 return; 1827 } 1828 1829 bdev_io->internal.buf.len = len; 1830 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1831 bdev_io_get_iobuf_cb); 1832 if (buf != NULL) { 1833 _bdev_io_set_buf(bdev_io, buf, len); 1834 } 1835 } 1836 1837 void 1838 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1839 { 1840 struct spdk_bdev *bdev = bdev_io->bdev; 1841 uint64_t alignment; 1842 1843 assert(cb != NULL); 1844 bdev_io->internal.get_buf_cb = cb; 1845 1846 alignment = spdk_bdev_get_buf_align(bdev); 1847 1848 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1849 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1850 /* Buffer already present and aligned */ 1851 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1852 return; 1853 } 1854 1855 bdev_io_get_buf(bdev_io, len); 1856 } 1857 1858 static void 1859 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1860 bool success) 1861 { 1862 if (!success) { 1863 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1864 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1865 bdev_io_complete_unsubmitted(bdev_io); 1866 return; 1867 } 1868 1869 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1870 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1871 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1872 return; 1873 } 1874 /* For reads we'll execute the sequence after the data is read, so, for now, only 1875 * clear out accel_sequence pointer and submit the IO */ 1876 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1877 bdev_io->u.bdev.accel_sequence = NULL; 1878 } 1879 1880 bdev_io_submit(bdev_io); 1881 } 1882 1883 static void 1884 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1885 uint64_t len) 1886 { 1887 assert(cb != NULL); 1888 bdev_io->internal.get_buf_cb = cb; 1889 1890 bdev_io_get_buf(bdev_io, len); 1891 } 1892 1893 1894 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1895 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1896 1897 void 1898 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1899 { 1900 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1901 1902 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1903 1904 assert(cb != NULL); 1905 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1906 bdev_io->internal.get_aux_buf_cb = cb; 1907 bdev_io_get_buf(bdev_io, len); 1908 } 1909 1910 static int 1911 bdev_module_get_max_ctx_size(void) 1912 { 1913 struct spdk_bdev_module *bdev_module; 1914 int max_bdev_module_size = 0; 1915 1916 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1917 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1918 max_bdev_module_size = bdev_module->get_ctx_size(); 1919 } 1920 } 1921 1922 return max_bdev_module_size; 1923 } 1924 1925 static void 1926 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1927 { 1928 if (!bdev->internal.histogram_enabled) { 1929 return; 1930 } 1931 1932 spdk_json_write_object_begin(w); 1933 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1934 1935 spdk_json_write_named_object_begin(w, "params"); 1936 spdk_json_write_named_string(w, "name", bdev->name); 1937 1938 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1939 1940 if (bdev->internal.histogram_io_type) { 1941 spdk_json_write_named_string(w, "opc", 1942 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1943 } 1944 1945 spdk_json_write_object_end(w); 1946 1947 spdk_json_write_object_end(w); 1948 } 1949 1950 static void 1951 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1952 { 1953 int i; 1954 struct spdk_bdev_qos *qos = bdev->internal.qos; 1955 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1956 1957 if (!qos) { 1958 return; 1959 } 1960 1961 spdk_bdev_get_qos_rate_limits(bdev, limits); 1962 1963 spdk_json_write_object_begin(w); 1964 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1965 1966 spdk_json_write_named_object_begin(w, "params"); 1967 spdk_json_write_named_string(w, "name", bdev->name); 1968 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1969 if (limits[i] > 0) { 1970 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1971 } 1972 } 1973 spdk_json_write_object_end(w); 1974 1975 spdk_json_write_object_end(w); 1976 } 1977 1978 void 1979 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1980 { 1981 struct spdk_bdev_module *bdev_module; 1982 struct spdk_bdev *bdev; 1983 1984 assert(w != NULL); 1985 1986 spdk_json_write_array_begin(w); 1987 1988 spdk_json_write_object_begin(w); 1989 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1990 spdk_json_write_named_object_begin(w, "params"); 1991 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1992 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1993 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1994 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1995 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1996 spdk_json_write_object_end(w); 1997 spdk_json_write_object_end(w); 1998 1999 bdev_examine_allowlist_config_json(w); 2000 2001 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2002 if (bdev_module->config_json) { 2003 bdev_module->config_json(w); 2004 } 2005 } 2006 2007 spdk_spin_lock(&g_bdev_mgr.spinlock); 2008 2009 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2010 if (bdev->fn_table->write_config_json) { 2011 bdev->fn_table->write_config_json(bdev, w); 2012 } 2013 2014 bdev_qos_config_json(bdev, w); 2015 bdev_enable_histogram_config_json(bdev, w); 2016 } 2017 2018 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2019 2020 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2021 spdk_json_write_object_begin(w); 2022 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2023 spdk_json_write_object_end(w); 2024 2025 spdk_json_write_array_end(w); 2026 } 2027 2028 static void 2029 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2030 { 2031 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2032 struct spdk_bdev_io *bdev_io; 2033 2034 spdk_iobuf_channel_fini(&ch->iobuf); 2035 2036 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2037 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2038 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2039 ch->per_thread_cache_count--; 2040 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2041 } 2042 2043 assert(ch->per_thread_cache_count == 0); 2044 } 2045 2046 static int 2047 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2048 { 2049 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2050 struct spdk_bdev_io *bdev_io; 2051 uint32_t i; 2052 int rc; 2053 2054 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2055 g_bdev_opts.iobuf_small_cache_size, 2056 g_bdev_opts.iobuf_large_cache_size); 2057 if (rc != 0) { 2058 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2059 return -1; 2060 } 2061 2062 STAILQ_INIT(&ch->per_thread_cache); 2063 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2064 2065 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2066 ch->per_thread_cache_count = 0; 2067 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2068 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2069 if (bdev_io == NULL) { 2070 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2071 assert(false); 2072 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2073 return -1; 2074 } 2075 ch->per_thread_cache_count++; 2076 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2077 } 2078 2079 TAILQ_INIT(&ch->shared_resources); 2080 TAILQ_INIT(&ch->io_wait_queue); 2081 2082 return 0; 2083 } 2084 2085 static void 2086 bdev_init_complete(int rc) 2087 { 2088 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2089 void *cb_arg = g_init_cb_arg; 2090 struct spdk_bdev_module *m; 2091 2092 g_bdev_mgr.init_complete = true; 2093 g_init_cb_fn = NULL; 2094 g_init_cb_arg = NULL; 2095 2096 /* 2097 * For modules that need to know when subsystem init is complete, 2098 * inform them now. 2099 */ 2100 if (rc == 0) { 2101 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2102 if (m->init_complete) { 2103 m->init_complete(); 2104 } 2105 } 2106 } 2107 2108 cb_fn(cb_arg, rc); 2109 } 2110 2111 static bool 2112 bdev_module_all_actions_completed(void) 2113 { 2114 struct spdk_bdev_module *m; 2115 2116 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2117 if (m->internal.action_in_progress > 0) { 2118 return false; 2119 } 2120 } 2121 return true; 2122 } 2123 2124 static void 2125 bdev_module_action_complete(void) 2126 { 2127 /* 2128 * Don't finish bdev subsystem initialization if 2129 * module pre-initialization is still in progress, or 2130 * the subsystem been already initialized. 2131 */ 2132 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2133 return; 2134 } 2135 2136 /* 2137 * Check all bdev modules for inits/examinations in progress. If any 2138 * exist, return immediately since we cannot finish bdev subsystem 2139 * initialization until all are completed. 2140 */ 2141 if (!bdev_module_all_actions_completed()) { 2142 return; 2143 } 2144 2145 /* 2146 * Modules already finished initialization - now that all 2147 * the bdev modules have finished their asynchronous I/O 2148 * processing, the entire bdev layer can be marked as complete. 2149 */ 2150 bdev_init_complete(0); 2151 } 2152 2153 static void 2154 bdev_module_action_done(struct spdk_bdev_module *module) 2155 { 2156 spdk_spin_lock(&module->internal.spinlock); 2157 assert(module->internal.action_in_progress > 0); 2158 module->internal.action_in_progress--; 2159 spdk_spin_unlock(&module->internal.spinlock); 2160 bdev_module_action_complete(); 2161 } 2162 2163 void 2164 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2165 { 2166 assert(module->async_init); 2167 bdev_module_action_done(module); 2168 } 2169 2170 void 2171 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2172 { 2173 bdev_module_action_done(module); 2174 } 2175 2176 /** The last initialized bdev module */ 2177 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2178 2179 static void 2180 bdev_init_failed(void *cb_arg) 2181 { 2182 struct spdk_bdev_module *module = cb_arg; 2183 2184 spdk_spin_lock(&module->internal.spinlock); 2185 assert(module->internal.action_in_progress > 0); 2186 module->internal.action_in_progress--; 2187 spdk_spin_unlock(&module->internal.spinlock); 2188 bdev_init_complete(-1); 2189 } 2190 2191 static int 2192 bdev_modules_init(void) 2193 { 2194 struct spdk_bdev_module *module; 2195 int rc = 0; 2196 2197 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2198 g_resume_bdev_module = module; 2199 if (module->async_init) { 2200 spdk_spin_lock(&module->internal.spinlock); 2201 module->internal.action_in_progress = 1; 2202 spdk_spin_unlock(&module->internal.spinlock); 2203 } 2204 rc = module->module_init(); 2205 if (rc != 0) { 2206 /* Bump action_in_progress to prevent other modules from completion of modules_init 2207 * Send message to defer application shutdown until resources are cleaned up */ 2208 spdk_spin_lock(&module->internal.spinlock); 2209 module->internal.action_in_progress = 1; 2210 spdk_spin_unlock(&module->internal.spinlock); 2211 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2212 return rc; 2213 } 2214 } 2215 2216 g_resume_bdev_module = NULL; 2217 return 0; 2218 } 2219 2220 void 2221 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2222 { 2223 int rc = 0; 2224 char mempool_name[32]; 2225 2226 assert(cb_fn != NULL); 2227 2228 g_init_cb_fn = cb_fn; 2229 g_init_cb_arg = cb_arg; 2230 2231 spdk_notify_type_register("bdev_register"); 2232 spdk_notify_type_register("bdev_unregister"); 2233 2234 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2235 2236 rc = spdk_iobuf_register_module("bdev"); 2237 if (rc != 0) { 2238 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2239 bdev_init_complete(-1); 2240 return; 2241 } 2242 2243 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2244 g_bdev_opts.bdev_io_pool_size, 2245 sizeof(struct spdk_bdev_io) + 2246 bdev_module_get_max_ctx_size(), 2247 0, 2248 SPDK_ENV_NUMA_ID_ANY); 2249 2250 if (g_bdev_mgr.bdev_io_pool == NULL) { 2251 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2252 bdev_init_complete(-1); 2253 return; 2254 } 2255 2256 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2257 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2258 if (!g_bdev_mgr.zero_buffer) { 2259 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2260 bdev_init_complete(-1); 2261 return; 2262 } 2263 2264 #ifdef SPDK_CONFIG_VTUNE 2265 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2266 #endif 2267 2268 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2269 bdev_mgmt_channel_destroy, 2270 sizeof(struct spdk_bdev_mgmt_channel), 2271 "bdev_mgr"); 2272 2273 rc = bdev_modules_init(); 2274 g_bdev_mgr.module_init_complete = true; 2275 if (rc != 0) { 2276 SPDK_ERRLOG("bdev modules init failed\n"); 2277 return; 2278 } 2279 2280 bdev_module_action_complete(); 2281 } 2282 2283 static void 2284 bdev_mgr_unregister_cb(void *io_device) 2285 { 2286 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2287 2288 if (g_bdev_mgr.bdev_io_pool) { 2289 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2290 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2291 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2292 g_bdev_opts.bdev_io_pool_size); 2293 } 2294 2295 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2296 } 2297 2298 spdk_free(g_bdev_mgr.zero_buffer); 2299 2300 bdev_examine_allowlist_free(); 2301 2302 cb_fn(g_fini_cb_arg); 2303 g_fini_cb_fn = NULL; 2304 g_fini_cb_arg = NULL; 2305 g_bdev_mgr.init_complete = false; 2306 g_bdev_mgr.module_init_complete = false; 2307 } 2308 2309 static void 2310 bdev_module_fini_iter(void *arg) 2311 { 2312 struct spdk_bdev_module *bdev_module; 2313 2314 /* FIXME: Handling initialization failures is broken now, 2315 * so we won't even try cleaning up after successfully 2316 * initialized modules. if module_init_complete is false, 2317 * just call spdk_bdev_mgr_unregister_cb 2318 */ 2319 if (!g_bdev_mgr.module_init_complete) { 2320 bdev_mgr_unregister_cb(NULL); 2321 return; 2322 } 2323 2324 /* Start iterating from the last touched module */ 2325 if (!g_resume_bdev_module) { 2326 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2327 } else { 2328 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2329 internal.tailq); 2330 } 2331 2332 while (bdev_module) { 2333 if (bdev_module->async_fini) { 2334 /* Save our place so we can resume later. We must 2335 * save the variable here, before calling module_fini() 2336 * below, because in some cases the module may immediately 2337 * call spdk_bdev_module_fini_done() and re-enter 2338 * this function to continue iterating. */ 2339 g_resume_bdev_module = bdev_module; 2340 } 2341 2342 if (bdev_module->module_fini) { 2343 bdev_module->module_fini(); 2344 } 2345 2346 if (bdev_module->async_fini) { 2347 return; 2348 } 2349 2350 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2351 internal.tailq); 2352 } 2353 2354 g_resume_bdev_module = NULL; 2355 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2356 } 2357 2358 void 2359 spdk_bdev_module_fini_done(void) 2360 { 2361 if (spdk_get_thread() != g_fini_thread) { 2362 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2363 } else { 2364 bdev_module_fini_iter(NULL); 2365 } 2366 } 2367 2368 static void 2369 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2370 { 2371 struct spdk_bdev *bdev = cb_arg; 2372 2373 if (bdeverrno && bdev) { 2374 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2375 bdev->name); 2376 2377 /* 2378 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2379 * bdev; try to continue by manually removing this bdev from the list and continue 2380 * with the next bdev in the list. 2381 */ 2382 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2383 } 2384 2385 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2386 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2387 /* 2388 * Bdev module finish need to be deferred as we might be in the middle of some context 2389 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2390 * after returning. 2391 */ 2392 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2393 return; 2394 } 2395 2396 /* 2397 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2398 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2399 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2400 * base bdevs. 2401 * 2402 * Also, walk the list in the reverse order. 2403 */ 2404 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2405 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2406 spdk_spin_lock(&bdev->internal.spinlock); 2407 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2408 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2409 spdk_spin_unlock(&bdev->internal.spinlock); 2410 continue; 2411 } 2412 spdk_spin_unlock(&bdev->internal.spinlock); 2413 2414 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2415 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2416 return; 2417 } 2418 2419 /* 2420 * If any bdev fails to unclaim underlying bdev properly, we may face the 2421 * case of bdev list consisting of claimed bdevs only (if claims are managed 2422 * correctly, this would mean there's a loop in the claims graph which is 2423 * clearly impossible). Warn and unregister last bdev on the list then. 2424 */ 2425 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2426 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2427 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2428 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2429 return; 2430 } 2431 } 2432 2433 static void 2434 bdev_module_fini_start_iter(void *arg) 2435 { 2436 struct spdk_bdev_module *bdev_module; 2437 2438 if (!g_resume_bdev_module) { 2439 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2440 } else { 2441 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2442 } 2443 2444 while (bdev_module) { 2445 if (bdev_module->async_fini_start) { 2446 /* Save our place so we can resume later. We must 2447 * save the variable here, before calling fini_start() 2448 * below, because in some cases the module may immediately 2449 * call spdk_bdev_module_fini_start_done() and re-enter 2450 * this function to continue iterating. */ 2451 g_resume_bdev_module = bdev_module; 2452 } 2453 2454 if (bdev_module->fini_start) { 2455 bdev_module->fini_start(); 2456 } 2457 2458 if (bdev_module->async_fini_start) { 2459 return; 2460 } 2461 2462 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2463 } 2464 2465 g_resume_bdev_module = NULL; 2466 2467 bdev_finish_unregister_bdevs_iter(NULL, 0); 2468 } 2469 2470 void 2471 spdk_bdev_module_fini_start_done(void) 2472 { 2473 if (spdk_get_thread() != g_fini_thread) { 2474 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2475 } else { 2476 bdev_module_fini_start_iter(NULL); 2477 } 2478 } 2479 2480 static void 2481 bdev_finish_wait_for_examine_done(void *cb_arg) 2482 { 2483 bdev_module_fini_start_iter(NULL); 2484 } 2485 2486 static void bdev_open_async_fini(void); 2487 2488 void 2489 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2490 { 2491 int rc; 2492 2493 assert(cb_fn != NULL); 2494 2495 g_fini_thread = spdk_get_thread(); 2496 2497 g_fini_cb_fn = cb_fn; 2498 g_fini_cb_arg = cb_arg; 2499 2500 bdev_open_async_fini(); 2501 2502 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2503 if (rc != 0) { 2504 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2505 bdev_finish_wait_for_examine_done(NULL); 2506 } 2507 } 2508 2509 struct spdk_bdev_io * 2510 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2511 { 2512 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2513 struct spdk_bdev_io *bdev_io; 2514 2515 if (ch->per_thread_cache_count > 0) { 2516 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2517 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2518 ch->per_thread_cache_count--; 2519 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2520 /* 2521 * Don't try to look for bdev_ios in the global pool if there are 2522 * waiters on bdev_ios - we don't want this caller to jump the line. 2523 */ 2524 bdev_io = NULL; 2525 } else { 2526 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2527 } 2528 2529 return bdev_io; 2530 } 2531 2532 void 2533 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2534 { 2535 struct spdk_bdev_mgmt_channel *ch; 2536 2537 assert(bdev_io != NULL); 2538 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2539 2540 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2541 2542 if (bdev_io->internal.f.has_buf) { 2543 bdev_io_put_buf(bdev_io); 2544 } 2545 2546 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2547 ch->per_thread_cache_count++; 2548 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2549 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2550 struct spdk_bdev_io_wait_entry *entry; 2551 2552 entry = TAILQ_FIRST(&ch->io_wait_queue); 2553 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2554 entry->cb_fn(entry->cb_arg); 2555 } 2556 } else { 2557 /* We should never have a full cache with entries on the io wait queue. */ 2558 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2559 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2560 } 2561 } 2562 2563 static bool 2564 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2565 { 2566 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2567 2568 switch (limit) { 2569 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2570 return true; 2571 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2572 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2573 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2574 return false; 2575 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2576 default: 2577 return false; 2578 } 2579 } 2580 2581 static bool 2582 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2583 { 2584 switch (bdev_io->type) { 2585 case SPDK_BDEV_IO_TYPE_NVME_IO: 2586 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2587 case SPDK_BDEV_IO_TYPE_READ: 2588 case SPDK_BDEV_IO_TYPE_WRITE: 2589 return true; 2590 case SPDK_BDEV_IO_TYPE_ZCOPY: 2591 if (bdev_io->u.bdev.zcopy.start) { 2592 return true; 2593 } else { 2594 return false; 2595 } 2596 default: 2597 return false; 2598 } 2599 } 2600 2601 static bool 2602 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2603 { 2604 switch (bdev_io->type) { 2605 case SPDK_BDEV_IO_TYPE_NVME_IO: 2606 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2607 /* Bit 1 (0x2) set for read operation */ 2608 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2609 return true; 2610 } else { 2611 return false; 2612 } 2613 case SPDK_BDEV_IO_TYPE_READ: 2614 return true; 2615 case SPDK_BDEV_IO_TYPE_ZCOPY: 2616 /* Populate to read from disk */ 2617 if (bdev_io->u.bdev.zcopy.populate) { 2618 return true; 2619 } else { 2620 return false; 2621 } 2622 default: 2623 return false; 2624 } 2625 } 2626 2627 static uint64_t 2628 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2629 { 2630 struct spdk_bdev *bdev = bdev_io->bdev; 2631 2632 switch (bdev_io->type) { 2633 case SPDK_BDEV_IO_TYPE_NVME_IO: 2634 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2635 return bdev_io->u.nvme_passthru.nbytes; 2636 case SPDK_BDEV_IO_TYPE_READ: 2637 case SPDK_BDEV_IO_TYPE_WRITE: 2638 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2639 case SPDK_BDEV_IO_TYPE_ZCOPY: 2640 /* Track the data in the start phase only */ 2641 if (bdev_io->u.bdev.zcopy.start) { 2642 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2643 } else { 2644 return 0; 2645 } 2646 default: 2647 return 0; 2648 } 2649 } 2650 2651 static inline bool 2652 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2653 { 2654 int64_t remaining_this_timeslice; 2655 2656 if (!limit->max_per_timeslice) { 2657 /* The QoS is disabled */ 2658 return false; 2659 } 2660 2661 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2662 __ATOMIC_RELAXED); 2663 if (remaining_this_timeslice + (int64_t)delta > 0) { 2664 /* There was still a quota for this delta -> the IO shouldn't be queued 2665 * 2666 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2667 * quota can be allowed once a while. Such overrun then taken into account in 2668 * the QoS poller, where the next timeslice quota is calculated. 2669 */ 2670 return false; 2671 } 2672 2673 /* There was no quota for this delta -> the IO should be queued 2674 * The remaining_this_timeslice must be rewinded so it reflects the real 2675 * amount of IOs or bytes allowed. 2676 */ 2677 __atomic_add_fetch( 2678 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2679 return true; 2680 } 2681 2682 static inline void 2683 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2684 { 2685 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2686 } 2687 2688 static bool 2689 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2690 { 2691 return bdev_qos_rw_queue_io(limit, io, 1); 2692 } 2693 2694 static void 2695 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2696 { 2697 bdev_qos_rw_rewind_io(limit, io, 1); 2698 } 2699 2700 static bool 2701 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2702 { 2703 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2704 } 2705 2706 static void 2707 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2708 { 2709 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2710 } 2711 2712 static bool 2713 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2714 { 2715 if (bdev_is_read_io(io) == false) { 2716 return false; 2717 } 2718 2719 return bdev_qos_rw_bps_queue(limit, io); 2720 } 2721 2722 static void 2723 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2724 { 2725 if (bdev_is_read_io(io) != false) { 2726 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2727 } 2728 } 2729 2730 static bool 2731 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2732 { 2733 if (bdev_is_read_io(io) == true) { 2734 return false; 2735 } 2736 2737 return bdev_qos_rw_bps_queue(limit, io); 2738 } 2739 2740 static void 2741 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2742 { 2743 if (bdev_is_read_io(io) != true) { 2744 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2745 } 2746 } 2747 2748 static void 2749 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2750 { 2751 int i; 2752 2753 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2754 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2755 qos->rate_limits[i].queue_io = NULL; 2756 continue; 2757 } 2758 2759 switch (i) { 2760 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2761 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2762 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2763 break; 2764 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2765 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2766 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2767 break; 2768 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2769 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2770 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2771 break; 2772 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2773 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2774 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2775 break; 2776 default: 2777 break; 2778 } 2779 } 2780 } 2781 2782 static void 2783 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2784 struct spdk_bdev_io *bdev_io, 2785 enum spdk_bdev_io_status status) 2786 { 2787 bdev_io->internal.f.in_submit_request = true; 2788 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2789 spdk_bdev_io_complete(bdev_io, status); 2790 bdev_io->internal.f.in_submit_request = false; 2791 } 2792 2793 static inline void 2794 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2795 { 2796 struct spdk_bdev *bdev = bdev_io->bdev; 2797 struct spdk_io_channel *ch = bdev_ch->channel; 2798 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2799 2800 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2801 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2802 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2803 2804 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2805 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2806 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2807 SPDK_BDEV_IO_STATUS_SUCCESS); 2808 return; 2809 } 2810 } 2811 2812 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2813 bdev_io->bdev->split_on_write_unit && 2814 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2815 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2816 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2817 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2818 return; 2819 } 2820 2821 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2822 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2823 bdev_io->internal.f.in_submit_request = true; 2824 bdev_submit_request(bdev, ch, bdev_io); 2825 bdev_io->internal.f.in_submit_request = false; 2826 } else { 2827 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2828 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2829 /* Special case when we have nomem IOs and no outstanding IOs which completions 2830 * could trigger retry of queued IOs */ 2831 bdev_shared_ch_retry_io(shared_resource); 2832 } 2833 } 2834 } 2835 2836 static bool 2837 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2838 { 2839 int i; 2840 2841 if (bdev_qos_io_to_limit(bdev_io) == true) { 2842 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2843 if (!qos->rate_limits[i].queue_io) { 2844 continue; 2845 } 2846 2847 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2848 bdev_io) == true) { 2849 for (i -= 1; i >= 0 ; i--) { 2850 if (!qos->rate_limits[i].queue_io) { 2851 continue; 2852 } 2853 2854 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2855 } 2856 return true; 2857 } 2858 } 2859 } 2860 2861 return false; 2862 } 2863 2864 static int 2865 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2866 { 2867 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2868 int submitted_ios = 0; 2869 2870 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2871 if (!bdev_qos_queue_io(qos, bdev_io)) { 2872 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2873 bdev_io_do_submit(ch, bdev_io); 2874 2875 submitted_ios++; 2876 } 2877 } 2878 2879 return submitted_ios; 2880 } 2881 2882 static void 2883 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2884 { 2885 int rc; 2886 2887 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2888 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2889 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2890 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2891 &bdev_io->internal.waitq_entry); 2892 if (rc != 0) { 2893 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2894 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2895 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2896 } 2897 } 2898 2899 static bool 2900 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2901 { 2902 uint32_t io_boundary; 2903 struct spdk_bdev *bdev = bdev_io->bdev; 2904 uint32_t max_segment_size = bdev->max_segment_size; 2905 uint32_t max_size = bdev->max_rw_size; 2906 int max_segs = bdev->max_num_segments; 2907 2908 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2909 io_boundary = bdev->write_unit_size; 2910 } else if (bdev->split_on_optimal_io_boundary) { 2911 io_boundary = bdev->optimal_io_boundary; 2912 } else { 2913 io_boundary = 0; 2914 } 2915 2916 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2917 return false; 2918 } 2919 2920 if (io_boundary) { 2921 uint64_t start_stripe, end_stripe; 2922 2923 start_stripe = bdev_io->u.bdev.offset_blocks; 2924 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2925 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2926 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2927 start_stripe >>= spdk_u32log2(io_boundary); 2928 end_stripe >>= spdk_u32log2(io_boundary); 2929 } else { 2930 start_stripe /= io_boundary; 2931 end_stripe /= io_boundary; 2932 } 2933 2934 if (start_stripe != end_stripe) { 2935 return true; 2936 } 2937 } 2938 2939 if (max_segs) { 2940 if (bdev_io->u.bdev.iovcnt > max_segs) { 2941 return true; 2942 } 2943 } 2944 2945 if (max_segment_size) { 2946 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2947 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2948 return true; 2949 } 2950 } 2951 } 2952 2953 if (max_size) { 2954 if (bdev_io->u.bdev.num_blocks > max_size) { 2955 return true; 2956 } 2957 } 2958 2959 return false; 2960 } 2961 2962 static bool 2963 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2964 { 2965 uint32_t num_unmap_segments; 2966 2967 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2968 return false; 2969 } 2970 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2971 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2972 return true; 2973 } 2974 2975 return false; 2976 } 2977 2978 static bool 2979 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2980 { 2981 if (!bdev_io->bdev->max_write_zeroes) { 2982 return false; 2983 } 2984 2985 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2986 return true; 2987 } 2988 2989 return false; 2990 } 2991 2992 static bool 2993 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2994 { 2995 if (bdev_io->bdev->max_copy != 0 && 2996 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2997 return true; 2998 } 2999 3000 return false; 3001 } 3002 3003 static bool 3004 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3005 { 3006 switch (bdev_io->type) { 3007 case SPDK_BDEV_IO_TYPE_READ: 3008 case SPDK_BDEV_IO_TYPE_WRITE: 3009 return bdev_rw_should_split(bdev_io); 3010 case SPDK_BDEV_IO_TYPE_UNMAP: 3011 return bdev_unmap_should_split(bdev_io); 3012 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3013 return bdev_write_zeroes_should_split(bdev_io); 3014 case SPDK_BDEV_IO_TYPE_COPY: 3015 return bdev_copy_should_split(bdev_io); 3016 default: 3017 return false; 3018 } 3019 } 3020 3021 static uint32_t 3022 _to_next_boundary(uint64_t offset, uint32_t boundary) 3023 { 3024 return (boundary - (offset % boundary)); 3025 } 3026 3027 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3028 3029 static void _bdev_rw_split(void *_bdev_io); 3030 3031 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3032 3033 static void 3034 _bdev_unmap_split(void *_bdev_io) 3035 { 3036 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3037 } 3038 3039 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3040 3041 static void 3042 _bdev_write_zeroes_split(void *_bdev_io) 3043 { 3044 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3045 } 3046 3047 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3048 3049 static void 3050 _bdev_copy_split(void *_bdev_io) 3051 { 3052 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3053 } 3054 3055 static int 3056 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3057 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3058 { 3059 int rc; 3060 uint64_t current_offset, current_remaining, current_src_offset; 3061 spdk_bdev_io_wait_cb io_wait_fn; 3062 3063 current_offset = *offset; 3064 current_remaining = *remaining; 3065 3066 assert(bdev_io->internal.f.split); 3067 3068 bdev_io->internal.split.outstanding++; 3069 3070 io_wait_fn = _bdev_rw_split; 3071 switch (bdev_io->type) { 3072 case SPDK_BDEV_IO_TYPE_READ: 3073 assert(bdev_io->u.bdev.accel_sequence == NULL); 3074 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3075 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3076 iov, iovcnt, md_buf, current_offset, 3077 num_blocks, 3078 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3079 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3080 NULL, 3081 bdev_io->u.bdev.dif_check_flags, 3082 bdev_io_split_done, bdev_io); 3083 break; 3084 case SPDK_BDEV_IO_TYPE_WRITE: 3085 assert(bdev_io->u.bdev.accel_sequence == NULL); 3086 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3087 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3088 iov, iovcnt, md_buf, current_offset, 3089 num_blocks, 3090 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3091 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3092 NULL, 3093 bdev_io->u.bdev.dif_check_flags, 3094 bdev_io->u.bdev.nvme_cdw12.raw, 3095 bdev_io->u.bdev.nvme_cdw13.raw, 3096 bdev_io_split_done, bdev_io); 3097 break; 3098 case SPDK_BDEV_IO_TYPE_UNMAP: 3099 io_wait_fn = _bdev_unmap_split; 3100 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3101 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3102 current_offset, num_blocks, 3103 bdev_io_split_done, bdev_io); 3104 break; 3105 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3106 io_wait_fn = _bdev_write_zeroes_split; 3107 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3108 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3109 current_offset, num_blocks, 3110 bdev_io_split_done, bdev_io); 3111 break; 3112 case SPDK_BDEV_IO_TYPE_COPY: 3113 io_wait_fn = _bdev_copy_split; 3114 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3115 (current_offset - bdev_io->u.bdev.offset_blocks); 3116 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3117 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3118 current_offset, current_src_offset, num_blocks, 3119 bdev_io_split_done, bdev_io); 3120 break; 3121 default: 3122 assert(false); 3123 rc = -EINVAL; 3124 break; 3125 } 3126 3127 if (rc == 0) { 3128 current_offset += num_blocks; 3129 current_remaining -= num_blocks; 3130 bdev_io->internal.split.current_offset_blocks = current_offset; 3131 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3132 *offset = current_offset; 3133 *remaining = current_remaining; 3134 } else { 3135 bdev_io->internal.split.outstanding--; 3136 if (rc == -ENOMEM) { 3137 if (bdev_io->internal.split.outstanding == 0) { 3138 /* No I/O is outstanding. Hence we should wait here. */ 3139 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3140 } 3141 } else { 3142 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3143 if (bdev_io->internal.split.outstanding == 0) { 3144 bdev_ch_remove_from_io_submitted(bdev_io); 3145 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3146 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3147 bdev_io->internal.ch->queue_depth); 3148 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3149 } 3150 } 3151 } 3152 3153 return rc; 3154 } 3155 3156 static void 3157 _bdev_rw_split(void *_bdev_io) 3158 { 3159 struct iovec *parent_iov, *iov; 3160 struct spdk_bdev_io *bdev_io = _bdev_io; 3161 struct spdk_bdev *bdev = bdev_io->bdev; 3162 uint64_t parent_offset, current_offset, remaining; 3163 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3164 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3165 uint32_t iovcnt, iov_len, child_iovsize; 3166 uint32_t blocklen = bdev->blocklen; 3167 uint32_t io_boundary; 3168 uint32_t max_segment_size = bdev->max_segment_size; 3169 uint32_t max_child_iovcnt = bdev->max_num_segments; 3170 uint32_t max_size = bdev->max_rw_size; 3171 void *md_buf = NULL; 3172 int rc; 3173 3174 max_size = max_size ? max_size : UINT32_MAX; 3175 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3176 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3177 SPDK_BDEV_IO_NUM_CHILD_IOV; 3178 3179 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3180 io_boundary = bdev->write_unit_size; 3181 } else if (bdev->split_on_optimal_io_boundary) { 3182 io_boundary = bdev->optimal_io_boundary; 3183 } else { 3184 io_boundary = UINT32_MAX; 3185 } 3186 3187 assert(bdev_io->internal.f.split); 3188 3189 remaining = bdev_io->internal.split.remaining_num_blocks; 3190 current_offset = bdev_io->internal.split.current_offset_blocks; 3191 parent_offset = bdev_io->u.bdev.offset_blocks; 3192 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3193 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3194 3195 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3196 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3197 if (parent_iov_offset < parent_iov->iov_len) { 3198 break; 3199 } 3200 parent_iov_offset -= parent_iov->iov_len; 3201 } 3202 3203 child_iovcnt = 0; 3204 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3205 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3206 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3207 to_next_boundary = spdk_min(remaining, to_next_boundary); 3208 to_next_boundary = spdk_min(max_size, to_next_boundary); 3209 to_next_boundary_bytes = to_next_boundary * blocklen; 3210 3211 iov = &bdev_io->child_iov[child_iovcnt]; 3212 iovcnt = 0; 3213 3214 if (bdev_io->u.bdev.md_buf) { 3215 md_buf = (char *)bdev_io->u.bdev.md_buf + 3216 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3217 } 3218 3219 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3220 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3221 iovcnt < child_iovsize) { 3222 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3223 iov_len = parent_iov->iov_len - parent_iov_offset; 3224 3225 iov_len = spdk_min(iov_len, max_segment_size); 3226 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3227 to_next_boundary_bytes -= iov_len; 3228 3229 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3230 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3231 3232 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3233 parent_iov_offset += iov_len; 3234 } else { 3235 parent_iovpos++; 3236 parent_iov_offset = 0; 3237 } 3238 child_iovcnt++; 3239 iovcnt++; 3240 } 3241 3242 if (to_next_boundary_bytes > 0) { 3243 /* We had to stop this child I/O early because we ran out of 3244 * child_iov space or were limited by max_num_segments. 3245 * Ensure the iovs to be aligned with block size and 3246 * then adjust to_next_boundary before starting the 3247 * child I/O. 3248 */ 3249 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3250 iovcnt == child_iovsize); 3251 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3252 if (to_last_block_bytes != 0) { 3253 uint32_t child_iovpos = child_iovcnt - 1; 3254 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3255 * so the loop will naturally end 3256 */ 3257 3258 to_last_block_bytes = blocklen - to_last_block_bytes; 3259 to_next_boundary_bytes += to_last_block_bytes; 3260 while (to_last_block_bytes > 0 && iovcnt > 0) { 3261 iov_len = spdk_min(to_last_block_bytes, 3262 bdev_io->child_iov[child_iovpos].iov_len); 3263 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3264 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3265 child_iovpos--; 3266 if (--iovcnt == 0) { 3267 /* If the child IO is less than a block size just return. 3268 * If the first child IO of any split round is less than 3269 * a block size, an error exit. 3270 */ 3271 if (bdev_io->internal.split.outstanding == 0) { 3272 SPDK_ERRLOG("The first child io was less than a block size\n"); 3273 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3274 bdev_ch_remove_from_io_submitted(bdev_io); 3275 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3276 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3277 bdev_io->internal.ch->queue_depth); 3278 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3279 } 3280 3281 return; 3282 } 3283 } 3284 3285 to_last_block_bytes -= iov_len; 3286 3287 if (parent_iov_offset == 0) { 3288 parent_iovpos--; 3289 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3290 } 3291 parent_iov_offset -= iov_len; 3292 } 3293 3294 assert(to_last_block_bytes == 0); 3295 } 3296 to_next_boundary -= to_next_boundary_bytes / blocklen; 3297 } 3298 3299 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3300 ¤t_offset, &remaining); 3301 if (spdk_unlikely(rc)) { 3302 return; 3303 } 3304 } 3305 } 3306 3307 static void 3308 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3309 { 3310 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3311 uint32_t num_children_reqs = 0; 3312 int rc; 3313 3314 assert(bdev_io->internal.f.split); 3315 3316 offset = bdev_io->internal.split.current_offset_blocks; 3317 remaining = bdev_io->internal.split.remaining_num_blocks; 3318 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3319 3320 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3321 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3322 3323 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3324 &offset, &remaining); 3325 if (spdk_likely(rc == 0)) { 3326 num_children_reqs++; 3327 } else { 3328 return; 3329 } 3330 } 3331 } 3332 3333 static void 3334 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3335 { 3336 uint64_t offset, write_zeroes_blocks, remaining; 3337 uint32_t num_children_reqs = 0; 3338 int rc; 3339 3340 assert(bdev_io->internal.f.split); 3341 3342 offset = bdev_io->internal.split.current_offset_blocks; 3343 remaining = bdev_io->internal.split.remaining_num_blocks; 3344 3345 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3346 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3347 3348 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3349 &offset, &remaining); 3350 if (spdk_likely(rc == 0)) { 3351 num_children_reqs++; 3352 } else { 3353 return; 3354 } 3355 } 3356 } 3357 3358 static void 3359 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3360 { 3361 uint64_t offset, copy_blocks, remaining; 3362 uint32_t num_children_reqs = 0; 3363 int rc; 3364 3365 assert(bdev_io->internal.f.split); 3366 3367 offset = bdev_io->internal.split.current_offset_blocks; 3368 remaining = bdev_io->internal.split.remaining_num_blocks; 3369 3370 assert(bdev_io->bdev->max_copy != 0); 3371 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3372 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3373 3374 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3375 &offset, &remaining); 3376 if (spdk_likely(rc == 0)) { 3377 num_children_reqs++; 3378 } else { 3379 return; 3380 } 3381 } 3382 } 3383 3384 static void 3385 parent_bdev_io_complete(void *ctx, int rc) 3386 { 3387 struct spdk_bdev_io *parent_io = ctx; 3388 3389 if (rc) { 3390 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3391 } 3392 3393 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3394 parent_io->internal.caller_ctx); 3395 } 3396 3397 static void 3398 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3399 { 3400 struct spdk_bdev_io *bdev_io = ctx; 3401 3402 /* u.bdev.accel_sequence should have already been cleared at this point */ 3403 assert(bdev_io->u.bdev.accel_sequence == NULL); 3404 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3405 bdev_io->internal.f.has_accel_sequence = false; 3406 3407 if (spdk_unlikely(status != 0)) { 3408 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3409 } 3410 3411 parent_bdev_io_complete(bdev_io, status); 3412 } 3413 3414 static void 3415 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3416 { 3417 struct spdk_bdev_io *parent_io = cb_arg; 3418 3419 spdk_bdev_free_io(bdev_io); 3420 3421 assert(parent_io->internal.f.split); 3422 3423 if (!success) { 3424 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3425 /* If any child I/O failed, stop further splitting process. */ 3426 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3427 parent_io->internal.split.remaining_num_blocks = 0; 3428 } 3429 parent_io->internal.split.outstanding--; 3430 if (parent_io->internal.split.outstanding != 0) { 3431 return; 3432 } 3433 3434 /* 3435 * Parent I/O finishes when all blocks are consumed. 3436 */ 3437 if (parent_io->internal.split.remaining_num_blocks == 0) { 3438 assert(parent_io->internal.cb != bdev_io_split_done); 3439 bdev_ch_remove_from_io_submitted(parent_io); 3440 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3441 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3442 parent_io->internal.ch->queue_depth); 3443 3444 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3445 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3446 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3447 return; 3448 } else if (parent_io->internal.f.has_bounce_buf && 3449 !bdev_io_use_accel_sequence(bdev_io)) { 3450 /* bdev IO will be completed in the callback */ 3451 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3452 return; 3453 } 3454 } 3455 3456 parent_bdev_io_complete(parent_io, 0); 3457 return; 3458 } 3459 3460 /* 3461 * Continue with the splitting process. This function will complete the parent I/O if the 3462 * splitting is done. 3463 */ 3464 switch (parent_io->type) { 3465 case SPDK_BDEV_IO_TYPE_READ: 3466 case SPDK_BDEV_IO_TYPE_WRITE: 3467 _bdev_rw_split(parent_io); 3468 break; 3469 case SPDK_BDEV_IO_TYPE_UNMAP: 3470 bdev_unmap_split(parent_io); 3471 break; 3472 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3473 bdev_write_zeroes_split(parent_io); 3474 break; 3475 case SPDK_BDEV_IO_TYPE_COPY: 3476 bdev_copy_split(parent_io); 3477 break; 3478 default: 3479 assert(false); 3480 break; 3481 } 3482 } 3483 3484 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3485 bool success); 3486 3487 static void 3488 bdev_io_split(struct spdk_bdev_io *bdev_io) 3489 { 3490 assert(bdev_io_should_split(bdev_io)); 3491 assert(bdev_io->internal.f.split); 3492 3493 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3494 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3495 bdev_io->internal.split.outstanding = 0; 3496 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3497 3498 switch (bdev_io->type) { 3499 case SPDK_BDEV_IO_TYPE_READ: 3500 case SPDK_BDEV_IO_TYPE_WRITE: 3501 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3502 _bdev_rw_split(bdev_io); 3503 } else { 3504 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3505 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3506 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3507 } 3508 break; 3509 case SPDK_BDEV_IO_TYPE_UNMAP: 3510 bdev_unmap_split(bdev_io); 3511 break; 3512 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3513 bdev_write_zeroes_split(bdev_io); 3514 break; 3515 case SPDK_BDEV_IO_TYPE_COPY: 3516 bdev_copy_split(bdev_io); 3517 break; 3518 default: 3519 assert(false); 3520 break; 3521 } 3522 } 3523 3524 static void 3525 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3526 { 3527 if (!success) { 3528 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3529 return; 3530 } 3531 3532 _bdev_rw_split(bdev_io); 3533 } 3534 3535 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3536 * be inlined, at least on some compilers. 3537 */ 3538 static inline void 3539 _bdev_io_submit(void *ctx) 3540 { 3541 struct spdk_bdev_io *bdev_io = ctx; 3542 struct spdk_bdev *bdev = bdev_io->bdev; 3543 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3544 3545 if (spdk_likely(bdev_ch->flags == 0)) { 3546 bdev_io_do_submit(bdev_ch, bdev_io); 3547 return; 3548 } 3549 3550 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3551 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3552 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3553 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3554 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3555 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3556 } else { 3557 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3558 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3559 } 3560 } else { 3561 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3562 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3563 } 3564 } 3565 3566 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3567 3568 bool 3569 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3570 { 3571 if (range1->length == 0 || range2->length == 0) { 3572 return false; 3573 } 3574 3575 if (range1->offset + range1->length <= range2->offset) { 3576 return false; 3577 } 3578 3579 if (range2->offset + range2->length <= range1->offset) { 3580 return false; 3581 } 3582 3583 return true; 3584 } 3585 3586 static bool 3587 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3588 { 3589 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3590 struct lba_range r; 3591 3592 switch (bdev_io->type) { 3593 case SPDK_BDEV_IO_TYPE_NVME_IO: 3594 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3595 /* Don't try to decode the NVMe command - just assume worst-case and that 3596 * it overlaps a locked range. 3597 */ 3598 return true; 3599 case SPDK_BDEV_IO_TYPE_READ: 3600 if (!range->quiesce) { 3601 return false; 3602 } 3603 /* fallthrough */ 3604 case SPDK_BDEV_IO_TYPE_WRITE: 3605 case SPDK_BDEV_IO_TYPE_UNMAP: 3606 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3607 case SPDK_BDEV_IO_TYPE_ZCOPY: 3608 case SPDK_BDEV_IO_TYPE_COPY: 3609 r.offset = bdev_io->u.bdev.offset_blocks; 3610 r.length = bdev_io->u.bdev.num_blocks; 3611 if (!bdev_lba_range_overlapped(range, &r)) { 3612 /* This I/O doesn't overlap the specified LBA range. */ 3613 return false; 3614 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3615 /* This I/O overlaps, but the I/O is on the same channel that locked this 3616 * range, and the caller_ctx is the same as the locked_ctx. This means 3617 * that this I/O is associated with the lock, and is allowed to execute. 3618 */ 3619 return false; 3620 } else { 3621 return true; 3622 } 3623 default: 3624 return false; 3625 } 3626 } 3627 3628 void 3629 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3630 { 3631 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3632 3633 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3634 3635 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3636 struct lba_range *range; 3637 3638 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3639 if (bdev_io_range_is_locked(bdev_io, range)) { 3640 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3641 return; 3642 } 3643 } 3644 } 3645 3646 bdev_ch_add_to_io_submitted(bdev_io); 3647 3648 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3649 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3650 ch->trace_id, bdev_io->u.bdev.num_blocks, 3651 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3652 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3653 3654 if (bdev_io->internal.f.split) { 3655 bdev_io_split(bdev_io); 3656 return; 3657 } 3658 3659 _bdev_io_submit(bdev_io); 3660 } 3661 3662 static inline void 3663 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3664 { 3665 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3666 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3667 * For write operation we need to pull buffers from memory domain before submitting IO. 3668 * Once read operation completes, we need to use memory_domain push functionality to 3669 * update data in original memory domain IO buffer 3670 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3671 assert(bdev_io->internal.f.has_memory_domain); 3672 bdev_io->u.bdev.memory_domain = NULL; 3673 bdev_io->u.bdev.memory_domain_ctx = NULL; 3674 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3675 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3676 } 3677 3678 static inline void 3679 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3680 { 3681 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3682 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3683 3684 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3685 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3686 bdev_io_complete_unsubmitted(bdev_io); 3687 return; 3688 } 3689 3690 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3691 * support them, but we need to execute an accel sequence and the data buffer is from accel 3692 * memory domain (to avoid doing a push/pull from that domain). 3693 */ 3694 if (bdev_io_use_memory_domain(bdev_io)) { 3695 if (!desc->memory_domains_supported || 3696 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3697 _bdev_io_ext_use_bounce_buffer(bdev_io); 3698 return; 3699 } 3700 } 3701 3702 if (needs_exec) { 3703 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3704 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3705 return; 3706 } 3707 /* For reads we'll execute the sequence after the data is read, so, for now, only 3708 * clear out accel_sequence pointer and submit the IO */ 3709 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3710 bdev_io->u.bdev.accel_sequence = NULL; 3711 } 3712 3713 bdev_io_submit(bdev_io); 3714 } 3715 3716 static void 3717 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3718 { 3719 struct spdk_bdev *bdev = bdev_io->bdev; 3720 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3721 struct spdk_io_channel *ch = bdev_ch->channel; 3722 3723 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3724 3725 bdev_io->internal.f.in_submit_request = true; 3726 bdev_submit_request(bdev, ch, bdev_io); 3727 bdev_io->internal.f.in_submit_request = false; 3728 } 3729 3730 void 3731 bdev_io_init(struct spdk_bdev_io *bdev_io, 3732 struct spdk_bdev *bdev, void *cb_arg, 3733 spdk_bdev_io_completion_cb cb) 3734 { 3735 bdev_io->bdev = bdev; 3736 bdev_io->internal.f.raw = 0; 3737 bdev_io->internal.caller_ctx = cb_arg; 3738 bdev_io->internal.cb = cb; 3739 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3740 bdev_io->internal.f.in_submit_request = false; 3741 bdev_io->internal.error.nvme.cdw0 = 0; 3742 bdev_io->num_retries = 0; 3743 bdev_io->internal.get_buf_cb = NULL; 3744 bdev_io->internal.get_aux_buf_cb = NULL; 3745 bdev_io->internal.data_transfer_cpl = NULL; 3746 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3747 } 3748 3749 static bool 3750 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3751 { 3752 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3753 } 3754 3755 bool 3756 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3757 { 3758 bool supported; 3759 3760 supported = bdev_io_type_supported(bdev, io_type); 3761 3762 if (!supported) { 3763 switch (io_type) { 3764 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3765 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3766 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3767 break; 3768 default: 3769 break; 3770 } 3771 } 3772 3773 return supported; 3774 } 3775 3776 static const char *g_io_type_strings[] = { 3777 [SPDK_BDEV_IO_TYPE_READ] = "read", 3778 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3779 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3780 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3781 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3782 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3783 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3784 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3785 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3786 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3787 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3788 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3789 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3790 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3791 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3792 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3793 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3794 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3795 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3796 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3797 }; 3798 3799 const char * 3800 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3801 { 3802 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3803 return NULL; 3804 } 3805 3806 return g_io_type_strings[io_type]; 3807 } 3808 3809 int 3810 spdk_bdev_get_io_type(const char *io_type_string) 3811 { 3812 int i; 3813 3814 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3815 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3816 return i; 3817 } 3818 } 3819 3820 return -1; 3821 } 3822 3823 uint64_t 3824 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3825 { 3826 return bdev_io->internal.submit_tsc; 3827 } 3828 3829 int 3830 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3831 { 3832 if (bdev->fn_table->dump_info_json) { 3833 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3834 } 3835 3836 return 0; 3837 } 3838 3839 static void 3840 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3841 { 3842 uint32_t max_per_timeslice = 0; 3843 int i; 3844 3845 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3846 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3847 qos->rate_limits[i].max_per_timeslice = 0; 3848 continue; 3849 } 3850 3851 max_per_timeslice = qos->rate_limits[i].limit * 3852 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3853 3854 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3855 qos->rate_limits[i].min_per_timeslice); 3856 3857 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3858 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3859 } 3860 3861 bdev_qos_set_ops(qos); 3862 } 3863 3864 static void 3865 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3866 struct spdk_io_channel *io_ch, void *ctx) 3867 { 3868 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3869 int status; 3870 3871 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3872 3873 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3874 /* TODO: channels round robing */ 3875 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3876 3877 spdk_bdev_for_each_channel_continue(i, status); 3878 } 3879 3880 3881 static void 3882 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3883 { 3884 3885 } 3886 3887 static int 3888 bdev_channel_poll_qos(void *arg) 3889 { 3890 struct spdk_bdev *bdev = arg; 3891 struct spdk_bdev_qos *qos = bdev->internal.qos; 3892 uint64_t now = spdk_get_ticks(); 3893 int i; 3894 int64_t remaining_last_timeslice; 3895 3896 if (spdk_unlikely(qos->thread == NULL)) { 3897 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3898 return SPDK_POLLER_IDLE; 3899 } 3900 3901 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3902 /* We received our callback earlier than expected - return 3903 * immediately and wait to do accounting until at least one 3904 * timeslice has actually expired. This should never happen 3905 * with a well-behaved timer implementation. 3906 */ 3907 return SPDK_POLLER_IDLE; 3908 } 3909 3910 /* Reset for next round of rate limiting */ 3911 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3912 /* We may have allowed the IOs or bytes to slightly overrun in the last 3913 * timeslice. remaining_this_timeslice is signed, so if it's negative 3914 * here, we'll account for the overrun so that the next timeslice will 3915 * be appropriately reduced. 3916 */ 3917 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3918 0, __ATOMIC_RELAXED); 3919 if (remaining_last_timeslice < 0) { 3920 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3921 * potentially use 2 atomic ops each, so they can intertwine. 3922 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3923 */ 3924 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3925 remaining_last_timeslice, __ATOMIC_RELAXED); 3926 } 3927 } 3928 3929 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3930 qos->last_timeslice += qos->timeslice_size; 3931 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3932 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3933 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3934 } 3935 } 3936 3937 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3938 bdev_channel_submit_qos_io_done); 3939 3940 return SPDK_POLLER_BUSY; 3941 } 3942 3943 static void 3944 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3945 { 3946 struct spdk_bdev_shared_resource *shared_resource; 3947 struct lba_range *range; 3948 3949 bdev_free_io_stat(ch->stat); 3950 #ifdef SPDK_CONFIG_VTUNE 3951 bdev_free_io_stat(ch->prev_stat); 3952 #endif 3953 3954 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3955 range = TAILQ_FIRST(&ch->locked_ranges); 3956 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3957 free(range); 3958 } 3959 3960 spdk_put_io_channel(ch->channel); 3961 spdk_put_io_channel(ch->accel_channel); 3962 3963 shared_resource = ch->shared_resource; 3964 3965 assert(TAILQ_EMPTY(&ch->io_locked)); 3966 assert(TAILQ_EMPTY(&ch->io_submitted)); 3967 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3968 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3969 assert(ch->io_outstanding == 0); 3970 assert(shared_resource->ref > 0); 3971 shared_resource->ref--; 3972 if (shared_resource->ref == 0) { 3973 assert(shared_resource->io_outstanding == 0); 3974 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3975 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3976 spdk_poller_unregister(&shared_resource->nomem_poller); 3977 free(shared_resource); 3978 } 3979 } 3980 3981 static void 3982 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3983 { 3984 struct spdk_bdev_qos *qos = bdev->internal.qos; 3985 int i; 3986 3987 assert(spdk_spin_held(&bdev->internal.spinlock)); 3988 3989 /* Rate limiting on this bdev enabled */ 3990 if (qos) { 3991 if (qos->ch == NULL) { 3992 struct spdk_io_channel *io_ch; 3993 3994 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3995 bdev->name, spdk_get_thread()); 3996 3997 /* No qos channel has been selected, so set one up */ 3998 3999 /* Take another reference to ch */ 4000 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4001 assert(io_ch != NULL); 4002 qos->ch = ch; 4003 4004 qos->thread = spdk_io_channel_get_thread(io_ch); 4005 4006 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4007 if (bdev_qos_is_iops_rate_limit(i) == true) { 4008 qos->rate_limits[i].min_per_timeslice = 4009 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4010 } else { 4011 qos->rate_limits[i].min_per_timeslice = 4012 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4013 } 4014 4015 if (qos->rate_limits[i].limit == 0) { 4016 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4017 } 4018 } 4019 bdev_qos_update_max_quota_per_timeslice(qos); 4020 qos->timeslice_size = 4021 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4022 qos->last_timeslice = spdk_get_ticks(); 4023 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4024 bdev, 4025 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4026 } 4027 4028 ch->flags |= BDEV_CH_QOS_ENABLED; 4029 } 4030 } 4031 4032 struct poll_timeout_ctx { 4033 struct spdk_bdev_desc *desc; 4034 uint64_t timeout_in_sec; 4035 spdk_bdev_io_timeout_cb cb_fn; 4036 void *cb_arg; 4037 }; 4038 4039 static void 4040 bdev_desc_free(struct spdk_bdev_desc *desc) 4041 { 4042 spdk_spin_destroy(&desc->spinlock); 4043 free(desc->media_events_buffer); 4044 free(desc); 4045 } 4046 4047 static void 4048 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4049 { 4050 struct poll_timeout_ctx *ctx = _ctx; 4051 struct spdk_bdev_desc *desc = ctx->desc; 4052 4053 free(ctx); 4054 4055 spdk_spin_lock(&desc->spinlock); 4056 desc->refs--; 4057 if (desc->closed == true && desc->refs == 0) { 4058 spdk_spin_unlock(&desc->spinlock); 4059 bdev_desc_free(desc); 4060 return; 4061 } 4062 spdk_spin_unlock(&desc->spinlock); 4063 } 4064 4065 static void 4066 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4067 struct spdk_io_channel *io_ch, void *_ctx) 4068 { 4069 struct poll_timeout_ctx *ctx = _ctx; 4070 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4071 struct spdk_bdev_desc *desc = ctx->desc; 4072 struct spdk_bdev_io *bdev_io; 4073 uint64_t now; 4074 4075 spdk_spin_lock(&desc->spinlock); 4076 if (desc->closed == true) { 4077 spdk_spin_unlock(&desc->spinlock); 4078 spdk_bdev_for_each_channel_continue(i, -1); 4079 return; 4080 } 4081 spdk_spin_unlock(&desc->spinlock); 4082 4083 now = spdk_get_ticks(); 4084 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4085 /* Exclude any I/O that are generated via splitting. */ 4086 if (bdev_io->internal.cb == bdev_io_split_done) { 4087 continue; 4088 } 4089 4090 /* Once we find an I/O that has not timed out, we can immediately 4091 * exit the loop. 4092 */ 4093 if (now < (bdev_io->internal.submit_tsc + 4094 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4095 goto end; 4096 } 4097 4098 if (bdev_io->internal.desc == desc) { 4099 ctx->cb_fn(ctx->cb_arg, bdev_io); 4100 } 4101 } 4102 4103 end: 4104 spdk_bdev_for_each_channel_continue(i, 0); 4105 } 4106 4107 static int 4108 bdev_poll_timeout_io(void *arg) 4109 { 4110 struct spdk_bdev_desc *desc = arg; 4111 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4112 struct poll_timeout_ctx *ctx; 4113 4114 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4115 if (!ctx) { 4116 SPDK_ERRLOG("failed to allocate memory\n"); 4117 return SPDK_POLLER_BUSY; 4118 } 4119 ctx->desc = desc; 4120 ctx->cb_arg = desc->cb_arg; 4121 ctx->cb_fn = desc->cb_fn; 4122 ctx->timeout_in_sec = desc->timeout_in_sec; 4123 4124 /* Take a ref on the descriptor in case it gets closed while we are checking 4125 * all of the channels. 4126 */ 4127 spdk_spin_lock(&desc->spinlock); 4128 desc->refs++; 4129 spdk_spin_unlock(&desc->spinlock); 4130 4131 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4132 bdev_channel_poll_timeout_io_done); 4133 4134 return SPDK_POLLER_BUSY; 4135 } 4136 4137 int 4138 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4139 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4140 { 4141 assert(desc->thread == spdk_get_thread()); 4142 4143 spdk_poller_unregister(&desc->io_timeout_poller); 4144 4145 if (timeout_in_sec) { 4146 assert(cb_fn != NULL); 4147 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4148 desc, 4149 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4150 1000); 4151 if (desc->io_timeout_poller == NULL) { 4152 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4153 return -1; 4154 } 4155 } 4156 4157 desc->cb_fn = cb_fn; 4158 desc->cb_arg = cb_arg; 4159 desc->timeout_in_sec = timeout_in_sec; 4160 4161 return 0; 4162 } 4163 4164 static int 4165 bdev_channel_create(void *io_device, void *ctx_buf) 4166 { 4167 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4168 struct spdk_bdev_channel *ch = ctx_buf; 4169 struct spdk_io_channel *mgmt_io_ch; 4170 struct spdk_bdev_mgmt_channel *mgmt_ch; 4171 struct spdk_bdev_shared_resource *shared_resource; 4172 struct lba_range *range; 4173 4174 ch->bdev = bdev; 4175 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4176 if (!ch->channel) { 4177 return -1; 4178 } 4179 4180 ch->accel_channel = spdk_accel_get_io_channel(); 4181 if (!ch->accel_channel) { 4182 spdk_put_io_channel(ch->channel); 4183 return -1; 4184 } 4185 4186 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4187 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4188 4189 assert(ch->histogram == NULL); 4190 if (bdev->internal.histogram_enabled) { 4191 ch->histogram = spdk_histogram_data_alloc(); 4192 if (ch->histogram == NULL) { 4193 SPDK_ERRLOG("Could not allocate histogram\n"); 4194 } 4195 } 4196 4197 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4198 if (!mgmt_io_ch) { 4199 spdk_put_io_channel(ch->channel); 4200 spdk_put_io_channel(ch->accel_channel); 4201 return -1; 4202 } 4203 4204 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4205 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4206 if (shared_resource->shared_ch == ch->channel) { 4207 spdk_put_io_channel(mgmt_io_ch); 4208 shared_resource->ref++; 4209 break; 4210 } 4211 } 4212 4213 if (shared_resource == NULL) { 4214 shared_resource = calloc(1, sizeof(*shared_resource)); 4215 if (shared_resource == NULL) { 4216 spdk_put_io_channel(ch->channel); 4217 spdk_put_io_channel(ch->accel_channel); 4218 spdk_put_io_channel(mgmt_io_ch); 4219 return -1; 4220 } 4221 4222 shared_resource->mgmt_ch = mgmt_ch; 4223 shared_resource->io_outstanding = 0; 4224 TAILQ_INIT(&shared_resource->nomem_io); 4225 shared_resource->nomem_threshold = 0; 4226 shared_resource->shared_ch = ch->channel; 4227 shared_resource->ref = 1; 4228 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4229 } 4230 4231 ch->io_outstanding = 0; 4232 TAILQ_INIT(&ch->queued_resets); 4233 TAILQ_INIT(&ch->locked_ranges); 4234 TAILQ_INIT(&ch->qos_queued_io); 4235 ch->flags = 0; 4236 ch->trace_id = bdev->internal.trace_id; 4237 ch->shared_resource = shared_resource; 4238 4239 TAILQ_INIT(&ch->io_submitted); 4240 TAILQ_INIT(&ch->io_locked); 4241 TAILQ_INIT(&ch->io_accel_exec); 4242 TAILQ_INIT(&ch->io_memory_domain); 4243 4244 ch->stat = bdev_alloc_io_stat(false); 4245 if (ch->stat == NULL) { 4246 bdev_channel_destroy_resource(ch); 4247 return -1; 4248 } 4249 4250 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4251 4252 #ifdef SPDK_CONFIG_VTUNE 4253 { 4254 char *name; 4255 __itt_init_ittlib(NULL, 0); 4256 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4257 if (!name) { 4258 bdev_channel_destroy_resource(ch); 4259 return -1; 4260 } 4261 ch->handle = __itt_string_handle_create(name); 4262 free(name); 4263 ch->start_tsc = spdk_get_ticks(); 4264 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4265 ch->prev_stat = bdev_alloc_io_stat(false); 4266 if (ch->prev_stat == NULL) { 4267 bdev_channel_destroy_resource(ch); 4268 return -1; 4269 } 4270 } 4271 #endif 4272 4273 spdk_spin_lock(&bdev->internal.spinlock); 4274 bdev_enable_qos(bdev, ch); 4275 4276 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4277 struct lba_range *new_range; 4278 4279 new_range = calloc(1, sizeof(*new_range)); 4280 if (new_range == NULL) { 4281 spdk_spin_unlock(&bdev->internal.spinlock); 4282 bdev_channel_destroy_resource(ch); 4283 return -1; 4284 } 4285 new_range->length = range->length; 4286 new_range->offset = range->offset; 4287 new_range->locked_ctx = range->locked_ctx; 4288 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4289 } 4290 4291 spdk_spin_unlock(&bdev->internal.spinlock); 4292 4293 return 0; 4294 } 4295 4296 static int 4297 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4298 void *cb_ctx) 4299 { 4300 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4301 struct spdk_bdev_io *bdev_io; 4302 uint64_t buf_len; 4303 4304 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4305 if (bdev_io->internal.ch == bdev_ch) { 4306 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4307 spdk_iobuf_entry_abort(ch, entry, buf_len); 4308 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4309 } 4310 4311 return 0; 4312 } 4313 4314 /* 4315 * Abort I/O that are waiting on a data buffer. 4316 */ 4317 static void 4318 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4319 { 4320 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4321 bdev_abort_all_buf_io_cb, ch); 4322 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4323 bdev_abort_all_buf_io_cb, ch); 4324 } 4325 4326 /* 4327 * Abort I/O that are queued waiting for submission. These types of I/O are 4328 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4329 */ 4330 static void 4331 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4332 { 4333 struct spdk_bdev_io *bdev_io, *tmp; 4334 4335 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4336 if (bdev_io->internal.ch == ch) { 4337 TAILQ_REMOVE(queue, bdev_io, internal.link); 4338 /* 4339 * spdk_bdev_io_complete() assumes that the completed I/O had 4340 * been submitted to the bdev module. Since in this case it 4341 * hadn't, bump io_outstanding to account for the decrement 4342 * that spdk_bdev_io_complete() will do. 4343 */ 4344 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4345 bdev_io_increment_outstanding(ch, ch->shared_resource); 4346 } 4347 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4348 } 4349 } 4350 } 4351 4352 static bool 4353 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4354 { 4355 struct spdk_bdev_io *bdev_io; 4356 4357 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4358 if (bdev_io == bio_to_abort) { 4359 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4360 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4361 return true; 4362 } 4363 } 4364 4365 return false; 4366 } 4367 4368 static int 4369 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4370 { 4371 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4372 uint64_t buf_len; 4373 4374 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4375 if (bdev_io == bio_to_abort) { 4376 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4377 spdk_iobuf_entry_abort(ch, entry, buf_len); 4378 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4379 return 1; 4380 } 4381 4382 return 0; 4383 } 4384 4385 static bool 4386 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4387 { 4388 int rc; 4389 4390 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4391 bdev_abort_buf_io_cb, bio_to_abort); 4392 if (rc == 1) { 4393 return true; 4394 } 4395 4396 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4397 bdev_abort_buf_io_cb, bio_to_abort); 4398 return rc == 1; 4399 } 4400 4401 static void 4402 bdev_qos_channel_destroy(void *cb_arg) 4403 { 4404 struct spdk_bdev_qos *qos = cb_arg; 4405 4406 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4407 spdk_poller_unregister(&qos->poller); 4408 4409 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4410 4411 free(qos); 4412 } 4413 4414 static int 4415 bdev_qos_destroy(struct spdk_bdev *bdev) 4416 { 4417 int i; 4418 4419 /* 4420 * Cleanly shutting down the QoS poller is tricky, because 4421 * during the asynchronous operation the user could open 4422 * a new descriptor and create a new channel, spawning 4423 * a new QoS poller. 4424 * 4425 * The strategy is to create a new QoS structure here and swap it 4426 * in. The shutdown path then continues to refer to the old one 4427 * until it completes and then releases it. 4428 */ 4429 struct spdk_bdev_qos *new_qos, *old_qos; 4430 4431 old_qos = bdev->internal.qos; 4432 4433 new_qos = calloc(1, sizeof(*new_qos)); 4434 if (!new_qos) { 4435 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4436 return -ENOMEM; 4437 } 4438 4439 /* Copy the old QoS data into the newly allocated structure */ 4440 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4441 4442 /* Zero out the key parts of the QoS structure */ 4443 new_qos->ch = NULL; 4444 new_qos->thread = NULL; 4445 new_qos->poller = NULL; 4446 /* 4447 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4448 * It will be used later for the new QoS structure. 4449 */ 4450 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4451 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4452 new_qos->rate_limits[i].min_per_timeslice = 0; 4453 new_qos->rate_limits[i].max_per_timeslice = 0; 4454 } 4455 4456 bdev->internal.qos = new_qos; 4457 4458 if (old_qos->thread == NULL) { 4459 free(old_qos); 4460 } else { 4461 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4462 } 4463 4464 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4465 * been destroyed yet. The destruction path will end up waiting for the final 4466 * channel to be put before it releases resources. */ 4467 4468 return 0; 4469 } 4470 4471 void 4472 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4473 { 4474 total->bytes_read += add->bytes_read; 4475 total->num_read_ops += add->num_read_ops; 4476 total->bytes_written += add->bytes_written; 4477 total->num_write_ops += add->num_write_ops; 4478 total->bytes_unmapped += add->bytes_unmapped; 4479 total->num_unmap_ops += add->num_unmap_ops; 4480 total->bytes_copied += add->bytes_copied; 4481 total->num_copy_ops += add->num_copy_ops; 4482 total->read_latency_ticks += add->read_latency_ticks; 4483 total->write_latency_ticks += add->write_latency_ticks; 4484 total->unmap_latency_ticks += add->unmap_latency_ticks; 4485 total->copy_latency_ticks += add->copy_latency_ticks; 4486 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4487 total->max_read_latency_ticks = add->max_read_latency_ticks; 4488 } 4489 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4490 total->min_read_latency_ticks = add->min_read_latency_ticks; 4491 } 4492 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4493 total->max_write_latency_ticks = add->max_write_latency_ticks; 4494 } 4495 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4496 total->min_write_latency_ticks = add->min_write_latency_ticks; 4497 } 4498 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4499 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4500 } 4501 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4502 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4503 } 4504 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4505 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4506 } 4507 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4508 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4509 } 4510 } 4511 4512 static void 4513 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4514 { 4515 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4516 4517 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4518 memcpy(to_stat->io_error, from_stat->io_error, 4519 sizeof(struct spdk_bdev_io_error_stat)); 4520 } 4521 } 4522 4523 void 4524 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4525 { 4526 stat->max_read_latency_ticks = 0; 4527 stat->min_read_latency_ticks = UINT64_MAX; 4528 stat->max_write_latency_ticks = 0; 4529 stat->min_write_latency_ticks = UINT64_MAX; 4530 stat->max_unmap_latency_ticks = 0; 4531 stat->min_unmap_latency_ticks = UINT64_MAX; 4532 stat->max_copy_latency_ticks = 0; 4533 stat->min_copy_latency_ticks = UINT64_MAX; 4534 4535 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4536 return; 4537 } 4538 4539 stat->bytes_read = 0; 4540 stat->num_read_ops = 0; 4541 stat->bytes_written = 0; 4542 stat->num_write_ops = 0; 4543 stat->bytes_unmapped = 0; 4544 stat->num_unmap_ops = 0; 4545 stat->bytes_copied = 0; 4546 stat->num_copy_ops = 0; 4547 stat->read_latency_ticks = 0; 4548 stat->write_latency_ticks = 0; 4549 stat->unmap_latency_ticks = 0; 4550 stat->copy_latency_ticks = 0; 4551 4552 if (stat->io_error != NULL) { 4553 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4554 } 4555 } 4556 4557 struct spdk_bdev_io_stat * 4558 bdev_alloc_io_stat(bool io_error_stat) 4559 { 4560 struct spdk_bdev_io_stat *stat; 4561 4562 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4563 if (stat == NULL) { 4564 return NULL; 4565 } 4566 4567 if (io_error_stat) { 4568 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4569 if (stat->io_error == NULL) { 4570 free(stat); 4571 return NULL; 4572 } 4573 } else { 4574 stat->io_error = NULL; 4575 } 4576 4577 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4578 4579 return stat; 4580 } 4581 4582 void 4583 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4584 { 4585 if (stat != NULL) { 4586 free(stat->io_error); 4587 free(stat); 4588 } 4589 } 4590 4591 void 4592 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4593 { 4594 int i; 4595 4596 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4597 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4598 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4599 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4600 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4601 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4602 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4603 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4604 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4605 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4606 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4607 stat->min_read_latency_ticks != UINT64_MAX ? 4608 stat->min_read_latency_ticks : 0); 4609 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4610 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4611 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4612 stat->min_write_latency_ticks != UINT64_MAX ? 4613 stat->min_write_latency_ticks : 0); 4614 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4615 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4616 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4617 stat->min_unmap_latency_ticks != UINT64_MAX ? 4618 stat->min_unmap_latency_ticks : 0); 4619 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4620 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4621 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4622 stat->min_copy_latency_ticks != UINT64_MAX ? 4623 stat->min_copy_latency_ticks : 0); 4624 4625 if (stat->io_error != NULL) { 4626 spdk_json_write_named_object_begin(w, "io_error"); 4627 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4628 if (stat->io_error->error_status[i] != 0) { 4629 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4630 stat->io_error->error_status[i]); 4631 } 4632 } 4633 spdk_json_write_object_end(w); 4634 } 4635 } 4636 4637 static void 4638 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4639 { 4640 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4641 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4642 4643 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4644 bdev_abort_all_buf_io(mgmt_ch, ch); 4645 } 4646 4647 static void 4648 bdev_channel_destroy(void *io_device, void *ctx_buf) 4649 { 4650 struct spdk_bdev_channel *ch = ctx_buf; 4651 4652 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4653 spdk_get_thread()); 4654 4655 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4656 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4657 4658 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4659 spdk_spin_lock(&ch->bdev->internal.spinlock); 4660 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4661 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4662 4663 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4664 4665 bdev_channel_abort_queued_ios(ch); 4666 4667 if (ch->histogram) { 4668 spdk_histogram_data_free(ch->histogram); 4669 } 4670 4671 bdev_channel_destroy_resource(ch); 4672 } 4673 4674 /* 4675 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4676 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4677 */ 4678 static int 4679 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4680 { 4681 struct spdk_bdev_name *tmp; 4682 4683 bdev_name->name = strdup(name); 4684 if (bdev_name->name == NULL) { 4685 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4686 return -ENOMEM; 4687 } 4688 4689 bdev_name->bdev = bdev; 4690 4691 spdk_spin_lock(&g_bdev_mgr.spinlock); 4692 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4693 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4694 4695 if (tmp != NULL) { 4696 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4697 free(bdev_name->name); 4698 return -EEXIST; 4699 } 4700 4701 return 0; 4702 } 4703 4704 static void 4705 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4706 { 4707 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4708 free(bdev_name->name); 4709 } 4710 4711 static void 4712 bdev_name_del(struct spdk_bdev_name *bdev_name) 4713 { 4714 spdk_spin_lock(&g_bdev_mgr.spinlock); 4715 bdev_name_del_unsafe(bdev_name); 4716 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4717 } 4718 4719 int 4720 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4721 { 4722 struct spdk_bdev_alias *tmp; 4723 int ret; 4724 4725 if (alias == NULL) { 4726 SPDK_ERRLOG("Empty alias passed\n"); 4727 return -EINVAL; 4728 } 4729 4730 tmp = calloc(1, sizeof(*tmp)); 4731 if (tmp == NULL) { 4732 SPDK_ERRLOG("Unable to allocate alias\n"); 4733 return -ENOMEM; 4734 } 4735 4736 ret = bdev_name_add(&tmp->alias, bdev, alias); 4737 if (ret != 0) { 4738 free(tmp); 4739 return ret; 4740 } 4741 4742 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4743 4744 return 0; 4745 } 4746 4747 static int 4748 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4749 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4750 { 4751 struct spdk_bdev_alias *tmp; 4752 4753 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4754 if (strcmp(alias, tmp->alias.name) == 0) { 4755 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4756 alias_del_fn(&tmp->alias); 4757 free(tmp); 4758 return 0; 4759 } 4760 } 4761 4762 return -ENOENT; 4763 } 4764 4765 int 4766 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4767 { 4768 int rc; 4769 4770 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4771 if (rc == -ENOENT) { 4772 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4773 } 4774 4775 return rc; 4776 } 4777 4778 void 4779 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4780 { 4781 struct spdk_bdev_alias *p, *tmp; 4782 4783 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4784 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4785 bdev_name_del(&p->alias); 4786 free(p); 4787 } 4788 } 4789 4790 struct spdk_io_channel * 4791 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4792 { 4793 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4794 } 4795 4796 void * 4797 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4798 { 4799 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4800 void *ctx = NULL; 4801 4802 if (bdev->fn_table->get_module_ctx) { 4803 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4804 } 4805 4806 return ctx; 4807 } 4808 4809 const char * 4810 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4811 { 4812 return bdev->module->name; 4813 } 4814 4815 const char * 4816 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4817 { 4818 return bdev->name; 4819 } 4820 4821 const char * 4822 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4823 { 4824 return bdev->product_name; 4825 } 4826 4827 const struct spdk_bdev_aliases_list * 4828 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4829 { 4830 return &bdev->aliases; 4831 } 4832 4833 uint32_t 4834 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4835 { 4836 return bdev->blocklen; 4837 } 4838 4839 uint32_t 4840 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4841 { 4842 return bdev->write_unit_size; 4843 } 4844 4845 uint64_t 4846 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4847 { 4848 return bdev->blockcnt; 4849 } 4850 4851 const char * 4852 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4853 { 4854 return qos_rpc_type[type]; 4855 } 4856 4857 void 4858 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4859 { 4860 int i; 4861 4862 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4863 4864 spdk_spin_lock(&bdev->internal.spinlock); 4865 if (bdev->internal.qos) { 4866 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4867 if (bdev->internal.qos->rate_limits[i].limit != 4868 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4869 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4870 if (bdev_qos_is_iops_rate_limit(i) == false) { 4871 /* Change from Byte to Megabyte which is user visible. */ 4872 limits[i] = limits[i] / 1024 / 1024; 4873 } 4874 } 4875 } 4876 } 4877 spdk_spin_unlock(&bdev->internal.spinlock); 4878 } 4879 4880 size_t 4881 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4882 { 4883 return 1 << bdev->required_alignment; 4884 } 4885 4886 uint32_t 4887 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4888 { 4889 return bdev->optimal_io_boundary; 4890 } 4891 4892 bool 4893 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4894 { 4895 return bdev->write_cache; 4896 } 4897 4898 const struct spdk_uuid * 4899 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4900 { 4901 return &bdev->uuid; 4902 } 4903 4904 uint16_t 4905 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4906 { 4907 return bdev->acwu; 4908 } 4909 4910 uint32_t 4911 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4912 { 4913 return bdev->md_len; 4914 } 4915 4916 bool 4917 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4918 { 4919 return (bdev->md_len != 0) && bdev->md_interleave; 4920 } 4921 4922 bool 4923 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4924 { 4925 return (bdev->md_len != 0) && !bdev->md_interleave; 4926 } 4927 4928 bool 4929 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4930 { 4931 return bdev->zoned; 4932 } 4933 4934 uint32_t 4935 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4936 { 4937 if (spdk_bdev_is_md_interleaved(bdev)) { 4938 return bdev->blocklen - bdev->md_len; 4939 } else { 4940 return bdev->blocklen; 4941 } 4942 } 4943 4944 uint32_t 4945 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4946 { 4947 return bdev->phys_blocklen; 4948 } 4949 4950 static uint32_t 4951 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4952 { 4953 if (!spdk_bdev_is_md_interleaved(bdev)) { 4954 return bdev->blocklen + bdev->md_len; 4955 } else { 4956 return bdev->blocklen; 4957 } 4958 } 4959 4960 /* We have to use the typedef in the function declaration to appease astyle. */ 4961 typedef enum spdk_dif_type spdk_dif_type_t; 4962 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4963 4964 spdk_dif_type_t 4965 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4966 { 4967 if (bdev->md_len != 0) { 4968 return bdev->dif_type; 4969 } else { 4970 return SPDK_DIF_DISABLE; 4971 } 4972 } 4973 4974 spdk_dif_pi_format_t 4975 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4976 { 4977 return bdev->dif_pi_format; 4978 } 4979 4980 bool 4981 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4982 { 4983 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4984 return bdev->dif_is_head_of_md; 4985 } else { 4986 return false; 4987 } 4988 } 4989 4990 bool 4991 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4992 enum spdk_dif_check_type check_type) 4993 { 4994 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4995 return false; 4996 } 4997 4998 switch (check_type) { 4999 case SPDK_DIF_CHECK_TYPE_REFTAG: 5000 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5001 case SPDK_DIF_CHECK_TYPE_APPTAG: 5002 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5003 case SPDK_DIF_CHECK_TYPE_GUARD: 5004 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5005 default: 5006 return false; 5007 } 5008 } 5009 5010 static uint32_t 5011 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5012 { 5013 uint64_t aligned_length, max_write_blocks; 5014 5015 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5016 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5017 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5018 5019 return max_write_blocks; 5020 } 5021 5022 uint32_t 5023 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5024 { 5025 return bdev->max_copy; 5026 } 5027 5028 uint64_t 5029 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5030 { 5031 return bdev->internal.measured_queue_depth; 5032 } 5033 5034 uint64_t 5035 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5036 { 5037 return bdev->internal.period; 5038 } 5039 5040 uint64_t 5041 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5042 { 5043 return bdev->internal.weighted_io_time; 5044 } 5045 5046 uint64_t 5047 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5048 { 5049 return bdev->internal.io_time; 5050 } 5051 5052 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5053 { 5054 return bdev->ctratt; 5055 } 5056 5057 static void bdev_update_qd_sampling_period(void *ctx); 5058 5059 static void 5060 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5061 { 5062 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5063 5064 if (bdev->internal.measured_queue_depth) { 5065 bdev->internal.io_time += bdev->internal.period; 5066 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5067 } 5068 5069 bdev->internal.qd_poll_in_progress = false; 5070 5071 bdev_update_qd_sampling_period(bdev); 5072 } 5073 5074 static void 5075 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5076 struct spdk_io_channel *io_ch, void *_ctx) 5077 { 5078 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5079 5080 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5081 spdk_bdev_for_each_channel_continue(i, 0); 5082 } 5083 5084 static int 5085 bdev_calculate_measured_queue_depth(void *ctx) 5086 { 5087 struct spdk_bdev *bdev = ctx; 5088 5089 bdev->internal.qd_poll_in_progress = true; 5090 bdev->internal.temporary_queue_depth = 0; 5091 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5092 return SPDK_POLLER_BUSY; 5093 } 5094 5095 static void 5096 bdev_update_qd_sampling_period(void *ctx) 5097 { 5098 struct spdk_bdev *bdev = ctx; 5099 5100 if (bdev->internal.period == bdev->internal.new_period) { 5101 return; 5102 } 5103 5104 if (bdev->internal.qd_poll_in_progress) { 5105 return; 5106 } 5107 5108 bdev->internal.period = bdev->internal.new_period; 5109 5110 spdk_poller_unregister(&bdev->internal.qd_poller); 5111 if (bdev->internal.period != 0) { 5112 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5113 bdev, bdev->internal.period); 5114 } else { 5115 spdk_bdev_close(bdev->internal.qd_desc); 5116 bdev->internal.qd_desc = NULL; 5117 } 5118 } 5119 5120 static void 5121 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5122 { 5123 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5124 } 5125 5126 void 5127 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5128 { 5129 int rc; 5130 5131 if (bdev->internal.new_period == period) { 5132 return; 5133 } 5134 5135 bdev->internal.new_period = period; 5136 5137 if (bdev->internal.qd_desc != NULL) { 5138 assert(bdev->internal.period != 0); 5139 5140 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5141 bdev_update_qd_sampling_period, bdev); 5142 return; 5143 } 5144 5145 assert(bdev->internal.period == 0); 5146 5147 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5148 NULL, &bdev->internal.qd_desc); 5149 if (rc != 0) { 5150 return; 5151 } 5152 5153 bdev->internal.period = period; 5154 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5155 bdev, period); 5156 } 5157 5158 struct bdev_get_current_qd_ctx { 5159 uint64_t current_qd; 5160 spdk_bdev_get_current_qd_cb cb_fn; 5161 void *cb_arg; 5162 }; 5163 5164 static void 5165 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5166 { 5167 struct bdev_get_current_qd_ctx *ctx = _ctx; 5168 5169 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5170 5171 free(ctx); 5172 } 5173 5174 static void 5175 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5176 struct spdk_io_channel *io_ch, void *_ctx) 5177 { 5178 struct bdev_get_current_qd_ctx *ctx = _ctx; 5179 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5180 5181 ctx->current_qd += bdev_ch->io_outstanding; 5182 5183 spdk_bdev_for_each_channel_continue(i, 0); 5184 } 5185 5186 void 5187 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5188 void *cb_arg) 5189 { 5190 struct bdev_get_current_qd_ctx *ctx; 5191 5192 assert(cb_fn != NULL); 5193 5194 ctx = calloc(1, sizeof(*ctx)); 5195 if (ctx == NULL) { 5196 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5197 return; 5198 } 5199 5200 ctx->cb_fn = cb_fn; 5201 ctx->cb_arg = cb_arg; 5202 5203 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5204 } 5205 5206 static void 5207 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5208 { 5209 assert(desc->thread == spdk_get_thread()); 5210 5211 spdk_spin_lock(&desc->spinlock); 5212 desc->refs--; 5213 if (!desc->closed) { 5214 spdk_spin_unlock(&desc->spinlock); 5215 desc->callback.event_fn(type, 5216 desc->bdev, 5217 desc->callback.ctx); 5218 return; 5219 } else if (desc->refs == 0) { 5220 /* This descriptor was closed after this event_notify message was sent. 5221 * spdk_bdev_close() could not free the descriptor since this message was 5222 * in flight, so we free it now using bdev_desc_free(). 5223 */ 5224 spdk_spin_unlock(&desc->spinlock); 5225 bdev_desc_free(desc); 5226 return; 5227 } 5228 spdk_spin_unlock(&desc->spinlock); 5229 } 5230 5231 static void 5232 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5233 { 5234 spdk_spin_lock(&desc->spinlock); 5235 desc->refs++; 5236 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5237 spdk_spin_unlock(&desc->spinlock); 5238 } 5239 5240 static void 5241 _resize_notify(void *ctx) 5242 { 5243 struct spdk_bdev_desc *desc = ctx; 5244 5245 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5246 } 5247 5248 int 5249 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5250 { 5251 struct spdk_bdev_desc *desc; 5252 int ret; 5253 5254 if (size == bdev->blockcnt) { 5255 return 0; 5256 } 5257 5258 spdk_spin_lock(&bdev->internal.spinlock); 5259 5260 /* bdev has open descriptors */ 5261 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5262 bdev->blockcnt > size) { 5263 ret = -EBUSY; 5264 } else { 5265 bdev->blockcnt = size; 5266 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5267 event_notify(desc, _resize_notify); 5268 } 5269 ret = 0; 5270 } 5271 5272 spdk_spin_unlock(&bdev->internal.spinlock); 5273 5274 return ret; 5275 } 5276 5277 /* 5278 * Convert I/O offset and length from bytes to blocks. 5279 * 5280 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5281 */ 5282 static uint64_t 5283 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5284 uint64_t num_bytes, uint64_t *num_blocks) 5285 { 5286 uint32_t block_size = bdev->blocklen; 5287 uint8_t shift_cnt; 5288 5289 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5290 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5291 shift_cnt = spdk_u32log2(block_size); 5292 *offset_blocks = offset_bytes >> shift_cnt; 5293 *num_blocks = num_bytes >> shift_cnt; 5294 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5295 (num_bytes - (*num_blocks << shift_cnt)); 5296 } else { 5297 *offset_blocks = offset_bytes / block_size; 5298 *num_blocks = num_bytes / block_size; 5299 return (offset_bytes % block_size) | (num_bytes % block_size); 5300 } 5301 } 5302 5303 static bool 5304 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5305 { 5306 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5307 * has been an overflow and hence the offset has been wrapped around */ 5308 if (offset_blocks + num_blocks < offset_blocks) { 5309 return false; 5310 } 5311 5312 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5313 if (offset_blocks + num_blocks > bdev->blockcnt) { 5314 return false; 5315 } 5316 5317 return true; 5318 } 5319 5320 static void 5321 bdev_seek_complete_cb(void *ctx) 5322 { 5323 struct spdk_bdev_io *bdev_io = ctx; 5324 5325 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5326 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5327 } 5328 5329 static int 5330 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5331 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5332 spdk_bdev_io_completion_cb cb, void *cb_arg) 5333 { 5334 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5335 struct spdk_bdev_io *bdev_io; 5336 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5337 5338 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5339 5340 /* Check if offset_blocks is valid looking at the validity of one block */ 5341 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5342 return -EINVAL; 5343 } 5344 5345 bdev_io = bdev_channel_get_io(channel); 5346 if (!bdev_io) { 5347 return -ENOMEM; 5348 } 5349 5350 bdev_io->internal.ch = channel; 5351 bdev_io->internal.desc = desc; 5352 bdev_io->type = io_type; 5353 bdev_io->u.bdev.offset_blocks = offset_blocks; 5354 bdev_io->u.bdev.memory_domain = NULL; 5355 bdev_io->u.bdev.memory_domain_ctx = NULL; 5356 bdev_io->u.bdev.accel_sequence = NULL; 5357 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5358 5359 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5360 /* In case bdev doesn't support seek to next data/hole offset, 5361 * it is assumed that only data and no holes are present */ 5362 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5363 bdev_io->u.bdev.seek.offset = offset_blocks; 5364 } else { 5365 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5366 } 5367 5368 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5369 return 0; 5370 } 5371 5372 bdev_io_submit(bdev_io); 5373 return 0; 5374 } 5375 5376 int 5377 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5378 uint64_t offset_blocks, 5379 spdk_bdev_io_completion_cb cb, void *cb_arg) 5380 { 5381 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5382 } 5383 5384 int 5385 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5386 uint64_t offset_blocks, 5387 spdk_bdev_io_completion_cb cb, void *cb_arg) 5388 { 5389 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5390 } 5391 5392 uint64_t 5393 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5394 { 5395 return bdev_io->u.bdev.seek.offset; 5396 } 5397 5398 static int 5399 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5400 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5401 spdk_bdev_io_completion_cb cb, void *cb_arg) 5402 { 5403 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5404 struct spdk_bdev_io *bdev_io; 5405 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5406 5407 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5408 return -EINVAL; 5409 } 5410 5411 bdev_io = bdev_channel_get_io(channel); 5412 if (!bdev_io) { 5413 return -ENOMEM; 5414 } 5415 5416 bdev_io->internal.ch = channel; 5417 bdev_io->internal.desc = desc; 5418 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5419 bdev_io->u.bdev.iovs = &bdev_io->iov; 5420 bdev_io->u.bdev.iovs[0].iov_base = buf; 5421 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5422 bdev_io->u.bdev.iovcnt = 1; 5423 bdev_io->u.bdev.md_buf = md_buf; 5424 bdev_io->u.bdev.num_blocks = num_blocks; 5425 bdev_io->u.bdev.offset_blocks = offset_blocks; 5426 bdev_io->u.bdev.memory_domain = NULL; 5427 bdev_io->u.bdev.memory_domain_ctx = NULL; 5428 bdev_io->u.bdev.accel_sequence = NULL; 5429 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5430 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5431 5432 bdev_io_submit(bdev_io); 5433 return 0; 5434 } 5435 5436 int 5437 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5438 void *buf, uint64_t offset, uint64_t nbytes, 5439 spdk_bdev_io_completion_cb cb, void *cb_arg) 5440 { 5441 uint64_t offset_blocks, num_blocks; 5442 5443 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5444 nbytes, &num_blocks) != 0) { 5445 return -EINVAL; 5446 } 5447 5448 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5449 } 5450 5451 int 5452 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5453 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5454 spdk_bdev_io_completion_cb cb, void *cb_arg) 5455 { 5456 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5457 } 5458 5459 int 5460 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5461 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5462 spdk_bdev_io_completion_cb cb, void *cb_arg) 5463 { 5464 struct iovec iov = { 5465 .iov_base = buf, 5466 }; 5467 5468 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5469 return -EINVAL; 5470 } 5471 5472 if (md_buf && !_is_buf_allocated(&iov)) { 5473 return -EINVAL; 5474 } 5475 5476 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5477 cb, cb_arg); 5478 } 5479 5480 int 5481 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5482 struct iovec *iov, int iovcnt, 5483 uint64_t offset, uint64_t nbytes, 5484 spdk_bdev_io_completion_cb cb, void *cb_arg) 5485 { 5486 uint64_t offset_blocks, num_blocks; 5487 5488 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5489 nbytes, &num_blocks) != 0) { 5490 return -EINVAL; 5491 } 5492 5493 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5494 } 5495 5496 static int 5497 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5498 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5499 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5500 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5501 spdk_bdev_io_completion_cb cb, void *cb_arg) 5502 { 5503 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5504 struct spdk_bdev_io *bdev_io; 5505 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5506 5507 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5508 return -EINVAL; 5509 } 5510 5511 bdev_io = bdev_channel_get_io(channel); 5512 if (spdk_unlikely(!bdev_io)) { 5513 return -ENOMEM; 5514 } 5515 5516 bdev_io->internal.ch = channel; 5517 bdev_io->internal.desc = desc; 5518 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5519 bdev_io->u.bdev.iovs = iov; 5520 bdev_io->u.bdev.iovcnt = iovcnt; 5521 bdev_io->u.bdev.md_buf = md_buf; 5522 bdev_io->u.bdev.num_blocks = num_blocks; 5523 bdev_io->u.bdev.offset_blocks = offset_blocks; 5524 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5525 5526 if (seq != NULL) { 5527 bdev_io->internal.f.has_accel_sequence = true; 5528 bdev_io->internal.accel_sequence = seq; 5529 } 5530 5531 if (domain != NULL) { 5532 bdev_io->internal.f.has_memory_domain = true; 5533 bdev_io->internal.memory_domain = domain; 5534 bdev_io->internal.memory_domain_ctx = domain_ctx; 5535 } 5536 5537 bdev_io->u.bdev.memory_domain = domain; 5538 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5539 bdev_io->u.bdev.accel_sequence = seq; 5540 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5541 5542 _bdev_io_submit_ext(desc, bdev_io); 5543 5544 return 0; 5545 } 5546 5547 int 5548 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5549 struct iovec *iov, int iovcnt, 5550 uint64_t offset_blocks, uint64_t num_blocks, 5551 spdk_bdev_io_completion_cb cb, void *cb_arg) 5552 { 5553 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5554 5555 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5556 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5557 } 5558 5559 int 5560 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5561 struct iovec *iov, int iovcnt, void *md_buf, 5562 uint64_t offset_blocks, uint64_t num_blocks, 5563 spdk_bdev_io_completion_cb cb, void *cb_arg) 5564 { 5565 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5566 5567 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5568 return -EINVAL; 5569 } 5570 5571 if (md_buf && !_is_buf_allocated(iov)) { 5572 return -EINVAL; 5573 } 5574 5575 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5576 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5577 } 5578 5579 static inline bool 5580 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5581 { 5582 /* 5583 * We check if opts size is at least of size when we first introduced 5584 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5585 * are not checked internal. 5586 */ 5587 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5588 sizeof(opts->metadata) && 5589 opts->size <= sizeof(*opts) && 5590 /* When memory domain is used, the user must provide data buffers */ 5591 (!opts->memory_domain || (iov && iov[0].iov_base)); 5592 } 5593 5594 int 5595 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5596 struct iovec *iov, int iovcnt, 5597 uint64_t offset_blocks, uint64_t num_blocks, 5598 spdk_bdev_io_completion_cb cb, void *cb_arg, 5599 struct spdk_bdev_ext_io_opts *opts) 5600 { 5601 struct spdk_memory_domain *domain = NULL; 5602 struct spdk_accel_sequence *seq = NULL; 5603 void *domain_ctx = NULL, *md = NULL; 5604 uint32_t dif_check_flags = 0; 5605 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5606 5607 if (opts) { 5608 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5609 return -EINVAL; 5610 } 5611 5612 md = opts->metadata; 5613 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5614 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5615 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5616 if (md) { 5617 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5618 return -EINVAL; 5619 } 5620 5621 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5622 return -EINVAL; 5623 } 5624 5625 if (spdk_unlikely(seq != NULL)) { 5626 return -EINVAL; 5627 } 5628 } 5629 } 5630 5631 dif_check_flags = bdev->dif_check_flags & 5632 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5633 5634 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5635 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5636 } 5637 5638 static int 5639 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5640 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5641 spdk_bdev_io_completion_cb cb, void *cb_arg) 5642 { 5643 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5644 struct spdk_bdev_io *bdev_io; 5645 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5646 5647 if (!desc->write) { 5648 return -EBADF; 5649 } 5650 5651 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5652 return -EINVAL; 5653 } 5654 5655 bdev_io = bdev_channel_get_io(channel); 5656 if (!bdev_io) { 5657 return -ENOMEM; 5658 } 5659 5660 bdev_io->internal.ch = channel; 5661 bdev_io->internal.desc = desc; 5662 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5663 bdev_io->u.bdev.iovs = &bdev_io->iov; 5664 bdev_io->u.bdev.iovs[0].iov_base = buf; 5665 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5666 bdev_io->u.bdev.iovcnt = 1; 5667 bdev_io->u.bdev.md_buf = md_buf; 5668 bdev_io->u.bdev.num_blocks = num_blocks; 5669 bdev_io->u.bdev.offset_blocks = offset_blocks; 5670 bdev_io->u.bdev.memory_domain = NULL; 5671 bdev_io->u.bdev.memory_domain_ctx = NULL; 5672 bdev_io->u.bdev.accel_sequence = NULL; 5673 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5674 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5675 5676 bdev_io_submit(bdev_io); 5677 return 0; 5678 } 5679 5680 int 5681 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5682 void *buf, uint64_t offset, uint64_t nbytes, 5683 spdk_bdev_io_completion_cb cb, void *cb_arg) 5684 { 5685 uint64_t offset_blocks, num_blocks; 5686 5687 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5688 nbytes, &num_blocks) != 0) { 5689 return -EINVAL; 5690 } 5691 5692 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5693 } 5694 5695 int 5696 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5697 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5698 spdk_bdev_io_completion_cb cb, void *cb_arg) 5699 { 5700 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5701 cb, cb_arg); 5702 } 5703 5704 int 5705 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5706 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5707 spdk_bdev_io_completion_cb cb, void *cb_arg) 5708 { 5709 struct iovec iov = { 5710 .iov_base = buf, 5711 }; 5712 5713 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5714 return -EINVAL; 5715 } 5716 5717 if (md_buf && !_is_buf_allocated(&iov)) { 5718 return -EINVAL; 5719 } 5720 5721 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5722 cb, cb_arg); 5723 } 5724 5725 static int 5726 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5727 struct iovec *iov, int iovcnt, void *md_buf, 5728 uint64_t offset_blocks, uint64_t num_blocks, 5729 struct spdk_memory_domain *domain, void *domain_ctx, 5730 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5731 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5732 spdk_bdev_io_completion_cb cb, void *cb_arg) 5733 { 5734 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5735 struct spdk_bdev_io *bdev_io; 5736 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5737 5738 if (spdk_unlikely(!desc->write)) { 5739 return -EBADF; 5740 } 5741 5742 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5743 return -EINVAL; 5744 } 5745 5746 bdev_io = bdev_channel_get_io(channel); 5747 if (spdk_unlikely(!bdev_io)) { 5748 return -ENOMEM; 5749 } 5750 5751 bdev_io->internal.ch = channel; 5752 bdev_io->internal.desc = desc; 5753 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5754 bdev_io->u.bdev.iovs = iov; 5755 bdev_io->u.bdev.iovcnt = iovcnt; 5756 bdev_io->u.bdev.md_buf = md_buf; 5757 bdev_io->u.bdev.num_blocks = num_blocks; 5758 bdev_io->u.bdev.offset_blocks = offset_blocks; 5759 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5760 if (seq != NULL) { 5761 bdev_io->internal.f.has_accel_sequence = true; 5762 bdev_io->internal.accel_sequence = seq; 5763 } 5764 5765 if (domain != NULL) { 5766 bdev_io->internal.f.has_memory_domain = true; 5767 bdev_io->internal.memory_domain = domain; 5768 bdev_io->internal.memory_domain_ctx = domain_ctx; 5769 } 5770 5771 bdev_io->u.bdev.memory_domain = domain; 5772 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5773 bdev_io->u.bdev.accel_sequence = seq; 5774 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5775 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5776 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5777 5778 _bdev_io_submit_ext(desc, bdev_io); 5779 5780 return 0; 5781 } 5782 5783 int 5784 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5785 struct iovec *iov, int iovcnt, 5786 uint64_t offset, uint64_t len, 5787 spdk_bdev_io_completion_cb cb, void *cb_arg) 5788 { 5789 uint64_t offset_blocks, num_blocks; 5790 5791 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5792 len, &num_blocks) != 0) { 5793 return -EINVAL; 5794 } 5795 5796 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5797 } 5798 5799 int 5800 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5801 struct iovec *iov, int iovcnt, 5802 uint64_t offset_blocks, uint64_t num_blocks, 5803 spdk_bdev_io_completion_cb cb, void *cb_arg) 5804 { 5805 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5806 5807 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5808 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5809 cb, cb_arg); 5810 } 5811 5812 int 5813 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5814 struct iovec *iov, int iovcnt, void *md_buf, 5815 uint64_t offset_blocks, uint64_t num_blocks, 5816 spdk_bdev_io_completion_cb cb, void *cb_arg) 5817 { 5818 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5819 5820 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5821 return -EINVAL; 5822 } 5823 5824 if (md_buf && !_is_buf_allocated(iov)) { 5825 return -EINVAL; 5826 } 5827 5828 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5829 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5830 cb, cb_arg); 5831 } 5832 5833 int 5834 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5835 struct iovec *iov, int iovcnt, 5836 uint64_t offset_blocks, uint64_t num_blocks, 5837 spdk_bdev_io_completion_cb cb, void *cb_arg, 5838 struct spdk_bdev_ext_io_opts *opts) 5839 { 5840 struct spdk_memory_domain *domain = NULL; 5841 struct spdk_accel_sequence *seq = NULL; 5842 void *domain_ctx = NULL, *md = NULL; 5843 uint32_t dif_check_flags = 0; 5844 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5845 uint32_t nvme_cdw12_raw = 0; 5846 uint32_t nvme_cdw13_raw = 0; 5847 5848 if (opts) { 5849 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5850 return -EINVAL; 5851 } 5852 md = opts->metadata; 5853 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5854 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5855 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5856 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5857 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5858 if (md) { 5859 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5860 return -EINVAL; 5861 } 5862 5863 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5864 return -EINVAL; 5865 } 5866 5867 if (spdk_unlikely(seq != NULL)) { 5868 return -EINVAL; 5869 } 5870 } 5871 } 5872 5873 dif_check_flags = bdev->dif_check_flags & 5874 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5875 5876 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5877 domain, domain_ctx, seq, dif_check_flags, 5878 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5879 } 5880 5881 static void 5882 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5883 { 5884 struct spdk_bdev_io *parent_io = cb_arg; 5885 struct spdk_bdev *bdev = parent_io->bdev; 5886 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5887 int i, rc = 0; 5888 5889 if (!success) { 5890 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5891 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5892 spdk_bdev_free_io(bdev_io); 5893 return; 5894 } 5895 5896 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5897 rc = memcmp(read_buf, 5898 parent_io->u.bdev.iovs[i].iov_base, 5899 parent_io->u.bdev.iovs[i].iov_len); 5900 if (rc) { 5901 break; 5902 } 5903 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5904 } 5905 5906 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5907 rc = memcmp(bdev_io->u.bdev.md_buf, 5908 parent_io->u.bdev.md_buf, 5909 spdk_bdev_get_md_size(bdev)); 5910 } 5911 5912 spdk_bdev_free_io(bdev_io); 5913 5914 if (rc == 0) { 5915 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5916 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5917 } else { 5918 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5919 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5920 } 5921 } 5922 5923 static void 5924 bdev_compare_do_read(void *_bdev_io) 5925 { 5926 struct spdk_bdev_io *bdev_io = _bdev_io; 5927 int rc; 5928 5929 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5930 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5931 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5932 bdev_compare_do_read_done, bdev_io); 5933 5934 if (rc == -ENOMEM) { 5935 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5936 } else if (rc != 0) { 5937 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5938 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5939 } 5940 } 5941 5942 static int 5943 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5944 struct iovec *iov, int iovcnt, void *md_buf, 5945 uint64_t offset_blocks, uint64_t num_blocks, 5946 spdk_bdev_io_completion_cb cb, void *cb_arg) 5947 { 5948 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5949 struct spdk_bdev_io *bdev_io; 5950 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5951 5952 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5953 return -EINVAL; 5954 } 5955 5956 bdev_io = bdev_channel_get_io(channel); 5957 if (!bdev_io) { 5958 return -ENOMEM; 5959 } 5960 5961 bdev_io->internal.ch = channel; 5962 bdev_io->internal.desc = desc; 5963 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5964 bdev_io->u.bdev.iovs = iov; 5965 bdev_io->u.bdev.iovcnt = iovcnt; 5966 bdev_io->u.bdev.md_buf = md_buf; 5967 bdev_io->u.bdev.num_blocks = num_blocks; 5968 bdev_io->u.bdev.offset_blocks = offset_blocks; 5969 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5970 bdev_io->u.bdev.memory_domain = NULL; 5971 bdev_io->u.bdev.memory_domain_ctx = NULL; 5972 bdev_io->u.bdev.accel_sequence = NULL; 5973 5974 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5975 bdev_io_submit(bdev_io); 5976 return 0; 5977 } 5978 5979 bdev_compare_do_read(bdev_io); 5980 5981 return 0; 5982 } 5983 5984 int 5985 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5986 struct iovec *iov, int iovcnt, 5987 uint64_t offset_blocks, uint64_t num_blocks, 5988 spdk_bdev_io_completion_cb cb, void *cb_arg) 5989 { 5990 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5991 num_blocks, cb, cb_arg); 5992 } 5993 5994 int 5995 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5996 struct iovec *iov, int iovcnt, void *md_buf, 5997 uint64_t offset_blocks, uint64_t num_blocks, 5998 spdk_bdev_io_completion_cb cb, void *cb_arg) 5999 { 6000 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6001 return -EINVAL; 6002 } 6003 6004 if (md_buf && !_is_buf_allocated(iov)) { 6005 return -EINVAL; 6006 } 6007 6008 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6009 num_blocks, cb, cb_arg); 6010 } 6011 6012 static int 6013 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6014 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6015 spdk_bdev_io_completion_cb cb, void *cb_arg) 6016 { 6017 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6018 struct spdk_bdev_io *bdev_io; 6019 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6020 6021 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6022 return -EINVAL; 6023 } 6024 6025 bdev_io = bdev_channel_get_io(channel); 6026 if (!bdev_io) { 6027 return -ENOMEM; 6028 } 6029 6030 bdev_io->internal.ch = channel; 6031 bdev_io->internal.desc = desc; 6032 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6033 bdev_io->u.bdev.iovs = &bdev_io->iov; 6034 bdev_io->u.bdev.iovs[0].iov_base = buf; 6035 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6036 bdev_io->u.bdev.iovcnt = 1; 6037 bdev_io->u.bdev.md_buf = md_buf; 6038 bdev_io->u.bdev.num_blocks = num_blocks; 6039 bdev_io->u.bdev.offset_blocks = offset_blocks; 6040 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6041 bdev_io->u.bdev.memory_domain = NULL; 6042 bdev_io->u.bdev.memory_domain_ctx = NULL; 6043 bdev_io->u.bdev.accel_sequence = NULL; 6044 6045 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6046 bdev_io_submit(bdev_io); 6047 return 0; 6048 } 6049 6050 bdev_compare_do_read(bdev_io); 6051 6052 return 0; 6053 } 6054 6055 int 6056 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6057 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6058 spdk_bdev_io_completion_cb cb, void *cb_arg) 6059 { 6060 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6061 cb, cb_arg); 6062 } 6063 6064 int 6065 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6066 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6067 spdk_bdev_io_completion_cb cb, void *cb_arg) 6068 { 6069 struct iovec iov = { 6070 .iov_base = buf, 6071 }; 6072 6073 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6074 return -EINVAL; 6075 } 6076 6077 if (md_buf && !_is_buf_allocated(&iov)) { 6078 return -EINVAL; 6079 } 6080 6081 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6082 cb, cb_arg); 6083 } 6084 6085 static void 6086 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6087 { 6088 struct spdk_bdev_io *bdev_io = ctx; 6089 6090 if (unlock_status) { 6091 SPDK_ERRLOG("LBA range unlock failed\n"); 6092 } 6093 6094 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6095 false, bdev_io->internal.caller_ctx); 6096 } 6097 6098 static void 6099 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6100 { 6101 bdev_io->internal.status = status; 6102 6103 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6104 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6105 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6106 } 6107 6108 static void 6109 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6110 { 6111 struct spdk_bdev_io *parent_io = cb_arg; 6112 6113 if (!success) { 6114 SPDK_ERRLOG("Compare and write operation failed\n"); 6115 } 6116 6117 spdk_bdev_free_io(bdev_io); 6118 6119 bdev_comparev_and_writev_blocks_unlock(parent_io, 6120 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6121 } 6122 6123 static void 6124 bdev_compare_and_write_do_write(void *_bdev_io) 6125 { 6126 struct spdk_bdev_io *bdev_io = _bdev_io; 6127 int rc; 6128 6129 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6130 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6131 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6132 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6133 bdev_compare_and_write_do_write_done, bdev_io); 6134 6135 6136 if (rc == -ENOMEM) { 6137 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6138 } else if (rc != 0) { 6139 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6140 } 6141 } 6142 6143 static void 6144 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6145 { 6146 struct spdk_bdev_io *parent_io = cb_arg; 6147 6148 spdk_bdev_free_io(bdev_io); 6149 6150 if (!success) { 6151 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6152 return; 6153 } 6154 6155 bdev_compare_and_write_do_write(parent_io); 6156 } 6157 6158 static void 6159 bdev_compare_and_write_do_compare(void *_bdev_io) 6160 { 6161 struct spdk_bdev_io *bdev_io = _bdev_io; 6162 int rc; 6163 6164 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6165 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6166 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6167 bdev_compare_and_write_do_compare_done, bdev_io); 6168 6169 if (rc == -ENOMEM) { 6170 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6171 } else if (rc != 0) { 6172 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6173 } 6174 } 6175 6176 static void 6177 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6178 { 6179 struct spdk_bdev_io *bdev_io = ctx; 6180 6181 if (status) { 6182 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6183 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6184 return; 6185 } 6186 6187 bdev_compare_and_write_do_compare(bdev_io); 6188 } 6189 6190 int 6191 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6192 struct iovec *compare_iov, int compare_iovcnt, 6193 struct iovec *write_iov, int write_iovcnt, 6194 uint64_t offset_blocks, uint64_t num_blocks, 6195 spdk_bdev_io_completion_cb cb, void *cb_arg) 6196 { 6197 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6198 struct spdk_bdev_io *bdev_io; 6199 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6200 6201 if (!desc->write) { 6202 return -EBADF; 6203 } 6204 6205 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6206 return -EINVAL; 6207 } 6208 6209 if (num_blocks > bdev->acwu) { 6210 return -EINVAL; 6211 } 6212 6213 bdev_io = bdev_channel_get_io(channel); 6214 if (!bdev_io) { 6215 return -ENOMEM; 6216 } 6217 6218 bdev_io->internal.ch = channel; 6219 bdev_io->internal.desc = desc; 6220 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6221 bdev_io->u.bdev.iovs = compare_iov; 6222 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6223 bdev_io->u.bdev.fused_iovs = write_iov; 6224 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6225 bdev_io->u.bdev.md_buf = NULL; 6226 bdev_io->u.bdev.num_blocks = num_blocks; 6227 bdev_io->u.bdev.offset_blocks = offset_blocks; 6228 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6229 bdev_io->u.bdev.memory_domain = NULL; 6230 bdev_io->u.bdev.memory_domain_ctx = NULL; 6231 bdev_io->u.bdev.accel_sequence = NULL; 6232 6233 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6234 bdev_io_submit(bdev_io); 6235 return 0; 6236 } 6237 6238 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6239 bdev_comparev_and_writev_blocks_locked, bdev_io); 6240 } 6241 6242 int 6243 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6244 struct iovec *iov, int iovcnt, 6245 uint64_t offset_blocks, uint64_t num_blocks, 6246 bool populate, 6247 spdk_bdev_io_completion_cb cb, void *cb_arg) 6248 { 6249 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6250 struct spdk_bdev_io *bdev_io; 6251 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6252 6253 if (!desc->write) { 6254 return -EBADF; 6255 } 6256 6257 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6258 return -EINVAL; 6259 } 6260 6261 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6262 return -ENOTSUP; 6263 } 6264 6265 bdev_io = bdev_channel_get_io(channel); 6266 if (!bdev_io) { 6267 return -ENOMEM; 6268 } 6269 6270 bdev_io->internal.ch = channel; 6271 bdev_io->internal.desc = desc; 6272 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6273 bdev_io->u.bdev.num_blocks = num_blocks; 6274 bdev_io->u.bdev.offset_blocks = offset_blocks; 6275 bdev_io->u.bdev.iovs = iov; 6276 bdev_io->u.bdev.iovcnt = iovcnt; 6277 bdev_io->u.bdev.md_buf = NULL; 6278 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6279 bdev_io->u.bdev.zcopy.commit = 0; 6280 bdev_io->u.bdev.zcopy.start = 1; 6281 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6282 bdev_io->u.bdev.memory_domain = NULL; 6283 bdev_io->u.bdev.memory_domain_ctx = NULL; 6284 bdev_io->u.bdev.accel_sequence = NULL; 6285 6286 bdev_io_submit(bdev_io); 6287 6288 return 0; 6289 } 6290 6291 int 6292 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6293 spdk_bdev_io_completion_cb cb, void *cb_arg) 6294 { 6295 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6296 return -EINVAL; 6297 } 6298 6299 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6300 bdev_io->u.bdev.zcopy.start = 0; 6301 bdev_io->internal.caller_ctx = cb_arg; 6302 bdev_io->internal.cb = cb; 6303 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6304 6305 bdev_io_submit(bdev_io); 6306 6307 return 0; 6308 } 6309 6310 int 6311 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6312 uint64_t offset, uint64_t len, 6313 spdk_bdev_io_completion_cb cb, void *cb_arg) 6314 { 6315 uint64_t offset_blocks, num_blocks; 6316 6317 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6318 len, &num_blocks) != 0) { 6319 return -EINVAL; 6320 } 6321 6322 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6323 } 6324 6325 int 6326 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6327 uint64_t offset_blocks, uint64_t num_blocks, 6328 spdk_bdev_io_completion_cb cb, void *cb_arg) 6329 { 6330 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6331 struct spdk_bdev_io *bdev_io; 6332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6333 6334 if (!desc->write) { 6335 return -EBADF; 6336 } 6337 6338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6339 return -EINVAL; 6340 } 6341 6342 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6343 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6344 return -ENOTSUP; 6345 } 6346 6347 bdev_io = bdev_channel_get_io(channel); 6348 6349 if (!bdev_io) { 6350 return -ENOMEM; 6351 } 6352 6353 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6354 bdev_io->internal.ch = channel; 6355 bdev_io->internal.desc = desc; 6356 bdev_io->u.bdev.offset_blocks = offset_blocks; 6357 bdev_io->u.bdev.num_blocks = num_blocks; 6358 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6359 bdev_io->u.bdev.memory_domain = NULL; 6360 bdev_io->u.bdev.memory_domain_ctx = NULL; 6361 bdev_io->u.bdev.accel_sequence = NULL; 6362 6363 /* If the write_zeroes size is large and should be split, use the generic split 6364 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6365 * 6366 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6367 * or emulate it using regular write request otherwise. 6368 */ 6369 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6370 bdev_io->internal.f.split) { 6371 bdev_io_submit(bdev_io); 6372 return 0; 6373 } 6374 6375 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6376 6377 return bdev_write_zero_buffer(bdev_io); 6378 } 6379 6380 int 6381 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6382 uint64_t offset, uint64_t nbytes, 6383 spdk_bdev_io_completion_cb cb, void *cb_arg) 6384 { 6385 uint64_t offset_blocks, num_blocks; 6386 6387 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6388 nbytes, &num_blocks) != 0) { 6389 return -EINVAL; 6390 } 6391 6392 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6393 } 6394 6395 static void 6396 bdev_io_complete_cb(void *ctx) 6397 { 6398 struct spdk_bdev_io *bdev_io = ctx; 6399 6400 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6401 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6402 } 6403 6404 int 6405 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6406 uint64_t offset_blocks, uint64_t num_blocks, 6407 spdk_bdev_io_completion_cb cb, void *cb_arg) 6408 { 6409 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6410 struct spdk_bdev_io *bdev_io; 6411 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6412 6413 if (!desc->write) { 6414 return -EBADF; 6415 } 6416 6417 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6418 return -EINVAL; 6419 } 6420 6421 bdev_io = bdev_channel_get_io(channel); 6422 if (!bdev_io) { 6423 return -ENOMEM; 6424 } 6425 6426 bdev_io->internal.ch = channel; 6427 bdev_io->internal.desc = desc; 6428 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6429 6430 bdev_io->u.bdev.iovs = &bdev_io->iov; 6431 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6432 bdev_io->u.bdev.iovs[0].iov_len = 0; 6433 bdev_io->u.bdev.iovcnt = 1; 6434 6435 bdev_io->u.bdev.offset_blocks = offset_blocks; 6436 bdev_io->u.bdev.num_blocks = num_blocks; 6437 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6438 bdev_io->u.bdev.memory_domain = NULL; 6439 bdev_io->u.bdev.memory_domain_ctx = NULL; 6440 bdev_io->u.bdev.accel_sequence = NULL; 6441 6442 if (num_blocks == 0) { 6443 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6444 return 0; 6445 } 6446 6447 bdev_io_submit(bdev_io); 6448 return 0; 6449 } 6450 6451 int 6452 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6453 uint64_t offset, uint64_t length, 6454 spdk_bdev_io_completion_cb cb, void *cb_arg) 6455 { 6456 uint64_t offset_blocks, num_blocks; 6457 6458 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6459 length, &num_blocks) != 0) { 6460 return -EINVAL; 6461 } 6462 6463 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6464 } 6465 6466 int 6467 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6468 uint64_t offset_blocks, uint64_t num_blocks, 6469 spdk_bdev_io_completion_cb cb, void *cb_arg) 6470 { 6471 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6472 struct spdk_bdev_io *bdev_io; 6473 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6474 6475 if (!desc->write) { 6476 return -EBADF; 6477 } 6478 6479 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6480 return -EINVAL; 6481 } 6482 6483 bdev_io = bdev_channel_get_io(channel); 6484 if (!bdev_io) { 6485 return -ENOMEM; 6486 } 6487 6488 bdev_io->internal.ch = channel; 6489 bdev_io->internal.desc = desc; 6490 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6491 bdev_io->u.bdev.iovs = NULL; 6492 bdev_io->u.bdev.iovcnt = 0; 6493 bdev_io->u.bdev.offset_blocks = offset_blocks; 6494 bdev_io->u.bdev.num_blocks = num_blocks; 6495 bdev_io->u.bdev.memory_domain = NULL; 6496 bdev_io->u.bdev.memory_domain_ctx = NULL; 6497 bdev_io->u.bdev.accel_sequence = NULL; 6498 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6499 6500 bdev_io_submit(bdev_io); 6501 return 0; 6502 } 6503 6504 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6505 6506 static void 6507 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6508 { 6509 struct spdk_bdev_channel *ch = _ctx; 6510 struct spdk_bdev_io *bdev_io; 6511 6512 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6513 6514 if (status == -EBUSY) { 6515 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6516 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6517 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6518 } else { 6519 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6520 6521 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6522 /* If outstanding IOs are still present and reset_io_drain_timeout 6523 * seconds passed, start the reset. */ 6524 bdev_io_submit_reset(bdev_io); 6525 } else { 6526 /* We still have in progress memory domain pull/push or we're 6527 * executing accel sequence. Since we cannot abort either of those 6528 * operations, fail the reset request. */ 6529 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6530 } 6531 } 6532 } else { 6533 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6534 SPDK_DEBUGLOG(bdev, 6535 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6536 ch->bdev->name); 6537 /* Mark the completion status as a SUCCESS and complete the reset. */ 6538 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6539 } 6540 } 6541 6542 static void 6543 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6544 struct spdk_io_channel *io_ch, void *_ctx) 6545 { 6546 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6547 int status = 0; 6548 6549 if (cur_ch->io_outstanding > 0 || 6550 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6551 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6552 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6553 * further iteration over the rest of the channels and pass non-zero status 6554 * to the callback function. */ 6555 status = -EBUSY; 6556 } 6557 spdk_bdev_for_each_channel_continue(i, status); 6558 } 6559 6560 static int 6561 bdev_reset_poll_for_outstanding_io(void *ctx) 6562 { 6563 struct spdk_bdev_channel *ch = ctx; 6564 struct spdk_bdev_io *bdev_io; 6565 6566 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6567 6568 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6569 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6570 bdev_reset_check_outstanding_io_done); 6571 6572 return SPDK_POLLER_BUSY; 6573 } 6574 6575 static void 6576 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6577 { 6578 struct spdk_bdev_channel *ch = _ctx; 6579 struct spdk_bdev_io *bdev_io; 6580 6581 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6582 6583 if (bdev->reset_io_drain_timeout == 0) { 6584 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6585 6586 bdev_io_submit_reset(bdev_io); 6587 return; 6588 } 6589 6590 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6591 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6592 6593 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6594 * submit the reset to the underlying module only if outstanding I/O 6595 * remain after reset_io_drain_timeout seconds have passed. */ 6596 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6597 bdev_reset_check_outstanding_io_done); 6598 } 6599 6600 static void 6601 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6602 struct spdk_io_channel *ch, void *_ctx) 6603 { 6604 struct spdk_bdev_channel *channel; 6605 struct spdk_bdev_mgmt_channel *mgmt_channel; 6606 struct spdk_bdev_shared_resource *shared_resource; 6607 bdev_io_tailq_t tmp_queued; 6608 6609 TAILQ_INIT(&tmp_queued); 6610 6611 channel = __io_ch_to_bdev_ch(ch); 6612 shared_resource = channel->shared_resource; 6613 mgmt_channel = shared_resource->mgmt_ch; 6614 6615 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6616 6617 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6618 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6619 } 6620 6621 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6622 bdev_abort_all_buf_io(mgmt_channel, channel); 6623 bdev_abort_all_queued_io(&tmp_queued, channel); 6624 6625 spdk_bdev_for_each_channel_continue(i, 0); 6626 } 6627 6628 static void 6629 bdev_start_reset(void *ctx) 6630 { 6631 struct spdk_bdev_channel *ch = ctx; 6632 6633 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6634 bdev_reset_freeze_channel_done); 6635 } 6636 6637 static void 6638 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6639 { 6640 struct spdk_bdev *bdev = ch->bdev; 6641 6642 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6643 6644 spdk_spin_lock(&bdev->internal.spinlock); 6645 if (bdev->internal.reset_in_progress == NULL) { 6646 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6647 /* 6648 * Take a channel reference for the target bdev for the life of this 6649 * reset. This guards against the channel getting destroyed while 6650 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6651 * progress. We will release the reference when this reset is 6652 * completed. 6653 */ 6654 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6655 bdev_start_reset(ch); 6656 } 6657 spdk_spin_unlock(&bdev->internal.spinlock); 6658 } 6659 6660 int 6661 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6662 spdk_bdev_io_completion_cb cb, void *cb_arg) 6663 { 6664 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6665 struct spdk_bdev_io *bdev_io; 6666 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6667 6668 bdev_io = bdev_channel_get_io(channel); 6669 if (!bdev_io) { 6670 return -ENOMEM; 6671 } 6672 6673 bdev_io->internal.ch = channel; 6674 bdev_io->internal.desc = desc; 6675 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6676 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6677 bdev_io->u.reset.ch_ref = NULL; 6678 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6679 6680 spdk_spin_lock(&bdev->internal.spinlock); 6681 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6682 spdk_spin_unlock(&bdev->internal.spinlock); 6683 6684 bdev_ch_add_to_io_submitted(bdev_io); 6685 6686 bdev_channel_start_reset(channel); 6687 6688 return 0; 6689 } 6690 6691 void 6692 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6693 struct spdk_bdev_io_stat *stat) 6694 { 6695 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6696 6697 bdev_get_io_stat(stat, channel->stat); 6698 } 6699 6700 static void 6701 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6702 { 6703 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6704 6705 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6706 bdev_iostat_ctx->cb_arg, 0); 6707 free(bdev_iostat_ctx); 6708 } 6709 6710 static void 6711 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6712 struct spdk_io_channel *ch, void *_ctx) 6713 { 6714 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6715 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6716 6717 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6718 spdk_bdev_for_each_channel_continue(i, 0); 6719 } 6720 6721 void 6722 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6723 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6724 { 6725 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6726 6727 assert(bdev != NULL); 6728 assert(stat != NULL); 6729 assert(cb != NULL); 6730 6731 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6732 if (bdev_iostat_ctx == NULL) { 6733 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6734 cb(bdev, stat, cb_arg, -ENOMEM); 6735 return; 6736 } 6737 6738 bdev_iostat_ctx->stat = stat; 6739 bdev_iostat_ctx->cb = cb; 6740 bdev_iostat_ctx->cb_arg = cb_arg; 6741 6742 /* Start with the statistics from previously deleted channels. */ 6743 spdk_spin_lock(&bdev->internal.spinlock); 6744 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6745 spdk_spin_unlock(&bdev->internal.spinlock); 6746 6747 /* Then iterate and add the statistics from each existing channel. */ 6748 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6749 bdev_get_device_stat_done); 6750 } 6751 6752 struct bdev_iostat_reset_ctx { 6753 enum spdk_bdev_reset_stat_mode mode; 6754 bdev_reset_device_stat_cb cb; 6755 void *cb_arg; 6756 }; 6757 6758 static void 6759 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6760 { 6761 struct bdev_iostat_reset_ctx *ctx = _ctx; 6762 6763 ctx->cb(bdev, ctx->cb_arg, 0); 6764 6765 free(ctx); 6766 } 6767 6768 static void 6769 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6770 struct spdk_io_channel *ch, void *_ctx) 6771 { 6772 struct bdev_iostat_reset_ctx *ctx = _ctx; 6773 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6774 6775 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6776 6777 spdk_bdev_for_each_channel_continue(i, 0); 6778 } 6779 6780 void 6781 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6782 bdev_reset_device_stat_cb cb, void *cb_arg) 6783 { 6784 struct bdev_iostat_reset_ctx *ctx; 6785 6786 assert(bdev != NULL); 6787 assert(cb != NULL); 6788 6789 ctx = calloc(1, sizeof(*ctx)); 6790 if (ctx == NULL) { 6791 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6792 cb(bdev, cb_arg, -ENOMEM); 6793 return; 6794 } 6795 6796 ctx->mode = mode; 6797 ctx->cb = cb; 6798 ctx->cb_arg = cb_arg; 6799 6800 spdk_spin_lock(&bdev->internal.spinlock); 6801 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6802 spdk_spin_unlock(&bdev->internal.spinlock); 6803 6804 spdk_bdev_for_each_channel(bdev, 6805 bdev_reset_each_channel_stat, 6806 ctx, 6807 bdev_reset_device_stat_done); 6808 } 6809 6810 int 6811 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6812 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6813 spdk_bdev_io_completion_cb cb, void *cb_arg) 6814 { 6815 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6816 struct spdk_bdev_io *bdev_io; 6817 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6818 6819 if (!desc->write) { 6820 return -EBADF; 6821 } 6822 6823 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6824 return -ENOTSUP; 6825 } 6826 6827 bdev_io = bdev_channel_get_io(channel); 6828 if (!bdev_io) { 6829 return -ENOMEM; 6830 } 6831 6832 bdev_io->internal.ch = channel; 6833 bdev_io->internal.desc = desc; 6834 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6835 bdev_io->u.nvme_passthru.cmd = *cmd; 6836 bdev_io->u.nvme_passthru.buf = buf; 6837 bdev_io->u.nvme_passthru.nbytes = nbytes; 6838 bdev_io->u.nvme_passthru.md_buf = NULL; 6839 bdev_io->u.nvme_passthru.md_len = 0; 6840 6841 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6842 6843 bdev_io_submit(bdev_io); 6844 return 0; 6845 } 6846 6847 int 6848 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6849 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6850 spdk_bdev_io_completion_cb cb, void *cb_arg) 6851 { 6852 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6853 struct spdk_bdev_io *bdev_io; 6854 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6855 6856 if (!desc->write) { 6857 /* 6858 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6859 * to easily determine if the command is a read or write, but for now just 6860 * do not allow io_passthru with a read-only descriptor. 6861 */ 6862 return -EBADF; 6863 } 6864 6865 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6866 return -ENOTSUP; 6867 } 6868 6869 bdev_io = bdev_channel_get_io(channel); 6870 if (!bdev_io) { 6871 return -ENOMEM; 6872 } 6873 6874 bdev_io->internal.ch = channel; 6875 bdev_io->internal.desc = desc; 6876 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6877 bdev_io->u.nvme_passthru.cmd = *cmd; 6878 bdev_io->u.nvme_passthru.buf = buf; 6879 bdev_io->u.nvme_passthru.nbytes = nbytes; 6880 bdev_io->u.nvme_passthru.md_buf = NULL; 6881 bdev_io->u.nvme_passthru.md_len = 0; 6882 6883 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6884 6885 bdev_io_submit(bdev_io); 6886 return 0; 6887 } 6888 6889 int 6890 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6891 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6892 spdk_bdev_io_completion_cb cb, void *cb_arg) 6893 { 6894 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6895 struct spdk_bdev_io *bdev_io; 6896 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6897 6898 if (!desc->write) { 6899 /* 6900 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6901 * to easily determine if the command is a read or write, but for now just 6902 * do not allow io_passthru with a read-only descriptor. 6903 */ 6904 return -EBADF; 6905 } 6906 6907 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6908 return -ENOTSUP; 6909 } 6910 6911 bdev_io = bdev_channel_get_io(channel); 6912 if (!bdev_io) { 6913 return -ENOMEM; 6914 } 6915 6916 bdev_io->internal.ch = channel; 6917 bdev_io->internal.desc = desc; 6918 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6919 bdev_io->u.nvme_passthru.cmd = *cmd; 6920 bdev_io->u.nvme_passthru.buf = buf; 6921 bdev_io->u.nvme_passthru.nbytes = nbytes; 6922 bdev_io->u.nvme_passthru.md_buf = md_buf; 6923 bdev_io->u.nvme_passthru.md_len = md_len; 6924 6925 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6926 6927 bdev_io_submit(bdev_io); 6928 return 0; 6929 } 6930 6931 int 6932 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6933 struct spdk_io_channel *ch, 6934 const struct spdk_nvme_cmd *cmd, 6935 struct iovec *iov, int iovcnt, size_t nbytes, 6936 void *md_buf, size_t md_len, 6937 spdk_bdev_io_completion_cb cb, void *cb_arg) 6938 { 6939 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6940 struct spdk_bdev_io *bdev_io; 6941 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6942 6943 if (!desc->write) { 6944 /* 6945 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6946 * to easily determine if the command is a read or write, but for now just 6947 * do not allow io_passthru with a read-only descriptor. 6948 */ 6949 return -EBADF; 6950 } 6951 6952 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6953 return -ENOTSUP; 6954 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6955 return -ENOTSUP; 6956 } 6957 6958 bdev_io = bdev_channel_get_io(channel); 6959 if (!bdev_io) { 6960 return -ENOMEM; 6961 } 6962 6963 bdev_io->internal.ch = channel; 6964 bdev_io->internal.desc = desc; 6965 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6966 bdev_io->u.nvme_passthru.cmd = *cmd; 6967 bdev_io->u.nvme_passthru.iovs = iov; 6968 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6969 bdev_io->u.nvme_passthru.nbytes = nbytes; 6970 bdev_io->u.nvme_passthru.md_buf = md_buf; 6971 bdev_io->u.nvme_passthru.md_len = md_len; 6972 6973 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6974 6975 bdev_io_submit(bdev_io); 6976 return 0; 6977 } 6978 6979 static void bdev_abort_retry(void *ctx); 6980 static void bdev_abort(struct spdk_bdev_io *parent_io); 6981 6982 static void 6983 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6984 { 6985 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6986 struct spdk_bdev_io *parent_io = cb_arg; 6987 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6988 6989 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6990 6991 spdk_bdev_free_io(bdev_io); 6992 6993 if (!success) { 6994 /* Check if the target I/O completed in the meantime. */ 6995 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6996 if (tmp_io == bio_to_abort) { 6997 break; 6998 } 6999 } 7000 7001 /* If the target I/O still exists, set the parent to failed. */ 7002 if (tmp_io != NULL) { 7003 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7004 } 7005 } 7006 7007 assert(parent_io->internal.f.split); 7008 7009 parent_io->internal.split.outstanding--; 7010 if (parent_io->internal.split.outstanding == 0) { 7011 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7012 bdev_abort_retry(parent_io); 7013 } else { 7014 bdev_io_complete(parent_io); 7015 } 7016 } 7017 } 7018 7019 static int 7020 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7021 struct spdk_bdev_io *bio_to_abort, 7022 spdk_bdev_io_completion_cb cb, void *cb_arg) 7023 { 7024 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7025 struct spdk_bdev_io *bdev_io; 7026 7027 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7028 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7029 /* TODO: Abort reset or abort request. */ 7030 return -ENOTSUP; 7031 } 7032 7033 bdev_io = bdev_channel_get_io(channel); 7034 if (bdev_io == NULL) { 7035 return -ENOMEM; 7036 } 7037 7038 bdev_io->internal.ch = channel; 7039 bdev_io->internal.desc = desc; 7040 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7041 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7042 7043 if (bio_to_abort->internal.f.split) { 7044 assert(bdev_io_should_split(bio_to_abort)); 7045 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7046 7047 /* Parent abort request is not submitted directly, but to manage its 7048 * execution add it to the submitted list here. 7049 */ 7050 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7051 bdev_ch_add_to_io_submitted(bdev_io); 7052 7053 bdev_abort(bdev_io); 7054 7055 return 0; 7056 } 7057 7058 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7059 7060 /* Submit the abort request to the underlying bdev module. */ 7061 bdev_io_submit(bdev_io); 7062 7063 return 0; 7064 } 7065 7066 static bool 7067 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7068 { 7069 struct spdk_bdev_io *iter; 7070 7071 TAILQ_FOREACH(iter, tailq, internal.link) { 7072 if (iter == bdev_io) { 7073 return true; 7074 } 7075 } 7076 7077 return false; 7078 } 7079 7080 static uint32_t 7081 _bdev_abort(struct spdk_bdev_io *parent_io) 7082 { 7083 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7084 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7085 void *bio_cb_arg; 7086 struct spdk_bdev_io *bio_to_abort; 7087 uint32_t matched_ios; 7088 int rc; 7089 7090 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7091 7092 /* matched_ios is returned and will be kept by the caller. 7093 * 7094 * This function will be used for two cases, 1) the same cb_arg is used for 7095 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7096 * Incrementing split_outstanding directly here may confuse readers especially 7097 * for the 1st case. 7098 * 7099 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7100 * works as expected. 7101 */ 7102 matched_ios = 0; 7103 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7104 7105 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7106 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7107 continue; 7108 } 7109 7110 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7111 /* Any I/O which was submitted after this abort command should be excluded. */ 7112 continue; 7113 } 7114 7115 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7116 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7117 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7118 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7119 break; 7120 } 7121 7122 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7123 if (rc != 0) { 7124 if (rc == -ENOMEM) { 7125 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7126 } else { 7127 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7128 } 7129 break; 7130 } 7131 matched_ios++; 7132 } 7133 7134 return matched_ios; 7135 } 7136 7137 static void 7138 bdev_abort_retry(void *ctx) 7139 { 7140 struct spdk_bdev_io *parent_io = ctx; 7141 uint32_t matched_ios; 7142 7143 matched_ios = _bdev_abort(parent_io); 7144 7145 if (matched_ios == 0) { 7146 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7147 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7148 } else { 7149 /* For retry, the case that no target I/O was found is success 7150 * because it means target I/Os completed in the meantime. 7151 */ 7152 bdev_io_complete(parent_io); 7153 } 7154 return; 7155 } 7156 7157 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7158 parent_io->internal.f.split = true; 7159 parent_io->internal.split.outstanding = matched_ios; 7160 } 7161 7162 static void 7163 bdev_abort(struct spdk_bdev_io *parent_io) 7164 { 7165 uint32_t matched_ios; 7166 7167 matched_ios = _bdev_abort(parent_io); 7168 7169 if (matched_ios == 0) { 7170 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7171 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7172 } else { 7173 /* The case the no target I/O was found is failure. */ 7174 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7175 bdev_io_complete(parent_io); 7176 } 7177 return; 7178 } 7179 7180 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7181 parent_io->internal.f.split = true; 7182 parent_io->internal.split.outstanding = matched_ios; 7183 } 7184 7185 int 7186 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7187 void *bio_cb_arg, 7188 spdk_bdev_io_completion_cb cb, void *cb_arg) 7189 { 7190 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7191 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7192 struct spdk_bdev_io *bdev_io; 7193 7194 if (bio_cb_arg == NULL) { 7195 return -EINVAL; 7196 } 7197 7198 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7199 return -ENOTSUP; 7200 } 7201 7202 bdev_io = bdev_channel_get_io(channel); 7203 if (bdev_io == NULL) { 7204 return -ENOMEM; 7205 } 7206 7207 bdev_io->internal.ch = channel; 7208 bdev_io->internal.desc = desc; 7209 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7210 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7211 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7212 7213 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7214 7215 /* Parent abort request is not submitted directly, but to manage its execution, 7216 * add it to the submitted list here. 7217 */ 7218 bdev_ch_add_to_io_submitted(bdev_io); 7219 7220 bdev_abort(bdev_io); 7221 7222 return 0; 7223 } 7224 7225 int 7226 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7227 struct spdk_bdev_io_wait_entry *entry) 7228 { 7229 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7230 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7231 7232 if (bdev != entry->bdev) { 7233 SPDK_ERRLOG("bdevs do not match\n"); 7234 return -EINVAL; 7235 } 7236 7237 if (mgmt_ch->per_thread_cache_count > 0) { 7238 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7239 return -EINVAL; 7240 } 7241 7242 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7243 return 0; 7244 } 7245 7246 static inline void 7247 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7248 { 7249 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7250 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7251 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7252 uint32_t blocklen = bdev_io->bdev->blocklen; 7253 7254 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7255 switch (bdev_io->type) { 7256 case SPDK_BDEV_IO_TYPE_READ: 7257 io_stat->bytes_read += num_blocks * blocklen; 7258 io_stat->num_read_ops++; 7259 io_stat->read_latency_ticks += tsc_diff; 7260 if (io_stat->max_read_latency_ticks < tsc_diff) { 7261 io_stat->max_read_latency_ticks = tsc_diff; 7262 } 7263 if (io_stat->min_read_latency_ticks > tsc_diff) { 7264 io_stat->min_read_latency_ticks = tsc_diff; 7265 } 7266 break; 7267 case SPDK_BDEV_IO_TYPE_WRITE: 7268 io_stat->bytes_written += num_blocks * blocklen; 7269 io_stat->num_write_ops++; 7270 io_stat->write_latency_ticks += tsc_diff; 7271 if (io_stat->max_write_latency_ticks < tsc_diff) { 7272 io_stat->max_write_latency_ticks = tsc_diff; 7273 } 7274 if (io_stat->min_write_latency_ticks > tsc_diff) { 7275 io_stat->min_write_latency_ticks = tsc_diff; 7276 } 7277 break; 7278 case SPDK_BDEV_IO_TYPE_UNMAP: 7279 io_stat->bytes_unmapped += num_blocks * blocklen; 7280 io_stat->num_unmap_ops++; 7281 io_stat->unmap_latency_ticks += tsc_diff; 7282 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7283 io_stat->max_unmap_latency_ticks = tsc_diff; 7284 } 7285 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7286 io_stat->min_unmap_latency_ticks = tsc_diff; 7287 } 7288 break; 7289 case SPDK_BDEV_IO_TYPE_ZCOPY: 7290 /* Track the data in the start phase only */ 7291 if (bdev_io->u.bdev.zcopy.start) { 7292 if (bdev_io->u.bdev.zcopy.populate) { 7293 io_stat->bytes_read += num_blocks * blocklen; 7294 io_stat->num_read_ops++; 7295 io_stat->read_latency_ticks += tsc_diff; 7296 if (io_stat->max_read_latency_ticks < tsc_diff) { 7297 io_stat->max_read_latency_ticks = tsc_diff; 7298 } 7299 if (io_stat->min_read_latency_ticks > tsc_diff) { 7300 io_stat->min_read_latency_ticks = tsc_diff; 7301 } 7302 } else { 7303 io_stat->bytes_written += num_blocks * blocklen; 7304 io_stat->num_write_ops++; 7305 io_stat->write_latency_ticks += tsc_diff; 7306 if (io_stat->max_write_latency_ticks < tsc_diff) { 7307 io_stat->max_write_latency_ticks = tsc_diff; 7308 } 7309 if (io_stat->min_write_latency_ticks > tsc_diff) { 7310 io_stat->min_write_latency_ticks = tsc_diff; 7311 } 7312 } 7313 } 7314 break; 7315 case SPDK_BDEV_IO_TYPE_COPY: 7316 io_stat->bytes_copied += num_blocks * blocklen; 7317 io_stat->num_copy_ops++; 7318 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7319 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7320 io_stat->max_copy_latency_ticks = tsc_diff; 7321 } 7322 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7323 io_stat->min_copy_latency_ticks = tsc_diff; 7324 } 7325 break; 7326 default: 7327 break; 7328 } 7329 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7330 io_stat = bdev_io->bdev->internal.stat; 7331 assert(io_stat->io_error != NULL); 7332 7333 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7334 io_stat->io_error->error_status[-io_status - 1]++; 7335 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7336 } 7337 7338 #ifdef SPDK_CONFIG_VTUNE 7339 uint64_t now_tsc = spdk_get_ticks(); 7340 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7341 uint64_t data[5]; 7342 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7343 7344 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7345 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7346 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7347 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7348 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7349 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7350 7351 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7352 __itt_metadata_u64, 5, data); 7353 7354 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7355 bdev_io->internal.ch->start_tsc = now_tsc; 7356 } 7357 #endif 7358 } 7359 7360 static inline void 7361 _bdev_io_complete(void *ctx) 7362 { 7363 struct spdk_bdev_io *bdev_io = ctx; 7364 7365 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7366 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7367 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7368 } 7369 7370 assert(bdev_io->internal.cb != NULL); 7371 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7372 7373 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7374 bdev_io->internal.caller_ctx); 7375 } 7376 7377 static inline void 7378 bdev_io_complete(void *ctx) 7379 { 7380 struct spdk_bdev_io *bdev_io = ctx; 7381 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7382 uint64_t tsc, tsc_diff; 7383 7384 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7385 /* 7386 * Defer completion to avoid potential infinite recursion if the 7387 * user's completion callback issues a new I/O. 7388 */ 7389 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7390 bdev_io_complete, bdev_io); 7391 return; 7392 } 7393 7394 tsc = spdk_get_ticks(); 7395 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7396 7397 bdev_ch_remove_from_io_submitted(bdev_io); 7398 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7399 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7400 7401 if (bdev_ch->histogram) { 7402 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7403 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7404 /* 7405 * Tally all I/O types if the histogram_io_type is set to 0. 7406 */ 7407 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7408 } 7409 } 7410 7411 bdev_io_update_io_stat(bdev_io, tsc_diff); 7412 _bdev_io_complete(bdev_io); 7413 } 7414 7415 /* The difference between this function and bdev_io_complete() is that this should be called to 7416 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7417 * io_submitted list and don't have submit_tsc updated. 7418 */ 7419 static inline void 7420 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7421 { 7422 /* Since the IO hasn't been submitted it's bound to be failed */ 7423 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7424 7425 /* At this point we don't know if the IO is completed from submission context or not, but, 7426 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7427 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7428 _bdev_io_complete, bdev_io); 7429 } 7430 7431 static void bdev_destroy_cb(void *io_device); 7432 7433 static void 7434 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7435 { 7436 struct spdk_bdev_io *bdev_io = _ctx; 7437 7438 if (bdev_io->u.reset.ch_ref != NULL) { 7439 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7440 bdev_io->u.reset.ch_ref = NULL; 7441 } 7442 7443 bdev_io_complete(bdev_io); 7444 7445 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7446 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7447 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7448 } 7449 } 7450 7451 static void 7452 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7453 struct spdk_io_channel *_ch, void *_ctx) 7454 { 7455 struct spdk_bdev_io *bdev_io = _ctx; 7456 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7457 struct spdk_bdev_io *queued_reset; 7458 7459 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7460 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7461 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7462 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7463 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7464 } 7465 7466 spdk_bdev_for_each_channel_continue(i, 0); 7467 } 7468 7469 static void 7470 bdev_io_complete_sequence_cb(void *ctx, int status) 7471 { 7472 struct spdk_bdev_io *bdev_io = ctx; 7473 7474 /* u.bdev.accel_sequence should have already been cleared at this point */ 7475 assert(bdev_io->u.bdev.accel_sequence == NULL); 7476 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7477 bdev_io->internal.f.has_accel_sequence = false; 7478 7479 if (spdk_unlikely(status != 0)) { 7480 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7481 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7482 } 7483 7484 bdev_io_complete(bdev_io); 7485 } 7486 7487 void 7488 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7489 { 7490 struct spdk_bdev *bdev = bdev_io->bdev; 7491 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7492 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7493 7494 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7495 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7496 spdk_bdev_get_module_name(bdev), 7497 bdev_io_status_get_string(bdev_io->internal.status)); 7498 assert(false); 7499 } 7500 bdev_io->internal.status = status; 7501 7502 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7503 bool unlock_channels = false; 7504 7505 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7506 SPDK_ERRLOG("NOMEM returned for reset\n"); 7507 } 7508 spdk_spin_lock(&bdev->internal.spinlock); 7509 if (bdev_io == bdev->internal.reset_in_progress) { 7510 bdev->internal.reset_in_progress = NULL; 7511 unlock_channels = true; 7512 } 7513 spdk_spin_unlock(&bdev->internal.spinlock); 7514 7515 if (unlock_channels) { 7516 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7517 bdev_reset_complete); 7518 return; 7519 } 7520 } else { 7521 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7522 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7523 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7524 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7525 return; 7526 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7527 !bdev_io_use_accel_sequence(bdev_io))) { 7528 _bdev_io_push_bounce_data_buffer(bdev_io, 7529 _bdev_io_complete_push_bounce_done); 7530 /* bdev IO will be completed in the callback */ 7531 return; 7532 } 7533 } 7534 7535 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7536 return; 7537 } 7538 } 7539 7540 bdev_io_complete(bdev_io); 7541 } 7542 7543 void 7544 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7545 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7546 { 7547 enum spdk_bdev_io_status status; 7548 7549 if (sc == SPDK_SCSI_STATUS_GOOD) { 7550 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7551 } else { 7552 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7553 bdev_io->internal.error.scsi.sc = sc; 7554 bdev_io->internal.error.scsi.sk = sk; 7555 bdev_io->internal.error.scsi.asc = asc; 7556 bdev_io->internal.error.scsi.ascq = ascq; 7557 } 7558 7559 spdk_bdev_io_complete(bdev_io, status); 7560 } 7561 7562 void 7563 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7564 int *sc, int *sk, int *asc, int *ascq) 7565 { 7566 assert(sc != NULL); 7567 assert(sk != NULL); 7568 assert(asc != NULL); 7569 assert(ascq != NULL); 7570 7571 switch (bdev_io->internal.status) { 7572 case SPDK_BDEV_IO_STATUS_SUCCESS: 7573 *sc = SPDK_SCSI_STATUS_GOOD; 7574 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7575 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7576 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7577 break; 7578 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7579 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7580 break; 7581 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7582 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7583 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7584 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7585 *ascq = bdev_io->internal.error.scsi.ascq; 7586 break; 7587 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7588 *sc = bdev_io->internal.error.scsi.sc; 7589 *sk = bdev_io->internal.error.scsi.sk; 7590 *asc = bdev_io->internal.error.scsi.asc; 7591 *ascq = bdev_io->internal.error.scsi.ascq; 7592 break; 7593 default: 7594 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7595 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7596 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7597 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7598 break; 7599 } 7600 } 7601 7602 void 7603 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7604 { 7605 enum spdk_bdev_io_status status; 7606 7607 if (aio_result == 0) { 7608 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7609 } else { 7610 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7611 } 7612 7613 bdev_io->internal.error.aio_result = aio_result; 7614 7615 spdk_bdev_io_complete(bdev_io, status); 7616 } 7617 7618 void 7619 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7620 { 7621 assert(aio_result != NULL); 7622 7623 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7624 *aio_result = bdev_io->internal.error.aio_result; 7625 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7626 *aio_result = 0; 7627 } else { 7628 *aio_result = -EIO; 7629 } 7630 } 7631 7632 void 7633 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7634 { 7635 enum spdk_bdev_io_status status; 7636 7637 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7638 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7639 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7640 status = SPDK_BDEV_IO_STATUS_ABORTED; 7641 } else { 7642 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7643 } 7644 7645 bdev_io->internal.error.nvme.cdw0 = cdw0; 7646 bdev_io->internal.error.nvme.sct = sct; 7647 bdev_io->internal.error.nvme.sc = sc; 7648 7649 spdk_bdev_io_complete(bdev_io, status); 7650 } 7651 7652 void 7653 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7654 { 7655 assert(sct != NULL); 7656 assert(sc != NULL); 7657 assert(cdw0 != NULL); 7658 7659 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7660 *sct = SPDK_NVME_SCT_GENERIC; 7661 *sc = SPDK_NVME_SC_SUCCESS; 7662 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7663 *cdw0 = 0; 7664 } else { 7665 *cdw0 = 1U; 7666 } 7667 return; 7668 } 7669 7670 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7671 *sct = SPDK_NVME_SCT_GENERIC; 7672 *sc = SPDK_NVME_SC_SUCCESS; 7673 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7674 *sct = bdev_io->internal.error.nvme.sct; 7675 *sc = bdev_io->internal.error.nvme.sc; 7676 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7677 *sct = SPDK_NVME_SCT_GENERIC; 7678 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7679 } else { 7680 *sct = SPDK_NVME_SCT_GENERIC; 7681 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7682 } 7683 7684 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7685 } 7686 7687 void 7688 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7689 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7690 { 7691 assert(first_sct != NULL); 7692 assert(first_sc != NULL); 7693 assert(second_sct != NULL); 7694 assert(second_sc != NULL); 7695 assert(cdw0 != NULL); 7696 7697 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7698 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7699 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7700 *first_sct = bdev_io->internal.error.nvme.sct; 7701 *first_sc = bdev_io->internal.error.nvme.sc; 7702 *second_sct = SPDK_NVME_SCT_GENERIC; 7703 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7704 } else { 7705 *first_sct = SPDK_NVME_SCT_GENERIC; 7706 *first_sc = SPDK_NVME_SC_SUCCESS; 7707 *second_sct = bdev_io->internal.error.nvme.sct; 7708 *second_sc = bdev_io->internal.error.nvme.sc; 7709 } 7710 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7711 *first_sct = SPDK_NVME_SCT_GENERIC; 7712 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7713 *second_sct = SPDK_NVME_SCT_GENERIC; 7714 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7715 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7716 *first_sct = SPDK_NVME_SCT_GENERIC; 7717 *first_sc = SPDK_NVME_SC_SUCCESS; 7718 *second_sct = SPDK_NVME_SCT_GENERIC; 7719 *second_sc = SPDK_NVME_SC_SUCCESS; 7720 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7721 *first_sct = SPDK_NVME_SCT_GENERIC; 7722 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7723 *second_sct = SPDK_NVME_SCT_GENERIC; 7724 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7725 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7726 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7727 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7728 *second_sct = SPDK_NVME_SCT_GENERIC; 7729 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7730 } else { 7731 *first_sct = SPDK_NVME_SCT_GENERIC; 7732 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7733 *second_sct = SPDK_NVME_SCT_GENERIC; 7734 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7735 } 7736 7737 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7738 } 7739 7740 void 7741 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7742 const struct spdk_bdev_io *base_io) 7743 { 7744 switch (base_io->internal.status) { 7745 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7746 spdk_bdev_io_complete_nvme_status(bdev_io, 7747 base_io->internal.error.nvme.cdw0, 7748 base_io->internal.error.nvme.sct, 7749 base_io->internal.error.nvme.sc); 7750 break; 7751 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7752 spdk_bdev_io_complete_scsi_status(bdev_io, 7753 base_io->internal.error.scsi.sc, 7754 base_io->internal.error.scsi.sk, 7755 base_io->internal.error.scsi.asc, 7756 base_io->internal.error.scsi.ascq); 7757 break; 7758 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7759 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7760 break; 7761 default: 7762 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7763 break; 7764 } 7765 } 7766 7767 struct spdk_thread * 7768 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7769 { 7770 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7771 } 7772 7773 struct spdk_io_channel * 7774 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7775 { 7776 return bdev_io->internal.ch->channel; 7777 } 7778 7779 static int 7780 bdev_register(struct spdk_bdev *bdev) 7781 { 7782 char *bdev_name; 7783 char uuid[SPDK_UUID_STRING_LEN]; 7784 struct spdk_iobuf_opts iobuf_opts; 7785 int ret; 7786 7787 assert(bdev->module != NULL); 7788 7789 if (!bdev->name) { 7790 SPDK_ERRLOG("Bdev name is NULL\n"); 7791 return -EINVAL; 7792 } 7793 7794 if (!strlen(bdev->name)) { 7795 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7796 return -EINVAL; 7797 } 7798 7799 /* Users often register their own I/O devices using the bdev name. In 7800 * order to avoid conflicts, prepend bdev_. */ 7801 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7802 if (!bdev_name) { 7803 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7804 return -ENOMEM; 7805 } 7806 7807 bdev->internal.stat = bdev_alloc_io_stat(true); 7808 if (!bdev->internal.stat) { 7809 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7810 free(bdev_name); 7811 return -ENOMEM; 7812 } 7813 7814 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7815 bdev->internal.measured_queue_depth = UINT64_MAX; 7816 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7817 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7818 bdev->internal.qd_poller = NULL; 7819 bdev->internal.qos = NULL; 7820 7821 TAILQ_INIT(&bdev->internal.open_descs); 7822 TAILQ_INIT(&bdev->internal.locked_ranges); 7823 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7824 TAILQ_INIT(&bdev->aliases); 7825 7826 /* UUID may be specified by the user or defined by bdev itself. 7827 * Otherwise it will be generated here, so this field will never be empty. */ 7828 if (spdk_uuid_is_null(&bdev->uuid)) { 7829 spdk_uuid_generate(&bdev->uuid); 7830 } 7831 7832 /* Add the UUID alias only if it's different than the name */ 7833 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7834 if (strcmp(bdev->name, uuid) != 0) { 7835 ret = spdk_bdev_alias_add(bdev, uuid); 7836 if (ret != 0) { 7837 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7838 bdev_free_io_stat(bdev->internal.stat); 7839 free(bdev_name); 7840 return ret; 7841 } 7842 } 7843 7844 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7845 if (spdk_bdev_get_buf_align(bdev) > 1) { 7846 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7847 iobuf_opts.large_bufsize / bdev->blocklen); 7848 } 7849 7850 /* If the user didn't specify a write unit size, set it to one. */ 7851 if (bdev->write_unit_size == 0) { 7852 bdev->write_unit_size = 1; 7853 } 7854 7855 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7856 if (bdev->acwu == 0) { 7857 bdev->acwu = bdev->write_unit_size; 7858 } 7859 7860 if (bdev->phys_blocklen == 0) { 7861 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7862 } 7863 7864 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7865 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7866 } 7867 7868 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7869 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7870 } 7871 7872 bdev->internal.reset_in_progress = NULL; 7873 bdev->internal.qd_poll_in_progress = false; 7874 bdev->internal.period = 0; 7875 bdev->internal.new_period = 0; 7876 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7877 7878 /* 7879 * Initialize spinlock before registering IO device because spinlock is used in 7880 * bdev_channel_create 7881 */ 7882 spdk_spin_init(&bdev->internal.spinlock); 7883 7884 spdk_io_device_register(__bdev_to_io_dev(bdev), 7885 bdev_channel_create, bdev_channel_destroy, 7886 sizeof(struct spdk_bdev_channel), 7887 bdev_name); 7888 7889 /* 7890 * Register bdev name only after the bdev object is ready. 7891 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7892 * create IO channels... 7893 */ 7894 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7895 if (ret != 0) { 7896 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7897 bdev_free_io_stat(bdev->internal.stat); 7898 spdk_spin_destroy(&bdev->internal.spinlock); 7899 free(bdev_name); 7900 return ret; 7901 } 7902 7903 free(bdev_name); 7904 7905 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7906 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7907 7908 return 0; 7909 } 7910 7911 static void 7912 bdev_destroy_cb(void *io_device) 7913 { 7914 int rc; 7915 struct spdk_bdev *bdev; 7916 spdk_bdev_unregister_cb cb_fn; 7917 void *cb_arg; 7918 7919 bdev = __bdev_from_io_dev(io_device); 7920 7921 if (bdev->internal.unregister_td != spdk_get_thread()) { 7922 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7923 return; 7924 } 7925 7926 cb_fn = bdev->internal.unregister_cb; 7927 cb_arg = bdev->internal.unregister_ctx; 7928 7929 spdk_spin_destroy(&bdev->internal.spinlock); 7930 free(bdev->internal.qos); 7931 bdev_free_io_stat(bdev->internal.stat); 7932 spdk_trace_unregister_owner(bdev->internal.trace_id); 7933 7934 rc = bdev->fn_table->destruct(bdev->ctxt); 7935 if (rc < 0) { 7936 SPDK_ERRLOG("destruct failed\n"); 7937 } 7938 if (rc <= 0 && cb_fn != NULL) { 7939 cb_fn(cb_arg, rc); 7940 } 7941 } 7942 7943 void 7944 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7945 { 7946 if (bdev->internal.unregister_cb != NULL) { 7947 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7948 } 7949 } 7950 7951 static void 7952 _remove_notify(void *arg) 7953 { 7954 struct spdk_bdev_desc *desc = arg; 7955 7956 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7957 } 7958 7959 /* returns: 0 - bdev removed and ready to be destructed. 7960 * -EBUSY - bdev can't be destructed yet. */ 7961 static int 7962 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7963 { 7964 struct spdk_bdev_desc *desc, *tmp; 7965 int rc = 0; 7966 char uuid[SPDK_UUID_STRING_LEN]; 7967 7968 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7969 assert(spdk_spin_held(&bdev->internal.spinlock)); 7970 7971 /* Notify each descriptor about hotremoval */ 7972 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7973 rc = -EBUSY; 7974 /* 7975 * Defer invocation of the event_cb to a separate message that will 7976 * run later on its thread. This ensures this context unwinds and 7977 * we don't recursively unregister this bdev again if the event_cb 7978 * immediately closes its descriptor. 7979 */ 7980 event_notify(desc, _remove_notify); 7981 } 7982 7983 /* If there are no descriptors, proceed removing the bdev */ 7984 if (rc == 0) { 7985 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7986 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7987 7988 /* Delete the name and the UUID alias */ 7989 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7990 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7991 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7992 7993 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7994 7995 if (bdev->internal.reset_in_progress != NULL) { 7996 /* If reset is in progress, let the completion callback for reset 7997 * unregister the bdev. 7998 */ 7999 rc = -EBUSY; 8000 } 8001 } 8002 8003 return rc; 8004 } 8005 8006 static void 8007 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8008 struct spdk_io_channel *io_ch, void *_ctx) 8009 { 8010 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8011 8012 bdev_channel_abort_queued_ios(bdev_ch); 8013 spdk_bdev_for_each_channel_continue(i, 0); 8014 } 8015 8016 static void 8017 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8018 { 8019 int rc; 8020 8021 spdk_spin_lock(&g_bdev_mgr.spinlock); 8022 spdk_spin_lock(&bdev->internal.spinlock); 8023 /* 8024 * Set the status to REMOVING after completing to abort channels. Otherwise, 8025 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8026 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8027 * may fail. 8028 */ 8029 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8030 rc = bdev_unregister_unsafe(bdev); 8031 spdk_spin_unlock(&bdev->internal.spinlock); 8032 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8033 8034 if (rc == 0) { 8035 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8036 } 8037 } 8038 8039 void 8040 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8041 { 8042 struct spdk_thread *thread; 8043 8044 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8045 8046 thread = spdk_get_thread(); 8047 if (!thread) { 8048 /* The user called this from a non-SPDK thread. */ 8049 if (cb_fn != NULL) { 8050 cb_fn(cb_arg, -ENOTSUP); 8051 } 8052 return; 8053 } 8054 8055 spdk_spin_lock(&g_bdev_mgr.spinlock); 8056 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8057 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8058 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8059 if (cb_fn) { 8060 cb_fn(cb_arg, -EBUSY); 8061 } 8062 return; 8063 } 8064 8065 spdk_spin_lock(&bdev->internal.spinlock); 8066 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8067 bdev->internal.unregister_cb = cb_fn; 8068 bdev->internal.unregister_ctx = cb_arg; 8069 bdev->internal.unregister_td = thread; 8070 spdk_spin_unlock(&bdev->internal.spinlock); 8071 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8072 8073 spdk_bdev_set_qd_sampling_period(bdev, 0); 8074 8075 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8076 bdev_unregister); 8077 } 8078 8079 int 8080 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8081 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8082 { 8083 struct spdk_bdev_desc *desc; 8084 struct spdk_bdev *bdev; 8085 int rc; 8086 8087 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8088 if (rc != 0) { 8089 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8090 return rc; 8091 } 8092 8093 bdev = spdk_bdev_desc_get_bdev(desc); 8094 8095 if (bdev->module != module) { 8096 spdk_bdev_close(desc); 8097 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8098 bdev_name); 8099 return -ENODEV; 8100 } 8101 8102 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8103 8104 spdk_bdev_close(desc); 8105 8106 return 0; 8107 } 8108 8109 static int 8110 bdev_start_qos(struct spdk_bdev *bdev) 8111 { 8112 struct set_qos_limit_ctx *ctx; 8113 8114 /* Enable QoS */ 8115 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8116 ctx = calloc(1, sizeof(*ctx)); 8117 if (ctx == NULL) { 8118 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8119 return -ENOMEM; 8120 } 8121 ctx->bdev = bdev; 8122 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8123 } 8124 8125 return 0; 8126 } 8127 8128 static void 8129 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8130 struct spdk_bdev *bdev) 8131 { 8132 enum spdk_bdev_claim_type type; 8133 const char *typename, *modname; 8134 extern struct spdk_log_flag SPDK_LOG_bdev; 8135 8136 assert(spdk_spin_held(&bdev->internal.spinlock)); 8137 8138 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8139 return; 8140 } 8141 8142 type = bdev->internal.claim_type; 8143 typename = spdk_bdev_claim_get_name(type); 8144 8145 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8146 modname = bdev->internal.claim.v1.module->name; 8147 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8148 bdev->name, detail, typename, modname); 8149 return; 8150 } 8151 8152 if (claim_type_is_v2(type)) { 8153 struct spdk_bdev_module_claim *claim; 8154 8155 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8156 modname = claim->module->name; 8157 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8158 bdev->name, detail, typename, modname); 8159 } 8160 return; 8161 } 8162 8163 assert(false); 8164 } 8165 8166 static int 8167 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8168 { 8169 struct spdk_thread *thread; 8170 int rc = 0; 8171 8172 thread = spdk_get_thread(); 8173 if (!thread) { 8174 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8175 return -ENOTSUP; 8176 } 8177 8178 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8179 spdk_get_thread()); 8180 8181 desc->bdev = bdev; 8182 desc->thread = thread; 8183 desc->write = write; 8184 8185 spdk_spin_lock(&bdev->internal.spinlock); 8186 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8187 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8188 spdk_spin_unlock(&bdev->internal.spinlock); 8189 return -ENODEV; 8190 } 8191 8192 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8193 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8194 spdk_spin_unlock(&bdev->internal.spinlock); 8195 return -EPERM; 8196 } 8197 8198 rc = bdev_start_qos(bdev); 8199 if (rc != 0) { 8200 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8201 spdk_spin_unlock(&bdev->internal.spinlock); 8202 return rc; 8203 } 8204 8205 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8206 8207 spdk_spin_unlock(&bdev->internal.spinlock); 8208 8209 return 0; 8210 } 8211 8212 static int 8213 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8214 struct spdk_bdev_desc **_desc) 8215 { 8216 struct spdk_bdev_desc *desc; 8217 unsigned int i; 8218 8219 desc = calloc(1, sizeof(*desc)); 8220 if (desc == NULL) { 8221 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8222 return -ENOMEM; 8223 } 8224 8225 TAILQ_INIT(&desc->pending_media_events); 8226 TAILQ_INIT(&desc->free_media_events); 8227 8228 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8229 desc->callback.event_fn = event_cb; 8230 desc->callback.ctx = event_ctx; 8231 spdk_spin_init(&desc->spinlock); 8232 8233 if (bdev->media_events) { 8234 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8235 sizeof(*desc->media_events_buffer)); 8236 if (desc->media_events_buffer == NULL) { 8237 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8238 bdev_desc_free(desc); 8239 return -ENOMEM; 8240 } 8241 8242 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8243 TAILQ_INSERT_TAIL(&desc->free_media_events, 8244 &desc->media_events_buffer[i], tailq); 8245 } 8246 } 8247 8248 if (bdev->fn_table->accel_sequence_supported != NULL) { 8249 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8250 desc->accel_sequence_supported[i] = 8251 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8252 (enum spdk_bdev_io_type)i); 8253 } 8254 } 8255 8256 *_desc = desc; 8257 8258 return 0; 8259 } 8260 8261 static int 8262 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8263 void *event_ctx, struct spdk_bdev_desc **_desc) 8264 { 8265 struct spdk_bdev_desc *desc; 8266 struct spdk_bdev *bdev; 8267 int rc; 8268 8269 bdev = bdev_get_by_name(bdev_name); 8270 8271 if (bdev == NULL) { 8272 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8273 return -ENODEV; 8274 } 8275 8276 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8277 if (rc != 0) { 8278 return rc; 8279 } 8280 8281 rc = bdev_open(bdev, write, desc); 8282 if (rc != 0) { 8283 bdev_desc_free(desc); 8284 desc = NULL; 8285 } 8286 8287 *_desc = desc; 8288 8289 return rc; 8290 } 8291 8292 int 8293 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8294 void *event_ctx, struct spdk_bdev_desc **_desc) 8295 { 8296 int rc; 8297 8298 if (event_cb == NULL) { 8299 SPDK_ERRLOG("Missing event callback function\n"); 8300 return -EINVAL; 8301 } 8302 8303 spdk_spin_lock(&g_bdev_mgr.spinlock); 8304 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8305 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8306 8307 return rc; 8308 } 8309 8310 struct spdk_bdev_open_async_ctx { 8311 char *bdev_name; 8312 spdk_bdev_event_cb_t event_cb; 8313 void *event_ctx; 8314 bool write; 8315 int rc; 8316 spdk_bdev_open_async_cb_t cb_fn; 8317 void *cb_arg; 8318 struct spdk_bdev_desc *desc; 8319 struct spdk_bdev_open_async_opts opts; 8320 uint64_t start_ticks; 8321 struct spdk_thread *orig_thread; 8322 struct spdk_poller *poller; 8323 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8324 }; 8325 8326 static void 8327 bdev_open_async_done(void *arg) 8328 { 8329 struct spdk_bdev_open_async_ctx *ctx = arg; 8330 8331 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8332 8333 free(ctx->bdev_name); 8334 free(ctx); 8335 } 8336 8337 static void 8338 bdev_open_async_cancel(void *arg) 8339 { 8340 struct spdk_bdev_open_async_ctx *ctx = arg; 8341 8342 assert(ctx->rc == -ESHUTDOWN); 8343 8344 spdk_poller_unregister(&ctx->poller); 8345 8346 bdev_open_async_done(ctx); 8347 } 8348 8349 /* This is called when the bdev library finishes at shutdown. */ 8350 static void 8351 bdev_open_async_fini(void) 8352 { 8353 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8354 8355 spdk_spin_lock(&g_bdev_mgr.spinlock); 8356 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8357 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8358 /* 8359 * We have to move to ctx->orig_thread to unregister ctx->poller. 8360 * However, there is a chance that ctx->poller is executed before 8361 * message is executed, which could result in bdev_open_async_done() 8362 * being called twice. To avoid such race condition, set ctx->rc to 8363 * -ESHUTDOWN. 8364 */ 8365 ctx->rc = -ESHUTDOWN; 8366 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8367 } 8368 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8369 } 8370 8371 static int bdev_open_async(void *arg); 8372 8373 static void 8374 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8375 { 8376 uint64_t timeout_ticks; 8377 8378 if (ctx->rc == -ESHUTDOWN) { 8379 /* This context is being canceled. Do nothing. */ 8380 return; 8381 } 8382 8383 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8384 &ctx->desc); 8385 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8386 goto exit; 8387 } 8388 8389 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8390 if (spdk_get_ticks() >= timeout_ticks) { 8391 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8392 ctx->rc = -ETIMEDOUT; 8393 goto exit; 8394 } 8395 8396 return; 8397 8398 exit: 8399 spdk_poller_unregister(&ctx->poller); 8400 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8401 8402 /* Completion callback is processed after stack unwinding. */ 8403 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8404 } 8405 8406 static int 8407 bdev_open_async(void *arg) 8408 { 8409 struct spdk_bdev_open_async_ctx *ctx = arg; 8410 8411 spdk_spin_lock(&g_bdev_mgr.spinlock); 8412 8413 _bdev_open_async(ctx); 8414 8415 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8416 8417 return SPDK_POLLER_BUSY; 8418 } 8419 8420 static void 8421 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8422 struct spdk_bdev_open_async_opts *opts_src, 8423 size_t size) 8424 { 8425 assert(opts); 8426 assert(opts_src); 8427 8428 opts->size = size; 8429 8430 #define SET_FIELD(field) \ 8431 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8432 opts->field = opts_src->field; \ 8433 } \ 8434 8435 SET_FIELD(timeout_ms); 8436 8437 /* Do not remove this statement, you should always update this statement when you adding a new field, 8438 * and do not forget to add the SET_FIELD statement for your added field. */ 8439 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8440 8441 #undef SET_FIELD 8442 } 8443 8444 static void 8445 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8446 { 8447 assert(opts); 8448 8449 opts->size = size; 8450 8451 #define SET_FIELD(field, value) \ 8452 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8453 opts->field = value; \ 8454 } \ 8455 8456 SET_FIELD(timeout_ms, 0); 8457 8458 #undef SET_FIELD 8459 } 8460 8461 int 8462 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8463 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8464 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8465 { 8466 struct spdk_bdev_open_async_ctx *ctx; 8467 8468 if (event_cb == NULL) { 8469 SPDK_ERRLOG("Missing event callback function\n"); 8470 return -EINVAL; 8471 } 8472 8473 if (open_cb == NULL) { 8474 SPDK_ERRLOG("Missing open callback function\n"); 8475 return -EINVAL; 8476 } 8477 8478 if (opts != NULL && opts->size == 0) { 8479 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8480 return -EINVAL; 8481 } 8482 8483 ctx = calloc(1, sizeof(*ctx)); 8484 if (ctx == NULL) { 8485 SPDK_ERRLOG("Failed to allocate open context\n"); 8486 return -ENOMEM; 8487 } 8488 8489 ctx->bdev_name = strdup(bdev_name); 8490 if (ctx->bdev_name == NULL) { 8491 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8492 free(ctx); 8493 return -ENOMEM; 8494 } 8495 8496 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8497 if (ctx->poller == NULL) { 8498 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8499 free(ctx->bdev_name); 8500 free(ctx); 8501 return -ENOMEM; 8502 } 8503 8504 ctx->cb_fn = open_cb; 8505 ctx->cb_arg = open_cb_arg; 8506 ctx->write = write; 8507 ctx->event_cb = event_cb; 8508 ctx->event_ctx = event_ctx; 8509 ctx->orig_thread = spdk_get_thread(); 8510 ctx->start_ticks = spdk_get_ticks(); 8511 8512 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8513 if (opts != NULL) { 8514 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8515 } 8516 8517 spdk_spin_lock(&g_bdev_mgr.spinlock); 8518 8519 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8520 _bdev_open_async(ctx); 8521 8522 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8523 8524 return 0; 8525 } 8526 8527 static void 8528 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8529 { 8530 int rc; 8531 8532 spdk_spin_lock(&bdev->internal.spinlock); 8533 spdk_spin_lock(&desc->spinlock); 8534 8535 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8536 8537 desc->closed = true; 8538 8539 if (desc->claim != NULL) { 8540 bdev_desc_release_claims(desc); 8541 } 8542 8543 if (0 == desc->refs) { 8544 spdk_spin_unlock(&desc->spinlock); 8545 bdev_desc_free(desc); 8546 } else { 8547 spdk_spin_unlock(&desc->spinlock); 8548 } 8549 8550 /* If no more descriptors, kill QoS channel */ 8551 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8552 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8553 bdev->name, spdk_get_thread()); 8554 8555 if (bdev_qos_destroy(bdev)) { 8556 /* There isn't anything we can do to recover here. Just let the 8557 * old QoS poller keep running. The QoS handling won't change 8558 * cores when the user allocates a new channel, but it won't break. */ 8559 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8560 } 8561 } 8562 8563 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8564 rc = bdev_unregister_unsafe(bdev); 8565 spdk_spin_unlock(&bdev->internal.spinlock); 8566 8567 if (rc == 0) { 8568 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8569 } 8570 } else { 8571 spdk_spin_unlock(&bdev->internal.spinlock); 8572 } 8573 } 8574 8575 void 8576 spdk_bdev_close(struct spdk_bdev_desc *desc) 8577 { 8578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8579 8580 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8581 spdk_get_thread()); 8582 8583 assert(desc->thread == spdk_get_thread()); 8584 8585 spdk_poller_unregister(&desc->io_timeout_poller); 8586 8587 spdk_spin_lock(&g_bdev_mgr.spinlock); 8588 8589 bdev_close(bdev, desc); 8590 8591 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8592 } 8593 8594 int32_t 8595 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8596 { 8597 if (bdev->numa.id_valid) { 8598 return bdev->numa.id; 8599 } else { 8600 return SPDK_ENV_NUMA_ID_ANY; 8601 } 8602 } 8603 8604 static void 8605 bdev_register_finished(void *arg) 8606 { 8607 struct spdk_bdev_desc *desc = arg; 8608 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8609 8610 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8611 8612 spdk_spin_lock(&g_bdev_mgr.spinlock); 8613 8614 bdev_close(bdev, desc); 8615 8616 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8617 } 8618 8619 int 8620 spdk_bdev_register(struct spdk_bdev *bdev) 8621 { 8622 struct spdk_bdev_desc *desc; 8623 struct spdk_thread *thread = spdk_get_thread(); 8624 int rc; 8625 8626 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8627 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8628 thread ? spdk_thread_get_name(thread) : "null"); 8629 return -EINVAL; 8630 } 8631 8632 rc = bdev_register(bdev); 8633 if (rc != 0) { 8634 return rc; 8635 } 8636 8637 /* A descriptor is opened to prevent bdev deletion during examination */ 8638 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8639 if (rc != 0) { 8640 spdk_bdev_unregister(bdev, NULL, NULL); 8641 return rc; 8642 } 8643 8644 rc = bdev_open(bdev, false, desc); 8645 if (rc != 0) { 8646 bdev_desc_free(desc); 8647 spdk_bdev_unregister(bdev, NULL, NULL); 8648 return rc; 8649 } 8650 8651 /* Examine configuration before initializing I/O */ 8652 bdev_examine(bdev); 8653 8654 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8655 if (rc != 0) { 8656 bdev_close(bdev, desc); 8657 spdk_bdev_unregister(bdev, NULL, NULL); 8658 } 8659 8660 return rc; 8661 } 8662 8663 int 8664 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8665 struct spdk_bdev_module *module) 8666 { 8667 spdk_spin_lock(&bdev->internal.spinlock); 8668 8669 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8670 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8671 spdk_spin_unlock(&bdev->internal.spinlock); 8672 return -EPERM; 8673 } 8674 8675 if (desc && !desc->write) { 8676 desc->write = true; 8677 } 8678 8679 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8680 bdev->internal.claim.v1.module = module; 8681 8682 spdk_spin_unlock(&bdev->internal.spinlock); 8683 return 0; 8684 } 8685 8686 void 8687 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8688 { 8689 spdk_spin_lock(&bdev->internal.spinlock); 8690 8691 assert(bdev->internal.claim.v1.module != NULL); 8692 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8693 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8694 bdev->internal.claim.v1.module = NULL; 8695 8696 spdk_spin_unlock(&bdev->internal.spinlock); 8697 } 8698 8699 /* 8700 * Start claims v2 8701 */ 8702 8703 const char * 8704 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8705 { 8706 switch (type) { 8707 case SPDK_BDEV_CLAIM_NONE: 8708 return "not_claimed"; 8709 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8710 return "exclusive_write"; 8711 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8712 return "read_many_write_one"; 8713 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8714 return "read_many_write_none"; 8715 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8716 return "read_many_write_many"; 8717 default: 8718 break; 8719 } 8720 return "invalid_claim"; 8721 } 8722 8723 static bool 8724 claim_type_is_v2(enum spdk_bdev_claim_type type) 8725 { 8726 switch (type) { 8727 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8728 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8729 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8730 return true; 8731 default: 8732 break; 8733 } 8734 return false; 8735 } 8736 8737 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8738 static bool 8739 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8740 { 8741 switch (type) { 8742 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8743 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8744 return true; 8745 default: 8746 break; 8747 } 8748 return false; 8749 } 8750 8751 void 8752 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8753 { 8754 if (opts == NULL) { 8755 SPDK_ERRLOG("opts should not be NULL\n"); 8756 assert(opts != NULL); 8757 return; 8758 } 8759 if (size == 0) { 8760 SPDK_ERRLOG("size should not be zero\n"); 8761 assert(size != 0); 8762 return; 8763 } 8764 8765 memset(opts, 0, size); 8766 opts->opts_size = size; 8767 8768 #define FIELD_OK(field) \ 8769 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8770 8771 #define SET_FIELD(field, value) \ 8772 if (FIELD_OK(field)) { \ 8773 opts->field = value; \ 8774 } \ 8775 8776 SET_FIELD(shared_claim_key, 0); 8777 8778 #undef FIELD_OK 8779 #undef SET_FIELD 8780 } 8781 8782 static int 8783 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8784 { 8785 if (src->opts_size == 0) { 8786 SPDK_ERRLOG("size should not be zero\n"); 8787 return -1; 8788 } 8789 8790 memset(dst, 0, sizeof(*dst)); 8791 dst->opts_size = src->opts_size; 8792 8793 #define FIELD_OK(field) \ 8794 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8795 8796 #define SET_FIELD(field) \ 8797 if (FIELD_OK(field)) { \ 8798 dst->field = src->field; \ 8799 } \ 8800 8801 if (FIELD_OK(name)) { 8802 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8803 } 8804 8805 SET_FIELD(shared_claim_key); 8806 8807 /* You should not remove this statement, but need to update the assert statement 8808 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8809 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8810 8811 #undef FIELD_OK 8812 #undef SET_FIELD 8813 return 0; 8814 } 8815 8816 /* Returns 0 if a read-write-once claim can be taken. */ 8817 static int 8818 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8819 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8820 { 8821 struct spdk_bdev *bdev = desc->bdev; 8822 struct spdk_bdev_desc *open_desc; 8823 8824 assert(spdk_spin_held(&bdev->internal.spinlock)); 8825 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8826 8827 if (opts->shared_claim_key != 0) { 8828 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8829 bdev->name); 8830 return -EINVAL; 8831 } 8832 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8833 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8834 return -EPERM; 8835 } 8836 if (desc->claim != NULL) { 8837 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8838 bdev->name, desc->claim->module->name); 8839 return -EPERM; 8840 } 8841 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8842 if (desc != open_desc && open_desc->write) { 8843 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8844 "another descriptor is open for writing\n", 8845 bdev->name); 8846 return -EPERM; 8847 } 8848 } 8849 8850 return 0; 8851 } 8852 8853 /* Returns 0 if a read-only-many claim can be taken. */ 8854 static int 8855 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8856 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8857 { 8858 struct spdk_bdev *bdev = desc->bdev; 8859 struct spdk_bdev_desc *open_desc; 8860 8861 assert(spdk_spin_held(&bdev->internal.spinlock)); 8862 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8863 assert(desc->claim == NULL); 8864 8865 if (desc->write) { 8866 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8867 bdev->name); 8868 return -EINVAL; 8869 } 8870 if (opts->shared_claim_key != 0) { 8871 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8872 return -EINVAL; 8873 } 8874 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8875 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8876 if (open_desc->write) { 8877 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8878 "another descriptor is open for writing\n", 8879 bdev->name); 8880 return -EPERM; 8881 } 8882 } 8883 } 8884 8885 return 0; 8886 } 8887 8888 /* Returns 0 if a read-write-many claim can be taken. */ 8889 static int 8890 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8891 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8892 { 8893 struct spdk_bdev *bdev = desc->bdev; 8894 struct spdk_bdev_desc *open_desc; 8895 8896 assert(spdk_spin_held(&bdev->internal.spinlock)); 8897 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8898 assert(desc->claim == NULL); 8899 8900 if (opts->shared_claim_key == 0) { 8901 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8902 bdev->name); 8903 return -EINVAL; 8904 } 8905 switch (bdev->internal.claim_type) { 8906 case SPDK_BDEV_CLAIM_NONE: 8907 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8908 if (open_desc == desc) { 8909 continue; 8910 } 8911 if (open_desc->write) { 8912 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8913 "another descriptor is open for writing without a " 8914 "claim\n", bdev->name); 8915 return -EPERM; 8916 } 8917 } 8918 break; 8919 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8920 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8921 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8922 return -EPERM; 8923 } 8924 break; 8925 default: 8926 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8927 return -EBUSY; 8928 } 8929 8930 return 0; 8931 } 8932 8933 /* Updates desc and its bdev with a v2 claim. */ 8934 static int 8935 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8936 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8937 { 8938 struct spdk_bdev *bdev = desc->bdev; 8939 struct spdk_bdev_module_claim *claim; 8940 8941 assert(spdk_spin_held(&bdev->internal.spinlock)); 8942 assert(claim_type_is_v2(type)); 8943 assert(desc->claim == NULL); 8944 8945 claim = calloc(1, sizeof(*desc->claim)); 8946 if (claim == NULL) { 8947 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8948 return -ENOMEM; 8949 } 8950 claim->module = module; 8951 claim->desc = desc; 8952 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8953 memcpy(claim->name, opts->name, sizeof(claim->name)); 8954 desc->claim = claim; 8955 8956 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8957 bdev->internal.claim_type = type; 8958 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8959 bdev->internal.claim.v2.key = opts->shared_claim_key; 8960 } 8961 assert(type == bdev->internal.claim_type); 8962 8963 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8964 8965 if (!desc->write && claim_type_promotes_to_write(type)) { 8966 desc->write = true; 8967 } 8968 8969 return 0; 8970 } 8971 8972 int 8973 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8974 struct spdk_bdev_claim_opts *_opts, 8975 struct spdk_bdev_module *module) 8976 { 8977 struct spdk_bdev *bdev; 8978 struct spdk_bdev_claim_opts opts; 8979 int rc = 0; 8980 8981 if (desc == NULL) { 8982 SPDK_ERRLOG("descriptor must not be NULL\n"); 8983 return -EINVAL; 8984 } 8985 8986 bdev = desc->bdev; 8987 8988 if (_opts == NULL) { 8989 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8990 } else if (claim_opts_copy(_opts, &opts) != 0) { 8991 return -EINVAL; 8992 } 8993 8994 spdk_spin_lock(&bdev->internal.spinlock); 8995 8996 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8997 bdev->internal.claim_type != type) { 8998 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8999 spdk_spin_unlock(&bdev->internal.spinlock); 9000 return -EPERM; 9001 } 9002 9003 if (claim_type_is_v2(type) && desc->claim != NULL) { 9004 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9005 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9006 spdk_spin_unlock(&bdev->internal.spinlock); 9007 return -EPERM; 9008 } 9009 9010 switch (type) { 9011 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9012 spdk_spin_unlock(&bdev->internal.spinlock); 9013 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9014 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9015 rc = claim_verify_rwo(desc, type, &opts, module); 9016 break; 9017 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9018 rc = claim_verify_rom(desc, type, &opts, module); 9019 break; 9020 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9021 rc = claim_verify_rwm(desc, type, &opts, module); 9022 break; 9023 default: 9024 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9025 rc = -ENOTSUP; 9026 } 9027 9028 if (rc == 0) { 9029 rc = claim_bdev(desc, type, &opts, module); 9030 } 9031 9032 spdk_spin_unlock(&bdev->internal.spinlock); 9033 return rc; 9034 } 9035 9036 static void 9037 claim_reset(struct spdk_bdev *bdev) 9038 { 9039 assert(spdk_spin_held(&bdev->internal.spinlock)); 9040 assert(claim_type_is_v2(bdev->internal.claim_type)); 9041 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9042 9043 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9044 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9045 } 9046 9047 static void 9048 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9049 { 9050 struct spdk_bdev *bdev = desc->bdev; 9051 9052 assert(spdk_spin_held(&bdev->internal.spinlock)); 9053 assert(claim_type_is_v2(bdev->internal.claim_type)); 9054 9055 if (bdev->internal.examine_in_progress == 0) { 9056 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9057 free(desc->claim); 9058 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9059 claim_reset(bdev); 9060 } 9061 } else { 9062 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9063 desc->claim->module = NULL; 9064 desc->claim->desc = NULL; 9065 } 9066 desc->claim = NULL; 9067 } 9068 9069 /* 9070 * End claims v2 9071 */ 9072 9073 struct spdk_bdev * 9074 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9075 { 9076 assert(desc != NULL); 9077 return desc->bdev; 9078 } 9079 9080 int 9081 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9082 { 9083 struct spdk_bdev *bdev, *tmp; 9084 struct spdk_bdev_desc *desc; 9085 int rc = 0; 9086 9087 assert(fn != NULL); 9088 9089 spdk_spin_lock(&g_bdev_mgr.spinlock); 9090 bdev = spdk_bdev_first(); 9091 while (bdev != NULL) { 9092 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9093 if (rc != 0) { 9094 break; 9095 } 9096 rc = bdev_open(bdev, false, desc); 9097 if (rc != 0) { 9098 bdev_desc_free(desc); 9099 if (rc == -ENODEV) { 9100 /* Ignore the error and move to the next bdev. */ 9101 rc = 0; 9102 bdev = spdk_bdev_next(bdev); 9103 continue; 9104 } 9105 break; 9106 } 9107 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9108 9109 rc = fn(ctx, bdev); 9110 9111 spdk_spin_lock(&g_bdev_mgr.spinlock); 9112 tmp = spdk_bdev_next(bdev); 9113 bdev_close(bdev, desc); 9114 if (rc != 0) { 9115 break; 9116 } 9117 bdev = tmp; 9118 } 9119 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9120 9121 return rc; 9122 } 9123 9124 int 9125 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9126 { 9127 struct spdk_bdev *bdev, *tmp; 9128 struct spdk_bdev_desc *desc; 9129 int rc = 0; 9130 9131 assert(fn != NULL); 9132 9133 spdk_spin_lock(&g_bdev_mgr.spinlock); 9134 bdev = spdk_bdev_first_leaf(); 9135 while (bdev != NULL) { 9136 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9137 if (rc != 0) { 9138 break; 9139 } 9140 rc = bdev_open(bdev, false, desc); 9141 if (rc != 0) { 9142 bdev_desc_free(desc); 9143 if (rc == -ENODEV) { 9144 /* Ignore the error and move to the next bdev. */ 9145 rc = 0; 9146 bdev = spdk_bdev_next_leaf(bdev); 9147 continue; 9148 } 9149 break; 9150 } 9151 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9152 9153 rc = fn(ctx, bdev); 9154 9155 spdk_spin_lock(&g_bdev_mgr.spinlock); 9156 tmp = spdk_bdev_next_leaf(bdev); 9157 bdev_close(bdev, desc); 9158 if (rc != 0) { 9159 break; 9160 } 9161 bdev = tmp; 9162 } 9163 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9164 9165 return rc; 9166 } 9167 9168 void 9169 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9170 { 9171 struct iovec *iovs; 9172 int iovcnt; 9173 9174 if (bdev_io == NULL) { 9175 return; 9176 } 9177 9178 switch (bdev_io->type) { 9179 case SPDK_BDEV_IO_TYPE_READ: 9180 case SPDK_BDEV_IO_TYPE_WRITE: 9181 case SPDK_BDEV_IO_TYPE_ZCOPY: 9182 iovs = bdev_io->u.bdev.iovs; 9183 iovcnt = bdev_io->u.bdev.iovcnt; 9184 break; 9185 default: 9186 iovs = NULL; 9187 iovcnt = 0; 9188 break; 9189 } 9190 9191 if (iovp) { 9192 *iovp = iovs; 9193 } 9194 if (iovcntp) { 9195 *iovcntp = iovcnt; 9196 } 9197 } 9198 9199 void * 9200 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9201 { 9202 if (bdev_io == NULL) { 9203 return NULL; 9204 } 9205 9206 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9207 return NULL; 9208 } 9209 9210 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9211 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9212 return bdev_io->u.bdev.md_buf; 9213 } 9214 9215 return NULL; 9216 } 9217 9218 void * 9219 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9220 { 9221 if (bdev_io == NULL) { 9222 assert(false); 9223 return NULL; 9224 } 9225 9226 return bdev_io->internal.caller_ctx; 9227 } 9228 9229 void 9230 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9231 { 9232 9233 if (spdk_bdev_module_list_find(bdev_module->name)) { 9234 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9235 assert(false); 9236 } 9237 9238 spdk_spin_init(&bdev_module->internal.spinlock); 9239 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9240 9241 /* 9242 * Modules with examine callbacks must be initialized first, so they are 9243 * ready to handle examine callbacks from later modules that will 9244 * register physical bdevs. 9245 */ 9246 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9247 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9248 } else { 9249 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9250 } 9251 } 9252 9253 struct spdk_bdev_module * 9254 spdk_bdev_module_list_find(const char *name) 9255 { 9256 struct spdk_bdev_module *bdev_module; 9257 9258 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9259 if (strcmp(name, bdev_module->name) == 0) { 9260 break; 9261 } 9262 } 9263 9264 return bdev_module; 9265 } 9266 9267 static int 9268 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9269 { 9270 uint64_t num_blocks; 9271 void *md_buf = NULL; 9272 9273 num_blocks = bdev_io->u.bdev.num_blocks; 9274 9275 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9276 md_buf = (char *)g_bdev_mgr.zero_buffer + 9277 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9278 } 9279 9280 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9281 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9282 g_bdev_mgr.zero_buffer, md_buf, 9283 bdev_io->u.bdev.offset_blocks, num_blocks, 9284 bdev_write_zero_buffer_done, bdev_io); 9285 } 9286 9287 static void 9288 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9289 { 9290 struct spdk_bdev_io *parent_io = cb_arg; 9291 9292 spdk_bdev_free_io(bdev_io); 9293 9294 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9295 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9296 } 9297 9298 static void 9299 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9300 { 9301 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9302 ctx->bdev->internal.qos_mod_in_progress = false; 9303 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9304 9305 if (ctx->cb_fn) { 9306 ctx->cb_fn(ctx->cb_arg, status); 9307 } 9308 free(ctx); 9309 } 9310 9311 static void 9312 bdev_disable_qos_done(void *cb_arg) 9313 { 9314 struct set_qos_limit_ctx *ctx = cb_arg; 9315 struct spdk_bdev *bdev = ctx->bdev; 9316 struct spdk_bdev_qos *qos; 9317 9318 spdk_spin_lock(&bdev->internal.spinlock); 9319 qos = bdev->internal.qos; 9320 bdev->internal.qos = NULL; 9321 spdk_spin_unlock(&bdev->internal.spinlock); 9322 9323 if (qos->thread != NULL) { 9324 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9325 spdk_poller_unregister(&qos->poller); 9326 } 9327 9328 free(qos); 9329 9330 bdev_set_qos_limit_done(ctx, 0); 9331 } 9332 9333 static void 9334 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9335 { 9336 struct set_qos_limit_ctx *ctx = _ctx; 9337 struct spdk_thread *thread; 9338 9339 spdk_spin_lock(&bdev->internal.spinlock); 9340 thread = bdev->internal.qos->thread; 9341 spdk_spin_unlock(&bdev->internal.spinlock); 9342 9343 if (thread != NULL) { 9344 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9345 } else { 9346 bdev_disable_qos_done(ctx); 9347 } 9348 } 9349 9350 static void 9351 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9352 struct spdk_io_channel *ch, void *_ctx) 9353 { 9354 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9355 struct spdk_bdev_io *bdev_io; 9356 9357 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9358 9359 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9360 /* Re-submit the queued I/O. */ 9361 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9362 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9363 _bdev_io_submit(bdev_io); 9364 } 9365 9366 spdk_bdev_for_each_channel_continue(i, 0); 9367 } 9368 9369 static void 9370 bdev_update_qos_rate_limit_msg(void *cb_arg) 9371 { 9372 struct set_qos_limit_ctx *ctx = cb_arg; 9373 struct spdk_bdev *bdev = ctx->bdev; 9374 9375 spdk_spin_lock(&bdev->internal.spinlock); 9376 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9377 spdk_spin_unlock(&bdev->internal.spinlock); 9378 9379 bdev_set_qos_limit_done(ctx, 0); 9380 } 9381 9382 static void 9383 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9384 struct spdk_io_channel *ch, void *_ctx) 9385 { 9386 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9387 9388 spdk_spin_lock(&bdev->internal.spinlock); 9389 bdev_enable_qos(bdev, bdev_ch); 9390 spdk_spin_unlock(&bdev->internal.spinlock); 9391 spdk_bdev_for_each_channel_continue(i, 0); 9392 } 9393 9394 static void 9395 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9396 { 9397 struct set_qos_limit_ctx *ctx = _ctx; 9398 9399 bdev_set_qos_limit_done(ctx, status); 9400 } 9401 9402 static void 9403 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9404 { 9405 int i; 9406 9407 assert(bdev->internal.qos != NULL); 9408 9409 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9410 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9411 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9412 9413 if (limits[i] == 0) { 9414 bdev->internal.qos->rate_limits[i].limit = 9415 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9416 } 9417 } 9418 } 9419 } 9420 9421 void 9422 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9423 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9424 { 9425 struct set_qos_limit_ctx *ctx; 9426 uint32_t limit_set_complement; 9427 uint64_t min_limit_per_sec; 9428 int i; 9429 bool disable_rate_limit = true; 9430 9431 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9432 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9433 continue; 9434 } 9435 9436 if (limits[i] > 0) { 9437 disable_rate_limit = false; 9438 } 9439 9440 if (bdev_qos_is_iops_rate_limit(i) == true) { 9441 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9442 } else { 9443 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9444 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9445 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9446 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9447 } 9448 /* Change from megabyte to byte rate limit */ 9449 limits[i] = limits[i] * 1024 * 1024; 9450 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9451 } 9452 9453 limit_set_complement = limits[i] % min_limit_per_sec; 9454 if (limit_set_complement) { 9455 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9456 limits[i], min_limit_per_sec); 9457 limits[i] += min_limit_per_sec - limit_set_complement; 9458 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9459 } 9460 } 9461 9462 ctx = calloc(1, sizeof(*ctx)); 9463 if (ctx == NULL) { 9464 cb_fn(cb_arg, -ENOMEM); 9465 return; 9466 } 9467 9468 ctx->cb_fn = cb_fn; 9469 ctx->cb_arg = cb_arg; 9470 ctx->bdev = bdev; 9471 9472 spdk_spin_lock(&bdev->internal.spinlock); 9473 if (bdev->internal.qos_mod_in_progress) { 9474 spdk_spin_unlock(&bdev->internal.spinlock); 9475 free(ctx); 9476 cb_fn(cb_arg, -EAGAIN); 9477 return; 9478 } 9479 bdev->internal.qos_mod_in_progress = true; 9480 9481 if (disable_rate_limit == true && bdev->internal.qos) { 9482 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9483 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9484 (bdev->internal.qos->rate_limits[i].limit > 0 && 9485 bdev->internal.qos->rate_limits[i].limit != 9486 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9487 disable_rate_limit = false; 9488 break; 9489 } 9490 } 9491 } 9492 9493 if (disable_rate_limit == false) { 9494 if (bdev->internal.qos == NULL) { 9495 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9496 if (!bdev->internal.qos) { 9497 spdk_spin_unlock(&bdev->internal.spinlock); 9498 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9499 bdev_set_qos_limit_done(ctx, -ENOMEM); 9500 return; 9501 } 9502 } 9503 9504 if (bdev->internal.qos->thread == NULL) { 9505 /* Enabling */ 9506 bdev_set_qos_rate_limits(bdev, limits); 9507 9508 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9509 bdev_enable_qos_done); 9510 } else { 9511 /* Updating */ 9512 bdev_set_qos_rate_limits(bdev, limits); 9513 9514 spdk_thread_send_msg(bdev->internal.qos->thread, 9515 bdev_update_qos_rate_limit_msg, ctx); 9516 } 9517 } else { 9518 if (bdev->internal.qos != NULL) { 9519 bdev_set_qos_rate_limits(bdev, limits); 9520 9521 /* Disabling */ 9522 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9523 bdev_disable_qos_msg_done); 9524 } else { 9525 spdk_spin_unlock(&bdev->internal.spinlock); 9526 bdev_set_qos_limit_done(ctx, 0); 9527 return; 9528 } 9529 } 9530 9531 spdk_spin_unlock(&bdev->internal.spinlock); 9532 } 9533 9534 struct spdk_bdev_histogram_ctx { 9535 spdk_bdev_histogram_status_cb cb_fn; 9536 void *cb_arg; 9537 struct spdk_bdev *bdev; 9538 int status; 9539 }; 9540 9541 static void 9542 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9543 { 9544 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9545 9546 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9547 ctx->bdev->internal.histogram_in_progress = false; 9548 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9549 ctx->cb_fn(ctx->cb_arg, ctx->status); 9550 free(ctx); 9551 } 9552 9553 static void 9554 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9555 struct spdk_io_channel *_ch, void *_ctx) 9556 { 9557 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9558 9559 if (ch->histogram != NULL) { 9560 spdk_histogram_data_free(ch->histogram); 9561 ch->histogram = NULL; 9562 } 9563 spdk_bdev_for_each_channel_continue(i, 0); 9564 } 9565 9566 static void 9567 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9568 { 9569 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9570 9571 if (status != 0) { 9572 ctx->status = status; 9573 ctx->bdev->internal.histogram_enabled = false; 9574 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9575 bdev_histogram_disable_channel_cb); 9576 } else { 9577 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9578 ctx->bdev->internal.histogram_in_progress = false; 9579 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9580 ctx->cb_fn(ctx->cb_arg, ctx->status); 9581 free(ctx); 9582 } 9583 } 9584 9585 static void 9586 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9587 struct spdk_io_channel *_ch, void *_ctx) 9588 { 9589 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9590 int status = 0; 9591 9592 if (ch->histogram == NULL) { 9593 ch->histogram = spdk_histogram_data_alloc(); 9594 if (ch->histogram == NULL) { 9595 status = -ENOMEM; 9596 } 9597 } 9598 9599 spdk_bdev_for_each_channel_continue(i, status); 9600 } 9601 9602 void 9603 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9604 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9605 { 9606 struct spdk_bdev_histogram_ctx *ctx; 9607 9608 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9609 if (ctx == NULL) { 9610 cb_fn(cb_arg, -ENOMEM); 9611 return; 9612 } 9613 9614 ctx->bdev = bdev; 9615 ctx->status = 0; 9616 ctx->cb_fn = cb_fn; 9617 ctx->cb_arg = cb_arg; 9618 9619 spdk_spin_lock(&bdev->internal.spinlock); 9620 if (bdev->internal.histogram_in_progress) { 9621 spdk_spin_unlock(&bdev->internal.spinlock); 9622 free(ctx); 9623 cb_fn(cb_arg, -EAGAIN); 9624 return; 9625 } 9626 9627 bdev->internal.histogram_in_progress = true; 9628 spdk_spin_unlock(&bdev->internal.spinlock); 9629 9630 bdev->internal.histogram_enabled = enable; 9631 bdev->internal.histogram_io_type = opts->io_type; 9632 9633 if (enable) { 9634 /* Allocate histogram for each channel */ 9635 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9636 bdev_histogram_enable_channel_cb); 9637 } else { 9638 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9639 bdev_histogram_disable_channel_cb); 9640 } 9641 } 9642 9643 void 9644 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9645 { 9646 if (opts == NULL) { 9647 SPDK_ERRLOG("opts should not be NULL\n"); 9648 assert(opts != NULL); 9649 return; 9650 } 9651 if (size == 0) { 9652 SPDK_ERRLOG("size should not be zero\n"); 9653 assert(size != 0); 9654 return; 9655 } 9656 9657 memset(opts, 0, size); 9658 opts->size = size; 9659 9660 #define FIELD_OK(field) \ 9661 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9662 9663 #define SET_FIELD(field, value) \ 9664 if (FIELD_OK(field)) { \ 9665 opts->field = value; \ 9666 } \ 9667 9668 SET_FIELD(io_type, 0); 9669 9670 /* You should not remove this statement, but need to update the assert statement 9671 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9672 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9673 9674 #undef FIELD_OK 9675 #undef SET_FIELD 9676 } 9677 9678 void 9679 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9680 void *cb_arg, bool enable) 9681 { 9682 struct spdk_bdev_enable_histogram_opts opts; 9683 9684 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9685 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9686 } 9687 9688 struct spdk_bdev_histogram_data_ctx { 9689 spdk_bdev_histogram_data_cb cb_fn; 9690 void *cb_arg; 9691 struct spdk_bdev *bdev; 9692 /** merged histogram data from all channels */ 9693 struct spdk_histogram_data *histogram; 9694 }; 9695 9696 static void 9697 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9698 { 9699 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9700 9701 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9702 free(ctx); 9703 } 9704 9705 static void 9706 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9707 struct spdk_io_channel *_ch, void *_ctx) 9708 { 9709 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9710 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9711 int status = 0; 9712 9713 if (ch->histogram == NULL) { 9714 status = -EFAULT; 9715 } else { 9716 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9717 } 9718 9719 spdk_bdev_for_each_channel_continue(i, status); 9720 } 9721 9722 void 9723 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9724 spdk_bdev_histogram_data_cb cb_fn, 9725 void *cb_arg) 9726 { 9727 struct spdk_bdev_histogram_data_ctx *ctx; 9728 9729 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9730 if (ctx == NULL) { 9731 cb_fn(cb_arg, -ENOMEM, NULL); 9732 return; 9733 } 9734 9735 ctx->bdev = bdev; 9736 ctx->cb_fn = cb_fn; 9737 ctx->cb_arg = cb_arg; 9738 9739 ctx->histogram = histogram; 9740 9741 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9742 bdev_histogram_get_channel_cb); 9743 } 9744 9745 void 9746 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9747 void *cb_arg) 9748 { 9749 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9750 int status = 0; 9751 9752 assert(cb_fn != NULL); 9753 9754 if (bdev_ch->histogram == NULL) { 9755 status = -EFAULT; 9756 } 9757 cb_fn(cb_arg, status, bdev_ch->histogram); 9758 } 9759 9760 size_t 9761 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9762 size_t max_events) 9763 { 9764 struct media_event_entry *entry; 9765 size_t num_events = 0; 9766 9767 for (; num_events < max_events; ++num_events) { 9768 entry = TAILQ_FIRST(&desc->pending_media_events); 9769 if (entry == NULL) { 9770 break; 9771 } 9772 9773 events[num_events] = entry->event; 9774 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9775 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9776 } 9777 9778 return num_events; 9779 } 9780 9781 int 9782 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9783 size_t num_events) 9784 { 9785 struct spdk_bdev_desc *desc; 9786 struct media_event_entry *entry; 9787 size_t event_id; 9788 int rc = 0; 9789 9790 assert(bdev->media_events); 9791 9792 spdk_spin_lock(&bdev->internal.spinlock); 9793 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9794 if (desc->write) { 9795 break; 9796 } 9797 } 9798 9799 if (desc == NULL || desc->media_events_buffer == NULL) { 9800 rc = -ENODEV; 9801 goto out; 9802 } 9803 9804 for (event_id = 0; event_id < num_events; ++event_id) { 9805 entry = TAILQ_FIRST(&desc->free_media_events); 9806 if (entry == NULL) { 9807 break; 9808 } 9809 9810 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9811 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9812 entry->event = events[event_id]; 9813 } 9814 9815 rc = event_id; 9816 out: 9817 spdk_spin_unlock(&bdev->internal.spinlock); 9818 return rc; 9819 } 9820 9821 static void 9822 _media_management_notify(void *arg) 9823 { 9824 struct spdk_bdev_desc *desc = arg; 9825 9826 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9827 } 9828 9829 void 9830 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9831 { 9832 struct spdk_bdev_desc *desc; 9833 9834 spdk_spin_lock(&bdev->internal.spinlock); 9835 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9836 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9837 event_notify(desc, _media_management_notify); 9838 } 9839 } 9840 spdk_spin_unlock(&bdev->internal.spinlock); 9841 } 9842 9843 struct locked_lba_range_ctx { 9844 struct lba_range range; 9845 struct lba_range *current_range; 9846 struct lba_range *owner_range; 9847 struct spdk_poller *poller; 9848 lock_range_cb cb_fn; 9849 void *cb_arg; 9850 }; 9851 9852 static void 9853 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9854 { 9855 struct locked_lba_range_ctx *ctx = _ctx; 9856 9857 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9858 free(ctx); 9859 } 9860 9861 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9862 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9863 9864 static void 9865 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9866 { 9867 struct locked_lba_range_ctx *ctx = _ctx; 9868 9869 if (status == -ENOMEM) { 9870 /* One of the channels could not allocate a range object. 9871 * So we have to go back and clean up any ranges that were 9872 * allocated successfully before we return error status to 9873 * the caller. We can reuse the unlock function to do that 9874 * clean up. 9875 */ 9876 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9877 bdev_lock_error_cleanup_cb); 9878 return; 9879 } 9880 9881 /* All channels have locked this range and no I/O overlapping the range 9882 * are outstanding! Set the owner_ch for the range object for the 9883 * locking channel, so that this channel will know that it is allowed 9884 * to write to this range. 9885 */ 9886 if (ctx->owner_range != NULL) { 9887 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9888 } 9889 9890 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9891 9892 /* Don't free the ctx here. Its range is in the bdev's global list of 9893 * locked ranges still, and will be removed and freed when this range 9894 * is later unlocked. 9895 */ 9896 } 9897 9898 static int 9899 bdev_lock_lba_range_check_io(void *_i) 9900 { 9901 struct spdk_bdev_channel_iter *i = _i; 9902 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9903 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9904 struct locked_lba_range_ctx *ctx = i->ctx; 9905 struct lba_range *range = ctx->current_range; 9906 struct spdk_bdev_io *bdev_io; 9907 9908 spdk_poller_unregister(&ctx->poller); 9909 9910 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9911 * range. But we need to wait until any outstanding IO overlapping with this range 9912 * are completed. 9913 */ 9914 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9915 if (bdev_io_range_is_locked(bdev_io, range)) { 9916 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9917 return SPDK_POLLER_BUSY; 9918 } 9919 } 9920 9921 spdk_bdev_for_each_channel_continue(i, 0); 9922 return SPDK_POLLER_BUSY; 9923 } 9924 9925 static void 9926 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9927 struct spdk_io_channel *_ch, void *_ctx) 9928 { 9929 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9930 struct locked_lba_range_ctx *ctx = _ctx; 9931 struct lba_range *range; 9932 9933 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9934 if (range->length == ctx->range.length && 9935 range->offset == ctx->range.offset && 9936 range->locked_ctx == ctx->range.locked_ctx) { 9937 /* This range already exists on this channel, so don't add 9938 * it again. This can happen when a new channel is created 9939 * while the for_each_channel operation is in progress. 9940 * Do not check for outstanding I/O in that case, since the 9941 * range was locked before any I/O could be submitted to the 9942 * new channel. 9943 */ 9944 spdk_bdev_for_each_channel_continue(i, 0); 9945 return; 9946 } 9947 } 9948 9949 range = calloc(1, sizeof(*range)); 9950 if (range == NULL) { 9951 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9952 return; 9953 } 9954 9955 range->length = ctx->range.length; 9956 range->offset = ctx->range.offset; 9957 range->locked_ctx = ctx->range.locked_ctx; 9958 range->quiesce = ctx->range.quiesce; 9959 ctx->current_range = range; 9960 if (ctx->range.owner_ch == ch) { 9961 /* This is the range object for the channel that will hold 9962 * the lock. Store it in the ctx object so that we can easily 9963 * set its owner_ch after the lock is finally acquired. 9964 */ 9965 ctx->owner_range = range; 9966 } 9967 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9968 bdev_lock_lba_range_check_io(i); 9969 } 9970 9971 static void 9972 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9973 { 9974 assert(spdk_get_thread() == ctx->range.owner_thread); 9975 assert(ctx->range.owner_ch == NULL || 9976 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9977 9978 /* We will add a copy of this range to each channel now. */ 9979 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9980 bdev_lock_lba_range_cb); 9981 } 9982 9983 static bool 9984 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9985 { 9986 struct lba_range *r; 9987 9988 TAILQ_FOREACH(r, tailq, tailq) { 9989 if (bdev_lba_range_overlapped(range, r)) { 9990 return true; 9991 } 9992 } 9993 return false; 9994 } 9995 9996 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9997 9998 static int 9999 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10000 uint64_t offset, uint64_t length, 10001 lock_range_cb cb_fn, void *cb_arg) 10002 { 10003 struct locked_lba_range_ctx *ctx; 10004 10005 ctx = calloc(1, sizeof(*ctx)); 10006 if (ctx == NULL) { 10007 return -ENOMEM; 10008 } 10009 10010 ctx->range.offset = offset; 10011 ctx->range.length = length; 10012 ctx->range.owner_thread = spdk_get_thread(); 10013 ctx->range.owner_ch = ch; 10014 ctx->range.locked_ctx = cb_arg; 10015 ctx->range.bdev = bdev; 10016 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10017 ctx->cb_fn = cb_fn; 10018 ctx->cb_arg = cb_arg; 10019 10020 spdk_spin_lock(&bdev->internal.spinlock); 10021 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10022 /* There is an active lock overlapping with this range. 10023 * Put it on the pending list until this range no 10024 * longer overlaps with another. 10025 */ 10026 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10027 } else { 10028 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10029 bdev_lock_lba_range_ctx(bdev, ctx); 10030 } 10031 spdk_spin_unlock(&bdev->internal.spinlock); 10032 return 0; 10033 } 10034 10035 static int 10036 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10037 uint64_t offset, uint64_t length, 10038 lock_range_cb cb_fn, void *cb_arg) 10039 { 10040 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10041 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10042 10043 if (cb_arg == NULL) { 10044 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10045 return -EINVAL; 10046 } 10047 10048 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10049 } 10050 10051 static void 10052 bdev_lock_lba_range_ctx_msg(void *_ctx) 10053 { 10054 struct locked_lba_range_ctx *ctx = _ctx; 10055 10056 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10057 } 10058 10059 static void 10060 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10061 { 10062 struct locked_lba_range_ctx *ctx = _ctx; 10063 struct locked_lba_range_ctx *pending_ctx; 10064 struct lba_range *range, *tmp; 10065 10066 spdk_spin_lock(&bdev->internal.spinlock); 10067 /* Check if there are any pending locked ranges that overlap with this range 10068 * that was just unlocked. If there are, check that it doesn't overlap with any 10069 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10070 * the lock process. 10071 */ 10072 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10073 if (bdev_lba_range_overlapped(range, &ctx->range) && 10074 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10075 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10076 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10077 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10078 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10079 bdev_lock_lba_range_ctx_msg, pending_ctx); 10080 } 10081 } 10082 spdk_spin_unlock(&bdev->internal.spinlock); 10083 10084 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10085 free(ctx); 10086 } 10087 10088 static void 10089 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10090 struct spdk_io_channel *_ch, void *_ctx) 10091 { 10092 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10093 struct locked_lba_range_ctx *ctx = _ctx; 10094 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10095 struct spdk_bdev_io *bdev_io; 10096 struct lba_range *range; 10097 10098 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10099 if (ctx->range.offset == range->offset && 10100 ctx->range.length == range->length && 10101 ctx->range.locked_ctx == range->locked_ctx) { 10102 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10103 free(range); 10104 break; 10105 } 10106 } 10107 10108 /* Note: we should almost always be able to assert that the range specified 10109 * was found. But there are some very rare corner cases where a new channel 10110 * gets created simultaneously with a range unlock, where this function 10111 * would execute on that new channel and wouldn't have the range. 10112 * We also use this to clean up range allocations when a later allocation 10113 * fails in the locking path. 10114 * So we can't actually assert() here. 10115 */ 10116 10117 /* Swap the locked IO into a temporary list, and then try to submit them again. 10118 * We could hyper-optimize this to only resubmit locked I/O that overlap 10119 * with the range that was just unlocked, but this isn't a performance path so 10120 * we go for simplicity here. 10121 */ 10122 TAILQ_INIT(&io_locked); 10123 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10124 while (!TAILQ_EMPTY(&io_locked)) { 10125 bdev_io = TAILQ_FIRST(&io_locked); 10126 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10127 bdev_io_submit(bdev_io); 10128 } 10129 10130 spdk_bdev_for_each_channel_continue(i, 0); 10131 } 10132 10133 static int 10134 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10135 lock_range_cb cb_fn, void *cb_arg) 10136 { 10137 struct locked_lba_range_ctx *ctx; 10138 struct lba_range *range; 10139 10140 spdk_spin_lock(&bdev->internal.spinlock); 10141 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10142 * and remove it. This ensures new channels don't inherit the locked range. 10143 * Then we will send a message to each channel to remove the range from its 10144 * per-channel list. 10145 */ 10146 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10147 if (range->offset == offset && range->length == length && 10148 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10149 break; 10150 } 10151 } 10152 if (range == NULL) { 10153 assert(false); 10154 spdk_spin_unlock(&bdev->internal.spinlock); 10155 return -EINVAL; 10156 } 10157 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10158 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10159 spdk_spin_unlock(&bdev->internal.spinlock); 10160 10161 ctx->cb_fn = cb_fn; 10162 ctx->cb_arg = cb_arg; 10163 10164 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10165 bdev_unlock_lba_range_cb); 10166 return 0; 10167 } 10168 10169 static int 10170 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10171 uint64_t offset, uint64_t length, 10172 lock_range_cb cb_fn, void *cb_arg) 10173 { 10174 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10175 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10176 struct lba_range *range; 10177 bool range_found = false; 10178 10179 /* Let's make sure the specified channel actually has a lock on 10180 * the specified range. Note that the range must match exactly. 10181 */ 10182 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10183 if (range->offset == offset && range->length == length && 10184 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10185 range_found = true; 10186 break; 10187 } 10188 } 10189 10190 if (!range_found) { 10191 return -EINVAL; 10192 } 10193 10194 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10195 } 10196 10197 struct bdev_quiesce_ctx { 10198 spdk_bdev_quiesce_cb cb_fn; 10199 void *cb_arg; 10200 }; 10201 10202 static void 10203 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10204 { 10205 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10206 10207 if (quiesce_ctx->cb_fn != NULL) { 10208 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10209 } 10210 10211 free(quiesce_ctx); 10212 } 10213 10214 static void 10215 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10216 { 10217 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10218 struct spdk_bdev_module *module = range->bdev->module; 10219 10220 if (status != 0) { 10221 if (quiesce_ctx->cb_fn != NULL) { 10222 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10223 } 10224 free(quiesce_ctx); 10225 return; 10226 } 10227 10228 spdk_spin_lock(&module->internal.spinlock); 10229 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10230 spdk_spin_unlock(&module->internal.spinlock); 10231 10232 if (quiesce_ctx->cb_fn != NULL) { 10233 /* copy the context in case the range is unlocked by the callback */ 10234 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10235 10236 quiesce_ctx->cb_fn = NULL; 10237 quiesce_ctx->cb_arg = NULL; 10238 10239 tmp.cb_fn(tmp.cb_arg, status); 10240 } 10241 /* quiesce_ctx will be freed on unquiesce */ 10242 } 10243 10244 static int 10245 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10246 uint64_t offset, uint64_t length, 10247 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10248 bool unquiesce) 10249 { 10250 struct bdev_quiesce_ctx *quiesce_ctx; 10251 int rc; 10252 10253 if (module != bdev->module) { 10254 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10255 return -EINVAL; 10256 } 10257 10258 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10259 return -EINVAL; 10260 } 10261 10262 if (unquiesce) { 10263 struct lba_range *range; 10264 10265 /* Make sure the specified range is actually quiesced in the specified module and 10266 * then remove it from the list. Note that the range must match exactly. 10267 */ 10268 spdk_spin_lock(&module->internal.spinlock); 10269 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10270 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10271 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10272 break; 10273 } 10274 } 10275 spdk_spin_unlock(&module->internal.spinlock); 10276 10277 if (range == NULL) { 10278 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10279 return -EINVAL; 10280 } 10281 10282 quiesce_ctx = range->locked_ctx; 10283 quiesce_ctx->cb_fn = cb_fn; 10284 quiesce_ctx->cb_arg = cb_arg; 10285 10286 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10287 } else { 10288 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10289 if (quiesce_ctx == NULL) { 10290 return -ENOMEM; 10291 } 10292 10293 quiesce_ctx->cb_fn = cb_fn; 10294 quiesce_ctx->cb_arg = cb_arg; 10295 10296 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10297 if (rc != 0) { 10298 free(quiesce_ctx); 10299 } 10300 } 10301 10302 return rc; 10303 } 10304 10305 int 10306 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10307 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10308 { 10309 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10310 } 10311 10312 int 10313 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10314 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10315 { 10316 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10317 } 10318 10319 int 10320 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10321 uint64_t offset, uint64_t length, 10322 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10323 { 10324 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10325 } 10326 10327 int 10328 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10329 uint64_t offset, uint64_t length, 10330 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10331 { 10332 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10333 } 10334 10335 int 10336 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10337 int array_size) 10338 { 10339 if (!bdev) { 10340 return -EINVAL; 10341 } 10342 10343 if (bdev->fn_table->get_memory_domains) { 10344 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10345 } 10346 10347 return 0; 10348 } 10349 10350 struct spdk_bdev_for_each_io_ctx { 10351 void *ctx; 10352 spdk_bdev_io_fn fn; 10353 spdk_bdev_for_each_io_cb cb; 10354 }; 10355 10356 static void 10357 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10358 struct spdk_io_channel *io_ch, void *_ctx) 10359 { 10360 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10361 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10362 struct spdk_bdev_io *bdev_io; 10363 int rc = 0; 10364 10365 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10366 rc = ctx->fn(ctx->ctx, bdev_io); 10367 if (rc != 0) { 10368 break; 10369 } 10370 } 10371 10372 spdk_bdev_for_each_channel_continue(i, rc); 10373 } 10374 10375 static void 10376 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10377 { 10378 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10379 10380 ctx->cb(ctx->ctx, status); 10381 10382 free(ctx); 10383 } 10384 10385 void 10386 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10387 spdk_bdev_for_each_io_cb cb) 10388 { 10389 struct spdk_bdev_for_each_io_ctx *ctx; 10390 10391 assert(fn != NULL && cb != NULL); 10392 10393 ctx = calloc(1, sizeof(*ctx)); 10394 if (ctx == NULL) { 10395 SPDK_ERRLOG("Failed to allocate context.\n"); 10396 cb(_ctx, -ENOMEM); 10397 return; 10398 } 10399 10400 ctx->ctx = _ctx; 10401 ctx->fn = fn; 10402 ctx->cb = cb; 10403 10404 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10405 bdev_for_each_io_done); 10406 } 10407 10408 void 10409 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10410 { 10411 spdk_for_each_channel_continue(iter->i, status); 10412 } 10413 10414 static struct spdk_bdev * 10415 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10416 { 10417 void *io_device = spdk_io_channel_iter_get_io_device(i); 10418 10419 return __bdev_from_io_dev(io_device); 10420 } 10421 10422 static void 10423 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10424 { 10425 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10426 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10427 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10428 10429 iter->i = i; 10430 iter->fn(iter, bdev, ch, iter->ctx); 10431 } 10432 10433 static void 10434 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10435 { 10436 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10437 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10438 10439 iter->i = i; 10440 iter->cpl(bdev, iter->ctx, status); 10441 10442 free(iter); 10443 } 10444 10445 void 10446 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10447 void *ctx, spdk_bdev_for_each_channel_done cpl) 10448 { 10449 struct spdk_bdev_channel_iter *iter; 10450 10451 assert(bdev != NULL && fn != NULL && ctx != NULL); 10452 10453 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10454 if (iter == NULL) { 10455 SPDK_ERRLOG("Unable to allocate iterator\n"); 10456 assert(false); 10457 return; 10458 } 10459 10460 iter->fn = fn; 10461 iter->cpl = cpl; 10462 iter->ctx = ctx; 10463 10464 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10465 iter, bdev_each_channel_cpl); 10466 } 10467 10468 static void 10469 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10470 { 10471 struct spdk_bdev_io *parent_io = cb_arg; 10472 10473 spdk_bdev_free_io(bdev_io); 10474 10475 /* Check return status of write */ 10476 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10477 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10478 } 10479 10480 static void 10481 bdev_copy_do_write(void *_bdev_io) 10482 { 10483 struct spdk_bdev_io *bdev_io = _bdev_io; 10484 int rc; 10485 10486 /* Write blocks */ 10487 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10488 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10489 bdev_io->u.bdev.iovs[0].iov_base, 10490 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10491 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10492 10493 if (rc == -ENOMEM) { 10494 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10495 } else if (rc != 0) { 10496 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10497 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10498 } 10499 } 10500 10501 static void 10502 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10503 { 10504 struct spdk_bdev_io *parent_io = cb_arg; 10505 10506 spdk_bdev_free_io(bdev_io); 10507 10508 /* Check return status of read */ 10509 if (!success) { 10510 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10511 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10512 return; 10513 } 10514 10515 /* Do write */ 10516 bdev_copy_do_write(parent_io); 10517 } 10518 10519 static void 10520 bdev_copy_do_read(void *_bdev_io) 10521 { 10522 struct spdk_bdev_io *bdev_io = _bdev_io; 10523 int rc; 10524 10525 /* Read blocks */ 10526 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10527 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10528 bdev_io->u.bdev.iovs[0].iov_base, 10529 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10530 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10531 10532 if (rc == -ENOMEM) { 10533 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10534 } else if (rc != 0) { 10535 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10536 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10537 } 10538 } 10539 10540 static void 10541 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10542 { 10543 if (!success) { 10544 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10545 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10546 return; 10547 } 10548 10549 bdev_copy_do_read(bdev_io); 10550 } 10551 10552 int 10553 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10554 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10555 spdk_bdev_io_completion_cb cb, void *cb_arg) 10556 { 10557 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10558 struct spdk_bdev_io *bdev_io; 10559 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10560 10561 if (!desc->write) { 10562 return -EBADF; 10563 } 10564 10565 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10566 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10567 SPDK_DEBUGLOG(bdev, 10568 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10569 dst_offset_blocks, src_offset_blocks, num_blocks); 10570 return -EINVAL; 10571 } 10572 10573 bdev_io = bdev_channel_get_io(channel); 10574 if (!bdev_io) { 10575 return -ENOMEM; 10576 } 10577 10578 bdev_io->internal.ch = channel; 10579 bdev_io->internal.desc = desc; 10580 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10581 10582 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10583 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10584 bdev_io->u.bdev.num_blocks = num_blocks; 10585 bdev_io->u.bdev.memory_domain = NULL; 10586 bdev_io->u.bdev.memory_domain_ctx = NULL; 10587 bdev_io->u.bdev.iovs = NULL; 10588 bdev_io->u.bdev.iovcnt = 0; 10589 bdev_io->u.bdev.md_buf = NULL; 10590 bdev_io->u.bdev.accel_sequence = NULL; 10591 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10592 10593 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10594 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10595 return 0; 10596 } 10597 10598 10599 /* If the copy size is large and should be split, use the generic split logic 10600 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10601 * 10602 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10603 * emulate it using regular read and write requests otherwise. 10604 */ 10605 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10606 bdev_io->internal.f.split) { 10607 bdev_io_submit(bdev_io); 10608 return 0; 10609 } 10610 10611 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10612 10613 return 0; 10614 } 10615 10616 SPDK_LOG_REGISTER_COMPONENT(bdev) 10617 10618 static void 10619 bdev_trace(void) 10620 { 10621 struct spdk_trace_tpoint_opts opts[] = { 10622 { 10623 "BDEV_IO_START", TRACE_BDEV_IO_START, 10624 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10625 { 10626 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10627 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10628 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10629 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10630 } 10631 }, 10632 { 10633 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10634 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10635 { 10636 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10637 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10638 } 10639 }, 10640 { 10641 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10642 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10643 { 10644 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10645 } 10646 }, 10647 { 10648 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10649 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10650 { 10651 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10652 } 10653 }, 10654 }; 10655 10656 10657 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10658 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10659 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10660 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10661 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10662 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10663 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10664 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10665 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10666 } 10667 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10668