1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF, 392 }; 393 394 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 395 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 396 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 397 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 398 399 static inline void bdev_io_complete(void *ctx); 400 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 402 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 403 static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io); 404 405 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 406 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 407 408 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 409 struct spdk_io_channel *ch, void *_ctx); 410 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 411 412 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 413 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 414 uint64_t num_blocks, 415 struct spdk_memory_domain *domain, void *domain_ctx, 416 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 417 spdk_bdev_io_completion_cb cb, void *cb_arg); 418 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 419 struct iovec *iov, int iovcnt, void *md_buf, 420 uint64_t offset_blocks, uint64_t num_blocks, 421 struct spdk_memory_domain *domain, void *domain_ctx, 422 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 423 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 424 spdk_bdev_io_completion_cb cb, void *cb_arg); 425 426 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 427 uint64_t offset, uint64_t length, 428 lock_range_cb cb_fn, void *cb_arg); 429 430 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 431 uint64_t offset, uint64_t length, 432 lock_range_cb cb_fn, void *cb_arg); 433 434 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 435 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 436 437 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 438 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 439 static void claim_reset(struct spdk_bdev *bdev); 440 441 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 442 443 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 444 445 #define bdev_get_ext_io_opt(opts, field, defval) \ 446 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 447 448 static inline void 449 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 450 { 451 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 452 bdev_io->internal.ch->queue_depth++; 453 } 454 455 static inline void 456 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 457 { 458 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 459 bdev_io->internal.ch->queue_depth--; 460 } 461 462 void 463 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 464 { 465 if (!opts) { 466 SPDK_ERRLOG("opts should not be NULL\n"); 467 return; 468 } 469 470 if (!opts_size) { 471 SPDK_ERRLOG("opts_size should not be zero value\n"); 472 return; 473 } 474 475 opts->opts_size = opts_size; 476 477 #define SET_FIELD(field) \ 478 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 479 opts->field = g_bdev_opts.field; \ 480 } \ 481 482 SET_FIELD(bdev_io_pool_size); 483 SET_FIELD(bdev_io_cache_size); 484 SET_FIELD(bdev_auto_examine); 485 SET_FIELD(iobuf_small_cache_size); 486 SET_FIELD(iobuf_large_cache_size); 487 488 /* Do not remove this statement, you should always update this statement when you adding a new field, 489 * and do not forget to add the SET_FIELD statement for your added field. */ 490 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 491 492 #undef SET_FIELD 493 } 494 495 int 496 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 497 { 498 uint32_t min_pool_size; 499 500 if (!opts) { 501 SPDK_ERRLOG("opts cannot be NULL\n"); 502 return -1; 503 } 504 505 if (!opts->opts_size) { 506 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 507 return -1; 508 } 509 510 /* 511 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 512 * initialization. A second mgmt_ch will be created on the same thread when the application starts 513 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 514 */ 515 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 516 if (opts->bdev_io_pool_size < min_pool_size) { 517 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 518 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 519 spdk_thread_get_count()); 520 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 521 return -1; 522 } 523 524 #define SET_FIELD(field) \ 525 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 526 g_bdev_opts.field = opts->field; \ 527 } \ 528 529 SET_FIELD(bdev_io_pool_size); 530 SET_FIELD(bdev_io_cache_size); 531 SET_FIELD(bdev_auto_examine); 532 SET_FIELD(iobuf_small_cache_size); 533 SET_FIELD(iobuf_large_cache_size); 534 535 g_bdev_opts.opts_size = opts->opts_size; 536 537 #undef SET_FIELD 538 539 return 0; 540 } 541 542 static struct spdk_bdev * 543 bdev_get_by_name(const char *bdev_name) 544 { 545 struct spdk_bdev_name find; 546 struct spdk_bdev_name *res; 547 548 find.name = (char *)bdev_name; 549 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 550 if (res != NULL) { 551 return res->bdev; 552 } 553 554 return NULL; 555 } 556 557 struct spdk_bdev * 558 spdk_bdev_get_by_name(const char *bdev_name) 559 { 560 struct spdk_bdev *bdev; 561 562 spdk_spin_lock(&g_bdev_mgr.spinlock); 563 bdev = bdev_get_by_name(bdev_name); 564 spdk_spin_unlock(&g_bdev_mgr.spinlock); 565 566 return bdev; 567 } 568 569 struct bdev_io_status_string { 570 enum spdk_bdev_io_status status; 571 const char *str; 572 }; 573 574 static const struct bdev_io_status_string bdev_io_status_strings[] = { 575 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 576 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 577 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 578 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 579 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 580 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 581 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 582 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 583 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 584 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 585 }; 586 587 static const char * 588 bdev_io_status_get_string(enum spdk_bdev_io_status status) 589 { 590 uint32_t i; 591 592 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 593 if (bdev_io_status_strings[i].status == status) { 594 return bdev_io_status_strings[i].str; 595 } 596 } 597 598 return "reserved"; 599 } 600 601 struct spdk_bdev_wait_for_examine_ctx { 602 struct spdk_poller *poller; 603 spdk_bdev_wait_for_examine_cb cb_fn; 604 void *cb_arg; 605 }; 606 607 static bool bdev_module_all_actions_completed(void); 608 609 static int 610 bdev_wait_for_examine_cb(void *arg) 611 { 612 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 613 614 if (!bdev_module_all_actions_completed()) { 615 return SPDK_POLLER_IDLE; 616 } 617 618 spdk_poller_unregister(&ctx->poller); 619 ctx->cb_fn(ctx->cb_arg); 620 free(ctx); 621 622 return SPDK_POLLER_BUSY; 623 } 624 625 int 626 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 627 { 628 struct spdk_bdev_wait_for_examine_ctx *ctx; 629 630 ctx = calloc(1, sizeof(*ctx)); 631 if (ctx == NULL) { 632 return -ENOMEM; 633 } 634 ctx->cb_fn = cb_fn; 635 ctx->cb_arg = cb_arg; 636 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 637 638 return 0; 639 } 640 641 struct spdk_bdev_examine_item { 642 char *name; 643 TAILQ_ENTRY(spdk_bdev_examine_item) link; 644 }; 645 646 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 647 648 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 649 g_bdev_examine_allowlist); 650 651 static inline bool 652 bdev_examine_allowlist_check(const char *name) 653 { 654 struct spdk_bdev_examine_item *item; 655 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 656 if (strcmp(name, item->name) == 0) { 657 return true; 658 } 659 } 660 return false; 661 } 662 663 static inline void 664 bdev_examine_allowlist_remove(const char *name) 665 { 666 struct spdk_bdev_examine_item *item; 667 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 668 if (strcmp(name, item->name) == 0) { 669 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 670 free(item->name); 671 free(item); 672 break; 673 } 674 } 675 } 676 677 static inline void 678 bdev_examine_allowlist_free(void) 679 { 680 struct spdk_bdev_examine_item *item; 681 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 682 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 683 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 684 free(item->name); 685 free(item); 686 } 687 } 688 689 static inline bool 690 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 691 { 692 struct spdk_bdev_alias *tmp; 693 if (bdev_examine_allowlist_check(bdev->name)) { 694 return true; 695 } 696 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 697 if (bdev_examine_allowlist_check(tmp->alias.name)) { 698 return true; 699 } 700 } 701 return false; 702 } 703 704 static inline bool 705 bdev_ok_to_examine(struct spdk_bdev *bdev) 706 { 707 /* Some bdevs may not support the READ command. 708 * Do not try to examine them. 709 */ 710 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 711 return false; 712 } 713 714 if (g_bdev_opts.bdev_auto_examine) { 715 return true; 716 } else { 717 return bdev_in_examine_allowlist(bdev); 718 } 719 } 720 721 static void 722 bdev_examine(struct spdk_bdev *bdev) 723 { 724 struct spdk_bdev_module *module; 725 struct spdk_bdev_module_claim *claim, *tmpclaim; 726 uint32_t action; 727 728 if (!bdev_ok_to_examine(bdev)) { 729 return; 730 } 731 732 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 733 if (module->examine_config) { 734 spdk_spin_lock(&module->internal.spinlock); 735 action = module->internal.action_in_progress; 736 module->internal.action_in_progress++; 737 spdk_spin_unlock(&module->internal.spinlock); 738 module->examine_config(bdev); 739 if (action != module->internal.action_in_progress) { 740 SPDK_ERRLOG("examine_config for module %s did not call " 741 "spdk_bdev_module_examine_done()\n", module->name); 742 } 743 } 744 } 745 746 spdk_spin_lock(&bdev->internal.spinlock); 747 748 switch (bdev->internal.claim_type) { 749 case SPDK_BDEV_CLAIM_NONE: 750 /* Examine by all bdev modules */ 751 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 752 if (module->examine_disk) { 753 spdk_spin_lock(&module->internal.spinlock); 754 module->internal.action_in_progress++; 755 spdk_spin_unlock(&module->internal.spinlock); 756 spdk_spin_unlock(&bdev->internal.spinlock); 757 module->examine_disk(bdev); 758 spdk_spin_lock(&bdev->internal.spinlock); 759 } 760 } 761 break; 762 case SPDK_BDEV_CLAIM_EXCL_WRITE: 763 /* Examine by the one bdev module with a v1 claim */ 764 module = bdev->internal.claim.v1.module; 765 if (module->examine_disk) { 766 spdk_spin_lock(&module->internal.spinlock); 767 module->internal.action_in_progress++; 768 spdk_spin_unlock(&module->internal.spinlock); 769 spdk_spin_unlock(&bdev->internal.spinlock); 770 module->examine_disk(bdev); 771 return; 772 } 773 break; 774 default: 775 /* Examine by all bdev modules with a v2 claim */ 776 assert(claim_type_is_v2(bdev->internal.claim_type)); 777 /* 778 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 779 * list, perhaps accessing freed memory. Without protection, this could happen 780 * while the lock is dropped during the examine callback. 781 */ 782 bdev->internal.examine_in_progress++; 783 784 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 785 module = claim->module; 786 787 if (module == NULL) { 788 /* This is a vestigial claim, held by examine_count */ 789 continue; 790 } 791 792 if (module->examine_disk == NULL) { 793 continue; 794 } 795 796 spdk_spin_lock(&module->internal.spinlock); 797 module->internal.action_in_progress++; 798 spdk_spin_unlock(&module->internal.spinlock); 799 800 /* Call examine_disk without holding internal.spinlock. */ 801 spdk_spin_unlock(&bdev->internal.spinlock); 802 module->examine_disk(bdev); 803 spdk_spin_lock(&bdev->internal.spinlock); 804 } 805 806 assert(bdev->internal.examine_in_progress > 0); 807 bdev->internal.examine_in_progress--; 808 if (bdev->internal.examine_in_progress == 0) { 809 /* Remove any claims that were released during examine_disk */ 810 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 811 if (claim->desc != NULL) { 812 continue; 813 } 814 815 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 816 free(claim); 817 } 818 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 819 claim_reset(bdev); 820 } 821 } 822 } 823 824 spdk_spin_unlock(&bdev->internal.spinlock); 825 } 826 827 int 828 spdk_bdev_examine(const char *name) 829 { 830 struct spdk_bdev *bdev; 831 struct spdk_bdev_examine_item *item; 832 struct spdk_thread *thread = spdk_get_thread(); 833 834 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 835 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 836 thread ? spdk_thread_get_name(thread) : "null"); 837 return -EINVAL; 838 } 839 840 if (g_bdev_opts.bdev_auto_examine) { 841 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 842 return -EINVAL; 843 } 844 845 if (bdev_examine_allowlist_check(name)) { 846 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 847 return -EEXIST; 848 } 849 850 item = calloc(1, sizeof(*item)); 851 if (!item) { 852 return -ENOMEM; 853 } 854 item->name = strdup(name); 855 if (!item->name) { 856 free(item); 857 return -ENOMEM; 858 } 859 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 860 861 bdev = spdk_bdev_get_by_name(name); 862 if (bdev) { 863 bdev_examine(bdev); 864 } 865 return 0; 866 } 867 868 static inline void 869 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 870 { 871 struct spdk_bdev_examine_item *item; 872 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 873 spdk_json_write_object_begin(w); 874 spdk_json_write_named_string(w, "method", "bdev_examine"); 875 spdk_json_write_named_object_begin(w, "params"); 876 spdk_json_write_named_string(w, "name", item->name); 877 spdk_json_write_object_end(w); 878 spdk_json_write_object_end(w); 879 } 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_first(void) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 888 if (bdev) { 889 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 890 } 891 892 return bdev; 893 } 894 895 struct spdk_bdev * 896 spdk_bdev_next(struct spdk_bdev *prev) 897 { 898 struct spdk_bdev *bdev; 899 900 bdev = TAILQ_NEXT(prev, internal.link); 901 if (bdev) { 902 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 903 } 904 905 return bdev; 906 } 907 908 static struct spdk_bdev * 909 _bdev_next_leaf(struct spdk_bdev *bdev) 910 { 911 while (bdev != NULL) { 912 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 913 return bdev; 914 } else { 915 bdev = TAILQ_NEXT(bdev, internal.link); 916 } 917 } 918 919 return bdev; 920 } 921 922 struct spdk_bdev * 923 spdk_bdev_first_leaf(void) 924 { 925 struct spdk_bdev *bdev; 926 927 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 928 929 if (bdev) { 930 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 931 } 932 933 return bdev; 934 } 935 936 struct spdk_bdev * 937 spdk_bdev_next_leaf(struct spdk_bdev *prev) 938 { 939 struct spdk_bdev *bdev; 940 941 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 942 943 if (bdev) { 944 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 945 } 946 947 return bdev; 948 } 949 950 static inline bool 951 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 952 { 953 return bdev_io->internal.f.has_memory_domain; 954 } 955 956 static inline bool 957 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 958 { 959 return bdev_io->internal.f.has_accel_sequence; 960 } 961 962 static inline uint32_t 963 bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 964 { 965 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 966 967 if (spdk_unlikely(desc->opts.hide_metadata)) { 968 return bdev->blocklen - bdev->md_len; 969 } else { 970 return bdev->blocklen; 971 } 972 } 973 974 static inline uint32_t 975 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io) 976 { 977 struct spdk_bdev *bdev = bdev_io->bdev; 978 979 if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) { 980 if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) { 981 return bdev->blocklen - bdev->md_len; 982 } else { 983 return bdev->blocklen; 984 } 985 } 986 987 return bdev_desc_get_block_size(bdev_io->internal.desc); 988 } 989 990 static inline void 991 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 992 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 993 { 994 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 995 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 996 * channels we will instead wait for half to complete. 997 */ 998 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 999 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1000 1001 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1002 bdev_io->internal.retry_state = state; 1003 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1004 } 1005 1006 static inline void 1007 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 1008 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1009 { 1010 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 1011 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 1012 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 1013 1014 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1015 bdev_io->internal.retry_state = state; 1016 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1017 } 1018 1019 void 1020 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 1021 { 1022 struct iovec *iovs; 1023 1024 if (bdev_io->u.bdev.iovs == NULL) { 1025 bdev_io->u.bdev.iovs = &bdev_io->iov; 1026 bdev_io->u.bdev.iovcnt = 1; 1027 } 1028 1029 iovs = bdev_io->u.bdev.iovs; 1030 1031 assert(iovs != NULL); 1032 assert(bdev_io->u.bdev.iovcnt >= 1); 1033 1034 iovs[0].iov_base = buf; 1035 iovs[0].iov_len = len; 1036 } 1037 1038 void 1039 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1040 { 1041 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1042 bdev_io->u.bdev.md_buf = md_buf; 1043 } 1044 1045 static bool 1046 _is_buf_allocated(const struct iovec *iovs) 1047 { 1048 if (iovs == NULL) { 1049 return false; 1050 } 1051 1052 return iovs[0].iov_base != NULL; 1053 } 1054 1055 static bool 1056 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1057 { 1058 int i; 1059 uintptr_t iov_base; 1060 1061 if (spdk_likely(alignment == 1)) { 1062 return true; 1063 } 1064 1065 for (i = 0; i < iovcnt; i++) { 1066 iov_base = (uintptr_t)iovs[i].iov_base; 1067 if ((iov_base & (alignment - 1)) != 0) { 1068 return false; 1069 } 1070 } 1071 1072 return true; 1073 } 1074 1075 static inline bool 1076 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1077 { 1078 return (bdev_io->bdev->md_len != 0) && 1079 (desc->opts.hide_metadata || 1080 (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT)); 1081 } 1082 1083 static inline bool 1084 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1085 { 1086 if (!bdev_io_use_accel_sequence(bdev_io)) { 1087 return false; 1088 } 1089 1090 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1091 * bdev module didn't support accel sequences */ 1092 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1093 } 1094 1095 static inline void 1096 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1097 struct spdk_bdev_shared_resource *shared_resource) 1098 { 1099 bdev_ch->io_outstanding++; 1100 shared_resource->io_outstanding++; 1101 } 1102 1103 static inline void 1104 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1105 struct spdk_bdev_shared_resource *shared_resource) 1106 { 1107 assert(bdev_ch->io_outstanding > 0); 1108 assert(shared_resource->io_outstanding > 0); 1109 bdev_ch->io_outstanding--; 1110 shared_resource->io_outstanding--; 1111 } 1112 1113 static void 1114 bdev_io_submit_sequence_cb(void *ctx, int status) 1115 { 1116 struct spdk_bdev_io *bdev_io = ctx; 1117 1118 assert(bdev_io_use_accel_sequence(bdev_io)); 1119 1120 bdev_io->u.bdev.accel_sequence = NULL; 1121 bdev_io->internal.f.has_accel_sequence = false; 1122 1123 if (spdk_unlikely(status != 0)) { 1124 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1126 bdev_io_complete_unsubmitted(bdev_io); 1127 return; 1128 } 1129 1130 bdev_io_submit(bdev_io); 1131 } 1132 1133 static void 1134 bdev_io_exec_sequence_cb(void *ctx, int status) 1135 { 1136 struct spdk_bdev_io *bdev_io = ctx; 1137 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1138 1139 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1140 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1141 1142 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1143 bdev_ch_retry_io(ch); 1144 } 1145 1146 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1147 } 1148 1149 static void 1150 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1151 { 1152 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1153 1154 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1155 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1156 assert(bdev_io_use_accel_sequence(bdev_io)); 1157 1158 /* Since the operations are appended during submission, they're in the opposite order than 1159 * how we want to execute them for reads (i.e. we need to execute the most recently added 1160 * operation first), so reverse the sequence before executing it. 1161 */ 1162 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1163 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1164 } 1165 1166 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1167 bdev_io_increment_outstanding(ch, ch->shared_resource); 1168 bdev_io->internal.data_transfer_cpl = cb_fn; 1169 1170 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1171 bdev_io_exec_sequence_cb, bdev_io); 1172 } 1173 1174 static void 1175 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1176 { 1177 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1178 void *buf; 1179 1180 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1181 buf = bdev_io->internal.buf.ptr; 1182 bdev_io->internal.buf.ptr = NULL; 1183 bdev_io->internal.f.has_buf = false; 1184 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1185 bdev_io->internal.get_aux_buf_cb = NULL; 1186 } else { 1187 assert(bdev_io->internal.get_buf_cb != NULL); 1188 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1189 bdev_io->internal.get_buf_cb = NULL; 1190 } 1191 } 1192 1193 static void 1194 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1195 { 1196 struct spdk_bdev_io *bdev_io = ctx; 1197 1198 if (rc) { 1199 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1200 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1201 } 1202 bdev_io_get_buf_complete(bdev_io, !rc); 1203 } 1204 1205 static void 1206 bdev_io_pull_md_buf_done(void *ctx, int status) 1207 { 1208 struct spdk_bdev_io *bdev_io = ctx; 1209 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1210 1211 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1212 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1213 1214 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1215 bdev_ch_retry_io(ch); 1216 } 1217 1218 assert(bdev_io->internal.data_transfer_cpl); 1219 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1220 } 1221 1222 static void 1223 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1224 { 1225 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1226 int rc = 0; 1227 1228 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1229 assert(bdev_io->internal.f.has_bounce_buf); 1230 if (bdev_io_use_memory_domain(bdev_io)) { 1231 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1232 bdev_io_increment_outstanding(ch, ch->shared_resource); 1233 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1234 bdev_io->internal.memory_domain_ctx, 1235 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1236 &bdev_io->internal.bounce_buf.md_iov, 1, 1237 bdev_io_pull_md_buf_done, bdev_io); 1238 if (rc == 0) { 1239 /* Continue to submit IO in completion callback */ 1240 return; 1241 } 1242 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1243 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1244 if (rc != -ENOMEM) { 1245 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1246 spdk_memory_domain_get_dma_device_id( 1247 bdev_io->internal.memory_domain), rc); 1248 } 1249 } else { 1250 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1251 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1252 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1253 } 1254 } 1255 1256 if (spdk_unlikely(rc == -ENOMEM)) { 1257 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1258 } else { 1259 assert(bdev_io->internal.data_transfer_cpl); 1260 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1261 } 1262 } 1263 1264 static void 1265 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1266 { 1267 assert(bdev_io->internal.f.has_bounce_buf); 1268 1269 /* save original md_buf */ 1270 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1271 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1272 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1273 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1274 /* set bounce md_buf */ 1275 bdev_io->u.bdev.md_buf = md_buf; 1276 1277 bdev_io_pull_md_buf(bdev_io); 1278 } 1279 1280 static void 1281 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1282 { 1283 struct spdk_bdev *bdev = bdev_io->bdev; 1284 uint64_t md_len; 1285 void *buf; 1286 1287 if (spdk_bdev_is_md_separate(bdev)) { 1288 assert(!bdev_io_use_accel_sequence(bdev_io)); 1289 1290 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1291 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1292 1293 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1294 1295 if (bdev_io->u.bdev.md_buf != NULL) { 1296 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1297 return; 1298 } else { 1299 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1300 } 1301 } 1302 1303 bdev_io_get_buf_complete(bdev_io, true); 1304 } 1305 1306 static inline void 1307 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1308 { 1309 if (rc) { 1310 SPDK_ERRLOG("Failed to get data buffer\n"); 1311 assert(bdev_io->internal.data_transfer_cpl); 1312 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1313 return; 1314 } 1315 1316 _bdev_io_set_md_buf(bdev_io); 1317 } 1318 1319 static void 1320 bdev_io_pull_data_done_and_track(void *ctx, int status) 1321 { 1322 struct spdk_bdev_io *bdev_io = ctx; 1323 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1324 1325 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1326 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1327 1328 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1329 bdev_ch_retry_io(ch); 1330 } 1331 1332 bdev_io_pull_data_done(bdev_io, status); 1333 } 1334 1335 static void 1336 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1337 { 1338 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1339 struct spdk_bdev_desc *desc = bdev_io->internal.desc; 1340 int rc = 0; 1341 1342 assert(bdev_io->internal.f.has_bounce_buf); 1343 1344 if (bdev_io_needs_metadata(desc, bdev_io)) { 1345 assert(bdev_io->bdev->md_interleave); 1346 1347 bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT; 1348 1349 if (!bdev_io_use_accel_sequence(bdev_io)) { 1350 bdev_io->internal.accel_sequence = NULL; 1351 } 1352 1353 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1354 rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1355 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1356 bdev_io->u.bdev.memory_domain, 1357 bdev_io->u.bdev.memory_domain_ctx, 1358 bdev_io->internal.bounce_buf.orig_iovs, 1359 bdev_io->internal.bounce_buf.orig_iovcnt, 1360 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1361 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1362 bdev_io->u.bdev.num_blocks, 1363 &bdev_io->u.bdev.dif_ctx, 1364 NULL, NULL); 1365 } else { 1366 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1367 rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1368 bdev_io->internal.bounce_buf.orig_iovs, 1369 bdev_io->internal.bounce_buf.orig_iovcnt, 1370 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1371 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1372 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1373 bdev_io->u.bdev.memory_domain, 1374 bdev_io->u.bdev.memory_domain_ctx, 1375 bdev_io->u.bdev.num_blocks, 1376 &bdev_io->u.bdev.dif_ctx, 1377 &bdev_io->u.bdev.dif_err, 1378 NULL, NULL); 1379 } 1380 1381 if (spdk_likely(rc == 0)) { 1382 bdev_io->internal.f.has_accel_sequence = true; 1383 bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence; 1384 } else if (rc != -ENOMEM) { 1385 SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n", 1386 bdev_io->internal.accel_sequence); 1387 } 1388 } else if (bdev_io_needs_sequence_exec(desc, bdev_io) || 1389 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1390 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1391 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1392 * operation */ 1393 assert(bdev_io_use_accel_sequence(bdev_io)); 1394 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1395 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1396 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1397 NULL, NULL, 1398 bdev_io->internal.bounce_buf.orig_iovs, 1399 bdev_io->internal.bounce_buf.orig_iovcnt, 1400 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1401 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1402 NULL, NULL); 1403 } else { 1404 /* We need to reverse the src/dst for reads */ 1405 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1406 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1407 bdev_io->internal.bounce_buf.orig_iovs, 1408 bdev_io->internal.bounce_buf.orig_iovcnt, 1409 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1410 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1411 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1412 NULL, NULL, NULL, NULL); 1413 } 1414 1415 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1416 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1417 bdev_io->internal.accel_sequence); 1418 } 1419 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1420 /* if this is write path, copy data from original buffer to bounce buffer */ 1421 if (bdev_io_use_memory_domain(bdev_io)) { 1422 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1423 bdev_io_increment_outstanding(ch, ch->shared_resource); 1424 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1425 bdev_io->internal.memory_domain_ctx, 1426 bdev_io->internal.bounce_buf.orig_iovs, 1427 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1428 bdev_io->u.bdev.iovs, 1, 1429 bdev_io_pull_data_done_and_track, 1430 bdev_io); 1431 if (rc == 0) { 1432 /* Continue to submit IO in completion callback */ 1433 return; 1434 } 1435 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1436 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1437 if (rc != -ENOMEM) { 1438 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1439 spdk_memory_domain_get_dma_device_id( 1440 bdev_io->internal.memory_domain)); 1441 } 1442 } else { 1443 assert(bdev_io->u.bdev.iovcnt == 1); 1444 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1445 bdev_io->u.bdev.iovs[0].iov_len, 1446 bdev_io->internal.bounce_buf.orig_iovs, 1447 bdev_io->internal.bounce_buf.orig_iovcnt); 1448 } 1449 } 1450 1451 if (spdk_unlikely(rc == -ENOMEM)) { 1452 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1453 } else { 1454 bdev_io_pull_data_done(bdev_io, rc); 1455 } 1456 } 1457 1458 static void 1459 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1460 bdev_copy_bounce_buffer_cpl cpl_cb) 1461 { 1462 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1463 1464 assert(bdev_io->internal.f.has_bounce_buf == false); 1465 1466 bdev_io->internal.data_transfer_cpl = cpl_cb; 1467 bdev_io->internal.f.has_bounce_buf = true; 1468 /* save original iovec */ 1469 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1470 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1471 /* zero the other data members */ 1472 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1473 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1474 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1475 /* set bounce iov */ 1476 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1477 bdev_io->u.bdev.iovcnt = 1; 1478 /* set bounce buffer for this operation */ 1479 bdev_io->u.bdev.iovs[0].iov_base = buf; 1480 bdev_io->u.bdev.iovs[0].iov_len = len; 1481 /* Now we use 1 iov, the split condition could have been changed */ 1482 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1483 1484 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1485 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1486 } else { 1487 bdev_io_pull_data(bdev_io); 1488 } 1489 } 1490 1491 static void 1492 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1493 { 1494 struct spdk_bdev *bdev = bdev_io->bdev; 1495 bool buf_allocated; 1496 uint64_t alignment; 1497 void *aligned_buf; 1498 1499 bdev_io->internal.buf.ptr = buf; 1500 bdev_io->internal.f.has_buf = true; 1501 1502 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1503 bdev_io_get_buf_complete(bdev_io, true); 1504 return; 1505 } 1506 1507 alignment = spdk_bdev_get_buf_align(bdev); 1508 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1509 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1510 1511 if (buf_allocated) { 1512 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1513 /* Continue in completion callback */ 1514 return; 1515 } else { 1516 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1517 } 1518 1519 _bdev_io_set_md_buf(bdev_io); 1520 } 1521 1522 static inline uint64_t 1523 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1524 { 1525 struct spdk_bdev *bdev = bdev_io->bdev; 1526 uint64_t md_len, alignment; 1527 1528 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1529 1530 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1531 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1532 1533 return len + alignment + md_len; 1534 } 1535 1536 static void 1537 bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io) 1538 { 1539 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1540 1541 spdk_accel_put_buf(ch->accel_channel, 1542 bdev_io->internal.buf.ptr, 1543 bdev_io->u.bdev.memory_domain, 1544 bdev_io->u.bdev.memory_domain_ctx); 1545 } 1546 1547 static void 1548 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1549 { 1550 struct spdk_bdev_mgmt_channel *ch; 1551 1552 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1553 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1554 } 1555 1556 static void 1557 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1558 { 1559 assert(bdev_io->internal.f.has_buf); 1560 1561 if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) { 1562 bdev_io_put_accel_buf(bdev_io); 1563 } else { 1564 assert(bdev_io->u.bdev.memory_domain == NULL); 1565 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, 1566 bdev_io->internal.buf.len); 1567 } 1568 bdev_io->internal.buf.ptr = NULL; 1569 bdev_io->internal.f.has_buf = false; 1570 } 1571 1572 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1573 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1574 1575 void 1576 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1577 { 1578 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1579 1580 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1581 1582 assert(buf != NULL); 1583 _bdev_io_put_buf(bdev_io, buf, len); 1584 } 1585 1586 static inline void 1587 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1588 struct spdk_bdev_io *bdev_io) 1589 { 1590 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1591 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1592 * sequence pointer to make sure we won't touch it anymore. */ 1593 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1594 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1595 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1596 bdev_io->internal.f.has_accel_sequence = false; 1597 } 1598 1599 /* The generic bdev layer should not pass an I/O with a dif_check_flags set that 1600 * the underlying bdev does not support. Add an assert to check this. 1601 */ 1602 assert((bdev_io->type != SPDK_BDEV_IO_TYPE_WRITE && 1603 bdev_io->type != SPDK_BDEV_IO_TYPE_READ) || 1604 ((bdev_io->u.bdev.dif_check_flags & bdev->dif_check_flags) == 1605 bdev_io->u.bdev.dif_check_flags)); 1606 1607 bdev->fn_table->submit_request(ioch, bdev_io); 1608 } 1609 1610 static inline void 1611 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1612 { 1613 struct spdk_bdev *bdev = bdev_io->bdev; 1614 1615 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1616 bdev_io->internal.error.nvme.cdw0 = 0; 1617 bdev_io->num_retries++; 1618 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1619 } 1620 1621 static void 1622 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1623 { 1624 struct spdk_bdev_io *bdev_io; 1625 1626 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1627 /* 1628 * Allow some more I/O to complete before retrying the nomem_io queue. 1629 * Some drivers (such as nvme) cannot immediately take a new I/O in 1630 * the context of a completion, because the resources for the I/O are 1631 * not released until control returns to the bdev poller. Also, we 1632 * may require several small I/O to complete before a larger I/O 1633 * (that requires splitting) can be submitted. 1634 */ 1635 return; 1636 } 1637 1638 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1639 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1640 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1641 1642 switch (bdev_io->internal.retry_state) { 1643 case BDEV_IO_RETRY_STATE_SUBMIT: 1644 bdev_ch_resubmit_io(shared_resource, bdev_io); 1645 break; 1646 case BDEV_IO_RETRY_STATE_PULL: 1647 bdev_io_pull_data(bdev_io); 1648 break; 1649 case BDEV_IO_RETRY_STATE_PULL_MD: 1650 bdev_io_pull_md_buf(bdev_io); 1651 break; 1652 case BDEV_IO_RETRY_STATE_PUSH: 1653 bdev_io_push_bounce_data(bdev_io); 1654 break; 1655 case BDEV_IO_RETRY_STATE_PUSH_MD: 1656 bdev_io_push_bounce_md_buf(bdev_io); 1657 break; 1658 case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF: 1659 _bdev_io_get_accel_buf(bdev_io); 1660 break; 1661 default: 1662 assert(0 && "invalid retry state"); 1663 break; 1664 } 1665 1666 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1667 /* This IO completed again with NOMEM status, so break the loop and 1668 * don't try anymore. Note that a bdev_io that fails with NOMEM 1669 * always gets requeued at the front of the list, to maintain 1670 * ordering. 1671 */ 1672 break; 1673 } 1674 } 1675 } 1676 1677 static void 1678 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1679 { 1680 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1681 } 1682 1683 static int 1684 bdev_no_mem_poller(void *ctx) 1685 { 1686 struct spdk_bdev_shared_resource *shared_resource = ctx; 1687 1688 spdk_poller_unregister(&shared_resource->nomem_poller); 1689 1690 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1691 bdev_shared_ch_retry_io(shared_resource); 1692 } 1693 /* the retry cb may re-register the poller so double check */ 1694 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1695 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1696 /* No IOs were submitted, try again */ 1697 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1698 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1699 } 1700 1701 return SPDK_POLLER_BUSY; 1702 } 1703 1704 static inline bool 1705 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1706 { 1707 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1708 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1709 1710 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1711 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1712 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1713 1714 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1715 /* Special case when we have nomem IOs and no outstanding IOs which completions 1716 * could trigger retry of queued IOs 1717 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1718 * new IOs submitted, e.g. qd==1 */ 1719 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1720 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1721 } 1722 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1723 * ownership of that sequence is transferred back to the bdev layer, so we need to 1724 * restore internal.accel_sequence to make sure that the sequence is handled 1725 * correctly in case the I/O is later aborted. */ 1726 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1727 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1728 assert(!bdev_io_use_accel_sequence(bdev_io)); 1729 bdev_io->internal.f.has_accel_sequence = true; 1730 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1731 } 1732 1733 return true; 1734 } 1735 1736 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1737 bdev_ch_retry_io(bdev_ch); 1738 } 1739 1740 return false; 1741 } 1742 1743 static void 1744 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1745 { 1746 struct spdk_bdev_io *bdev_io = ctx; 1747 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1748 1749 if (rc) { 1750 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1751 } 1752 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1753 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1754 */ 1755 bdev_io_put_buf(bdev_io); 1756 1757 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1758 bdev_ch_retry_io(ch); 1759 } 1760 1761 /* Continue with IO completion flow */ 1762 bdev_io_complete(bdev_io); 1763 } 1764 1765 static void 1766 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1767 { 1768 struct spdk_bdev_io *bdev_io = ctx; 1769 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1770 1771 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1772 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1773 bdev_io->internal.f.has_bounce_buf = false; 1774 1775 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1776 bdev_ch_retry_io(ch); 1777 } 1778 1779 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1780 } 1781 1782 static inline void 1783 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1784 { 1785 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1786 int rc = 0; 1787 1788 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1789 assert(bdev_io->internal.f.has_bounce_buf); 1790 1791 /* do the same for metadata buffer */ 1792 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1793 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1794 1795 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1796 if (bdev_io_use_memory_domain(bdev_io)) { 1797 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1798 bdev_io_increment_outstanding(ch, ch->shared_resource); 1799 /* If memory domain is used then we need to call async push function */ 1800 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1801 bdev_io->internal.memory_domain_ctx, 1802 &bdev_io->internal.bounce_buf.orig_md_iov, 1803 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1804 &bdev_io->internal.bounce_buf.md_iov, 1, 1805 bdev_io_push_bounce_md_buf_done, 1806 bdev_io); 1807 if (rc == 0) { 1808 /* Continue IO completion in async callback */ 1809 return; 1810 } 1811 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1812 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1813 if (rc != -ENOMEM) { 1814 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1815 spdk_memory_domain_get_dma_device_id( 1816 bdev_io->internal.memory_domain)); 1817 } 1818 } else { 1819 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1820 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1821 } 1822 } 1823 } 1824 1825 if (spdk_unlikely(rc == -ENOMEM)) { 1826 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1827 } else { 1828 assert(bdev_io->internal.data_transfer_cpl); 1829 bdev_io->internal.f.has_bounce_buf = false; 1830 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1831 } 1832 } 1833 1834 static inline void 1835 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1836 { 1837 assert(bdev_io->internal.data_transfer_cpl); 1838 if (rc) { 1839 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1840 return; 1841 } 1842 1843 /* set original buffer for this io */ 1844 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1845 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1846 1847 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1848 * we still need to clear the md buf */ 1849 1850 bdev_io_push_bounce_md_buf(bdev_io); 1851 } 1852 1853 static void 1854 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1855 { 1856 struct spdk_bdev_io *bdev_io = ctx; 1857 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1858 1859 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1860 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1861 1862 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1863 bdev_ch_retry_io(ch); 1864 } 1865 1866 bdev_io_push_bounce_data_done(bdev_io, status); 1867 } 1868 1869 static inline void 1870 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1871 { 1872 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1873 int rc = 0; 1874 1875 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1876 assert(!bdev_io_use_accel_sequence(bdev_io)); 1877 assert(bdev_io->internal.f.has_bounce_buf); 1878 1879 /* if this is read path, copy data from bounce buffer to original buffer */ 1880 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1881 if (bdev_io_use_memory_domain(bdev_io)) { 1882 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1883 bdev_io_increment_outstanding(ch, ch->shared_resource); 1884 /* If memory domain is used then we need to call async push function */ 1885 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1886 bdev_io->internal.memory_domain_ctx, 1887 bdev_io->internal.bounce_buf.orig_iovs, 1888 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1889 &bdev_io->internal.bounce_buf.iov, 1, 1890 bdev_io_push_bounce_data_done_and_track, 1891 bdev_io); 1892 if (rc == 0) { 1893 /* Continue IO completion in async callback */ 1894 return; 1895 } 1896 1897 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1898 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1899 if (rc != -ENOMEM) { 1900 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1901 spdk_memory_domain_get_dma_device_id( 1902 bdev_io->internal.memory_domain)); 1903 } 1904 } else { 1905 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1906 bdev_io->internal.bounce_buf.orig_iovcnt, 1907 bdev_io->internal.bounce_buf.iov.iov_base, 1908 bdev_io->internal.bounce_buf.iov.iov_len); 1909 } 1910 } 1911 1912 if (spdk_unlikely(rc == -ENOMEM)) { 1913 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1914 } else { 1915 bdev_io_push_bounce_data_done(bdev_io, rc); 1916 } 1917 } 1918 1919 static inline void 1920 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1921 { 1922 bdev_io->internal.data_transfer_cpl = cpl_cb; 1923 bdev_io_push_bounce_data(bdev_io); 1924 } 1925 1926 static void 1927 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1928 { 1929 struct spdk_bdev_io *bdev_io; 1930 1931 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1932 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1933 } 1934 1935 static void 1936 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1937 { 1938 struct spdk_bdev_mgmt_channel *mgmt_ch; 1939 uint64_t max_len; 1940 void *buf; 1941 1942 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1943 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1944 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1945 1946 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1947 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1948 bdev_io_get_buf_complete(bdev_io, false); 1949 return; 1950 } 1951 1952 bdev_io->internal.buf.len = len; 1953 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1954 bdev_io_get_iobuf_cb); 1955 if (buf != NULL) { 1956 _bdev_io_set_buf(bdev_io, buf, len); 1957 } 1958 } 1959 1960 void 1961 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1962 { 1963 struct spdk_bdev *bdev = bdev_io->bdev; 1964 uint64_t alignment; 1965 1966 assert(cb != NULL); 1967 bdev_io->internal.get_buf_cb = cb; 1968 1969 alignment = spdk_bdev_get_buf_align(bdev); 1970 1971 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1972 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1973 /* Buffer already present and aligned */ 1974 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1975 return; 1976 } 1977 1978 bdev_io_get_buf(bdev_io, len); 1979 } 1980 1981 static void 1982 _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1983 uint64_t len) 1984 { 1985 assert(cb != NULL); 1986 bdev_io->internal.get_buf_cb = cb; 1987 1988 bdev_io_get_buf(bdev_io, len); 1989 } 1990 1991 static void 1992 _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io) 1993 { 1994 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1995 void *buf; 1996 int rc; 1997 1998 rc = spdk_accel_get_buf(ch->accel_channel, 1999 bdev_io->internal.buf.len, 2000 &buf, 2001 &bdev_io->u.bdev.memory_domain, 2002 &bdev_io->u.bdev.memory_domain_ctx); 2003 if (rc != 0) { 2004 bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io, 2005 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF); 2006 return; 2007 } 2008 2009 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 2010 } 2011 2012 static inline void 2013 bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 2014 uint64_t len) 2015 { 2016 bdev_io->internal.buf.len = len; 2017 bdev_io->internal.get_buf_cb = cb; 2018 2019 _bdev_io_get_accel_buf(bdev_io); 2020 } 2021 2022 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 2023 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 2024 2025 void 2026 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 2027 { 2028 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2029 2030 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 2031 2032 assert(cb != NULL); 2033 assert(bdev_io->internal.get_aux_buf_cb == NULL); 2034 bdev_io->internal.get_aux_buf_cb = cb; 2035 bdev_io_get_buf(bdev_io, len); 2036 } 2037 2038 static int 2039 bdev_module_get_max_ctx_size(void) 2040 { 2041 struct spdk_bdev_module *bdev_module; 2042 int max_bdev_module_size = 0; 2043 2044 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2045 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 2046 max_bdev_module_size = bdev_module->get_ctx_size(); 2047 } 2048 } 2049 2050 return max_bdev_module_size; 2051 } 2052 2053 static void 2054 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2055 { 2056 if (!bdev->internal.histogram_enabled) { 2057 return; 2058 } 2059 2060 spdk_json_write_object_begin(w); 2061 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 2062 2063 spdk_json_write_named_object_begin(w, "params"); 2064 spdk_json_write_named_string(w, "name", bdev->name); 2065 2066 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 2067 2068 if (bdev->internal.histogram_io_type) { 2069 spdk_json_write_named_string(w, "opc", 2070 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 2071 } 2072 2073 spdk_json_write_object_end(w); 2074 2075 spdk_json_write_object_end(w); 2076 } 2077 2078 static void 2079 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2080 { 2081 int i; 2082 struct spdk_bdev_qos *qos = bdev->internal.qos; 2083 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 2084 2085 if (!qos) { 2086 return; 2087 } 2088 2089 spdk_bdev_get_qos_rate_limits(bdev, limits); 2090 2091 spdk_json_write_object_begin(w); 2092 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 2093 2094 spdk_json_write_named_object_begin(w, "params"); 2095 spdk_json_write_named_string(w, "name", bdev->name); 2096 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2097 if (limits[i] > 0) { 2098 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 2099 } 2100 } 2101 spdk_json_write_object_end(w); 2102 2103 spdk_json_write_object_end(w); 2104 } 2105 2106 void 2107 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 2108 { 2109 struct spdk_bdev_module *bdev_module; 2110 struct spdk_bdev *bdev; 2111 2112 assert(w != NULL); 2113 2114 spdk_json_write_array_begin(w); 2115 2116 spdk_json_write_object_begin(w); 2117 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2118 spdk_json_write_named_object_begin(w, "params"); 2119 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2120 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2121 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2122 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2123 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2124 spdk_json_write_object_end(w); 2125 spdk_json_write_object_end(w); 2126 2127 bdev_examine_allowlist_config_json(w); 2128 2129 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2130 if (bdev_module->config_json) { 2131 bdev_module->config_json(w); 2132 } 2133 } 2134 2135 spdk_spin_lock(&g_bdev_mgr.spinlock); 2136 2137 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2138 if (bdev->fn_table->write_config_json) { 2139 bdev->fn_table->write_config_json(bdev, w); 2140 } 2141 2142 bdev_qos_config_json(bdev, w); 2143 bdev_enable_histogram_config_json(bdev, w); 2144 } 2145 2146 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2147 2148 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2149 spdk_json_write_object_begin(w); 2150 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2151 spdk_json_write_object_end(w); 2152 2153 spdk_json_write_array_end(w); 2154 } 2155 2156 static void 2157 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2158 { 2159 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2160 struct spdk_bdev_io *bdev_io; 2161 2162 spdk_iobuf_channel_fini(&ch->iobuf); 2163 2164 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2165 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2166 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2167 ch->per_thread_cache_count--; 2168 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2169 } 2170 2171 assert(ch->per_thread_cache_count == 0); 2172 } 2173 2174 static int 2175 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2176 { 2177 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2178 struct spdk_bdev_io *bdev_io; 2179 uint32_t i; 2180 int rc; 2181 2182 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2183 g_bdev_opts.iobuf_small_cache_size, 2184 g_bdev_opts.iobuf_large_cache_size); 2185 if (rc != 0) { 2186 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2187 return -1; 2188 } 2189 2190 STAILQ_INIT(&ch->per_thread_cache); 2191 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2192 2193 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2194 ch->per_thread_cache_count = 0; 2195 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2196 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2197 if (bdev_io == NULL) { 2198 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2199 assert(false); 2200 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2201 return -1; 2202 } 2203 ch->per_thread_cache_count++; 2204 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2205 } 2206 2207 TAILQ_INIT(&ch->shared_resources); 2208 TAILQ_INIT(&ch->io_wait_queue); 2209 2210 return 0; 2211 } 2212 2213 static void 2214 bdev_init_complete(int rc) 2215 { 2216 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2217 void *cb_arg = g_init_cb_arg; 2218 struct spdk_bdev_module *m; 2219 2220 g_bdev_mgr.init_complete = true; 2221 g_init_cb_fn = NULL; 2222 g_init_cb_arg = NULL; 2223 2224 /* 2225 * For modules that need to know when subsystem init is complete, 2226 * inform them now. 2227 */ 2228 if (rc == 0) { 2229 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2230 if (m->init_complete) { 2231 m->init_complete(); 2232 } 2233 } 2234 } 2235 2236 cb_fn(cb_arg, rc); 2237 } 2238 2239 static bool 2240 bdev_module_all_actions_completed(void) 2241 { 2242 struct spdk_bdev_module *m; 2243 2244 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2245 if (m->internal.action_in_progress > 0) { 2246 return false; 2247 } 2248 } 2249 return true; 2250 } 2251 2252 static void 2253 bdev_module_action_complete(void) 2254 { 2255 /* 2256 * Don't finish bdev subsystem initialization if 2257 * module pre-initialization is still in progress, or 2258 * the subsystem been already initialized. 2259 */ 2260 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2261 return; 2262 } 2263 2264 /* 2265 * Check all bdev modules for inits/examinations in progress. If any 2266 * exist, return immediately since we cannot finish bdev subsystem 2267 * initialization until all are completed. 2268 */ 2269 if (!bdev_module_all_actions_completed()) { 2270 return; 2271 } 2272 2273 /* 2274 * Modules already finished initialization - now that all 2275 * the bdev modules have finished their asynchronous I/O 2276 * processing, the entire bdev layer can be marked as complete. 2277 */ 2278 bdev_init_complete(0); 2279 } 2280 2281 static void 2282 bdev_module_action_done(struct spdk_bdev_module *module) 2283 { 2284 spdk_spin_lock(&module->internal.spinlock); 2285 assert(module->internal.action_in_progress > 0); 2286 module->internal.action_in_progress--; 2287 spdk_spin_unlock(&module->internal.spinlock); 2288 bdev_module_action_complete(); 2289 } 2290 2291 void 2292 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2293 { 2294 assert(module->async_init); 2295 bdev_module_action_done(module); 2296 } 2297 2298 void 2299 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2300 { 2301 bdev_module_action_done(module); 2302 } 2303 2304 /** The last initialized bdev module */ 2305 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2306 2307 static void 2308 bdev_init_failed(void *cb_arg) 2309 { 2310 struct spdk_bdev_module *module = cb_arg; 2311 2312 spdk_spin_lock(&module->internal.spinlock); 2313 assert(module->internal.action_in_progress > 0); 2314 module->internal.action_in_progress--; 2315 spdk_spin_unlock(&module->internal.spinlock); 2316 bdev_init_complete(-1); 2317 } 2318 2319 static int 2320 bdev_modules_init(void) 2321 { 2322 struct spdk_bdev_module *module; 2323 int rc = 0; 2324 2325 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2326 g_resume_bdev_module = module; 2327 if (module->async_init) { 2328 spdk_spin_lock(&module->internal.spinlock); 2329 module->internal.action_in_progress = 1; 2330 spdk_spin_unlock(&module->internal.spinlock); 2331 } 2332 rc = module->module_init(); 2333 if (rc != 0) { 2334 /* Bump action_in_progress to prevent other modules from completion of modules_init 2335 * Send message to defer application shutdown until resources are cleaned up */ 2336 spdk_spin_lock(&module->internal.spinlock); 2337 module->internal.action_in_progress = 1; 2338 spdk_spin_unlock(&module->internal.spinlock); 2339 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2340 return rc; 2341 } 2342 } 2343 2344 g_resume_bdev_module = NULL; 2345 return 0; 2346 } 2347 2348 void 2349 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2350 { 2351 int rc = 0; 2352 char mempool_name[32]; 2353 2354 assert(cb_fn != NULL); 2355 2356 g_init_cb_fn = cb_fn; 2357 g_init_cb_arg = cb_arg; 2358 2359 spdk_notify_type_register("bdev_register"); 2360 spdk_notify_type_register("bdev_unregister"); 2361 2362 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2363 2364 rc = spdk_iobuf_register_module("bdev"); 2365 if (rc != 0) { 2366 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2367 bdev_init_complete(-1); 2368 return; 2369 } 2370 2371 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2372 g_bdev_opts.bdev_io_pool_size, 2373 sizeof(struct spdk_bdev_io) + 2374 bdev_module_get_max_ctx_size(), 2375 0, 2376 SPDK_ENV_NUMA_ID_ANY); 2377 2378 if (g_bdev_mgr.bdev_io_pool == NULL) { 2379 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2380 bdev_init_complete(-1); 2381 return; 2382 } 2383 2384 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2385 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2386 if (!g_bdev_mgr.zero_buffer) { 2387 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2388 bdev_init_complete(-1); 2389 return; 2390 } 2391 2392 #ifdef SPDK_CONFIG_VTUNE 2393 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2394 #endif 2395 2396 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2397 bdev_mgmt_channel_destroy, 2398 sizeof(struct spdk_bdev_mgmt_channel), 2399 "bdev_mgr"); 2400 2401 rc = bdev_modules_init(); 2402 g_bdev_mgr.module_init_complete = true; 2403 if (rc != 0) { 2404 SPDK_ERRLOG("bdev modules init failed\n"); 2405 return; 2406 } 2407 2408 bdev_module_action_complete(); 2409 } 2410 2411 static void 2412 bdev_mgr_unregister_cb(void *io_device) 2413 { 2414 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2415 2416 if (g_bdev_mgr.bdev_io_pool) { 2417 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2418 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2419 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2420 g_bdev_opts.bdev_io_pool_size); 2421 } 2422 2423 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2424 } 2425 2426 spdk_free(g_bdev_mgr.zero_buffer); 2427 2428 bdev_examine_allowlist_free(); 2429 2430 cb_fn(g_fini_cb_arg); 2431 g_fini_cb_fn = NULL; 2432 g_fini_cb_arg = NULL; 2433 g_bdev_mgr.init_complete = false; 2434 g_bdev_mgr.module_init_complete = false; 2435 } 2436 2437 static void 2438 bdev_module_fini_iter(void *arg) 2439 { 2440 struct spdk_bdev_module *bdev_module; 2441 2442 /* FIXME: Handling initialization failures is broken now, 2443 * so we won't even try cleaning up after successfully 2444 * initialized modules. if module_init_complete is false, 2445 * just call spdk_bdev_mgr_unregister_cb 2446 */ 2447 if (!g_bdev_mgr.module_init_complete) { 2448 bdev_mgr_unregister_cb(NULL); 2449 return; 2450 } 2451 2452 /* Start iterating from the last touched module */ 2453 if (!g_resume_bdev_module) { 2454 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2455 } else { 2456 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2457 internal.tailq); 2458 } 2459 2460 while (bdev_module) { 2461 if (bdev_module->async_fini) { 2462 /* Save our place so we can resume later. We must 2463 * save the variable here, before calling module_fini() 2464 * below, because in some cases the module may immediately 2465 * call spdk_bdev_module_fini_done() and re-enter 2466 * this function to continue iterating. */ 2467 g_resume_bdev_module = bdev_module; 2468 } 2469 2470 if (bdev_module->module_fini) { 2471 bdev_module->module_fini(); 2472 } 2473 2474 if (bdev_module->async_fini) { 2475 return; 2476 } 2477 2478 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2479 internal.tailq); 2480 } 2481 2482 g_resume_bdev_module = NULL; 2483 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2484 } 2485 2486 void 2487 spdk_bdev_module_fini_done(void) 2488 { 2489 if (spdk_get_thread() != g_fini_thread) { 2490 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2491 } else { 2492 bdev_module_fini_iter(NULL); 2493 } 2494 } 2495 2496 static void 2497 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2498 { 2499 struct spdk_bdev *bdev = cb_arg; 2500 2501 if (bdeverrno && bdev) { 2502 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2503 bdev->name); 2504 2505 /* 2506 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2507 * bdev; try to continue by manually removing this bdev from the list and continue 2508 * with the next bdev in the list. 2509 */ 2510 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2511 } 2512 2513 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2514 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2515 /* 2516 * Bdev module finish need to be deferred as we might be in the middle of some context 2517 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2518 * after returning. 2519 */ 2520 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2521 return; 2522 } 2523 2524 /* 2525 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2526 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2527 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2528 * base bdevs. 2529 * 2530 * Also, walk the list in the reverse order. 2531 */ 2532 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2533 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2534 spdk_spin_lock(&bdev->internal.spinlock); 2535 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2536 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2537 spdk_spin_unlock(&bdev->internal.spinlock); 2538 continue; 2539 } 2540 spdk_spin_unlock(&bdev->internal.spinlock); 2541 2542 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2543 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2544 return; 2545 } 2546 2547 /* 2548 * If any bdev fails to unclaim underlying bdev properly, we may face the 2549 * case of bdev list consisting of claimed bdevs only (if claims are managed 2550 * correctly, this would mean there's a loop in the claims graph which is 2551 * clearly impossible). Warn and unregister last bdev on the list then. 2552 */ 2553 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2554 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2555 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2556 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2557 return; 2558 } 2559 } 2560 2561 static void 2562 bdev_module_fini_start_iter(void *arg) 2563 { 2564 struct spdk_bdev_module *bdev_module; 2565 2566 if (!g_resume_bdev_module) { 2567 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2568 } else { 2569 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2570 } 2571 2572 while (bdev_module) { 2573 if (bdev_module->async_fini_start) { 2574 /* Save our place so we can resume later. We must 2575 * save the variable here, before calling fini_start() 2576 * below, because in some cases the module may immediately 2577 * call spdk_bdev_module_fini_start_done() and re-enter 2578 * this function to continue iterating. */ 2579 g_resume_bdev_module = bdev_module; 2580 } 2581 2582 if (bdev_module->fini_start) { 2583 bdev_module->fini_start(); 2584 } 2585 2586 if (bdev_module->async_fini_start) { 2587 return; 2588 } 2589 2590 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2591 } 2592 2593 g_resume_bdev_module = NULL; 2594 2595 bdev_finish_unregister_bdevs_iter(NULL, 0); 2596 } 2597 2598 void 2599 spdk_bdev_module_fini_start_done(void) 2600 { 2601 if (spdk_get_thread() != g_fini_thread) { 2602 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2603 } else { 2604 bdev_module_fini_start_iter(NULL); 2605 } 2606 } 2607 2608 static void 2609 bdev_finish_wait_for_examine_done(void *cb_arg) 2610 { 2611 bdev_module_fini_start_iter(NULL); 2612 } 2613 2614 static void bdev_open_async_fini(void); 2615 2616 void 2617 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2618 { 2619 int rc; 2620 2621 assert(cb_fn != NULL); 2622 2623 g_fini_thread = spdk_get_thread(); 2624 2625 g_fini_cb_fn = cb_fn; 2626 g_fini_cb_arg = cb_arg; 2627 2628 bdev_open_async_fini(); 2629 2630 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2631 if (rc != 0) { 2632 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2633 bdev_finish_wait_for_examine_done(NULL); 2634 } 2635 } 2636 2637 struct spdk_bdev_io * 2638 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2639 { 2640 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2641 struct spdk_bdev_io *bdev_io; 2642 2643 if (ch->per_thread_cache_count > 0) { 2644 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2645 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2646 ch->per_thread_cache_count--; 2647 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2648 /* 2649 * Don't try to look for bdev_ios in the global pool if there are 2650 * waiters on bdev_ios - we don't want this caller to jump the line. 2651 */ 2652 bdev_io = NULL; 2653 } else { 2654 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2655 } 2656 2657 return bdev_io; 2658 } 2659 2660 void 2661 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2662 { 2663 struct spdk_bdev_mgmt_channel *ch; 2664 2665 assert(bdev_io != NULL); 2666 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2667 2668 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2669 2670 if (bdev_io->internal.f.has_buf) { 2671 bdev_io_put_buf(bdev_io); 2672 } 2673 2674 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2675 ch->per_thread_cache_count++; 2676 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2677 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2678 struct spdk_bdev_io_wait_entry *entry; 2679 2680 entry = TAILQ_FIRST(&ch->io_wait_queue); 2681 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2682 entry->cb_fn(entry->cb_arg); 2683 } 2684 } else { 2685 /* We should never have a full cache with entries on the io wait queue. */ 2686 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2687 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2688 } 2689 } 2690 2691 static bool 2692 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2693 { 2694 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2695 2696 switch (limit) { 2697 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2698 return true; 2699 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2700 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2701 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2702 return false; 2703 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2704 default: 2705 return false; 2706 } 2707 } 2708 2709 static bool 2710 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2711 { 2712 switch (bdev_io->type) { 2713 case SPDK_BDEV_IO_TYPE_NVME_IO: 2714 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2715 case SPDK_BDEV_IO_TYPE_READ: 2716 case SPDK_BDEV_IO_TYPE_WRITE: 2717 return true; 2718 case SPDK_BDEV_IO_TYPE_ZCOPY: 2719 if (bdev_io->u.bdev.zcopy.start) { 2720 return true; 2721 } else { 2722 return false; 2723 } 2724 default: 2725 return false; 2726 } 2727 } 2728 2729 static bool 2730 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2731 { 2732 switch (bdev_io->type) { 2733 case SPDK_BDEV_IO_TYPE_NVME_IO: 2734 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2735 /* Bit 1 (0x2) set for read operation */ 2736 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2737 return true; 2738 } else { 2739 return false; 2740 } 2741 case SPDK_BDEV_IO_TYPE_READ: 2742 return true; 2743 case SPDK_BDEV_IO_TYPE_ZCOPY: 2744 /* Populate to read from disk */ 2745 if (bdev_io->u.bdev.zcopy.populate) { 2746 return true; 2747 } else { 2748 return false; 2749 } 2750 default: 2751 return false; 2752 } 2753 } 2754 2755 static uint64_t 2756 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2757 { 2758 uint32_t blocklen = bdev_io_get_block_size(bdev_io); 2759 2760 switch (bdev_io->type) { 2761 case SPDK_BDEV_IO_TYPE_NVME_IO: 2762 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2763 return bdev_io->u.nvme_passthru.nbytes; 2764 case SPDK_BDEV_IO_TYPE_READ: 2765 case SPDK_BDEV_IO_TYPE_WRITE: 2766 return bdev_io->u.bdev.num_blocks * blocklen; 2767 case SPDK_BDEV_IO_TYPE_ZCOPY: 2768 /* Track the data in the start phase only */ 2769 if (bdev_io->u.bdev.zcopy.start) { 2770 return bdev_io->u.bdev.num_blocks * blocklen; 2771 } else { 2772 return 0; 2773 } 2774 default: 2775 return 0; 2776 } 2777 } 2778 2779 static inline bool 2780 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2781 { 2782 int64_t remaining_this_timeslice; 2783 2784 if (!limit->max_per_timeslice) { 2785 /* The QoS is disabled */ 2786 return false; 2787 } 2788 2789 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2790 __ATOMIC_RELAXED); 2791 if (remaining_this_timeslice + (int64_t)delta > 0) { 2792 /* There was still a quota for this delta -> the IO shouldn't be queued 2793 * 2794 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2795 * quota can be allowed once a while. Such overrun then taken into account in 2796 * the QoS poller, where the next timeslice quota is calculated. 2797 */ 2798 return false; 2799 } 2800 2801 /* There was no quota for this delta -> the IO should be queued 2802 * The remaining_this_timeslice must be rewinded so it reflects the real 2803 * amount of IOs or bytes allowed. 2804 */ 2805 __atomic_add_fetch( 2806 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2807 return true; 2808 } 2809 2810 static inline void 2811 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2812 { 2813 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2814 } 2815 2816 static bool 2817 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2818 { 2819 return bdev_qos_rw_queue_io(limit, io, 1); 2820 } 2821 2822 static void 2823 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2824 { 2825 bdev_qos_rw_rewind_io(limit, io, 1); 2826 } 2827 2828 static bool 2829 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2830 { 2831 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2832 } 2833 2834 static void 2835 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2836 { 2837 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2838 } 2839 2840 static bool 2841 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2842 { 2843 if (bdev_is_read_io(io) == false) { 2844 return false; 2845 } 2846 2847 return bdev_qos_rw_bps_queue(limit, io); 2848 } 2849 2850 static void 2851 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2852 { 2853 if (bdev_is_read_io(io) != false) { 2854 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2855 } 2856 } 2857 2858 static bool 2859 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2860 { 2861 if (bdev_is_read_io(io) == true) { 2862 return false; 2863 } 2864 2865 return bdev_qos_rw_bps_queue(limit, io); 2866 } 2867 2868 static void 2869 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2870 { 2871 if (bdev_is_read_io(io) != true) { 2872 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2873 } 2874 } 2875 2876 static void 2877 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2878 { 2879 int i; 2880 2881 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2882 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2883 qos->rate_limits[i].queue_io = NULL; 2884 continue; 2885 } 2886 2887 switch (i) { 2888 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2889 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2890 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2891 break; 2892 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2893 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2894 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2895 break; 2896 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2897 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2898 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2899 break; 2900 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2901 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2902 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2903 break; 2904 default: 2905 break; 2906 } 2907 } 2908 } 2909 2910 static void 2911 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2912 struct spdk_bdev_io *bdev_io, 2913 enum spdk_bdev_io_status status) 2914 { 2915 bdev_io->internal.f.in_submit_request = true; 2916 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2917 spdk_bdev_io_complete(bdev_io, status); 2918 bdev_io->internal.f.in_submit_request = false; 2919 } 2920 2921 static inline void 2922 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2923 { 2924 struct spdk_bdev *bdev = bdev_io->bdev; 2925 struct spdk_io_channel *ch = bdev_ch->channel; 2926 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2927 2928 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2929 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2930 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2931 2932 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2933 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2934 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2935 SPDK_BDEV_IO_STATUS_SUCCESS); 2936 return; 2937 } 2938 } 2939 2940 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2941 bdev_io->bdev->split_on_write_unit && 2942 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2943 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2944 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2945 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2946 return; 2947 } 2948 2949 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2950 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2951 bdev_io->internal.f.in_submit_request = true; 2952 bdev_submit_request(bdev, ch, bdev_io); 2953 bdev_io->internal.f.in_submit_request = false; 2954 } else { 2955 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2956 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2957 /* Special case when we have nomem IOs and no outstanding IOs which completions 2958 * could trigger retry of queued IOs */ 2959 bdev_shared_ch_retry_io(shared_resource); 2960 } 2961 } 2962 } 2963 2964 static bool 2965 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2966 { 2967 int i; 2968 2969 if (bdev_qos_io_to_limit(bdev_io) == true) { 2970 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2971 if (!qos->rate_limits[i].queue_io) { 2972 continue; 2973 } 2974 2975 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2976 bdev_io) == true) { 2977 for (i -= 1; i >= 0 ; i--) { 2978 if (!qos->rate_limits[i].queue_io) { 2979 continue; 2980 } 2981 2982 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2983 } 2984 return true; 2985 } 2986 } 2987 } 2988 2989 return false; 2990 } 2991 2992 static int 2993 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2994 { 2995 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2996 int submitted_ios = 0; 2997 2998 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2999 if (!bdev_qos_queue_io(qos, bdev_io)) { 3000 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 3001 bdev_io_do_submit(ch, bdev_io); 3002 3003 submitted_ios++; 3004 } 3005 } 3006 3007 return submitted_ios; 3008 } 3009 3010 static void 3011 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 3012 { 3013 int rc; 3014 3015 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 3016 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 3017 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 3018 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3019 &bdev_io->internal.waitq_entry); 3020 if (rc != 0) { 3021 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 3022 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3023 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3024 } 3025 } 3026 3027 static bool 3028 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 3029 { 3030 uint32_t io_boundary; 3031 struct spdk_bdev *bdev = bdev_io->bdev; 3032 uint32_t max_segment_size = bdev->max_segment_size; 3033 uint32_t max_size = bdev->max_rw_size; 3034 int max_segs = bdev->max_num_segments; 3035 3036 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3037 io_boundary = bdev->write_unit_size; 3038 } else if (bdev->split_on_optimal_io_boundary) { 3039 io_boundary = bdev->optimal_io_boundary; 3040 } else { 3041 io_boundary = 0; 3042 } 3043 3044 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 3045 return false; 3046 } 3047 3048 if (io_boundary) { 3049 uint64_t start_stripe, end_stripe; 3050 3051 start_stripe = bdev_io->u.bdev.offset_blocks; 3052 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 3053 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3054 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 3055 start_stripe >>= spdk_u32log2(io_boundary); 3056 end_stripe >>= spdk_u32log2(io_boundary); 3057 } else { 3058 start_stripe /= io_boundary; 3059 end_stripe /= io_boundary; 3060 } 3061 3062 if (start_stripe != end_stripe) { 3063 return true; 3064 } 3065 } 3066 3067 if (max_segs) { 3068 if (bdev_io->u.bdev.iovcnt > max_segs) { 3069 return true; 3070 } 3071 } 3072 3073 if (max_segment_size) { 3074 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 3075 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 3076 return true; 3077 } 3078 } 3079 } 3080 3081 if (max_size) { 3082 if (bdev_io->u.bdev.num_blocks > max_size) { 3083 return true; 3084 } 3085 } 3086 3087 return false; 3088 } 3089 3090 static bool 3091 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 3092 { 3093 uint32_t num_unmap_segments; 3094 3095 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 3096 return false; 3097 } 3098 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 3099 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 3100 return true; 3101 } 3102 3103 return false; 3104 } 3105 3106 static bool 3107 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 3108 { 3109 if (!bdev_io->bdev->max_write_zeroes) { 3110 return false; 3111 } 3112 3113 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3114 return true; 3115 } 3116 3117 return false; 3118 } 3119 3120 static bool 3121 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3122 { 3123 if (bdev_io->bdev->max_copy != 0 && 3124 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3125 return true; 3126 } 3127 3128 return false; 3129 } 3130 3131 static bool 3132 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3133 { 3134 switch (bdev_io->type) { 3135 case SPDK_BDEV_IO_TYPE_READ: 3136 case SPDK_BDEV_IO_TYPE_WRITE: 3137 return bdev_rw_should_split(bdev_io); 3138 case SPDK_BDEV_IO_TYPE_UNMAP: 3139 return bdev_unmap_should_split(bdev_io); 3140 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3141 return bdev_write_zeroes_should_split(bdev_io); 3142 case SPDK_BDEV_IO_TYPE_COPY: 3143 return bdev_copy_should_split(bdev_io); 3144 default: 3145 return false; 3146 } 3147 } 3148 3149 static uint32_t 3150 _to_next_boundary(uint64_t offset, uint32_t boundary) 3151 { 3152 return (boundary - (offset % boundary)); 3153 } 3154 3155 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3156 3157 static void _bdev_rw_split(void *_bdev_io); 3158 3159 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3160 3161 static void 3162 _bdev_unmap_split(void *_bdev_io) 3163 { 3164 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3165 } 3166 3167 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3168 3169 static void 3170 _bdev_write_zeroes_split(void *_bdev_io) 3171 { 3172 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3173 } 3174 3175 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3176 3177 static void 3178 _bdev_copy_split(void *_bdev_io) 3179 { 3180 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3181 } 3182 3183 static int 3184 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3185 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3186 { 3187 int rc; 3188 uint64_t current_offset, current_remaining, current_src_offset; 3189 spdk_bdev_io_wait_cb io_wait_fn; 3190 3191 current_offset = *offset; 3192 current_remaining = *remaining; 3193 3194 assert(bdev_io->internal.f.split); 3195 3196 bdev_io->internal.split.outstanding++; 3197 3198 io_wait_fn = _bdev_rw_split; 3199 switch (bdev_io->type) { 3200 case SPDK_BDEV_IO_TYPE_READ: 3201 assert(bdev_io->u.bdev.accel_sequence == NULL); 3202 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3203 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3204 iov, iovcnt, md_buf, current_offset, 3205 num_blocks, 3206 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3207 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3208 NULL, 3209 bdev_io->u.bdev.dif_check_flags, 3210 bdev_io_split_done, bdev_io); 3211 break; 3212 case SPDK_BDEV_IO_TYPE_WRITE: 3213 assert(bdev_io->u.bdev.accel_sequence == NULL); 3214 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3215 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3216 iov, iovcnt, md_buf, current_offset, 3217 num_blocks, 3218 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3219 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3220 NULL, 3221 bdev_io->u.bdev.dif_check_flags, 3222 bdev_io->u.bdev.nvme_cdw12.raw, 3223 bdev_io->u.bdev.nvme_cdw13.raw, 3224 bdev_io_split_done, bdev_io); 3225 break; 3226 case SPDK_BDEV_IO_TYPE_UNMAP: 3227 io_wait_fn = _bdev_unmap_split; 3228 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3229 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3230 current_offset, num_blocks, 3231 bdev_io_split_done, bdev_io); 3232 break; 3233 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3234 io_wait_fn = _bdev_write_zeroes_split; 3235 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3236 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3237 current_offset, num_blocks, 3238 bdev_io_split_done, bdev_io); 3239 break; 3240 case SPDK_BDEV_IO_TYPE_COPY: 3241 io_wait_fn = _bdev_copy_split; 3242 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3243 (current_offset - bdev_io->u.bdev.offset_blocks); 3244 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3245 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3246 current_offset, current_src_offset, num_blocks, 3247 bdev_io_split_done, bdev_io); 3248 break; 3249 default: 3250 assert(false); 3251 rc = -EINVAL; 3252 break; 3253 } 3254 3255 if (rc == 0) { 3256 current_offset += num_blocks; 3257 current_remaining -= num_blocks; 3258 bdev_io->internal.split.current_offset_blocks = current_offset; 3259 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3260 *offset = current_offset; 3261 *remaining = current_remaining; 3262 } else { 3263 bdev_io->internal.split.outstanding--; 3264 if (rc == -ENOMEM) { 3265 if (bdev_io->internal.split.outstanding == 0) { 3266 /* No I/O is outstanding. Hence we should wait here. */ 3267 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3268 } 3269 } else { 3270 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3271 if (bdev_io->internal.split.outstanding == 0) { 3272 bdev_ch_remove_from_io_submitted(bdev_io); 3273 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3274 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3275 bdev_io->internal.ch->queue_depth); 3276 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3277 } 3278 } 3279 } 3280 3281 return rc; 3282 } 3283 3284 static void 3285 _bdev_rw_split(void *_bdev_io) 3286 { 3287 struct iovec *parent_iov, *iov; 3288 struct spdk_bdev_io *bdev_io = _bdev_io; 3289 struct spdk_bdev *bdev = bdev_io->bdev; 3290 uint64_t parent_offset, current_offset, remaining; 3291 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3292 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3293 uint32_t iovcnt, iov_len, child_iovsize; 3294 uint32_t blocklen; 3295 uint32_t io_boundary; 3296 uint32_t max_segment_size = bdev->max_segment_size; 3297 uint32_t max_child_iovcnt = bdev->max_num_segments; 3298 uint32_t max_size = bdev->max_rw_size; 3299 void *md_buf = NULL; 3300 int rc; 3301 3302 blocklen = bdev_io_get_block_size(bdev_io); 3303 3304 max_size = max_size ? max_size : UINT32_MAX; 3305 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3306 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3307 SPDK_BDEV_IO_NUM_CHILD_IOV; 3308 3309 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3310 io_boundary = bdev->write_unit_size; 3311 } else if (bdev->split_on_optimal_io_boundary) { 3312 io_boundary = bdev->optimal_io_boundary; 3313 } else { 3314 io_boundary = UINT32_MAX; 3315 } 3316 3317 assert(bdev_io->internal.f.split); 3318 3319 remaining = bdev_io->internal.split.remaining_num_blocks; 3320 current_offset = bdev_io->internal.split.current_offset_blocks; 3321 parent_offset = bdev_io->u.bdev.offset_blocks; 3322 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3323 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3324 3325 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3326 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3327 if (parent_iov_offset < parent_iov->iov_len) { 3328 break; 3329 } 3330 parent_iov_offset -= parent_iov->iov_len; 3331 } 3332 3333 child_iovcnt = 0; 3334 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3335 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3336 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3337 to_next_boundary = spdk_min(remaining, to_next_boundary); 3338 to_next_boundary = spdk_min(max_size, to_next_boundary); 3339 to_next_boundary_bytes = to_next_boundary * blocklen; 3340 3341 iov = &bdev_io->child_iov[child_iovcnt]; 3342 iovcnt = 0; 3343 3344 if (bdev_io->u.bdev.md_buf) { 3345 md_buf = (char *)bdev_io->u.bdev.md_buf + 3346 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3347 } 3348 3349 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3350 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3351 iovcnt < child_iovsize) { 3352 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3353 iov_len = parent_iov->iov_len - parent_iov_offset; 3354 3355 iov_len = spdk_min(iov_len, max_segment_size); 3356 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3357 to_next_boundary_bytes -= iov_len; 3358 3359 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3360 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3361 3362 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3363 parent_iov_offset += iov_len; 3364 } else { 3365 parent_iovpos++; 3366 parent_iov_offset = 0; 3367 } 3368 child_iovcnt++; 3369 iovcnt++; 3370 } 3371 3372 if (to_next_boundary_bytes > 0) { 3373 /* We had to stop this child I/O early because we ran out of 3374 * child_iov space or were limited by max_num_segments. 3375 * Ensure the iovs to be aligned with block size and 3376 * then adjust to_next_boundary before starting the 3377 * child I/O. 3378 */ 3379 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3380 iovcnt == child_iovsize); 3381 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3382 if (to_last_block_bytes != 0) { 3383 uint32_t child_iovpos = child_iovcnt - 1; 3384 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3385 * so the loop will naturally end 3386 */ 3387 3388 to_last_block_bytes = blocklen - to_last_block_bytes; 3389 to_next_boundary_bytes += to_last_block_bytes; 3390 while (to_last_block_bytes > 0 && iovcnt > 0) { 3391 iov_len = spdk_min(to_last_block_bytes, 3392 bdev_io->child_iov[child_iovpos].iov_len); 3393 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3394 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3395 child_iovpos--; 3396 if (--iovcnt == 0) { 3397 /* If the child IO is less than a block size just return. 3398 * If the first child IO of any split round is less than 3399 * a block size, an error exit. 3400 */ 3401 if (bdev_io->internal.split.outstanding == 0) { 3402 SPDK_ERRLOG("The first child io was less than a block size\n"); 3403 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3404 bdev_ch_remove_from_io_submitted(bdev_io); 3405 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3406 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3407 bdev_io->internal.ch->queue_depth); 3408 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3409 } 3410 3411 return; 3412 } 3413 } 3414 3415 to_last_block_bytes -= iov_len; 3416 3417 if (parent_iov_offset == 0) { 3418 parent_iovpos--; 3419 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3420 } 3421 parent_iov_offset -= iov_len; 3422 } 3423 3424 assert(to_last_block_bytes == 0); 3425 } 3426 to_next_boundary -= to_next_boundary_bytes / blocklen; 3427 } 3428 3429 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3430 ¤t_offset, &remaining); 3431 if (spdk_unlikely(rc)) { 3432 return; 3433 } 3434 } 3435 } 3436 3437 static void 3438 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3439 { 3440 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3441 uint32_t num_children_reqs = 0; 3442 int rc; 3443 3444 assert(bdev_io->internal.f.split); 3445 3446 offset = bdev_io->internal.split.current_offset_blocks; 3447 remaining = bdev_io->internal.split.remaining_num_blocks; 3448 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3449 3450 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3451 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3452 3453 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3454 &offset, &remaining); 3455 if (spdk_likely(rc == 0)) { 3456 num_children_reqs++; 3457 } else { 3458 return; 3459 } 3460 } 3461 } 3462 3463 static void 3464 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3465 { 3466 uint64_t offset, write_zeroes_blocks, remaining; 3467 uint32_t num_children_reqs = 0; 3468 int rc; 3469 3470 assert(bdev_io->internal.f.split); 3471 3472 offset = bdev_io->internal.split.current_offset_blocks; 3473 remaining = bdev_io->internal.split.remaining_num_blocks; 3474 3475 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3476 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3477 3478 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3479 &offset, &remaining); 3480 if (spdk_likely(rc == 0)) { 3481 num_children_reqs++; 3482 } else { 3483 return; 3484 } 3485 } 3486 } 3487 3488 static void 3489 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3490 { 3491 uint64_t offset, copy_blocks, remaining; 3492 uint32_t num_children_reqs = 0; 3493 int rc; 3494 3495 assert(bdev_io->internal.f.split); 3496 3497 offset = bdev_io->internal.split.current_offset_blocks; 3498 remaining = bdev_io->internal.split.remaining_num_blocks; 3499 3500 assert(bdev_io->bdev->max_copy != 0); 3501 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3502 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3503 3504 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3505 &offset, &remaining); 3506 if (spdk_likely(rc == 0)) { 3507 num_children_reqs++; 3508 } else { 3509 return; 3510 } 3511 } 3512 } 3513 3514 static void 3515 parent_bdev_io_complete(void *ctx, int rc) 3516 { 3517 struct spdk_bdev_io *parent_io = ctx; 3518 3519 if (rc) { 3520 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3521 } 3522 3523 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3524 parent_io->internal.caller_ctx); 3525 } 3526 3527 static void 3528 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3529 { 3530 struct spdk_bdev_io *bdev_io = ctx; 3531 3532 /* u.bdev.accel_sequence should have already been cleared at this point */ 3533 assert(bdev_io->u.bdev.accel_sequence == NULL); 3534 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3535 bdev_io->internal.f.has_accel_sequence = false; 3536 3537 if (spdk_unlikely(status != 0)) { 3538 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3539 } 3540 3541 parent_bdev_io_complete(bdev_io, status); 3542 } 3543 3544 static void 3545 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3546 { 3547 struct spdk_bdev_io *parent_io = cb_arg; 3548 3549 spdk_bdev_free_io(bdev_io); 3550 3551 assert(parent_io->internal.f.split); 3552 3553 if (!success) { 3554 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3555 /* If any child I/O failed, stop further splitting process. */ 3556 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3557 parent_io->internal.split.remaining_num_blocks = 0; 3558 } 3559 parent_io->internal.split.outstanding--; 3560 if (parent_io->internal.split.outstanding != 0) { 3561 return; 3562 } 3563 3564 /* 3565 * Parent I/O finishes when all blocks are consumed. 3566 */ 3567 if (parent_io->internal.split.remaining_num_blocks == 0) { 3568 assert(parent_io->internal.cb != bdev_io_split_done); 3569 bdev_ch_remove_from_io_submitted(parent_io); 3570 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3571 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3572 parent_io->internal.ch->queue_depth); 3573 3574 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3575 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3576 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3577 return; 3578 } else if (parent_io->internal.f.has_bounce_buf && 3579 !bdev_io_use_accel_sequence(bdev_io)) { 3580 /* bdev IO will be completed in the callback */ 3581 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3582 return; 3583 } 3584 } 3585 3586 parent_bdev_io_complete(parent_io, 0); 3587 return; 3588 } 3589 3590 /* 3591 * Continue with the splitting process. This function will complete the parent I/O if the 3592 * splitting is done. 3593 */ 3594 switch (parent_io->type) { 3595 case SPDK_BDEV_IO_TYPE_READ: 3596 case SPDK_BDEV_IO_TYPE_WRITE: 3597 _bdev_rw_split(parent_io); 3598 break; 3599 case SPDK_BDEV_IO_TYPE_UNMAP: 3600 bdev_unmap_split(parent_io); 3601 break; 3602 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3603 bdev_write_zeroes_split(parent_io); 3604 break; 3605 case SPDK_BDEV_IO_TYPE_COPY: 3606 bdev_copy_split(parent_io); 3607 break; 3608 default: 3609 assert(false); 3610 break; 3611 } 3612 } 3613 3614 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3615 bool success); 3616 3617 static void 3618 bdev_io_split(struct spdk_bdev_io *bdev_io) 3619 { 3620 assert(bdev_io_should_split(bdev_io)); 3621 assert(bdev_io->internal.f.split); 3622 3623 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3624 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3625 bdev_io->internal.split.outstanding = 0; 3626 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3627 3628 switch (bdev_io->type) { 3629 case SPDK_BDEV_IO_TYPE_READ: 3630 case SPDK_BDEV_IO_TYPE_WRITE: 3631 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3632 _bdev_rw_split(bdev_io); 3633 } else { 3634 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3635 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3636 bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io)); 3637 } 3638 break; 3639 case SPDK_BDEV_IO_TYPE_UNMAP: 3640 bdev_unmap_split(bdev_io); 3641 break; 3642 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3643 bdev_write_zeroes_split(bdev_io); 3644 break; 3645 case SPDK_BDEV_IO_TYPE_COPY: 3646 bdev_copy_split(bdev_io); 3647 break; 3648 default: 3649 assert(false); 3650 break; 3651 } 3652 } 3653 3654 static void 3655 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3656 { 3657 if (!success) { 3658 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3659 return; 3660 } 3661 3662 _bdev_rw_split(bdev_io); 3663 } 3664 3665 static inline void 3666 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3667 { 3668 struct spdk_bdev *bdev = bdev_io->bdev; 3669 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3670 3671 if (spdk_likely(bdev_ch->flags == 0)) { 3672 bdev_io_do_submit(bdev_ch, bdev_io); 3673 return; 3674 } 3675 3676 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3677 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3678 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3679 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3680 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3681 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3682 } else { 3683 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3684 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3685 } 3686 } else { 3687 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3688 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3689 } 3690 } 3691 3692 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3693 3694 bool 3695 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3696 { 3697 if (range1->length == 0 || range2->length == 0) { 3698 return false; 3699 } 3700 3701 if (range1->offset + range1->length <= range2->offset) { 3702 return false; 3703 } 3704 3705 if (range2->offset + range2->length <= range1->offset) { 3706 return false; 3707 } 3708 3709 return true; 3710 } 3711 3712 static bool 3713 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3714 { 3715 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3716 struct lba_range r; 3717 3718 switch (bdev_io->type) { 3719 case SPDK_BDEV_IO_TYPE_NVME_IO: 3720 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3721 /* Don't try to decode the NVMe command - just assume worst-case and that 3722 * it overlaps a locked range. 3723 */ 3724 return true; 3725 case SPDK_BDEV_IO_TYPE_READ: 3726 if (!range->quiesce) { 3727 return false; 3728 } 3729 /* fallthrough */ 3730 case SPDK_BDEV_IO_TYPE_WRITE: 3731 case SPDK_BDEV_IO_TYPE_UNMAP: 3732 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3733 case SPDK_BDEV_IO_TYPE_ZCOPY: 3734 case SPDK_BDEV_IO_TYPE_COPY: 3735 r.offset = bdev_io->u.bdev.offset_blocks; 3736 r.length = bdev_io->u.bdev.num_blocks; 3737 if (!bdev_lba_range_overlapped(range, &r)) { 3738 /* This I/O doesn't overlap the specified LBA range. */ 3739 return false; 3740 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3741 /* This I/O overlaps, but the I/O is on the same channel that locked this 3742 * range, and the caller_ctx is the same as the locked_ctx. This means 3743 * that this I/O is associated with the lock, and is allowed to execute. 3744 */ 3745 return false; 3746 } else { 3747 return true; 3748 } 3749 default: 3750 return false; 3751 } 3752 } 3753 3754 void 3755 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3756 { 3757 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3758 3759 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3760 3761 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3762 struct lba_range *range; 3763 3764 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3765 if (bdev_io_range_is_locked(bdev_io, range)) { 3766 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3767 return; 3768 } 3769 } 3770 } 3771 3772 bdev_ch_add_to_io_submitted(bdev_io); 3773 3774 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3775 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3776 ch->trace_id, bdev_io->u.bdev.num_blocks, 3777 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3778 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3779 3780 if (bdev_io->internal.f.split) { 3781 bdev_io_split(bdev_io); 3782 return; 3783 } 3784 3785 _bdev_io_submit(bdev_io); 3786 } 3787 3788 static inline int 3789 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io) 3790 { 3791 struct spdk_bdev *bdev = bdev_io->bdev; 3792 struct spdk_dif_ctx_init_ext_opts dif_opts; 3793 3794 memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error)); 3795 3796 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 3797 dif_opts.dif_pi_format = bdev->dif_pi_format; 3798 3799 return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx, 3800 bdev->blocklen, 3801 bdev->md_len, 3802 bdev->md_interleave, 3803 bdev->dif_is_head_of_md, 3804 bdev->dif_type, 3805 bdev_io->u.bdev.dif_check_flags, 3806 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 3807 0xFFFF, 0, 0, 0, &dif_opts); 3808 } 3809 3810 static void 3811 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3812 bool success) 3813 { 3814 if (!success) { 3815 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 3816 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3817 bdev_io_complete_unsubmitted(bdev_io); 3818 return; 3819 } 3820 3821 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 3822 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3823 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3824 return; 3825 } 3826 /* For reads we'll execute the sequence after the data is read, so, for now, only 3827 * clear out accel_sequence pointer and submit the IO */ 3828 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3829 bdev_io->u.bdev.accel_sequence = NULL; 3830 } 3831 3832 bdev_io_submit(bdev_io); 3833 } 3834 3835 static inline void 3836 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3837 { 3838 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3839 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3840 * For write operation we need to pull buffers from memory domain before submitting IO. 3841 * Once read operation completes, we need to use memory_domain push functionality to 3842 * update data in original memory domain IO buffer. 3843 * 3844 * If this I/O request is not aware of metadata, buffers in thsi IO request can't be 3845 * accessed directly too. It is needed to allocate buffers before issuing IO operation. 3846 * For write operation we need to insert metadata before submitting IO. Once read 3847 * operation completes, we need to strip metadata in original IO buffer. 3848 * 3849 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3850 assert(bdev_io_use_memory_domain(bdev_io) || 3851 bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3852 3853 bdev_io->u.bdev.memory_domain = NULL; 3854 bdev_io->u.bdev.memory_domain_ctx = NULL; 3855 _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3856 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3857 } 3858 3859 static inline void 3860 _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io) 3861 { 3862 assert(bdev_io_use_memory_domain(bdev_io)); 3863 assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3864 3865 bdev_io->u.bdev.memory_domain = NULL; 3866 bdev_io->u.bdev.memory_domain_ctx = NULL; 3867 bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3868 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3869 } 3870 3871 /* We need to allocate bounce buffer 3872 * - if bdev doesn't support memory domains, 3873 * - if it does support them, but we need to execute an accel sequence and the data buffer is 3874 * from accel memory domain (to avoid doing a push/pull from that domain), or 3875 * - if IO is not aware of metadata. 3876 */ 3877 static inline bool 3878 bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3879 { 3880 if (bdev_io_use_memory_domain(bdev_io)) { 3881 if (!desc->memory_domains_supported || 3882 (bdev_io_needs_sequence_exec(desc, bdev_io) && 3883 (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() || 3884 bdev_io_needs_metadata(desc, bdev_io)))) { 3885 return true; 3886 } 3887 3888 return false; 3889 } 3890 3891 if (bdev_io_needs_metadata(desc, bdev_io)) { 3892 return true; 3893 } 3894 3895 return false; 3896 } 3897 3898 /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not 3899 * aware of metadata. 3900 */ 3901 static inline bool 3902 bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3903 { 3904 if (bdev_io_needs_metadata(desc, bdev_io)) { 3905 assert(bdev_io_use_memory_domain(bdev_io)); 3906 return true; 3907 } 3908 3909 return false; 3910 } 3911 3912 static inline void 3913 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3914 { 3915 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3916 int rc; 3917 3918 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3919 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3920 bdev_io_complete_unsubmitted(bdev_io); 3921 return; 3922 } 3923 3924 if (bdev_io_needs_metadata(desc, bdev_io)) { 3925 rc = bdev_io_init_dif_ctx(bdev_io); 3926 if (spdk_unlikely(rc != 0)) { 3927 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3928 bdev_io_complete_unsubmitted(bdev_io); 3929 return; 3930 } 3931 } 3932 3933 if (bdev_io_needs_bounce_buffer(desc, bdev_io)) { 3934 _bdev_io_ext_use_bounce_buffer(bdev_io); 3935 return; 3936 } 3937 3938 if (bdev_io_needs_accel_buffer(desc, bdev_io)) { 3939 _bdev_io_ext_use_accel_buffer(bdev_io); 3940 return; 3941 } 3942 3943 if (bdev_io_needs_sequence_exec(desc, bdev_io)) { 3944 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3945 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3946 return; 3947 } 3948 /* For reads we'll execute the sequence after the data is read, so, for now, only 3949 * clear out accel_sequence pointer and submit the IO */ 3950 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3951 bdev_io->u.bdev.accel_sequence = NULL; 3952 } 3953 3954 bdev_io_submit(bdev_io); 3955 } 3956 3957 static void 3958 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3959 { 3960 struct spdk_bdev *bdev = bdev_io->bdev; 3961 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3962 struct spdk_io_channel *ch = bdev_ch->channel; 3963 3964 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3965 3966 bdev_io->internal.f.in_submit_request = true; 3967 bdev_submit_request(bdev, ch, bdev_io); 3968 bdev_io->internal.f.in_submit_request = false; 3969 } 3970 3971 void 3972 bdev_io_init(struct spdk_bdev_io *bdev_io, 3973 struct spdk_bdev *bdev, void *cb_arg, 3974 spdk_bdev_io_completion_cb cb) 3975 { 3976 bdev_io->bdev = bdev; 3977 bdev_io->internal.f.raw = 0; 3978 bdev_io->internal.caller_ctx = cb_arg; 3979 bdev_io->internal.cb = cb; 3980 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3981 bdev_io->internal.f.in_submit_request = false; 3982 bdev_io->internal.error.nvme.cdw0 = 0; 3983 bdev_io->num_retries = 0; 3984 bdev_io->internal.get_buf_cb = NULL; 3985 bdev_io->internal.get_aux_buf_cb = NULL; 3986 bdev_io->internal.data_transfer_cpl = NULL; 3987 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3988 } 3989 3990 static bool 3991 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3992 { 3993 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3994 } 3995 3996 bool 3997 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3998 { 3999 bool supported; 4000 4001 supported = bdev_io_type_supported(bdev, io_type); 4002 4003 if (!supported) { 4004 switch (io_type) { 4005 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 4006 /* The bdev layer will emulate write zeroes as long as write is supported. */ 4007 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 4008 break; 4009 default: 4010 break; 4011 } 4012 } 4013 4014 return supported; 4015 } 4016 4017 static const char *g_io_type_strings[] = { 4018 [SPDK_BDEV_IO_TYPE_READ] = "read", 4019 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 4020 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 4021 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 4022 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 4023 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 4024 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 4025 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 4026 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 4027 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 4028 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 4029 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 4030 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 4031 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 4032 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 4033 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 4034 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 4035 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 4036 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 4037 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 4038 }; 4039 4040 const char * 4041 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 4042 { 4043 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 4044 return NULL; 4045 } 4046 4047 return g_io_type_strings[io_type]; 4048 } 4049 4050 int 4051 spdk_bdev_get_io_type(const char *io_type_string) 4052 { 4053 int i; 4054 4055 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 4056 if (!strcmp(io_type_string, g_io_type_strings[i])) { 4057 return i; 4058 } 4059 } 4060 4061 return -1; 4062 } 4063 4064 uint64_t 4065 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 4066 { 4067 return bdev_io->internal.submit_tsc; 4068 } 4069 4070 bool 4071 spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io) 4072 { 4073 return bdev_io->internal.desc->opts.hide_metadata; 4074 } 4075 4076 int 4077 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4078 { 4079 if (bdev->fn_table->dump_info_json) { 4080 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 4081 } 4082 4083 return 0; 4084 } 4085 4086 static void 4087 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 4088 { 4089 uint32_t max_per_timeslice = 0; 4090 int i; 4091 4092 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4093 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4094 qos->rate_limits[i].max_per_timeslice = 0; 4095 continue; 4096 } 4097 4098 max_per_timeslice = qos->rate_limits[i].limit * 4099 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 4100 4101 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 4102 qos->rate_limits[i].min_per_timeslice); 4103 4104 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4105 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 4106 } 4107 4108 bdev_qos_set_ops(qos); 4109 } 4110 4111 static void 4112 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4113 struct spdk_io_channel *io_ch, void *ctx) 4114 { 4115 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4116 int status; 4117 4118 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 4119 4120 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 4121 /* TODO: channels round robing */ 4122 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 4123 4124 spdk_bdev_for_each_channel_continue(i, status); 4125 } 4126 4127 4128 static void 4129 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 4130 { 4131 4132 } 4133 4134 static int 4135 bdev_channel_poll_qos(void *arg) 4136 { 4137 struct spdk_bdev *bdev = arg; 4138 struct spdk_bdev_qos *qos = bdev->internal.qos; 4139 uint64_t now = spdk_get_ticks(); 4140 int i; 4141 int64_t remaining_last_timeslice; 4142 4143 if (spdk_unlikely(qos->thread == NULL)) { 4144 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 4145 return SPDK_POLLER_IDLE; 4146 } 4147 4148 if (now < (qos->last_timeslice + qos->timeslice_size)) { 4149 /* We received our callback earlier than expected - return 4150 * immediately and wait to do accounting until at least one 4151 * timeslice has actually expired. This should never happen 4152 * with a well-behaved timer implementation. 4153 */ 4154 return SPDK_POLLER_IDLE; 4155 } 4156 4157 /* Reset for next round of rate limiting */ 4158 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4159 /* We may have allowed the IOs or bytes to slightly overrun in the last 4160 * timeslice. remaining_this_timeslice is signed, so if it's negative 4161 * here, we'll account for the overrun so that the next timeslice will 4162 * be appropriately reduced. 4163 */ 4164 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 4165 0, __ATOMIC_RELAXED); 4166 if (remaining_last_timeslice < 0) { 4167 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 4168 * potentially use 2 atomic ops each, so they can intertwine. 4169 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 4170 */ 4171 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4172 remaining_last_timeslice, __ATOMIC_RELAXED); 4173 } 4174 } 4175 4176 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 4177 qos->last_timeslice += qos->timeslice_size; 4178 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4179 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 4180 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 4181 } 4182 } 4183 4184 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 4185 bdev_channel_submit_qos_io_done); 4186 4187 return SPDK_POLLER_BUSY; 4188 } 4189 4190 static void 4191 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 4192 { 4193 struct spdk_bdev_shared_resource *shared_resource; 4194 struct lba_range *range; 4195 4196 bdev_free_io_stat(ch->stat); 4197 #ifdef SPDK_CONFIG_VTUNE 4198 bdev_free_io_stat(ch->prev_stat); 4199 #endif 4200 4201 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 4202 range = TAILQ_FIRST(&ch->locked_ranges); 4203 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 4204 free(range); 4205 } 4206 4207 spdk_put_io_channel(ch->channel); 4208 spdk_put_io_channel(ch->accel_channel); 4209 4210 shared_resource = ch->shared_resource; 4211 4212 assert(TAILQ_EMPTY(&ch->io_locked)); 4213 assert(TAILQ_EMPTY(&ch->io_submitted)); 4214 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 4215 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 4216 assert(ch->io_outstanding == 0); 4217 assert(shared_resource->ref > 0); 4218 shared_resource->ref--; 4219 if (shared_resource->ref == 0) { 4220 assert(shared_resource->io_outstanding == 0); 4221 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 4222 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 4223 spdk_poller_unregister(&shared_resource->nomem_poller); 4224 free(shared_resource); 4225 } 4226 } 4227 4228 static void 4229 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 4230 { 4231 struct spdk_bdev_qos *qos = bdev->internal.qos; 4232 int i; 4233 4234 assert(spdk_spin_held(&bdev->internal.spinlock)); 4235 4236 /* Rate limiting on this bdev enabled */ 4237 if (qos) { 4238 if (qos->ch == NULL) { 4239 struct spdk_io_channel *io_ch; 4240 4241 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4242 bdev->name, spdk_get_thread()); 4243 4244 /* No qos channel has been selected, so set one up */ 4245 4246 /* Take another reference to ch */ 4247 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4248 assert(io_ch != NULL); 4249 qos->ch = ch; 4250 4251 qos->thread = spdk_io_channel_get_thread(io_ch); 4252 4253 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4254 if (bdev_qos_is_iops_rate_limit(i) == true) { 4255 qos->rate_limits[i].min_per_timeslice = 4256 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4257 } else { 4258 qos->rate_limits[i].min_per_timeslice = 4259 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4260 } 4261 4262 if (qos->rate_limits[i].limit == 0) { 4263 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4264 } 4265 } 4266 bdev_qos_update_max_quota_per_timeslice(qos); 4267 qos->timeslice_size = 4268 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4269 qos->last_timeslice = spdk_get_ticks(); 4270 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4271 bdev, 4272 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4273 } 4274 4275 ch->flags |= BDEV_CH_QOS_ENABLED; 4276 } 4277 } 4278 4279 struct poll_timeout_ctx { 4280 struct spdk_bdev_desc *desc; 4281 uint64_t timeout_in_sec; 4282 spdk_bdev_io_timeout_cb cb_fn; 4283 void *cb_arg; 4284 }; 4285 4286 static void 4287 bdev_desc_free(struct spdk_bdev_desc *desc) 4288 { 4289 spdk_spin_destroy(&desc->spinlock); 4290 free(desc->media_events_buffer); 4291 free(desc); 4292 } 4293 4294 static void 4295 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4296 { 4297 struct poll_timeout_ctx *ctx = _ctx; 4298 struct spdk_bdev_desc *desc = ctx->desc; 4299 4300 free(ctx); 4301 4302 spdk_spin_lock(&desc->spinlock); 4303 desc->refs--; 4304 if (desc->closed == true && desc->refs == 0) { 4305 spdk_spin_unlock(&desc->spinlock); 4306 bdev_desc_free(desc); 4307 return; 4308 } 4309 spdk_spin_unlock(&desc->spinlock); 4310 } 4311 4312 static void 4313 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4314 struct spdk_io_channel *io_ch, void *_ctx) 4315 { 4316 struct poll_timeout_ctx *ctx = _ctx; 4317 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4318 struct spdk_bdev_desc *desc = ctx->desc; 4319 struct spdk_bdev_io *bdev_io; 4320 uint64_t now; 4321 4322 spdk_spin_lock(&desc->spinlock); 4323 if (desc->closed == true) { 4324 spdk_spin_unlock(&desc->spinlock); 4325 spdk_bdev_for_each_channel_continue(i, -1); 4326 return; 4327 } 4328 spdk_spin_unlock(&desc->spinlock); 4329 4330 now = spdk_get_ticks(); 4331 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4332 /* Exclude any I/O that are generated via splitting. */ 4333 if (bdev_io->internal.cb == bdev_io_split_done) { 4334 continue; 4335 } 4336 4337 /* Once we find an I/O that has not timed out, we can immediately 4338 * exit the loop. 4339 */ 4340 if (now < (bdev_io->internal.submit_tsc + 4341 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4342 goto end; 4343 } 4344 4345 if (bdev_io->internal.desc == desc) { 4346 ctx->cb_fn(ctx->cb_arg, bdev_io); 4347 } 4348 } 4349 4350 end: 4351 spdk_bdev_for_each_channel_continue(i, 0); 4352 } 4353 4354 static int 4355 bdev_poll_timeout_io(void *arg) 4356 { 4357 struct spdk_bdev_desc *desc = arg; 4358 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4359 struct poll_timeout_ctx *ctx; 4360 4361 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4362 if (!ctx) { 4363 SPDK_ERRLOG("failed to allocate memory\n"); 4364 return SPDK_POLLER_BUSY; 4365 } 4366 ctx->desc = desc; 4367 ctx->cb_arg = desc->cb_arg; 4368 ctx->cb_fn = desc->cb_fn; 4369 ctx->timeout_in_sec = desc->timeout_in_sec; 4370 4371 /* Take a ref on the descriptor in case it gets closed while we are checking 4372 * all of the channels. 4373 */ 4374 spdk_spin_lock(&desc->spinlock); 4375 desc->refs++; 4376 spdk_spin_unlock(&desc->spinlock); 4377 4378 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4379 bdev_channel_poll_timeout_io_done); 4380 4381 return SPDK_POLLER_BUSY; 4382 } 4383 4384 int 4385 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4386 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4387 { 4388 assert(desc->thread == spdk_get_thread()); 4389 4390 spdk_poller_unregister(&desc->io_timeout_poller); 4391 4392 if (timeout_in_sec) { 4393 assert(cb_fn != NULL); 4394 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4395 desc, 4396 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4397 1000); 4398 if (desc->io_timeout_poller == NULL) { 4399 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4400 return -1; 4401 } 4402 } 4403 4404 desc->cb_fn = cb_fn; 4405 desc->cb_arg = cb_arg; 4406 desc->timeout_in_sec = timeout_in_sec; 4407 4408 return 0; 4409 } 4410 4411 static int 4412 bdev_channel_create(void *io_device, void *ctx_buf) 4413 { 4414 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4415 struct spdk_bdev_channel *ch = ctx_buf; 4416 struct spdk_io_channel *mgmt_io_ch; 4417 struct spdk_bdev_mgmt_channel *mgmt_ch; 4418 struct spdk_bdev_shared_resource *shared_resource; 4419 struct lba_range *range; 4420 4421 ch->bdev = bdev; 4422 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4423 if (!ch->channel) { 4424 return -1; 4425 } 4426 4427 ch->accel_channel = spdk_accel_get_io_channel(); 4428 if (!ch->accel_channel) { 4429 spdk_put_io_channel(ch->channel); 4430 return -1; 4431 } 4432 4433 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4434 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4435 4436 assert(ch->histogram == NULL); 4437 if (bdev->internal.histogram_enabled) { 4438 ch->histogram = spdk_histogram_data_alloc(); 4439 if (ch->histogram == NULL) { 4440 SPDK_ERRLOG("Could not allocate histogram\n"); 4441 } 4442 } 4443 4444 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4445 if (!mgmt_io_ch) { 4446 spdk_put_io_channel(ch->channel); 4447 spdk_put_io_channel(ch->accel_channel); 4448 return -1; 4449 } 4450 4451 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4452 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4453 if (shared_resource->shared_ch == ch->channel) { 4454 spdk_put_io_channel(mgmt_io_ch); 4455 shared_resource->ref++; 4456 break; 4457 } 4458 } 4459 4460 if (shared_resource == NULL) { 4461 shared_resource = calloc(1, sizeof(*shared_resource)); 4462 if (shared_resource == NULL) { 4463 spdk_put_io_channel(ch->channel); 4464 spdk_put_io_channel(ch->accel_channel); 4465 spdk_put_io_channel(mgmt_io_ch); 4466 return -1; 4467 } 4468 4469 shared_resource->mgmt_ch = mgmt_ch; 4470 shared_resource->io_outstanding = 0; 4471 TAILQ_INIT(&shared_resource->nomem_io); 4472 shared_resource->nomem_threshold = 0; 4473 shared_resource->shared_ch = ch->channel; 4474 shared_resource->ref = 1; 4475 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4476 } 4477 4478 ch->io_outstanding = 0; 4479 TAILQ_INIT(&ch->locked_ranges); 4480 TAILQ_INIT(&ch->qos_queued_io); 4481 ch->flags = 0; 4482 ch->trace_id = bdev->internal.trace_id; 4483 ch->shared_resource = shared_resource; 4484 4485 TAILQ_INIT(&ch->io_submitted); 4486 TAILQ_INIT(&ch->io_locked); 4487 TAILQ_INIT(&ch->io_accel_exec); 4488 TAILQ_INIT(&ch->io_memory_domain); 4489 4490 ch->stat = bdev_alloc_io_stat(false); 4491 if (ch->stat == NULL) { 4492 bdev_channel_destroy_resource(ch); 4493 return -1; 4494 } 4495 4496 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4497 4498 #ifdef SPDK_CONFIG_VTUNE 4499 { 4500 char *name; 4501 __itt_init_ittlib(NULL, 0); 4502 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4503 if (!name) { 4504 bdev_channel_destroy_resource(ch); 4505 return -1; 4506 } 4507 ch->handle = __itt_string_handle_create(name); 4508 free(name); 4509 ch->start_tsc = spdk_get_ticks(); 4510 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4511 ch->prev_stat = bdev_alloc_io_stat(false); 4512 if (ch->prev_stat == NULL) { 4513 bdev_channel_destroy_resource(ch); 4514 return -1; 4515 } 4516 } 4517 #endif 4518 4519 spdk_spin_lock(&bdev->internal.spinlock); 4520 bdev_enable_qos(bdev, ch); 4521 4522 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4523 struct lba_range *new_range; 4524 4525 new_range = calloc(1, sizeof(*new_range)); 4526 if (new_range == NULL) { 4527 spdk_spin_unlock(&bdev->internal.spinlock); 4528 bdev_channel_destroy_resource(ch); 4529 return -1; 4530 } 4531 new_range->length = range->length; 4532 new_range->offset = range->offset; 4533 new_range->locked_ctx = range->locked_ctx; 4534 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4535 } 4536 4537 spdk_spin_unlock(&bdev->internal.spinlock); 4538 4539 return 0; 4540 } 4541 4542 static int 4543 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4544 void *cb_ctx) 4545 { 4546 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4547 struct spdk_bdev_io *bdev_io; 4548 uint64_t buf_len; 4549 4550 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4551 if (bdev_io->internal.ch == bdev_ch) { 4552 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4553 spdk_iobuf_entry_abort(ch, entry, buf_len); 4554 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4555 } 4556 4557 return 0; 4558 } 4559 4560 /* 4561 * Abort I/O that are waiting on a data buffer. 4562 */ 4563 static void 4564 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4565 { 4566 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4567 } 4568 4569 /* 4570 * Abort I/O that are queued waiting for submission. These types of I/O are 4571 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4572 */ 4573 static void 4574 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4575 { 4576 struct spdk_bdev_io *bdev_io, *tmp; 4577 4578 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4579 if (bdev_io->internal.ch == ch) { 4580 TAILQ_REMOVE(queue, bdev_io, internal.link); 4581 /* 4582 * spdk_bdev_io_complete() assumes that the completed I/O had 4583 * been submitted to the bdev module. Since in this case it 4584 * hadn't, bump io_outstanding to account for the decrement 4585 * that spdk_bdev_io_complete() will do. 4586 */ 4587 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4588 bdev_io_increment_outstanding(ch, ch->shared_resource); 4589 } 4590 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4591 } 4592 } 4593 } 4594 4595 static bool 4596 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4597 { 4598 struct spdk_bdev_io *bdev_io; 4599 4600 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4601 if (bdev_io == bio_to_abort) { 4602 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4603 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4604 return true; 4605 } 4606 } 4607 4608 return false; 4609 } 4610 4611 static int 4612 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4613 { 4614 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4615 uint64_t buf_len; 4616 4617 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4618 if (bdev_io == bio_to_abort) { 4619 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4620 spdk_iobuf_entry_abort(ch, entry, buf_len); 4621 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4622 return 1; 4623 } 4624 4625 return 0; 4626 } 4627 4628 static bool 4629 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4630 { 4631 int rc; 4632 4633 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4634 return rc == 1; 4635 } 4636 4637 static void 4638 bdev_qos_channel_destroy(void *cb_arg) 4639 { 4640 struct spdk_bdev_qos *qos = cb_arg; 4641 4642 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4643 spdk_poller_unregister(&qos->poller); 4644 4645 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4646 4647 free(qos); 4648 } 4649 4650 static int 4651 bdev_qos_destroy(struct spdk_bdev *bdev) 4652 { 4653 int i; 4654 4655 /* 4656 * Cleanly shutting down the QoS poller is tricky, because 4657 * during the asynchronous operation the user could open 4658 * a new descriptor and create a new channel, spawning 4659 * a new QoS poller. 4660 * 4661 * The strategy is to create a new QoS structure here and swap it 4662 * in. The shutdown path then continues to refer to the old one 4663 * until it completes and then releases it. 4664 */ 4665 struct spdk_bdev_qos *new_qos, *old_qos; 4666 4667 old_qos = bdev->internal.qos; 4668 4669 new_qos = calloc(1, sizeof(*new_qos)); 4670 if (!new_qos) { 4671 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4672 return -ENOMEM; 4673 } 4674 4675 /* Copy the old QoS data into the newly allocated structure */ 4676 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4677 4678 /* Zero out the key parts of the QoS structure */ 4679 new_qos->ch = NULL; 4680 new_qos->thread = NULL; 4681 new_qos->poller = NULL; 4682 /* 4683 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4684 * It will be used later for the new QoS structure. 4685 */ 4686 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4687 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4688 new_qos->rate_limits[i].min_per_timeslice = 0; 4689 new_qos->rate_limits[i].max_per_timeslice = 0; 4690 } 4691 4692 bdev->internal.qos = new_qos; 4693 4694 if (old_qos->thread == NULL) { 4695 free(old_qos); 4696 } else { 4697 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4698 } 4699 4700 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4701 * been destroyed yet. The destruction path will end up waiting for the final 4702 * channel to be put before it releases resources. */ 4703 4704 return 0; 4705 } 4706 4707 void 4708 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4709 { 4710 total->bytes_read += add->bytes_read; 4711 total->num_read_ops += add->num_read_ops; 4712 total->bytes_written += add->bytes_written; 4713 total->num_write_ops += add->num_write_ops; 4714 total->bytes_unmapped += add->bytes_unmapped; 4715 total->num_unmap_ops += add->num_unmap_ops; 4716 total->bytes_copied += add->bytes_copied; 4717 total->num_copy_ops += add->num_copy_ops; 4718 total->read_latency_ticks += add->read_latency_ticks; 4719 total->write_latency_ticks += add->write_latency_ticks; 4720 total->unmap_latency_ticks += add->unmap_latency_ticks; 4721 total->copy_latency_ticks += add->copy_latency_ticks; 4722 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4723 total->max_read_latency_ticks = add->max_read_latency_ticks; 4724 } 4725 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4726 total->min_read_latency_ticks = add->min_read_latency_ticks; 4727 } 4728 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4729 total->max_write_latency_ticks = add->max_write_latency_ticks; 4730 } 4731 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4732 total->min_write_latency_ticks = add->min_write_latency_ticks; 4733 } 4734 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4735 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4736 } 4737 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4738 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4739 } 4740 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4741 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4742 } 4743 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4744 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4745 } 4746 } 4747 4748 static void 4749 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4750 { 4751 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4752 4753 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4754 memcpy(to_stat->io_error, from_stat->io_error, 4755 sizeof(struct spdk_bdev_io_error_stat)); 4756 } 4757 } 4758 4759 void 4760 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4761 { 4762 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4763 return; 4764 } 4765 4766 stat->max_read_latency_ticks = 0; 4767 stat->min_read_latency_ticks = UINT64_MAX; 4768 stat->max_write_latency_ticks = 0; 4769 stat->min_write_latency_ticks = UINT64_MAX; 4770 stat->max_unmap_latency_ticks = 0; 4771 stat->min_unmap_latency_ticks = UINT64_MAX; 4772 stat->max_copy_latency_ticks = 0; 4773 stat->min_copy_latency_ticks = UINT64_MAX; 4774 4775 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4776 return; 4777 } 4778 4779 stat->bytes_read = 0; 4780 stat->num_read_ops = 0; 4781 stat->bytes_written = 0; 4782 stat->num_write_ops = 0; 4783 stat->bytes_unmapped = 0; 4784 stat->num_unmap_ops = 0; 4785 stat->bytes_copied = 0; 4786 stat->num_copy_ops = 0; 4787 stat->read_latency_ticks = 0; 4788 stat->write_latency_ticks = 0; 4789 stat->unmap_latency_ticks = 0; 4790 stat->copy_latency_ticks = 0; 4791 4792 if (stat->io_error != NULL) { 4793 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4794 } 4795 } 4796 4797 struct spdk_bdev_io_stat * 4798 bdev_alloc_io_stat(bool io_error_stat) 4799 { 4800 struct spdk_bdev_io_stat *stat; 4801 4802 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4803 if (stat == NULL) { 4804 return NULL; 4805 } 4806 4807 if (io_error_stat) { 4808 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4809 if (stat->io_error == NULL) { 4810 free(stat); 4811 return NULL; 4812 } 4813 } else { 4814 stat->io_error = NULL; 4815 } 4816 4817 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4818 4819 return stat; 4820 } 4821 4822 void 4823 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4824 { 4825 if (stat != NULL) { 4826 free(stat->io_error); 4827 free(stat); 4828 } 4829 } 4830 4831 void 4832 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4833 { 4834 int i; 4835 4836 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4837 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4838 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4839 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4840 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4841 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4842 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4843 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4844 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4845 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4846 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4847 stat->min_read_latency_ticks != UINT64_MAX ? 4848 stat->min_read_latency_ticks : 0); 4849 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4850 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4851 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4852 stat->min_write_latency_ticks != UINT64_MAX ? 4853 stat->min_write_latency_ticks : 0); 4854 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4855 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4856 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4857 stat->min_unmap_latency_ticks != UINT64_MAX ? 4858 stat->min_unmap_latency_ticks : 0); 4859 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4860 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4861 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4862 stat->min_copy_latency_ticks != UINT64_MAX ? 4863 stat->min_copy_latency_ticks : 0); 4864 4865 if (stat->io_error != NULL) { 4866 spdk_json_write_named_object_begin(w, "io_error"); 4867 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4868 if (stat->io_error->error_status[i] != 0) { 4869 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4870 stat->io_error->error_status[i]); 4871 } 4872 } 4873 spdk_json_write_object_end(w); 4874 } 4875 } 4876 4877 static void 4878 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4879 { 4880 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4881 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4882 4883 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4884 bdev_abort_all_buf_io(mgmt_ch, ch); 4885 } 4886 4887 static void 4888 bdev_channel_destroy(void *io_device, void *ctx_buf) 4889 { 4890 struct spdk_bdev_channel *ch = ctx_buf; 4891 4892 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4893 spdk_get_thread()); 4894 4895 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4896 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4897 4898 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4899 spdk_spin_lock(&ch->bdev->internal.spinlock); 4900 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4901 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4902 4903 bdev_channel_abort_queued_ios(ch); 4904 4905 if (ch->histogram) { 4906 spdk_histogram_data_free(ch->histogram); 4907 } 4908 4909 bdev_channel_destroy_resource(ch); 4910 } 4911 4912 /* 4913 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4914 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4915 */ 4916 static int 4917 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4918 { 4919 struct spdk_bdev_name *tmp; 4920 4921 bdev_name->name = strdup(name); 4922 if (bdev_name->name == NULL) { 4923 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4924 return -ENOMEM; 4925 } 4926 4927 bdev_name->bdev = bdev; 4928 4929 spdk_spin_lock(&g_bdev_mgr.spinlock); 4930 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4931 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4932 4933 if (tmp != NULL) { 4934 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4935 free(bdev_name->name); 4936 return -EEXIST; 4937 } 4938 4939 return 0; 4940 } 4941 4942 static void 4943 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4944 { 4945 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4946 free(bdev_name->name); 4947 } 4948 4949 static void 4950 bdev_name_del(struct spdk_bdev_name *bdev_name) 4951 { 4952 spdk_spin_lock(&g_bdev_mgr.spinlock); 4953 bdev_name_del_unsafe(bdev_name); 4954 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4955 } 4956 4957 int 4958 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4959 { 4960 struct spdk_bdev_alias *tmp; 4961 int ret; 4962 4963 if (alias == NULL) { 4964 SPDK_ERRLOG("Empty alias passed\n"); 4965 return -EINVAL; 4966 } 4967 4968 tmp = calloc(1, sizeof(*tmp)); 4969 if (tmp == NULL) { 4970 SPDK_ERRLOG("Unable to allocate alias\n"); 4971 return -ENOMEM; 4972 } 4973 4974 ret = bdev_name_add(&tmp->alias, bdev, alias); 4975 if (ret != 0) { 4976 free(tmp); 4977 return ret; 4978 } 4979 4980 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4981 4982 return 0; 4983 } 4984 4985 static int 4986 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4987 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4988 { 4989 struct spdk_bdev_alias *tmp; 4990 4991 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4992 if (strcmp(alias, tmp->alias.name) == 0) { 4993 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4994 alias_del_fn(&tmp->alias); 4995 free(tmp); 4996 return 0; 4997 } 4998 } 4999 5000 return -ENOENT; 5001 } 5002 5003 int 5004 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 5005 { 5006 int rc; 5007 5008 rc = bdev_alias_del(bdev, alias, bdev_name_del); 5009 if (rc == -ENOENT) { 5010 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 5011 } 5012 5013 return rc; 5014 } 5015 5016 void 5017 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 5018 { 5019 struct spdk_bdev_alias *p, *tmp; 5020 5021 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 5022 TAILQ_REMOVE(&bdev->aliases, p, tailq); 5023 bdev_name_del(&p->alias); 5024 free(p); 5025 } 5026 } 5027 5028 struct spdk_io_channel * 5029 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 5030 { 5031 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 5032 } 5033 5034 void * 5035 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 5036 { 5037 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5038 void *ctx = NULL; 5039 5040 if (bdev->fn_table->get_module_ctx) { 5041 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 5042 } 5043 5044 return ctx; 5045 } 5046 5047 const char * 5048 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 5049 { 5050 return bdev->module->name; 5051 } 5052 5053 const char * 5054 spdk_bdev_get_name(const struct spdk_bdev *bdev) 5055 { 5056 return bdev->name; 5057 } 5058 5059 const char * 5060 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 5061 { 5062 return bdev->product_name; 5063 } 5064 5065 const struct spdk_bdev_aliases_list * 5066 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 5067 { 5068 return &bdev->aliases; 5069 } 5070 5071 uint32_t 5072 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 5073 { 5074 return bdev->blocklen; 5075 } 5076 5077 uint32_t 5078 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 5079 { 5080 return bdev->write_unit_size; 5081 } 5082 5083 uint64_t 5084 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 5085 { 5086 return bdev->blockcnt; 5087 } 5088 5089 const char * 5090 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 5091 { 5092 return qos_rpc_type[type]; 5093 } 5094 5095 void 5096 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 5097 { 5098 int i; 5099 5100 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 5101 5102 spdk_spin_lock(&bdev->internal.spinlock); 5103 if (bdev->internal.qos) { 5104 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5105 if (bdev->internal.qos->rate_limits[i].limit != 5106 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5107 limits[i] = bdev->internal.qos->rate_limits[i].limit; 5108 if (bdev_qos_is_iops_rate_limit(i) == false) { 5109 /* Change from Byte to Megabyte which is user visible. */ 5110 limits[i] = limits[i] / 1024 / 1024; 5111 } 5112 } 5113 } 5114 } 5115 spdk_spin_unlock(&bdev->internal.spinlock); 5116 } 5117 5118 size_t 5119 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 5120 { 5121 return 1 << bdev->required_alignment; 5122 } 5123 5124 uint32_t 5125 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 5126 { 5127 return bdev->optimal_io_boundary; 5128 } 5129 5130 bool 5131 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 5132 { 5133 return bdev->write_cache; 5134 } 5135 5136 const struct spdk_uuid * 5137 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 5138 { 5139 return &bdev->uuid; 5140 } 5141 5142 uint16_t 5143 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 5144 { 5145 return bdev->acwu; 5146 } 5147 5148 uint32_t 5149 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 5150 { 5151 return bdev->md_len; 5152 } 5153 5154 bool 5155 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 5156 { 5157 return (bdev->md_len != 0) && bdev->md_interleave; 5158 } 5159 5160 bool 5161 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 5162 { 5163 return (bdev->md_len != 0) && !bdev->md_interleave; 5164 } 5165 5166 bool 5167 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 5168 { 5169 return bdev->zoned; 5170 } 5171 5172 uint32_t 5173 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 5174 { 5175 if (spdk_bdev_is_md_interleaved(bdev)) { 5176 return bdev->blocklen - bdev->md_len; 5177 } else { 5178 return bdev->blocklen; 5179 } 5180 } 5181 5182 uint32_t 5183 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 5184 { 5185 return bdev->phys_blocklen; 5186 } 5187 5188 static uint32_t 5189 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 5190 { 5191 if (!spdk_bdev_is_md_interleaved(bdev)) { 5192 return bdev->blocklen + bdev->md_len; 5193 } else { 5194 return bdev->blocklen; 5195 } 5196 } 5197 5198 /* We have to use the typedef in the function declaration to appease astyle. */ 5199 typedef enum spdk_dif_type spdk_dif_type_t; 5200 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 5201 5202 spdk_dif_type_t 5203 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 5204 { 5205 if (bdev->md_len != 0) { 5206 return bdev->dif_type; 5207 } else { 5208 return SPDK_DIF_DISABLE; 5209 } 5210 } 5211 5212 spdk_dif_pi_format_t 5213 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 5214 { 5215 return bdev->dif_pi_format; 5216 } 5217 5218 bool 5219 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 5220 { 5221 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 5222 return bdev->dif_is_head_of_md; 5223 } else { 5224 return false; 5225 } 5226 } 5227 5228 bool 5229 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 5230 enum spdk_dif_check_type check_type) 5231 { 5232 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5233 return false; 5234 } 5235 5236 switch (check_type) { 5237 case SPDK_DIF_CHECK_TYPE_REFTAG: 5238 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5239 case SPDK_DIF_CHECK_TYPE_APPTAG: 5240 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5241 case SPDK_DIF_CHECK_TYPE_GUARD: 5242 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5243 default: 5244 return false; 5245 } 5246 } 5247 5248 static uint32_t 5249 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5250 { 5251 uint64_t aligned_length, max_write_blocks; 5252 5253 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5254 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5255 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5256 5257 return max_write_blocks; 5258 } 5259 5260 uint32_t 5261 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5262 { 5263 return bdev->max_copy; 5264 } 5265 5266 uint64_t 5267 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5268 { 5269 return bdev->internal.measured_queue_depth; 5270 } 5271 5272 uint64_t 5273 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5274 { 5275 return bdev->internal.period; 5276 } 5277 5278 uint64_t 5279 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5280 { 5281 return bdev->internal.weighted_io_time; 5282 } 5283 5284 uint64_t 5285 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5286 { 5287 return bdev->internal.io_time; 5288 } 5289 5290 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5291 { 5292 return bdev->ctratt; 5293 } 5294 5295 uint32_t 5296 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5297 { 5298 return bdev->nsid; 5299 } 5300 5301 uint32_t 5302 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5303 { 5304 struct spdk_bdev *bdev = desc->bdev; 5305 5306 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5307 } 5308 5309 uint32_t 5310 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5311 { 5312 struct spdk_bdev *bdev = desc->bdev; 5313 5314 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5315 } 5316 5317 bool 5318 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5319 { 5320 struct spdk_bdev *bdev = desc->bdev; 5321 5322 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5323 } 5324 5325 bool 5326 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5327 { 5328 struct spdk_bdev *bdev = desc->bdev; 5329 5330 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5331 } 5332 5333 spdk_dif_type_t 5334 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5335 { 5336 struct spdk_bdev *bdev = desc->bdev; 5337 5338 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5339 } 5340 5341 spdk_dif_pi_format_t 5342 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5343 { 5344 struct spdk_bdev *bdev = desc->bdev; 5345 5346 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5347 } 5348 5349 bool 5350 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5351 { 5352 struct spdk_bdev *bdev = desc->bdev; 5353 5354 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5355 } 5356 5357 bool 5358 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5359 enum spdk_dif_check_type check_type) 5360 { 5361 struct spdk_bdev *bdev = desc->bdev; 5362 5363 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5364 } 5365 5366 static void bdev_update_qd_sampling_period(void *ctx); 5367 5368 static void 5369 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5370 { 5371 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5372 5373 if (bdev->internal.measured_queue_depth) { 5374 bdev->internal.io_time += bdev->internal.period; 5375 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5376 } 5377 5378 bdev->internal.qd_poll_in_progress = false; 5379 5380 bdev_update_qd_sampling_period(bdev); 5381 } 5382 5383 static void 5384 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5385 struct spdk_io_channel *io_ch, void *_ctx) 5386 { 5387 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5388 5389 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5390 spdk_bdev_for_each_channel_continue(i, 0); 5391 } 5392 5393 static int 5394 bdev_calculate_measured_queue_depth(void *ctx) 5395 { 5396 struct spdk_bdev *bdev = ctx; 5397 5398 bdev->internal.qd_poll_in_progress = true; 5399 bdev->internal.temporary_queue_depth = 0; 5400 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5401 return SPDK_POLLER_BUSY; 5402 } 5403 5404 static void 5405 bdev_update_qd_sampling_period(void *ctx) 5406 { 5407 struct spdk_bdev *bdev = ctx; 5408 5409 if (bdev->internal.period == bdev->internal.new_period) { 5410 return; 5411 } 5412 5413 if (bdev->internal.qd_poll_in_progress) { 5414 return; 5415 } 5416 5417 bdev->internal.period = bdev->internal.new_period; 5418 5419 spdk_poller_unregister(&bdev->internal.qd_poller); 5420 if (bdev->internal.period != 0) { 5421 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5422 bdev, bdev->internal.period); 5423 } else { 5424 spdk_bdev_close(bdev->internal.qd_desc); 5425 bdev->internal.qd_desc = NULL; 5426 } 5427 } 5428 5429 static void 5430 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5431 { 5432 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5433 } 5434 5435 void 5436 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5437 { 5438 int rc; 5439 5440 if (bdev->internal.new_period == period) { 5441 return; 5442 } 5443 5444 bdev->internal.new_period = period; 5445 5446 if (bdev->internal.qd_desc != NULL) { 5447 assert(bdev->internal.period != 0); 5448 5449 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5450 bdev_update_qd_sampling_period, bdev); 5451 return; 5452 } 5453 5454 assert(bdev->internal.period == 0); 5455 5456 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5457 NULL, &bdev->internal.qd_desc); 5458 if (rc != 0) { 5459 return; 5460 } 5461 5462 bdev->internal.period = period; 5463 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5464 bdev, period); 5465 } 5466 5467 struct bdev_get_current_qd_ctx { 5468 uint64_t current_qd; 5469 spdk_bdev_get_current_qd_cb cb_fn; 5470 void *cb_arg; 5471 }; 5472 5473 static void 5474 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5475 { 5476 struct bdev_get_current_qd_ctx *ctx = _ctx; 5477 5478 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5479 5480 free(ctx); 5481 } 5482 5483 static void 5484 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5485 struct spdk_io_channel *io_ch, void *_ctx) 5486 { 5487 struct bdev_get_current_qd_ctx *ctx = _ctx; 5488 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5489 5490 ctx->current_qd += bdev_ch->io_outstanding; 5491 5492 spdk_bdev_for_each_channel_continue(i, 0); 5493 } 5494 5495 void 5496 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5497 void *cb_arg) 5498 { 5499 struct bdev_get_current_qd_ctx *ctx; 5500 5501 assert(cb_fn != NULL); 5502 5503 ctx = calloc(1, sizeof(*ctx)); 5504 if (ctx == NULL) { 5505 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5506 return; 5507 } 5508 5509 ctx->cb_fn = cb_fn; 5510 ctx->cb_arg = cb_arg; 5511 5512 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5513 } 5514 5515 static void 5516 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5517 { 5518 assert(desc->thread == spdk_get_thread()); 5519 5520 spdk_spin_lock(&desc->spinlock); 5521 desc->refs--; 5522 if (!desc->closed) { 5523 spdk_spin_unlock(&desc->spinlock); 5524 desc->callback.event_fn(type, 5525 desc->bdev, 5526 desc->callback.ctx); 5527 return; 5528 } else if (desc->refs == 0) { 5529 /* This descriptor was closed after this event_notify message was sent. 5530 * spdk_bdev_close() could not free the descriptor since this message was 5531 * in flight, so we free it now using bdev_desc_free(). 5532 */ 5533 spdk_spin_unlock(&desc->spinlock); 5534 bdev_desc_free(desc); 5535 return; 5536 } 5537 spdk_spin_unlock(&desc->spinlock); 5538 } 5539 5540 static void 5541 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5542 { 5543 spdk_spin_lock(&desc->spinlock); 5544 desc->refs++; 5545 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5546 spdk_spin_unlock(&desc->spinlock); 5547 } 5548 5549 static void 5550 _resize_notify(void *ctx) 5551 { 5552 struct spdk_bdev_desc *desc = ctx; 5553 5554 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5555 } 5556 5557 int 5558 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5559 { 5560 struct spdk_bdev_desc *desc; 5561 int ret; 5562 5563 if (size == bdev->blockcnt) { 5564 return 0; 5565 } 5566 5567 spdk_spin_lock(&bdev->internal.spinlock); 5568 5569 /* bdev has open descriptors */ 5570 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5571 bdev->blockcnt > size) { 5572 ret = -EBUSY; 5573 } else { 5574 bdev->blockcnt = size; 5575 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5576 event_notify(desc, _resize_notify); 5577 } 5578 ret = 0; 5579 } 5580 5581 spdk_spin_unlock(&bdev->internal.spinlock); 5582 5583 return ret; 5584 } 5585 5586 /* 5587 * Convert I/O offset and length from bytes to blocks. 5588 * 5589 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5590 */ 5591 static uint64_t 5592 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5593 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5594 { 5595 uint32_t block_size = bdev_desc_get_block_size(desc); 5596 uint8_t shift_cnt; 5597 5598 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5599 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5600 shift_cnt = spdk_u32log2(block_size); 5601 *offset_blocks = offset_bytes >> shift_cnt; 5602 *num_blocks = num_bytes >> shift_cnt; 5603 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5604 (num_bytes - (*num_blocks << shift_cnt)); 5605 } else { 5606 *offset_blocks = offset_bytes / block_size; 5607 *num_blocks = num_bytes / block_size; 5608 return (offset_bytes % block_size) | (num_bytes % block_size); 5609 } 5610 } 5611 5612 static bool 5613 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5614 { 5615 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5616 * has been an overflow and hence the offset has been wrapped around */ 5617 if (offset_blocks + num_blocks < offset_blocks) { 5618 return false; 5619 } 5620 5621 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5622 if (offset_blocks + num_blocks > bdev->blockcnt) { 5623 return false; 5624 } 5625 5626 return true; 5627 } 5628 5629 static void 5630 bdev_seek_complete_cb(void *ctx) 5631 { 5632 struct spdk_bdev_io *bdev_io = ctx; 5633 5634 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5635 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5636 } 5637 5638 static int 5639 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5640 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5641 spdk_bdev_io_completion_cb cb, void *cb_arg) 5642 { 5643 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5644 struct spdk_bdev_io *bdev_io; 5645 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5646 5647 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5648 5649 /* Check if offset_blocks is valid looking at the validity of one block */ 5650 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5651 return -EINVAL; 5652 } 5653 5654 bdev_io = bdev_channel_get_io(channel); 5655 if (!bdev_io) { 5656 return -ENOMEM; 5657 } 5658 5659 bdev_io->internal.ch = channel; 5660 bdev_io->internal.desc = desc; 5661 bdev_io->type = io_type; 5662 bdev_io->u.bdev.offset_blocks = offset_blocks; 5663 bdev_io->u.bdev.memory_domain = NULL; 5664 bdev_io->u.bdev.memory_domain_ctx = NULL; 5665 bdev_io->u.bdev.accel_sequence = NULL; 5666 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5667 5668 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5669 /* In case bdev doesn't support seek to next data/hole offset, 5670 * it is assumed that only data and no holes are present */ 5671 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5672 bdev_io->u.bdev.seek.offset = offset_blocks; 5673 } else { 5674 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5675 } 5676 5677 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5678 return 0; 5679 } 5680 5681 bdev_io_submit(bdev_io); 5682 return 0; 5683 } 5684 5685 int 5686 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5687 uint64_t offset_blocks, 5688 spdk_bdev_io_completion_cb cb, void *cb_arg) 5689 { 5690 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5691 } 5692 5693 int 5694 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5695 uint64_t offset_blocks, 5696 spdk_bdev_io_completion_cb cb, void *cb_arg) 5697 { 5698 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5699 } 5700 5701 uint64_t 5702 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5703 { 5704 return bdev_io->u.bdev.seek.offset; 5705 } 5706 5707 static int 5708 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5709 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5710 spdk_bdev_io_completion_cb cb, void *cb_arg) 5711 { 5712 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5713 struct spdk_bdev_io *bdev_io; 5714 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5715 5716 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5717 return -EINVAL; 5718 } 5719 5720 bdev_io = bdev_channel_get_io(channel); 5721 if (!bdev_io) { 5722 return -ENOMEM; 5723 } 5724 5725 bdev_io->internal.ch = channel; 5726 bdev_io->internal.desc = desc; 5727 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5728 bdev_io->u.bdev.iovs = &bdev_io->iov; 5729 bdev_io->u.bdev.iovs[0].iov_base = buf; 5730 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5731 bdev_io->u.bdev.iovcnt = 1; 5732 bdev_io->u.bdev.md_buf = md_buf; 5733 bdev_io->u.bdev.num_blocks = num_blocks; 5734 bdev_io->u.bdev.offset_blocks = offset_blocks; 5735 bdev_io->u.bdev.memory_domain = NULL; 5736 bdev_io->u.bdev.memory_domain_ctx = NULL; 5737 bdev_io->u.bdev.accel_sequence = NULL; 5738 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5739 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5740 5741 bdev_io_submit(bdev_io); 5742 return 0; 5743 } 5744 5745 int 5746 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5747 void *buf, uint64_t offset, uint64_t nbytes, 5748 spdk_bdev_io_completion_cb cb, void *cb_arg) 5749 { 5750 uint64_t offset_blocks, num_blocks; 5751 5752 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5753 return -EINVAL; 5754 } 5755 5756 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5757 } 5758 5759 int 5760 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5761 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5762 spdk_bdev_io_completion_cb cb, void *cb_arg) 5763 { 5764 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5765 } 5766 5767 int 5768 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5769 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5770 spdk_bdev_io_completion_cb cb, void *cb_arg) 5771 { 5772 struct iovec iov = { 5773 .iov_base = buf, 5774 }; 5775 5776 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5777 return -EINVAL; 5778 } 5779 5780 if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) { 5781 return -EINVAL; 5782 } 5783 5784 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5785 cb, cb_arg); 5786 } 5787 5788 int 5789 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5790 struct iovec *iov, int iovcnt, 5791 uint64_t offset, uint64_t nbytes, 5792 spdk_bdev_io_completion_cb cb, void *cb_arg) 5793 { 5794 uint64_t offset_blocks, num_blocks; 5795 5796 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5797 return -EINVAL; 5798 } 5799 5800 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5801 } 5802 5803 static int 5804 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5805 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5806 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5807 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5808 spdk_bdev_io_completion_cb cb, void *cb_arg) 5809 { 5810 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5811 struct spdk_bdev_io *bdev_io; 5812 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5813 5814 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5815 return -EINVAL; 5816 } 5817 5818 bdev_io = bdev_channel_get_io(channel); 5819 if (spdk_unlikely(!bdev_io)) { 5820 return -ENOMEM; 5821 } 5822 5823 bdev_io->internal.ch = channel; 5824 bdev_io->internal.desc = desc; 5825 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5826 bdev_io->u.bdev.iovs = iov; 5827 bdev_io->u.bdev.iovcnt = iovcnt; 5828 bdev_io->u.bdev.md_buf = md_buf; 5829 bdev_io->u.bdev.num_blocks = num_blocks; 5830 bdev_io->u.bdev.offset_blocks = offset_blocks; 5831 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5832 5833 if (seq != NULL) { 5834 bdev_io->internal.f.has_accel_sequence = true; 5835 bdev_io->internal.accel_sequence = seq; 5836 } 5837 5838 if (domain != NULL) { 5839 bdev_io->internal.f.has_memory_domain = true; 5840 bdev_io->internal.memory_domain = domain; 5841 bdev_io->internal.memory_domain_ctx = domain_ctx; 5842 } 5843 5844 bdev_io->u.bdev.memory_domain = domain; 5845 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5846 bdev_io->u.bdev.accel_sequence = seq; 5847 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5848 5849 _bdev_io_submit_ext(desc, bdev_io); 5850 5851 return 0; 5852 } 5853 5854 int 5855 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5856 struct iovec *iov, int iovcnt, 5857 uint64_t offset_blocks, uint64_t num_blocks, 5858 spdk_bdev_io_completion_cb cb, void *cb_arg) 5859 { 5860 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5861 5862 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5863 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5864 } 5865 5866 int 5867 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5868 struct iovec *iov, int iovcnt, void *md_buf, 5869 uint64_t offset_blocks, uint64_t num_blocks, 5870 spdk_bdev_io_completion_cb cb, void *cb_arg) 5871 { 5872 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5873 5874 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5875 return -EINVAL; 5876 } 5877 5878 if (md_buf && !_is_buf_allocated(iov)) { 5879 return -EINVAL; 5880 } 5881 5882 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5883 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5884 } 5885 5886 static inline bool 5887 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5888 { 5889 /* 5890 * We check if opts size is at least of size when we first introduced 5891 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5892 * are not checked internal. 5893 */ 5894 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5895 sizeof(opts->metadata) && 5896 opts->size <= sizeof(*opts) && 5897 /* When memory domain is used, the user must provide data buffers */ 5898 (!opts->memory_domain || (iov && iov[0].iov_base)); 5899 } 5900 5901 int 5902 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5903 struct iovec *iov, int iovcnt, 5904 uint64_t offset_blocks, uint64_t num_blocks, 5905 spdk_bdev_io_completion_cb cb, void *cb_arg, 5906 struct spdk_bdev_ext_io_opts *opts) 5907 { 5908 struct spdk_memory_domain *domain = NULL; 5909 struct spdk_accel_sequence *seq = NULL; 5910 void *domain_ctx = NULL, *md = NULL; 5911 uint32_t dif_check_flags = 0; 5912 uint32_t nvme_cdw12_raw; 5913 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5914 5915 if (opts) { 5916 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5917 return -EINVAL; 5918 } 5919 5920 md = opts->metadata; 5921 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5922 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5923 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5924 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5925 if (md) { 5926 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5927 return -EINVAL; 5928 } 5929 5930 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5931 return -EINVAL; 5932 } 5933 5934 if (spdk_unlikely(seq != NULL)) { 5935 return -EINVAL; 5936 } 5937 5938 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5939 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 5940 return -ENOTSUP; 5941 } 5942 } 5943 5944 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5945 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 5946 } 5947 } 5948 5949 dif_check_flags |= bdev->dif_check_flags & 5950 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5951 5952 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5953 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5954 } 5955 5956 static int 5957 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5958 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5959 spdk_bdev_io_completion_cb cb, void *cb_arg) 5960 { 5961 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5962 struct spdk_bdev_io *bdev_io; 5963 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5964 5965 if (!desc->write) { 5966 return -EBADF; 5967 } 5968 5969 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5970 return -EINVAL; 5971 } 5972 5973 bdev_io = bdev_channel_get_io(channel); 5974 if (!bdev_io) { 5975 return -ENOMEM; 5976 } 5977 5978 bdev_io->internal.ch = channel; 5979 bdev_io->internal.desc = desc; 5980 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5981 bdev_io->u.bdev.iovs = &bdev_io->iov; 5982 bdev_io->u.bdev.iovs[0].iov_base = buf; 5983 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5984 bdev_io->u.bdev.iovcnt = 1; 5985 bdev_io->u.bdev.md_buf = md_buf; 5986 bdev_io->u.bdev.num_blocks = num_blocks; 5987 bdev_io->u.bdev.offset_blocks = offset_blocks; 5988 bdev_io->u.bdev.memory_domain = NULL; 5989 bdev_io->u.bdev.memory_domain_ctx = NULL; 5990 bdev_io->u.bdev.accel_sequence = NULL; 5991 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5992 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5993 5994 bdev_io_submit(bdev_io); 5995 return 0; 5996 } 5997 5998 int 5999 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6000 void *buf, uint64_t offset, uint64_t nbytes, 6001 spdk_bdev_io_completion_cb cb, void *cb_arg) 6002 { 6003 uint64_t offset_blocks, num_blocks; 6004 6005 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6006 return -EINVAL; 6007 } 6008 6009 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 6010 } 6011 6012 int 6013 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6014 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6015 spdk_bdev_io_completion_cb cb, void *cb_arg) 6016 { 6017 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6018 cb, cb_arg); 6019 } 6020 6021 int 6022 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6023 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6024 spdk_bdev_io_completion_cb cb, void *cb_arg) 6025 { 6026 struct iovec iov = { 6027 .iov_base = buf, 6028 }; 6029 6030 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6031 return -EINVAL; 6032 } 6033 6034 if (md_buf && !_is_buf_allocated(&iov)) { 6035 return -EINVAL; 6036 } 6037 6038 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6039 cb, cb_arg); 6040 } 6041 6042 static int 6043 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6044 struct iovec *iov, int iovcnt, void *md_buf, 6045 uint64_t offset_blocks, uint64_t num_blocks, 6046 struct spdk_memory_domain *domain, void *domain_ctx, 6047 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 6048 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 6049 spdk_bdev_io_completion_cb cb, void *cb_arg) 6050 { 6051 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6052 struct spdk_bdev_io *bdev_io; 6053 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6054 6055 if (spdk_unlikely(!desc->write)) { 6056 return -EBADF; 6057 } 6058 6059 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 6060 return -EINVAL; 6061 } 6062 6063 bdev_io = bdev_channel_get_io(channel); 6064 if (spdk_unlikely(!bdev_io)) { 6065 return -ENOMEM; 6066 } 6067 6068 bdev_io->internal.ch = channel; 6069 bdev_io->internal.desc = desc; 6070 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 6071 bdev_io->u.bdev.iovs = iov; 6072 bdev_io->u.bdev.iovcnt = iovcnt; 6073 bdev_io->u.bdev.md_buf = md_buf; 6074 bdev_io->u.bdev.num_blocks = num_blocks; 6075 bdev_io->u.bdev.offset_blocks = offset_blocks; 6076 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6077 if (seq != NULL) { 6078 bdev_io->internal.f.has_accel_sequence = true; 6079 bdev_io->internal.accel_sequence = seq; 6080 } 6081 6082 if (domain != NULL) { 6083 bdev_io->internal.f.has_memory_domain = true; 6084 bdev_io->internal.memory_domain = domain; 6085 bdev_io->internal.memory_domain_ctx = domain_ctx; 6086 } 6087 6088 bdev_io->u.bdev.memory_domain = domain; 6089 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 6090 bdev_io->u.bdev.accel_sequence = seq; 6091 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 6092 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 6093 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 6094 6095 _bdev_io_submit_ext(desc, bdev_io); 6096 6097 return 0; 6098 } 6099 6100 int 6101 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6102 struct iovec *iov, int iovcnt, 6103 uint64_t offset, uint64_t len, 6104 spdk_bdev_io_completion_cb cb, void *cb_arg) 6105 { 6106 uint64_t offset_blocks, num_blocks; 6107 6108 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6109 return -EINVAL; 6110 } 6111 6112 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 6113 } 6114 6115 int 6116 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6117 struct iovec *iov, int iovcnt, 6118 uint64_t offset_blocks, uint64_t num_blocks, 6119 spdk_bdev_io_completion_cb cb, void *cb_arg) 6120 { 6121 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6122 6123 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6124 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6125 cb, cb_arg); 6126 } 6127 6128 int 6129 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6130 struct iovec *iov, int iovcnt, void *md_buf, 6131 uint64_t offset_blocks, uint64_t num_blocks, 6132 spdk_bdev_io_completion_cb cb, void *cb_arg) 6133 { 6134 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6135 6136 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 6137 return -EINVAL; 6138 } 6139 6140 if (md_buf && !_is_buf_allocated(iov)) { 6141 return -EINVAL; 6142 } 6143 6144 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6145 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6146 cb, cb_arg); 6147 } 6148 6149 int 6150 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6151 struct iovec *iov, int iovcnt, 6152 uint64_t offset_blocks, uint64_t num_blocks, 6153 spdk_bdev_io_completion_cb cb, void *cb_arg, 6154 struct spdk_bdev_ext_io_opts *opts) 6155 { 6156 struct spdk_memory_domain *domain = NULL; 6157 struct spdk_accel_sequence *seq = NULL; 6158 void *domain_ctx = NULL, *md = NULL; 6159 uint32_t dif_check_flags = 0; 6160 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6161 uint32_t nvme_cdw12_raw = 0; 6162 uint32_t nvme_cdw13_raw = 0; 6163 6164 if (opts) { 6165 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 6166 return -EINVAL; 6167 } 6168 md = opts->metadata; 6169 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 6170 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 6171 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 6172 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 6173 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 6174 if (md) { 6175 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 6176 return -EINVAL; 6177 } 6178 6179 if (spdk_unlikely(!_is_buf_allocated(iov))) { 6180 return -EINVAL; 6181 } 6182 6183 if (spdk_unlikely(seq != NULL)) { 6184 return -EINVAL; 6185 } 6186 6187 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6188 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 6189 return -ENOTSUP; 6190 } 6191 } 6192 6193 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6194 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 6195 } 6196 } 6197 6198 dif_check_flags |= bdev->dif_check_flags & 6199 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 6200 6201 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 6202 domain, domain_ctx, seq, dif_check_flags, 6203 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 6204 } 6205 6206 static void 6207 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6208 { 6209 struct spdk_bdev_io *parent_io = cb_arg; 6210 struct spdk_bdev *bdev = parent_io->bdev; 6211 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 6212 int i, rc = 0; 6213 6214 if (!success) { 6215 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6216 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6217 spdk_bdev_free_io(bdev_io); 6218 return; 6219 } 6220 6221 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 6222 rc = memcmp(read_buf, 6223 parent_io->u.bdev.iovs[i].iov_base, 6224 parent_io->u.bdev.iovs[i].iov_len); 6225 if (rc) { 6226 break; 6227 } 6228 read_buf += parent_io->u.bdev.iovs[i].iov_len; 6229 } 6230 6231 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 6232 rc = memcmp(bdev_io->u.bdev.md_buf, 6233 parent_io->u.bdev.md_buf, 6234 spdk_bdev_get_md_size(bdev)); 6235 } 6236 6237 spdk_bdev_free_io(bdev_io); 6238 6239 if (rc == 0) { 6240 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6241 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6242 } else { 6243 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 6244 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6245 } 6246 } 6247 6248 static void 6249 bdev_compare_do_read(void *_bdev_io) 6250 { 6251 struct spdk_bdev_io *bdev_io = _bdev_io; 6252 int rc; 6253 6254 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 6255 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6256 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6257 bdev_compare_do_read_done, bdev_io); 6258 6259 if (rc == -ENOMEM) { 6260 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6261 } else if (rc != 0) { 6262 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6263 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6264 } 6265 } 6266 6267 static int 6268 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6269 struct iovec *iov, int iovcnt, void *md_buf, 6270 uint64_t offset_blocks, uint64_t num_blocks, 6271 spdk_bdev_io_completion_cb cb, void *cb_arg) 6272 { 6273 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6274 struct spdk_bdev_io *bdev_io; 6275 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6276 6277 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6278 return -EINVAL; 6279 } 6280 6281 bdev_io = bdev_channel_get_io(channel); 6282 if (!bdev_io) { 6283 return -ENOMEM; 6284 } 6285 6286 bdev_io->internal.ch = channel; 6287 bdev_io->internal.desc = desc; 6288 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6289 bdev_io->u.bdev.iovs = iov; 6290 bdev_io->u.bdev.iovcnt = iovcnt; 6291 bdev_io->u.bdev.md_buf = md_buf; 6292 bdev_io->u.bdev.num_blocks = num_blocks; 6293 bdev_io->u.bdev.offset_blocks = offset_blocks; 6294 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6295 bdev_io->u.bdev.memory_domain = NULL; 6296 bdev_io->u.bdev.memory_domain_ctx = NULL; 6297 bdev_io->u.bdev.accel_sequence = NULL; 6298 6299 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6300 bdev_io_submit(bdev_io); 6301 return 0; 6302 } 6303 6304 bdev_compare_do_read(bdev_io); 6305 6306 return 0; 6307 } 6308 6309 int 6310 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6311 struct iovec *iov, int iovcnt, 6312 uint64_t offset_blocks, uint64_t num_blocks, 6313 spdk_bdev_io_completion_cb cb, void *cb_arg) 6314 { 6315 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6316 num_blocks, cb, cb_arg); 6317 } 6318 6319 int 6320 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6321 struct iovec *iov, int iovcnt, void *md_buf, 6322 uint64_t offset_blocks, uint64_t num_blocks, 6323 spdk_bdev_io_completion_cb cb, void *cb_arg) 6324 { 6325 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6326 return -EINVAL; 6327 } 6328 6329 if (md_buf && !_is_buf_allocated(iov)) { 6330 return -EINVAL; 6331 } 6332 6333 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6334 num_blocks, cb, cb_arg); 6335 } 6336 6337 static int 6338 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6339 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6340 spdk_bdev_io_completion_cb cb, void *cb_arg) 6341 { 6342 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6343 struct spdk_bdev_io *bdev_io; 6344 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6345 6346 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6347 return -EINVAL; 6348 } 6349 6350 bdev_io = bdev_channel_get_io(channel); 6351 if (!bdev_io) { 6352 return -ENOMEM; 6353 } 6354 6355 bdev_io->internal.ch = channel; 6356 bdev_io->internal.desc = desc; 6357 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6358 bdev_io->u.bdev.iovs = &bdev_io->iov; 6359 bdev_io->u.bdev.iovs[0].iov_base = buf; 6360 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 6361 bdev_io->u.bdev.iovcnt = 1; 6362 bdev_io->u.bdev.md_buf = md_buf; 6363 bdev_io->u.bdev.num_blocks = num_blocks; 6364 bdev_io->u.bdev.offset_blocks = offset_blocks; 6365 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6366 bdev_io->u.bdev.memory_domain = NULL; 6367 bdev_io->u.bdev.memory_domain_ctx = NULL; 6368 bdev_io->u.bdev.accel_sequence = NULL; 6369 6370 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6371 bdev_io_submit(bdev_io); 6372 return 0; 6373 } 6374 6375 bdev_compare_do_read(bdev_io); 6376 6377 return 0; 6378 } 6379 6380 int 6381 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6382 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6383 spdk_bdev_io_completion_cb cb, void *cb_arg) 6384 { 6385 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6386 cb, cb_arg); 6387 } 6388 6389 int 6390 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6391 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6392 spdk_bdev_io_completion_cb cb, void *cb_arg) 6393 { 6394 struct iovec iov = { 6395 .iov_base = buf, 6396 }; 6397 6398 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6399 return -EINVAL; 6400 } 6401 6402 if (md_buf && !_is_buf_allocated(&iov)) { 6403 return -EINVAL; 6404 } 6405 6406 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6407 cb, cb_arg); 6408 } 6409 6410 static void 6411 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6412 { 6413 struct spdk_bdev_io *bdev_io = ctx; 6414 6415 if (unlock_status) { 6416 SPDK_ERRLOG("LBA range unlock failed\n"); 6417 } 6418 6419 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6420 false, bdev_io->internal.caller_ctx); 6421 } 6422 6423 static void 6424 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6425 { 6426 bdev_io->internal.status = status; 6427 6428 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6429 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6430 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6431 } 6432 6433 static void 6434 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6435 { 6436 struct spdk_bdev_io *parent_io = cb_arg; 6437 6438 if (!success) { 6439 SPDK_ERRLOG("Compare and write operation failed\n"); 6440 } 6441 6442 spdk_bdev_free_io(bdev_io); 6443 6444 bdev_comparev_and_writev_blocks_unlock(parent_io, 6445 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6446 } 6447 6448 static void 6449 bdev_compare_and_write_do_write(void *_bdev_io) 6450 { 6451 struct spdk_bdev_io *bdev_io = _bdev_io; 6452 int rc; 6453 6454 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6455 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6456 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6457 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6458 bdev_compare_and_write_do_write_done, bdev_io); 6459 6460 6461 if (rc == -ENOMEM) { 6462 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6463 } else if (rc != 0) { 6464 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6465 } 6466 } 6467 6468 static void 6469 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6470 { 6471 struct spdk_bdev_io *parent_io = cb_arg; 6472 6473 spdk_bdev_free_io(bdev_io); 6474 6475 if (!success) { 6476 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6477 return; 6478 } 6479 6480 bdev_compare_and_write_do_write(parent_io); 6481 } 6482 6483 static void 6484 bdev_compare_and_write_do_compare(void *_bdev_io) 6485 { 6486 struct spdk_bdev_io *bdev_io = _bdev_io; 6487 int rc; 6488 6489 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6490 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6491 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6492 bdev_compare_and_write_do_compare_done, bdev_io); 6493 6494 if (rc == -ENOMEM) { 6495 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6496 } else if (rc != 0) { 6497 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6498 } 6499 } 6500 6501 static void 6502 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6503 { 6504 struct spdk_bdev_io *bdev_io = ctx; 6505 6506 if (status) { 6507 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6508 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6509 return; 6510 } 6511 6512 bdev_compare_and_write_do_compare(bdev_io); 6513 } 6514 6515 int 6516 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6517 struct iovec *compare_iov, int compare_iovcnt, 6518 struct iovec *write_iov, int write_iovcnt, 6519 uint64_t offset_blocks, uint64_t num_blocks, 6520 spdk_bdev_io_completion_cb cb, void *cb_arg) 6521 { 6522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6523 struct spdk_bdev_io *bdev_io; 6524 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6525 6526 if (!desc->write) { 6527 return -EBADF; 6528 } 6529 6530 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6531 return -EINVAL; 6532 } 6533 6534 if (num_blocks > bdev->acwu) { 6535 return -EINVAL; 6536 } 6537 6538 bdev_io = bdev_channel_get_io(channel); 6539 if (!bdev_io) { 6540 return -ENOMEM; 6541 } 6542 6543 bdev_io->internal.ch = channel; 6544 bdev_io->internal.desc = desc; 6545 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6546 bdev_io->u.bdev.iovs = compare_iov; 6547 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6548 bdev_io->u.bdev.fused_iovs = write_iov; 6549 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6550 bdev_io->u.bdev.md_buf = NULL; 6551 bdev_io->u.bdev.num_blocks = num_blocks; 6552 bdev_io->u.bdev.offset_blocks = offset_blocks; 6553 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6554 bdev_io->u.bdev.memory_domain = NULL; 6555 bdev_io->u.bdev.memory_domain_ctx = NULL; 6556 bdev_io->u.bdev.accel_sequence = NULL; 6557 6558 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6559 bdev_io_submit(bdev_io); 6560 return 0; 6561 } 6562 6563 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6564 bdev_comparev_and_writev_blocks_locked, bdev_io); 6565 } 6566 6567 int 6568 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6569 struct iovec *iov, int iovcnt, 6570 uint64_t offset_blocks, uint64_t num_blocks, 6571 bool populate, 6572 spdk_bdev_io_completion_cb cb, void *cb_arg) 6573 { 6574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6575 struct spdk_bdev_io *bdev_io; 6576 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6577 6578 if (!desc->write) { 6579 return -EBADF; 6580 } 6581 6582 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6583 return -EINVAL; 6584 } 6585 6586 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6587 return -ENOTSUP; 6588 } 6589 6590 bdev_io = bdev_channel_get_io(channel); 6591 if (!bdev_io) { 6592 return -ENOMEM; 6593 } 6594 6595 bdev_io->internal.ch = channel; 6596 bdev_io->internal.desc = desc; 6597 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6598 bdev_io->u.bdev.num_blocks = num_blocks; 6599 bdev_io->u.bdev.offset_blocks = offset_blocks; 6600 bdev_io->u.bdev.iovs = iov; 6601 bdev_io->u.bdev.iovcnt = iovcnt; 6602 bdev_io->u.bdev.md_buf = NULL; 6603 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6604 bdev_io->u.bdev.zcopy.commit = 0; 6605 bdev_io->u.bdev.zcopy.start = 1; 6606 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6607 bdev_io->u.bdev.memory_domain = NULL; 6608 bdev_io->u.bdev.memory_domain_ctx = NULL; 6609 bdev_io->u.bdev.accel_sequence = NULL; 6610 6611 bdev_io_submit(bdev_io); 6612 6613 return 0; 6614 } 6615 6616 int 6617 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6618 spdk_bdev_io_completion_cb cb, void *cb_arg) 6619 { 6620 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6621 return -EINVAL; 6622 } 6623 6624 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6625 bdev_io->u.bdev.zcopy.start = 0; 6626 bdev_io->internal.caller_ctx = cb_arg; 6627 bdev_io->internal.cb = cb; 6628 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6629 6630 bdev_io_submit(bdev_io); 6631 6632 return 0; 6633 } 6634 6635 int 6636 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6637 uint64_t offset, uint64_t len, 6638 spdk_bdev_io_completion_cb cb, void *cb_arg) 6639 { 6640 uint64_t offset_blocks, num_blocks; 6641 6642 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6643 return -EINVAL; 6644 } 6645 6646 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6647 } 6648 6649 int 6650 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6651 uint64_t offset_blocks, uint64_t num_blocks, 6652 spdk_bdev_io_completion_cb cb, void *cb_arg) 6653 { 6654 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6655 struct spdk_bdev_io *bdev_io; 6656 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6657 6658 if (!desc->write) { 6659 return -EBADF; 6660 } 6661 6662 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6663 return -EINVAL; 6664 } 6665 6666 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6667 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6668 return -ENOTSUP; 6669 } 6670 6671 bdev_io = bdev_channel_get_io(channel); 6672 6673 if (!bdev_io) { 6674 return -ENOMEM; 6675 } 6676 6677 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6678 bdev_io->internal.ch = channel; 6679 bdev_io->internal.desc = desc; 6680 bdev_io->u.bdev.offset_blocks = offset_blocks; 6681 bdev_io->u.bdev.num_blocks = num_blocks; 6682 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6683 bdev_io->u.bdev.memory_domain = NULL; 6684 bdev_io->u.bdev.memory_domain_ctx = NULL; 6685 bdev_io->u.bdev.accel_sequence = NULL; 6686 6687 /* If the write_zeroes size is large and should be split, use the generic split 6688 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6689 * 6690 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6691 * or emulate it using regular write request otherwise. 6692 */ 6693 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6694 bdev_io->internal.f.split) { 6695 bdev_io_submit(bdev_io); 6696 return 0; 6697 } 6698 6699 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6700 6701 return bdev_write_zero_buffer(bdev_io); 6702 } 6703 6704 int 6705 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6706 uint64_t offset, uint64_t nbytes, 6707 spdk_bdev_io_completion_cb cb, void *cb_arg) 6708 { 6709 uint64_t offset_blocks, num_blocks; 6710 6711 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6712 return -EINVAL; 6713 } 6714 6715 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6716 } 6717 6718 static void 6719 bdev_io_complete_cb(void *ctx) 6720 { 6721 struct spdk_bdev_io *bdev_io = ctx; 6722 6723 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6724 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6725 } 6726 6727 int 6728 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6729 uint64_t offset_blocks, uint64_t num_blocks, 6730 spdk_bdev_io_completion_cb cb, void *cb_arg) 6731 { 6732 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6733 struct spdk_bdev_io *bdev_io; 6734 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6735 6736 if (!desc->write) { 6737 return -EBADF; 6738 } 6739 6740 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6741 return -EINVAL; 6742 } 6743 6744 bdev_io = bdev_channel_get_io(channel); 6745 if (!bdev_io) { 6746 return -ENOMEM; 6747 } 6748 6749 bdev_io->internal.ch = channel; 6750 bdev_io->internal.desc = desc; 6751 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6752 6753 bdev_io->u.bdev.iovs = &bdev_io->iov; 6754 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6755 bdev_io->u.bdev.iovs[0].iov_len = 0; 6756 bdev_io->u.bdev.iovcnt = 1; 6757 6758 bdev_io->u.bdev.offset_blocks = offset_blocks; 6759 bdev_io->u.bdev.num_blocks = num_blocks; 6760 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6761 bdev_io->u.bdev.memory_domain = NULL; 6762 bdev_io->u.bdev.memory_domain_ctx = NULL; 6763 bdev_io->u.bdev.accel_sequence = NULL; 6764 6765 if (num_blocks == 0) { 6766 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6767 return 0; 6768 } 6769 6770 bdev_io_submit(bdev_io); 6771 return 0; 6772 } 6773 6774 int 6775 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6776 uint64_t offset, uint64_t length, 6777 spdk_bdev_io_completion_cb cb, void *cb_arg) 6778 { 6779 uint64_t offset_blocks, num_blocks; 6780 6781 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6782 return -EINVAL; 6783 } 6784 6785 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6786 } 6787 6788 int 6789 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6790 uint64_t offset_blocks, uint64_t num_blocks, 6791 spdk_bdev_io_completion_cb cb, void *cb_arg) 6792 { 6793 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6794 struct spdk_bdev_io *bdev_io; 6795 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6796 6797 if (!desc->write) { 6798 return -EBADF; 6799 } 6800 6801 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6802 return -ENOTSUP; 6803 } 6804 6805 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6806 return -EINVAL; 6807 } 6808 6809 bdev_io = bdev_channel_get_io(channel); 6810 if (!bdev_io) { 6811 return -ENOMEM; 6812 } 6813 6814 bdev_io->internal.ch = channel; 6815 bdev_io->internal.desc = desc; 6816 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6817 bdev_io->u.bdev.iovs = NULL; 6818 bdev_io->u.bdev.iovcnt = 0; 6819 bdev_io->u.bdev.offset_blocks = offset_blocks; 6820 bdev_io->u.bdev.num_blocks = num_blocks; 6821 bdev_io->u.bdev.memory_domain = NULL; 6822 bdev_io->u.bdev.memory_domain_ctx = NULL; 6823 bdev_io->u.bdev.accel_sequence = NULL; 6824 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6825 6826 bdev_io_submit(bdev_io); 6827 return 0; 6828 } 6829 6830 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6831 6832 static void 6833 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6834 { 6835 struct spdk_bdev_io *bdev_io = _ctx; 6836 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6837 6838 if (status == -EBUSY) { 6839 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6840 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6841 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6842 } else { 6843 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6844 /* If outstanding IOs are still present and reset_io_drain_timeout 6845 * seconds passed, start the reset. */ 6846 bdev_io_submit_reset(bdev_io); 6847 } else { 6848 /* We still have in progress memory domain pull/push or we're 6849 * executing accel sequence. Since we cannot abort either of those 6850 * operations, fail the reset request. */ 6851 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6852 } 6853 } 6854 } else { 6855 SPDK_DEBUGLOG(bdev, 6856 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6857 ch->bdev->name); 6858 /* Mark the completion status as a SUCCESS and complete the reset. */ 6859 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6860 } 6861 } 6862 6863 static void 6864 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6865 struct spdk_io_channel *io_ch, void *_ctx) 6866 { 6867 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6868 int status = 0; 6869 6870 if (cur_ch->io_outstanding > 0 || 6871 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6872 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6873 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6874 * further iteration over the rest of the channels and pass non-zero status 6875 * to the callback function. */ 6876 status = -EBUSY; 6877 } 6878 spdk_bdev_for_each_channel_continue(i, status); 6879 } 6880 6881 static int 6882 bdev_reset_poll_for_outstanding_io(void *ctx) 6883 { 6884 struct spdk_bdev_io *bdev_io = ctx; 6885 6886 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6887 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6888 bdev_reset_check_outstanding_io_done); 6889 6890 return SPDK_POLLER_BUSY; 6891 } 6892 6893 static void 6894 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6895 { 6896 struct spdk_bdev_io *bdev_io = _ctx; 6897 6898 if (bdev->reset_io_drain_timeout == 0) { 6899 bdev_io_submit_reset(bdev_io); 6900 return; 6901 } 6902 6903 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6904 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6905 6906 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6907 * submit the reset to the underlying module only if outstanding I/O 6908 * remain after reset_io_drain_timeout seconds have passed. */ 6909 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6910 bdev_reset_check_outstanding_io_done); 6911 } 6912 6913 static void 6914 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6915 struct spdk_io_channel *ch, void *_ctx) 6916 { 6917 struct spdk_bdev_channel *channel; 6918 struct spdk_bdev_mgmt_channel *mgmt_channel; 6919 struct spdk_bdev_shared_resource *shared_resource; 6920 6921 channel = __io_ch_to_bdev_ch(ch); 6922 shared_resource = channel->shared_resource; 6923 mgmt_channel = shared_resource->mgmt_ch; 6924 6925 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6926 6927 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6928 bdev_abort_all_buf_io(mgmt_channel, channel); 6929 6930 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6931 bdev_abort_all_queued_io(&channel->qos_queued_io, channel); 6932 } 6933 6934 spdk_bdev_for_each_channel_continue(i, 0); 6935 } 6936 6937 static void 6938 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6939 { 6940 struct spdk_bdev *bdev = bdev_io->bdev; 6941 bool freeze_channel = false; 6942 6943 bdev_ch_add_to_io_submitted(bdev_io); 6944 6945 /** 6946 * Take a channel reference for the target bdev for the life of this 6947 * reset. This guards against the channel getting destroyed before 6948 * the reset is completed. We will release the reference when this 6949 * reset is completed. 6950 */ 6951 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6952 6953 spdk_spin_lock(&bdev->internal.spinlock); 6954 if (bdev->internal.reset_in_progress == NULL) { 6955 bdev->internal.reset_in_progress = bdev_io; 6956 freeze_channel = true; 6957 } else { 6958 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6959 } 6960 spdk_spin_unlock(&bdev->internal.spinlock); 6961 6962 if (freeze_channel) { 6963 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6964 bdev_reset_freeze_channel_done); 6965 } 6966 } 6967 6968 int 6969 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6970 spdk_bdev_io_completion_cb cb, void *cb_arg) 6971 { 6972 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6973 struct spdk_bdev_io *bdev_io; 6974 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6975 6976 bdev_io = bdev_channel_get_io(channel); 6977 if (!bdev_io) { 6978 return -ENOMEM; 6979 } 6980 6981 bdev_io->internal.ch = channel; 6982 bdev_io->internal.desc = desc; 6983 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6984 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6985 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6986 6987 bdev_start_reset(bdev_io); 6988 return 0; 6989 } 6990 6991 void 6992 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6993 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6994 { 6995 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6996 6997 bdev_get_io_stat(stat, channel->stat); 6998 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6999 } 7000 7001 static void 7002 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 7003 { 7004 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7005 7006 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 7007 bdev_iostat_ctx->cb_arg, 0); 7008 free(bdev_iostat_ctx); 7009 } 7010 7011 static void 7012 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7013 struct spdk_io_channel *ch, void *_ctx) 7014 { 7015 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7016 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7017 7018 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 7019 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 7020 spdk_bdev_for_each_channel_continue(i, 0); 7021 } 7022 7023 void 7024 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 7025 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 7026 { 7027 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 7028 7029 assert(bdev != NULL); 7030 assert(stat != NULL); 7031 assert(cb != NULL); 7032 7033 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 7034 if (bdev_iostat_ctx == NULL) { 7035 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 7036 cb(bdev, stat, cb_arg, -ENOMEM); 7037 return; 7038 } 7039 7040 bdev_iostat_ctx->stat = stat; 7041 bdev_iostat_ctx->cb = cb; 7042 bdev_iostat_ctx->cb_arg = cb_arg; 7043 bdev_iostat_ctx->reset_mode = reset_mode; 7044 7045 /* Start with the statistics from previously deleted channels. */ 7046 spdk_spin_lock(&bdev->internal.spinlock); 7047 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 7048 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 7049 spdk_spin_unlock(&bdev->internal.spinlock); 7050 7051 /* Then iterate and add the statistics from each existing channel. */ 7052 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 7053 bdev_get_device_stat_done); 7054 } 7055 7056 struct bdev_iostat_reset_ctx { 7057 enum spdk_bdev_reset_stat_mode mode; 7058 bdev_reset_device_stat_cb cb; 7059 void *cb_arg; 7060 }; 7061 7062 static void 7063 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 7064 { 7065 struct bdev_iostat_reset_ctx *ctx = _ctx; 7066 7067 ctx->cb(bdev, ctx->cb_arg, 0); 7068 7069 free(ctx); 7070 } 7071 7072 static void 7073 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7074 struct spdk_io_channel *ch, void *_ctx) 7075 { 7076 struct bdev_iostat_reset_ctx *ctx = _ctx; 7077 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7078 7079 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 7080 7081 spdk_bdev_for_each_channel_continue(i, 0); 7082 } 7083 7084 void 7085 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 7086 bdev_reset_device_stat_cb cb, void *cb_arg) 7087 { 7088 struct bdev_iostat_reset_ctx *ctx; 7089 7090 assert(bdev != NULL); 7091 assert(cb != NULL); 7092 7093 ctx = calloc(1, sizeof(*ctx)); 7094 if (ctx == NULL) { 7095 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 7096 cb(bdev, cb_arg, -ENOMEM); 7097 return; 7098 } 7099 7100 ctx->mode = mode; 7101 ctx->cb = cb; 7102 ctx->cb_arg = cb_arg; 7103 7104 spdk_spin_lock(&bdev->internal.spinlock); 7105 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 7106 spdk_spin_unlock(&bdev->internal.spinlock); 7107 7108 spdk_bdev_for_each_channel(bdev, 7109 bdev_reset_each_channel_stat, 7110 ctx, 7111 bdev_reset_device_stat_done); 7112 } 7113 7114 int 7115 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7116 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7117 spdk_bdev_io_completion_cb cb, void *cb_arg) 7118 { 7119 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7120 struct spdk_bdev_io *bdev_io; 7121 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7122 7123 if (!desc->write) { 7124 return -EBADF; 7125 } 7126 7127 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 7128 return -ENOTSUP; 7129 } 7130 7131 bdev_io = bdev_channel_get_io(channel); 7132 if (!bdev_io) { 7133 return -ENOMEM; 7134 } 7135 7136 bdev_io->internal.ch = channel; 7137 bdev_io->internal.desc = desc; 7138 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 7139 bdev_io->u.nvme_passthru.cmd = *cmd; 7140 bdev_io->u.nvme_passthru.buf = buf; 7141 bdev_io->u.nvme_passthru.nbytes = nbytes; 7142 bdev_io->u.nvme_passthru.md_buf = NULL; 7143 bdev_io->u.nvme_passthru.md_len = 0; 7144 7145 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7146 7147 bdev_io_submit(bdev_io); 7148 return 0; 7149 } 7150 7151 int 7152 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7153 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7154 spdk_bdev_io_completion_cb cb, void *cb_arg) 7155 { 7156 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7157 struct spdk_bdev_io *bdev_io; 7158 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7159 7160 if (!desc->write) { 7161 /* 7162 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7163 * to easily determine if the command is a read or write, but for now just 7164 * do not allow io_passthru with a read-only descriptor. 7165 */ 7166 return -EBADF; 7167 } 7168 7169 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7170 return -ENOTSUP; 7171 } 7172 7173 bdev_io = bdev_channel_get_io(channel); 7174 if (!bdev_io) { 7175 return -ENOMEM; 7176 } 7177 7178 bdev_io->internal.ch = channel; 7179 bdev_io->internal.desc = desc; 7180 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 7181 bdev_io->u.nvme_passthru.cmd = *cmd; 7182 bdev_io->u.nvme_passthru.buf = buf; 7183 bdev_io->u.nvme_passthru.nbytes = nbytes; 7184 bdev_io->u.nvme_passthru.md_buf = NULL; 7185 bdev_io->u.nvme_passthru.md_len = 0; 7186 7187 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7188 7189 bdev_io_submit(bdev_io); 7190 return 0; 7191 } 7192 7193 int 7194 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7195 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 7196 spdk_bdev_io_completion_cb cb, void *cb_arg) 7197 { 7198 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7199 struct spdk_bdev_io *bdev_io; 7200 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7201 7202 if (!desc->write) { 7203 /* 7204 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7205 * to easily determine if the command is a read or write, but for now just 7206 * do not allow io_passthru with a read-only descriptor. 7207 */ 7208 return -EBADF; 7209 } 7210 7211 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7212 return -ENOTSUP; 7213 } 7214 7215 bdev_io = bdev_channel_get_io(channel); 7216 if (!bdev_io) { 7217 return -ENOMEM; 7218 } 7219 7220 bdev_io->internal.ch = channel; 7221 bdev_io->internal.desc = desc; 7222 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 7223 bdev_io->u.nvme_passthru.cmd = *cmd; 7224 bdev_io->u.nvme_passthru.buf = buf; 7225 bdev_io->u.nvme_passthru.nbytes = nbytes; 7226 bdev_io->u.nvme_passthru.md_buf = md_buf; 7227 bdev_io->u.nvme_passthru.md_len = md_len; 7228 7229 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7230 7231 bdev_io_submit(bdev_io); 7232 return 0; 7233 } 7234 7235 int 7236 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 7237 struct spdk_io_channel *ch, 7238 const struct spdk_nvme_cmd *cmd, 7239 struct iovec *iov, int iovcnt, size_t nbytes, 7240 void *md_buf, size_t md_len, 7241 spdk_bdev_io_completion_cb cb, void *cb_arg) 7242 { 7243 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7244 struct spdk_bdev_io *bdev_io; 7245 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7246 7247 if (!desc->write) { 7248 /* 7249 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7250 * to easily determine if the command is a read or write, but for now just 7251 * do not allow io_passthru with a read-only descriptor. 7252 */ 7253 return -EBADF; 7254 } 7255 7256 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7257 return -ENOTSUP; 7258 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7259 return -ENOTSUP; 7260 } 7261 7262 bdev_io = bdev_channel_get_io(channel); 7263 if (!bdev_io) { 7264 return -ENOMEM; 7265 } 7266 7267 bdev_io->internal.ch = channel; 7268 bdev_io->internal.desc = desc; 7269 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7270 bdev_io->u.nvme_passthru.cmd = *cmd; 7271 bdev_io->u.nvme_passthru.iovs = iov; 7272 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7273 bdev_io->u.nvme_passthru.nbytes = nbytes; 7274 bdev_io->u.nvme_passthru.md_buf = md_buf; 7275 bdev_io->u.nvme_passthru.md_len = md_len; 7276 7277 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7278 7279 bdev_io_submit(bdev_io); 7280 return 0; 7281 } 7282 7283 static void bdev_abort_retry(void *ctx); 7284 static void bdev_abort(struct spdk_bdev_io *parent_io); 7285 7286 static void 7287 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7288 { 7289 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7290 struct spdk_bdev_io *parent_io = cb_arg; 7291 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7292 7293 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7294 7295 spdk_bdev_free_io(bdev_io); 7296 7297 if (!success) { 7298 /* Check if the target I/O completed in the meantime. */ 7299 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7300 if (tmp_io == bio_to_abort) { 7301 break; 7302 } 7303 } 7304 7305 /* If the target I/O still exists, set the parent to failed. */ 7306 if (tmp_io != NULL) { 7307 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7308 } 7309 } 7310 7311 assert(parent_io->internal.f.split); 7312 7313 parent_io->internal.split.outstanding--; 7314 if (parent_io->internal.split.outstanding == 0) { 7315 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7316 bdev_abort_retry(parent_io); 7317 } else { 7318 bdev_io_complete(parent_io); 7319 } 7320 } 7321 } 7322 7323 static int 7324 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7325 struct spdk_bdev_io *bio_to_abort, 7326 spdk_bdev_io_completion_cb cb, void *cb_arg) 7327 { 7328 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7329 struct spdk_bdev_io *bdev_io; 7330 7331 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7332 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7333 /* TODO: Abort reset or abort request. */ 7334 return -ENOTSUP; 7335 } 7336 7337 bdev_io = bdev_channel_get_io(channel); 7338 if (bdev_io == NULL) { 7339 return -ENOMEM; 7340 } 7341 7342 bdev_io->internal.ch = channel; 7343 bdev_io->internal.desc = desc; 7344 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7345 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7346 7347 if (bio_to_abort->internal.f.split) { 7348 assert(bdev_io_should_split(bio_to_abort)); 7349 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7350 7351 /* Parent abort request is not submitted directly, but to manage its 7352 * execution add it to the submitted list here. 7353 */ 7354 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7355 bdev_ch_add_to_io_submitted(bdev_io); 7356 7357 bdev_abort(bdev_io); 7358 7359 return 0; 7360 } 7361 7362 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7363 7364 /* Submit the abort request to the underlying bdev module. */ 7365 bdev_io_submit(bdev_io); 7366 7367 return 0; 7368 } 7369 7370 static bool 7371 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7372 { 7373 struct spdk_bdev_io *iter; 7374 7375 TAILQ_FOREACH(iter, tailq, internal.link) { 7376 if (iter == bdev_io) { 7377 return true; 7378 } 7379 } 7380 7381 return false; 7382 } 7383 7384 static uint32_t 7385 _bdev_abort(struct spdk_bdev_io *parent_io) 7386 { 7387 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7388 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7389 void *bio_cb_arg; 7390 struct spdk_bdev_io *bio_to_abort; 7391 uint32_t matched_ios; 7392 int rc; 7393 7394 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7395 7396 /* matched_ios is returned and will be kept by the caller. 7397 * 7398 * This function will be used for two cases, 1) the same cb_arg is used for 7399 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7400 * Incrementing split_outstanding directly here may confuse readers especially 7401 * for the 1st case. 7402 * 7403 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7404 * works as expected. 7405 */ 7406 matched_ios = 0; 7407 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7408 7409 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7410 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7411 continue; 7412 } 7413 7414 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7415 /* Any I/O which was submitted after this abort command should be excluded. */ 7416 continue; 7417 } 7418 7419 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7420 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7421 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7422 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7423 break; 7424 } 7425 7426 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7427 if (rc != 0) { 7428 if (rc == -ENOMEM) { 7429 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7430 } else { 7431 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7432 } 7433 break; 7434 } 7435 matched_ios++; 7436 } 7437 7438 return matched_ios; 7439 } 7440 7441 static void 7442 bdev_abort_retry(void *ctx) 7443 { 7444 struct spdk_bdev_io *parent_io = ctx; 7445 uint32_t matched_ios; 7446 7447 matched_ios = _bdev_abort(parent_io); 7448 7449 if (matched_ios == 0) { 7450 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7451 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7452 } else { 7453 /* For retry, the case that no target I/O was found is success 7454 * because it means target I/Os completed in the meantime. 7455 */ 7456 bdev_io_complete(parent_io); 7457 } 7458 return; 7459 } 7460 7461 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7462 parent_io->internal.f.split = true; 7463 parent_io->internal.split.outstanding = matched_ios; 7464 } 7465 7466 static void 7467 bdev_abort(struct spdk_bdev_io *parent_io) 7468 { 7469 uint32_t matched_ios; 7470 7471 matched_ios = _bdev_abort(parent_io); 7472 7473 if (matched_ios == 0) { 7474 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7475 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7476 } else { 7477 /* The case the no target I/O was found is failure. */ 7478 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7479 bdev_io_complete(parent_io); 7480 } 7481 return; 7482 } 7483 7484 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7485 parent_io->internal.f.split = true; 7486 parent_io->internal.split.outstanding = matched_ios; 7487 } 7488 7489 int 7490 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7491 void *bio_cb_arg, 7492 spdk_bdev_io_completion_cb cb, void *cb_arg) 7493 { 7494 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7495 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7496 struct spdk_bdev_io *bdev_io; 7497 7498 if (bio_cb_arg == NULL) { 7499 return -EINVAL; 7500 } 7501 7502 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7503 return -ENOTSUP; 7504 } 7505 7506 bdev_io = bdev_channel_get_io(channel); 7507 if (bdev_io == NULL) { 7508 return -ENOMEM; 7509 } 7510 7511 bdev_io->internal.ch = channel; 7512 bdev_io->internal.desc = desc; 7513 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7514 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7515 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7516 7517 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7518 7519 /* Parent abort request is not submitted directly, but to manage its execution, 7520 * add it to the submitted list here. 7521 */ 7522 bdev_ch_add_to_io_submitted(bdev_io); 7523 7524 bdev_abort(bdev_io); 7525 7526 return 0; 7527 } 7528 7529 int 7530 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7531 struct spdk_bdev_io_wait_entry *entry) 7532 { 7533 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7534 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7535 7536 if (bdev != entry->bdev) { 7537 SPDK_ERRLOG("bdevs do not match\n"); 7538 return -EINVAL; 7539 } 7540 7541 if (mgmt_ch->per_thread_cache_count > 0) { 7542 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7543 return -EINVAL; 7544 } 7545 7546 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7547 return 0; 7548 } 7549 7550 static inline void 7551 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7552 { 7553 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7554 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7555 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7556 uint32_t blocklen = bdev_io->bdev->blocklen; 7557 7558 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7559 switch (bdev_io->type) { 7560 case SPDK_BDEV_IO_TYPE_READ: 7561 io_stat->bytes_read += num_blocks * blocklen; 7562 io_stat->num_read_ops++; 7563 io_stat->read_latency_ticks += tsc_diff; 7564 if (io_stat->max_read_latency_ticks < tsc_diff) { 7565 io_stat->max_read_latency_ticks = tsc_diff; 7566 } 7567 if (io_stat->min_read_latency_ticks > tsc_diff) { 7568 io_stat->min_read_latency_ticks = tsc_diff; 7569 } 7570 break; 7571 case SPDK_BDEV_IO_TYPE_WRITE: 7572 io_stat->bytes_written += num_blocks * blocklen; 7573 io_stat->num_write_ops++; 7574 io_stat->write_latency_ticks += tsc_diff; 7575 if (io_stat->max_write_latency_ticks < tsc_diff) { 7576 io_stat->max_write_latency_ticks = tsc_diff; 7577 } 7578 if (io_stat->min_write_latency_ticks > tsc_diff) { 7579 io_stat->min_write_latency_ticks = tsc_diff; 7580 } 7581 break; 7582 case SPDK_BDEV_IO_TYPE_UNMAP: 7583 io_stat->bytes_unmapped += num_blocks * blocklen; 7584 io_stat->num_unmap_ops++; 7585 io_stat->unmap_latency_ticks += tsc_diff; 7586 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7587 io_stat->max_unmap_latency_ticks = tsc_diff; 7588 } 7589 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7590 io_stat->min_unmap_latency_ticks = tsc_diff; 7591 } 7592 break; 7593 case SPDK_BDEV_IO_TYPE_ZCOPY: 7594 /* Track the data in the start phase only */ 7595 if (bdev_io->u.bdev.zcopy.start) { 7596 if (bdev_io->u.bdev.zcopy.populate) { 7597 io_stat->bytes_read += num_blocks * blocklen; 7598 io_stat->num_read_ops++; 7599 io_stat->read_latency_ticks += tsc_diff; 7600 if (io_stat->max_read_latency_ticks < tsc_diff) { 7601 io_stat->max_read_latency_ticks = tsc_diff; 7602 } 7603 if (io_stat->min_read_latency_ticks > tsc_diff) { 7604 io_stat->min_read_latency_ticks = tsc_diff; 7605 } 7606 } else { 7607 io_stat->bytes_written += num_blocks * blocklen; 7608 io_stat->num_write_ops++; 7609 io_stat->write_latency_ticks += tsc_diff; 7610 if (io_stat->max_write_latency_ticks < tsc_diff) { 7611 io_stat->max_write_latency_ticks = tsc_diff; 7612 } 7613 if (io_stat->min_write_latency_ticks > tsc_diff) { 7614 io_stat->min_write_latency_ticks = tsc_diff; 7615 } 7616 } 7617 } 7618 break; 7619 case SPDK_BDEV_IO_TYPE_COPY: 7620 io_stat->bytes_copied += num_blocks * blocklen; 7621 io_stat->num_copy_ops++; 7622 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7623 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7624 io_stat->max_copy_latency_ticks = tsc_diff; 7625 } 7626 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7627 io_stat->min_copy_latency_ticks = tsc_diff; 7628 } 7629 break; 7630 default: 7631 break; 7632 } 7633 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7634 io_stat = bdev_io->bdev->internal.stat; 7635 assert(io_stat->io_error != NULL); 7636 7637 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7638 io_stat->io_error->error_status[-io_status - 1]++; 7639 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7640 } 7641 7642 #ifdef SPDK_CONFIG_VTUNE 7643 uint64_t now_tsc = spdk_get_ticks(); 7644 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7645 uint64_t data[5]; 7646 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7647 7648 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7649 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7650 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7651 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7652 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7653 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7654 7655 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7656 __itt_metadata_u64, 5, data); 7657 7658 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7659 bdev_io->internal.ch->start_tsc = now_tsc; 7660 } 7661 #endif 7662 } 7663 7664 static inline void 7665 _bdev_io_complete(void *ctx) 7666 { 7667 struct spdk_bdev_io *bdev_io = ctx; 7668 7669 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7670 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7671 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7672 } 7673 7674 assert(bdev_io->internal.cb != NULL); 7675 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7676 7677 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7678 bdev_io->internal.caller_ctx); 7679 } 7680 7681 static inline void 7682 bdev_io_complete(void *ctx) 7683 { 7684 struct spdk_bdev_io *bdev_io = ctx; 7685 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7686 uint64_t tsc, tsc_diff; 7687 7688 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7689 /* 7690 * Defer completion to avoid potential infinite recursion if the 7691 * user's completion callback issues a new I/O. 7692 */ 7693 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7694 bdev_io_complete, bdev_io); 7695 return; 7696 } 7697 7698 tsc = spdk_get_ticks(); 7699 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7700 7701 bdev_ch_remove_from_io_submitted(bdev_io); 7702 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7703 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7704 7705 if (bdev_ch->histogram) { 7706 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7707 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7708 /* 7709 * Tally all I/O types if the histogram_io_type is set to 0. 7710 */ 7711 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7712 } 7713 } 7714 7715 bdev_io_update_io_stat(bdev_io, tsc_diff); 7716 _bdev_io_complete(bdev_io); 7717 } 7718 7719 /* The difference between this function and bdev_io_complete() is that this should be called to 7720 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7721 * io_submitted list and don't have submit_tsc updated. 7722 */ 7723 static inline void 7724 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7725 { 7726 /* Since the IO hasn't been submitted it's bound to be failed */ 7727 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7728 7729 /* At this point we don't know if the IO is completed from submission context or not, but, 7730 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7731 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7732 _bdev_io_complete, bdev_io); 7733 } 7734 7735 static void bdev_destroy_cb(void *io_device); 7736 7737 static inline void 7738 _bdev_reset_complete(void *ctx) 7739 { 7740 struct spdk_bdev_io *bdev_io = ctx; 7741 7742 /* Put the channel reference we got in submission. */ 7743 assert(bdev_io->u.reset.ch_ref != NULL); 7744 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7745 bdev_io->u.reset.ch_ref = NULL; 7746 7747 bdev_io_complete(bdev_io); 7748 } 7749 7750 static void 7751 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7752 { 7753 struct spdk_bdev_io *bdev_io = _ctx; 7754 bdev_io_tailq_t queued_resets; 7755 struct spdk_bdev_io *queued_reset; 7756 7757 assert(bdev_io == bdev->internal.reset_in_progress); 7758 7759 TAILQ_INIT(&queued_resets); 7760 7761 spdk_spin_lock(&bdev->internal.spinlock); 7762 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7763 spdk_bdev_io, internal.link); 7764 bdev->internal.reset_in_progress = NULL; 7765 spdk_spin_unlock(&bdev->internal.spinlock); 7766 7767 while (!TAILQ_EMPTY(&queued_resets)) { 7768 queued_reset = TAILQ_FIRST(&queued_resets); 7769 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7770 queued_reset->internal.status = bdev_io->internal.status; 7771 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7772 _bdev_reset_complete, queued_reset); 7773 } 7774 7775 _bdev_reset_complete(bdev_io); 7776 7777 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7778 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7779 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7780 } 7781 } 7782 7783 static void 7784 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7785 struct spdk_io_channel *_ch, void *_ctx) 7786 { 7787 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7788 7789 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7790 7791 spdk_bdev_for_each_channel_continue(i, 0); 7792 } 7793 7794 static void 7795 bdev_io_complete_sequence_cb(void *ctx, int status) 7796 { 7797 struct spdk_bdev_io *bdev_io = ctx; 7798 7799 /* u.bdev.accel_sequence should have already been cleared at this point */ 7800 assert(bdev_io->u.bdev.accel_sequence == NULL); 7801 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7802 bdev_io->internal.f.has_accel_sequence = false; 7803 7804 if (spdk_unlikely(status != 0)) { 7805 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7806 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7807 } 7808 7809 bdev_io_complete(bdev_io); 7810 } 7811 7812 void 7813 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7814 { 7815 struct spdk_bdev *bdev = bdev_io->bdev; 7816 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7817 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7818 7819 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7820 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7821 spdk_bdev_get_module_name(bdev), 7822 bdev_io_status_get_string(bdev_io->internal.status)); 7823 assert(false); 7824 } 7825 bdev_io->internal.status = status; 7826 7827 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7828 assert(bdev_io == bdev->internal.reset_in_progress); 7829 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7830 bdev_reset_complete); 7831 return; 7832 } else { 7833 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7834 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7835 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7836 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7837 return; 7838 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7839 !bdev_io_use_accel_sequence(bdev_io))) { 7840 _bdev_io_push_bounce_data_buffer(bdev_io, 7841 _bdev_io_complete_push_bounce_done); 7842 /* bdev IO will be completed in the callback */ 7843 return; 7844 } 7845 } 7846 7847 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7848 return; 7849 } 7850 } 7851 7852 bdev_io_complete(bdev_io); 7853 } 7854 7855 void 7856 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7857 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7858 { 7859 enum spdk_bdev_io_status status; 7860 7861 if (sc == SPDK_SCSI_STATUS_GOOD) { 7862 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7863 } else { 7864 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7865 bdev_io->internal.error.scsi.sc = sc; 7866 bdev_io->internal.error.scsi.sk = sk; 7867 bdev_io->internal.error.scsi.asc = asc; 7868 bdev_io->internal.error.scsi.ascq = ascq; 7869 } 7870 7871 spdk_bdev_io_complete(bdev_io, status); 7872 } 7873 7874 void 7875 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7876 int *sc, int *sk, int *asc, int *ascq) 7877 { 7878 assert(sc != NULL); 7879 assert(sk != NULL); 7880 assert(asc != NULL); 7881 assert(ascq != NULL); 7882 7883 switch (bdev_io->internal.status) { 7884 case SPDK_BDEV_IO_STATUS_SUCCESS: 7885 *sc = SPDK_SCSI_STATUS_GOOD; 7886 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7887 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7888 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7889 break; 7890 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7891 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7892 break; 7893 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7894 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7895 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7896 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7897 *ascq = bdev_io->internal.error.scsi.ascq; 7898 break; 7899 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7900 *sc = bdev_io->internal.error.scsi.sc; 7901 *sk = bdev_io->internal.error.scsi.sk; 7902 *asc = bdev_io->internal.error.scsi.asc; 7903 *ascq = bdev_io->internal.error.scsi.ascq; 7904 break; 7905 default: 7906 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7907 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7908 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7909 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7910 break; 7911 } 7912 } 7913 7914 void 7915 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7916 { 7917 enum spdk_bdev_io_status status; 7918 7919 if (aio_result == 0) { 7920 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7921 } else { 7922 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7923 } 7924 7925 bdev_io->internal.error.aio_result = aio_result; 7926 7927 spdk_bdev_io_complete(bdev_io, status); 7928 } 7929 7930 void 7931 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7932 { 7933 assert(aio_result != NULL); 7934 7935 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7936 *aio_result = bdev_io->internal.error.aio_result; 7937 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7938 *aio_result = 0; 7939 } else { 7940 *aio_result = -EIO; 7941 } 7942 } 7943 7944 void 7945 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7946 { 7947 enum spdk_bdev_io_status status; 7948 7949 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7950 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7951 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7952 status = SPDK_BDEV_IO_STATUS_ABORTED; 7953 } else { 7954 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7955 } 7956 7957 bdev_io->internal.error.nvme.cdw0 = cdw0; 7958 bdev_io->internal.error.nvme.sct = sct; 7959 bdev_io->internal.error.nvme.sc = sc; 7960 7961 spdk_bdev_io_complete(bdev_io, status); 7962 } 7963 7964 void 7965 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7966 { 7967 assert(sct != NULL); 7968 assert(sc != NULL); 7969 assert(cdw0 != NULL); 7970 7971 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7972 *sct = SPDK_NVME_SCT_GENERIC; 7973 *sc = SPDK_NVME_SC_SUCCESS; 7974 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7975 *cdw0 = 0; 7976 } else { 7977 *cdw0 = 1U; 7978 } 7979 return; 7980 } 7981 7982 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7983 *sct = SPDK_NVME_SCT_GENERIC; 7984 *sc = SPDK_NVME_SC_SUCCESS; 7985 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7986 *sct = bdev_io->internal.error.nvme.sct; 7987 *sc = bdev_io->internal.error.nvme.sc; 7988 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7989 *sct = SPDK_NVME_SCT_GENERIC; 7990 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7991 } else { 7992 *sct = SPDK_NVME_SCT_GENERIC; 7993 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7994 } 7995 7996 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7997 } 7998 7999 void 8000 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 8001 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 8002 { 8003 assert(first_sct != NULL); 8004 assert(first_sc != NULL); 8005 assert(second_sct != NULL); 8006 assert(second_sc != NULL); 8007 assert(cdw0 != NULL); 8008 8009 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 8010 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 8011 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 8012 *first_sct = bdev_io->internal.error.nvme.sct; 8013 *first_sc = bdev_io->internal.error.nvme.sc; 8014 *second_sct = SPDK_NVME_SCT_GENERIC; 8015 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8016 } else { 8017 *first_sct = SPDK_NVME_SCT_GENERIC; 8018 *first_sc = SPDK_NVME_SC_SUCCESS; 8019 *second_sct = bdev_io->internal.error.nvme.sct; 8020 *second_sc = bdev_io->internal.error.nvme.sc; 8021 } 8022 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 8023 *first_sct = SPDK_NVME_SCT_GENERIC; 8024 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8025 *second_sct = SPDK_NVME_SCT_GENERIC; 8026 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8027 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 8028 *first_sct = SPDK_NVME_SCT_GENERIC; 8029 *first_sc = SPDK_NVME_SC_SUCCESS; 8030 *second_sct = SPDK_NVME_SCT_GENERIC; 8031 *second_sc = SPDK_NVME_SC_SUCCESS; 8032 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 8033 *first_sct = SPDK_NVME_SCT_GENERIC; 8034 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8035 *second_sct = SPDK_NVME_SCT_GENERIC; 8036 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8037 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 8038 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 8039 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 8040 *second_sct = SPDK_NVME_SCT_GENERIC; 8041 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8042 } else { 8043 *first_sct = SPDK_NVME_SCT_GENERIC; 8044 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8045 *second_sct = SPDK_NVME_SCT_GENERIC; 8046 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8047 } 8048 8049 *cdw0 = bdev_io->internal.error.nvme.cdw0; 8050 } 8051 8052 void 8053 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 8054 const struct spdk_bdev_io *base_io) 8055 { 8056 switch (base_io->internal.status) { 8057 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 8058 spdk_bdev_io_complete_nvme_status(bdev_io, 8059 base_io->internal.error.nvme.cdw0, 8060 base_io->internal.error.nvme.sct, 8061 base_io->internal.error.nvme.sc); 8062 break; 8063 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 8064 spdk_bdev_io_complete_scsi_status(bdev_io, 8065 base_io->internal.error.scsi.sc, 8066 base_io->internal.error.scsi.sk, 8067 base_io->internal.error.scsi.asc, 8068 base_io->internal.error.scsi.ascq); 8069 break; 8070 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 8071 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 8072 break; 8073 default: 8074 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 8075 break; 8076 } 8077 } 8078 8079 struct spdk_thread * 8080 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 8081 { 8082 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 8083 } 8084 8085 struct spdk_io_channel * 8086 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 8087 { 8088 return bdev_io->internal.ch->channel; 8089 } 8090 8091 static int 8092 bdev_register(struct spdk_bdev *bdev) 8093 { 8094 char *bdev_name; 8095 char uuid[SPDK_UUID_STRING_LEN]; 8096 struct spdk_iobuf_opts iobuf_opts; 8097 int ret; 8098 8099 assert(bdev->module != NULL); 8100 8101 if (!bdev->name) { 8102 SPDK_ERRLOG("Bdev name is NULL\n"); 8103 return -EINVAL; 8104 } 8105 8106 if (!strlen(bdev->name)) { 8107 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 8108 return -EINVAL; 8109 } 8110 8111 /* Users often register their own I/O devices using the bdev name. In 8112 * order to avoid conflicts, prepend bdev_. */ 8113 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 8114 if (!bdev_name) { 8115 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 8116 return -ENOMEM; 8117 } 8118 8119 bdev->internal.stat = bdev_alloc_io_stat(true); 8120 if (!bdev->internal.stat) { 8121 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 8122 free(bdev_name); 8123 return -ENOMEM; 8124 } 8125 8126 bdev->internal.status = SPDK_BDEV_STATUS_READY; 8127 bdev->internal.measured_queue_depth = UINT64_MAX; 8128 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8129 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8130 bdev->internal.qd_poller = NULL; 8131 bdev->internal.qos = NULL; 8132 8133 TAILQ_INIT(&bdev->internal.open_descs); 8134 TAILQ_INIT(&bdev->internal.locked_ranges); 8135 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 8136 TAILQ_INIT(&bdev->internal.queued_resets); 8137 TAILQ_INIT(&bdev->aliases); 8138 8139 /* UUID may be specified by the user or defined by bdev itself. 8140 * Otherwise it will be generated here, so this field will never be empty. */ 8141 if (spdk_uuid_is_null(&bdev->uuid)) { 8142 spdk_uuid_generate(&bdev->uuid); 8143 } 8144 8145 /* Add the UUID alias only if it's different than the name */ 8146 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8147 if (strcmp(bdev->name, uuid) != 0) { 8148 ret = spdk_bdev_alias_add(bdev, uuid); 8149 if (ret != 0) { 8150 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 8151 bdev_free_io_stat(bdev->internal.stat); 8152 free(bdev_name); 8153 return ret; 8154 } 8155 } 8156 8157 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 8158 if (spdk_bdev_get_buf_align(bdev) > 1) { 8159 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 8160 iobuf_opts.large_bufsize / bdev->blocklen); 8161 } 8162 8163 /* If the user didn't specify a write unit size, set it to one. */ 8164 if (bdev->write_unit_size == 0) { 8165 bdev->write_unit_size = 1; 8166 } 8167 8168 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 8169 if (bdev->acwu == 0) { 8170 bdev->acwu = bdev->write_unit_size; 8171 } 8172 8173 if (bdev->phys_blocklen == 0) { 8174 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 8175 } 8176 8177 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 8178 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 8179 } 8180 8181 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 8182 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 8183 } 8184 8185 bdev->internal.reset_in_progress = NULL; 8186 bdev->internal.qd_poll_in_progress = false; 8187 bdev->internal.period = 0; 8188 bdev->internal.new_period = 0; 8189 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 8190 8191 /* 8192 * Initialize spinlock before registering IO device because spinlock is used in 8193 * bdev_channel_create 8194 */ 8195 spdk_spin_init(&bdev->internal.spinlock); 8196 8197 spdk_io_device_register(__bdev_to_io_dev(bdev), 8198 bdev_channel_create, bdev_channel_destroy, 8199 sizeof(struct spdk_bdev_channel), 8200 bdev_name); 8201 8202 /* 8203 * Register bdev name only after the bdev object is ready. 8204 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 8205 * create IO channels... 8206 */ 8207 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 8208 if (ret != 0) { 8209 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 8210 bdev_free_io_stat(bdev->internal.stat); 8211 spdk_spin_destroy(&bdev->internal.spinlock); 8212 free(bdev_name); 8213 return ret; 8214 } 8215 8216 free(bdev_name); 8217 8218 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 8219 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 8220 8221 return 0; 8222 } 8223 8224 static void 8225 bdev_destroy_cb(void *io_device) 8226 { 8227 int rc; 8228 struct spdk_bdev *bdev; 8229 spdk_bdev_unregister_cb cb_fn; 8230 void *cb_arg; 8231 8232 bdev = __bdev_from_io_dev(io_device); 8233 8234 if (bdev->internal.unregister_td != spdk_get_thread()) { 8235 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 8236 return; 8237 } 8238 8239 cb_fn = bdev->internal.unregister_cb; 8240 cb_arg = bdev->internal.unregister_ctx; 8241 8242 spdk_spin_destroy(&bdev->internal.spinlock); 8243 free(bdev->internal.qos); 8244 bdev_free_io_stat(bdev->internal.stat); 8245 spdk_trace_unregister_owner(bdev->internal.trace_id); 8246 8247 rc = bdev->fn_table->destruct(bdev->ctxt); 8248 if (rc < 0) { 8249 SPDK_ERRLOG("destruct failed\n"); 8250 } 8251 if (rc <= 0 && cb_fn != NULL) { 8252 cb_fn(cb_arg, rc); 8253 } 8254 } 8255 8256 void 8257 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8258 { 8259 if (bdev->internal.unregister_cb != NULL) { 8260 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8261 } 8262 } 8263 8264 static void 8265 _remove_notify(void *arg) 8266 { 8267 struct spdk_bdev_desc *desc = arg; 8268 8269 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8270 } 8271 8272 /* returns: 0 - bdev removed and ready to be destructed. 8273 * -EBUSY - bdev can't be destructed yet. */ 8274 static int 8275 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8276 { 8277 struct spdk_bdev_desc *desc, *tmp; 8278 struct spdk_bdev_alias *alias; 8279 int rc = 0; 8280 char uuid[SPDK_UUID_STRING_LEN]; 8281 8282 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8283 assert(spdk_spin_held(&bdev->internal.spinlock)); 8284 8285 /* Notify each descriptor about hotremoval */ 8286 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8287 rc = -EBUSY; 8288 /* 8289 * Defer invocation of the event_cb to a separate message that will 8290 * run later on its thread. This ensures this context unwinds and 8291 * we don't recursively unregister this bdev again if the event_cb 8292 * immediately closes its descriptor. 8293 */ 8294 event_notify(desc, _remove_notify); 8295 } 8296 8297 /* If there are no descriptors, proceed removing the bdev */ 8298 if (rc == 0) { 8299 bdev_examine_allowlist_remove(bdev->name); 8300 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8301 bdev_examine_allowlist_remove(alias->alias.name); 8302 } 8303 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8304 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8305 8306 /* Delete the name and the UUID alias */ 8307 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8308 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8309 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8310 8311 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8312 8313 if (bdev->internal.reset_in_progress != NULL) { 8314 /* If reset is in progress, let the completion callback for reset 8315 * unregister the bdev. 8316 */ 8317 rc = -EBUSY; 8318 } 8319 } 8320 8321 return rc; 8322 } 8323 8324 static void 8325 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8326 struct spdk_io_channel *io_ch, void *_ctx) 8327 { 8328 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8329 8330 bdev_channel_abort_queued_ios(bdev_ch); 8331 spdk_bdev_for_each_channel_continue(i, 0); 8332 } 8333 8334 static void 8335 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8336 { 8337 int rc; 8338 8339 spdk_spin_lock(&g_bdev_mgr.spinlock); 8340 spdk_spin_lock(&bdev->internal.spinlock); 8341 /* 8342 * Set the status to REMOVING after completing to abort channels. Otherwise, 8343 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8344 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8345 * may fail. 8346 */ 8347 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8348 rc = bdev_unregister_unsafe(bdev); 8349 spdk_spin_unlock(&bdev->internal.spinlock); 8350 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8351 8352 if (rc == 0) { 8353 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8354 } 8355 } 8356 8357 void 8358 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8359 { 8360 struct spdk_thread *thread; 8361 8362 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8363 8364 thread = spdk_get_thread(); 8365 if (!thread) { 8366 /* The user called this from a non-SPDK thread. */ 8367 if (cb_fn != NULL) { 8368 cb_fn(cb_arg, -ENOTSUP); 8369 } 8370 return; 8371 } 8372 8373 spdk_spin_lock(&g_bdev_mgr.spinlock); 8374 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8375 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8376 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8377 if (cb_fn) { 8378 cb_fn(cb_arg, -EBUSY); 8379 } 8380 return; 8381 } 8382 8383 spdk_spin_lock(&bdev->internal.spinlock); 8384 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8385 bdev->internal.unregister_cb = cb_fn; 8386 bdev->internal.unregister_ctx = cb_arg; 8387 bdev->internal.unregister_td = thread; 8388 spdk_spin_unlock(&bdev->internal.spinlock); 8389 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8390 8391 spdk_bdev_set_qd_sampling_period(bdev, 0); 8392 8393 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8394 bdev_unregister); 8395 } 8396 8397 int 8398 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8399 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8400 { 8401 struct spdk_bdev_desc *desc; 8402 struct spdk_bdev *bdev; 8403 int rc; 8404 8405 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8406 if (rc != 0) { 8407 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8408 return rc; 8409 } 8410 8411 bdev = spdk_bdev_desc_get_bdev(desc); 8412 8413 if (bdev->module != module) { 8414 spdk_bdev_close(desc); 8415 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8416 bdev_name); 8417 return -ENODEV; 8418 } 8419 8420 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8421 8422 spdk_bdev_close(desc); 8423 8424 return 0; 8425 } 8426 8427 static int 8428 bdev_start_qos(struct spdk_bdev *bdev) 8429 { 8430 struct set_qos_limit_ctx *ctx; 8431 8432 /* Enable QoS */ 8433 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8434 ctx = calloc(1, sizeof(*ctx)); 8435 if (ctx == NULL) { 8436 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8437 return -ENOMEM; 8438 } 8439 ctx->bdev = bdev; 8440 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8441 } 8442 8443 return 0; 8444 } 8445 8446 static void 8447 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8448 struct spdk_bdev *bdev) 8449 { 8450 enum spdk_bdev_claim_type type; 8451 const char *typename, *modname; 8452 extern struct spdk_log_flag SPDK_LOG_bdev; 8453 8454 assert(spdk_spin_held(&bdev->internal.spinlock)); 8455 8456 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8457 return; 8458 } 8459 8460 type = bdev->internal.claim_type; 8461 typename = spdk_bdev_claim_get_name(type); 8462 8463 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8464 modname = bdev->internal.claim.v1.module->name; 8465 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8466 bdev->name, detail, typename, modname); 8467 return; 8468 } 8469 8470 if (claim_type_is_v2(type)) { 8471 struct spdk_bdev_module_claim *claim; 8472 8473 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8474 modname = claim->module->name; 8475 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8476 bdev->name, detail, typename, modname); 8477 } 8478 return; 8479 } 8480 8481 assert(false); 8482 } 8483 8484 static int 8485 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8486 { 8487 struct spdk_thread *thread; 8488 int rc = 0; 8489 8490 thread = spdk_get_thread(); 8491 if (!thread) { 8492 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8493 return -ENOTSUP; 8494 } 8495 8496 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8497 spdk_get_thread()); 8498 8499 desc->bdev = bdev; 8500 desc->thread = thread; 8501 desc->write = write; 8502 8503 spdk_spin_lock(&bdev->internal.spinlock); 8504 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8505 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8506 spdk_spin_unlock(&bdev->internal.spinlock); 8507 return -ENODEV; 8508 } 8509 8510 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8511 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8512 spdk_spin_unlock(&bdev->internal.spinlock); 8513 return -EPERM; 8514 } 8515 8516 rc = bdev_start_qos(bdev); 8517 if (rc != 0) { 8518 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8519 spdk_spin_unlock(&bdev->internal.spinlock); 8520 return rc; 8521 } 8522 8523 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8524 8525 spdk_spin_unlock(&bdev->internal.spinlock); 8526 8527 return 0; 8528 } 8529 8530 static void 8531 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8532 { 8533 if (!opts) { 8534 SPDK_ERRLOG("opts should not be NULL.\n"); 8535 return; 8536 } 8537 8538 if (!opts_size) { 8539 SPDK_ERRLOG("opts_size should not be zero.\n"); 8540 return; 8541 } 8542 8543 memset(opts, 0, opts_size); 8544 opts->size = opts_size; 8545 8546 #define FIELD_OK(field) \ 8547 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8548 8549 #define SET_FIELD(field, value) \ 8550 if (FIELD_OK(field)) { \ 8551 opts->field = value; \ 8552 } \ 8553 8554 SET_FIELD(hide_metadata, false); 8555 8556 #undef FIELD_OK 8557 #undef SET_FIELD 8558 } 8559 8560 static void 8561 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8562 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8563 { 8564 assert(opts); 8565 assert(opts_src); 8566 8567 #define SET_FIELD(field) \ 8568 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8569 opts->field = opts_src->field; \ 8570 } \ 8571 8572 SET_FIELD(hide_metadata); 8573 8574 opts->size = opts_src->size; 8575 8576 /* We should not remove this statement, but need to update the assert statement 8577 * if we add a new field, and also add a corresponding SET_FIELD statement. 8578 */ 8579 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8580 8581 #undef SET_FIELD 8582 } 8583 8584 void 8585 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8586 { 8587 struct spdk_bdev_open_opts opts_local; 8588 8589 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8590 bdev_open_opts_copy(opts, &opts_local, opts_size); 8591 } 8592 8593 static int 8594 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8595 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8596 { 8597 struct spdk_bdev_desc *desc; 8598 struct spdk_bdev_open_opts opts; 8599 unsigned int i; 8600 8601 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8602 if (user_opts != NULL) { 8603 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8604 } 8605 8606 desc = calloc(1, sizeof(*desc)); 8607 if (desc == NULL) { 8608 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8609 return -ENOMEM; 8610 } 8611 8612 desc->opts = opts; 8613 8614 TAILQ_INIT(&desc->pending_media_events); 8615 TAILQ_INIT(&desc->free_media_events); 8616 8617 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8618 desc->callback.event_fn = event_cb; 8619 desc->callback.ctx = event_ctx; 8620 spdk_spin_init(&desc->spinlock); 8621 8622 if (desc->opts.hide_metadata) { 8623 if (spdk_bdev_is_md_separate(bdev)) { 8624 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8625 bdev_desc_free(desc); 8626 return -EINVAL; 8627 } 8628 } 8629 8630 if (bdev->media_events) { 8631 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8632 sizeof(*desc->media_events_buffer)); 8633 if (desc->media_events_buffer == NULL) { 8634 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8635 bdev_desc_free(desc); 8636 return -ENOMEM; 8637 } 8638 8639 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8640 TAILQ_INSERT_TAIL(&desc->free_media_events, 8641 &desc->media_events_buffer[i], tailq); 8642 } 8643 } 8644 8645 if (bdev->fn_table->accel_sequence_supported != NULL) { 8646 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8647 desc->accel_sequence_supported[i] = 8648 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8649 (enum spdk_bdev_io_type)i); 8650 } 8651 } 8652 8653 *_desc = desc; 8654 8655 return 0; 8656 } 8657 8658 static int 8659 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8660 void *event_ctx, struct spdk_bdev_open_opts *opts, 8661 struct spdk_bdev_desc **_desc) 8662 { 8663 struct spdk_bdev_desc *desc; 8664 struct spdk_bdev *bdev; 8665 int rc; 8666 8667 bdev = bdev_get_by_name(bdev_name); 8668 8669 if (bdev == NULL) { 8670 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8671 return -ENODEV; 8672 } 8673 8674 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8675 if (rc != 0) { 8676 return rc; 8677 } 8678 8679 rc = bdev_open(bdev, write, desc); 8680 if (rc != 0) { 8681 bdev_desc_free(desc); 8682 desc = NULL; 8683 } 8684 8685 *_desc = desc; 8686 8687 return rc; 8688 } 8689 8690 int 8691 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8692 void *event_ctx, struct spdk_bdev_open_opts *opts, 8693 struct spdk_bdev_desc **_desc) 8694 { 8695 int rc; 8696 8697 if (event_cb == NULL) { 8698 SPDK_ERRLOG("Missing event callback function\n"); 8699 return -EINVAL; 8700 } 8701 8702 spdk_spin_lock(&g_bdev_mgr.spinlock); 8703 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8704 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8705 8706 return rc; 8707 } 8708 8709 int 8710 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8711 void *event_ctx, struct spdk_bdev_desc **_desc) 8712 { 8713 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8714 } 8715 8716 struct spdk_bdev_open_async_ctx { 8717 char *bdev_name; 8718 spdk_bdev_event_cb_t event_cb; 8719 void *event_ctx; 8720 bool write; 8721 int rc; 8722 spdk_bdev_open_async_cb_t cb_fn; 8723 void *cb_arg; 8724 struct spdk_bdev_desc *desc; 8725 struct spdk_bdev_open_async_opts opts; 8726 uint64_t start_ticks; 8727 struct spdk_thread *orig_thread; 8728 struct spdk_poller *poller; 8729 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8730 }; 8731 8732 static void 8733 bdev_open_async_done(void *arg) 8734 { 8735 struct spdk_bdev_open_async_ctx *ctx = arg; 8736 8737 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8738 8739 free(ctx->bdev_name); 8740 free(ctx); 8741 } 8742 8743 static void 8744 bdev_open_async_cancel(void *arg) 8745 { 8746 struct spdk_bdev_open_async_ctx *ctx = arg; 8747 8748 assert(ctx->rc == -ESHUTDOWN); 8749 8750 spdk_poller_unregister(&ctx->poller); 8751 8752 bdev_open_async_done(ctx); 8753 } 8754 8755 /* This is called when the bdev library finishes at shutdown. */ 8756 static void 8757 bdev_open_async_fini(void) 8758 { 8759 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8760 8761 spdk_spin_lock(&g_bdev_mgr.spinlock); 8762 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8763 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8764 /* 8765 * We have to move to ctx->orig_thread to unregister ctx->poller. 8766 * However, there is a chance that ctx->poller is executed before 8767 * message is executed, which could result in bdev_open_async_done() 8768 * being called twice. To avoid such race condition, set ctx->rc to 8769 * -ESHUTDOWN. 8770 */ 8771 ctx->rc = -ESHUTDOWN; 8772 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8773 } 8774 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8775 } 8776 8777 static int bdev_open_async(void *arg); 8778 8779 static void 8780 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8781 { 8782 uint64_t timeout_ticks; 8783 8784 if (ctx->rc == -ESHUTDOWN) { 8785 /* This context is being canceled. Do nothing. */ 8786 return; 8787 } 8788 8789 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8790 NULL, &ctx->desc); 8791 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8792 goto exit; 8793 } 8794 8795 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8796 if (spdk_get_ticks() >= timeout_ticks) { 8797 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8798 ctx->rc = -ETIMEDOUT; 8799 goto exit; 8800 } 8801 8802 return; 8803 8804 exit: 8805 spdk_poller_unregister(&ctx->poller); 8806 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8807 8808 /* Completion callback is processed after stack unwinding. */ 8809 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8810 } 8811 8812 static int 8813 bdev_open_async(void *arg) 8814 { 8815 struct spdk_bdev_open_async_ctx *ctx = arg; 8816 8817 spdk_spin_lock(&g_bdev_mgr.spinlock); 8818 8819 _bdev_open_async(ctx); 8820 8821 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8822 8823 return SPDK_POLLER_BUSY; 8824 } 8825 8826 static void 8827 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8828 struct spdk_bdev_open_async_opts *opts_src, 8829 size_t size) 8830 { 8831 assert(opts); 8832 assert(opts_src); 8833 8834 opts->size = size; 8835 8836 #define SET_FIELD(field) \ 8837 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8838 opts->field = opts_src->field; \ 8839 } \ 8840 8841 SET_FIELD(timeout_ms); 8842 8843 /* Do not remove this statement, you should always update this statement when you adding a new field, 8844 * and do not forget to add the SET_FIELD statement for your added field. */ 8845 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8846 8847 #undef SET_FIELD 8848 } 8849 8850 static void 8851 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8852 { 8853 assert(opts); 8854 8855 opts->size = size; 8856 8857 #define SET_FIELD(field, value) \ 8858 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8859 opts->field = value; \ 8860 } \ 8861 8862 SET_FIELD(timeout_ms, 0); 8863 8864 #undef SET_FIELD 8865 } 8866 8867 int 8868 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8869 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8870 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8871 { 8872 struct spdk_bdev_open_async_ctx *ctx; 8873 8874 if (event_cb == NULL) { 8875 SPDK_ERRLOG("Missing event callback function\n"); 8876 return -EINVAL; 8877 } 8878 8879 if (open_cb == NULL) { 8880 SPDK_ERRLOG("Missing open callback function\n"); 8881 return -EINVAL; 8882 } 8883 8884 if (opts != NULL && opts->size == 0) { 8885 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8886 return -EINVAL; 8887 } 8888 8889 ctx = calloc(1, sizeof(*ctx)); 8890 if (ctx == NULL) { 8891 SPDK_ERRLOG("Failed to allocate open context\n"); 8892 return -ENOMEM; 8893 } 8894 8895 ctx->bdev_name = strdup(bdev_name); 8896 if (ctx->bdev_name == NULL) { 8897 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8898 free(ctx); 8899 return -ENOMEM; 8900 } 8901 8902 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8903 if (ctx->poller == NULL) { 8904 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8905 free(ctx->bdev_name); 8906 free(ctx); 8907 return -ENOMEM; 8908 } 8909 8910 ctx->cb_fn = open_cb; 8911 ctx->cb_arg = open_cb_arg; 8912 ctx->write = write; 8913 ctx->event_cb = event_cb; 8914 ctx->event_ctx = event_ctx; 8915 ctx->orig_thread = spdk_get_thread(); 8916 ctx->start_ticks = spdk_get_ticks(); 8917 8918 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8919 if (opts != NULL) { 8920 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8921 } 8922 8923 spdk_spin_lock(&g_bdev_mgr.spinlock); 8924 8925 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8926 _bdev_open_async(ctx); 8927 8928 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8929 8930 return 0; 8931 } 8932 8933 static void 8934 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8935 { 8936 int rc; 8937 8938 spdk_spin_lock(&bdev->internal.spinlock); 8939 spdk_spin_lock(&desc->spinlock); 8940 8941 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8942 8943 desc->closed = true; 8944 8945 if (desc->claim != NULL) { 8946 bdev_desc_release_claims(desc); 8947 } 8948 8949 if (0 == desc->refs) { 8950 spdk_spin_unlock(&desc->spinlock); 8951 bdev_desc_free(desc); 8952 } else { 8953 spdk_spin_unlock(&desc->spinlock); 8954 } 8955 8956 /* If no more descriptors, kill QoS channel */ 8957 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8958 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8959 bdev->name, spdk_get_thread()); 8960 8961 if (bdev_qos_destroy(bdev)) { 8962 /* There isn't anything we can do to recover here. Just let the 8963 * old QoS poller keep running. The QoS handling won't change 8964 * cores when the user allocates a new channel, but it won't break. */ 8965 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8966 } 8967 } 8968 8969 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8970 rc = bdev_unregister_unsafe(bdev); 8971 spdk_spin_unlock(&bdev->internal.spinlock); 8972 8973 if (rc == 0) { 8974 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8975 } 8976 } else { 8977 spdk_spin_unlock(&bdev->internal.spinlock); 8978 } 8979 } 8980 8981 void 8982 spdk_bdev_close(struct spdk_bdev_desc *desc) 8983 { 8984 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8985 8986 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8987 spdk_get_thread()); 8988 8989 assert(desc->thread == spdk_get_thread()); 8990 8991 spdk_poller_unregister(&desc->io_timeout_poller); 8992 8993 spdk_spin_lock(&g_bdev_mgr.spinlock); 8994 8995 bdev_close(bdev, desc); 8996 8997 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8998 } 8999 9000 int32_t 9001 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 9002 { 9003 if (bdev->numa.id_valid) { 9004 return bdev->numa.id; 9005 } else { 9006 return SPDK_ENV_NUMA_ID_ANY; 9007 } 9008 } 9009 9010 static void 9011 bdev_register_finished(void *arg) 9012 { 9013 struct spdk_bdev_desc *desc = arg; 9014 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9015 9016 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 9017 9018 spdk_spin_lock(&g_bdev_mgr.spinlock); 9019 9020 bdev_close(bdev, desc); 9021 9022 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9023 } 9024 9025 int 9026 spdk_bdev_register(struct spdk_bdev *bdev) 9027 { 9028 struct spdk_bdev_desc *desc; 9029 struct spdk_thread *thread = spdk_get_thread(); 9030 int rc; 9031 9032 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 9033 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 9034 thread ? spdk_thread_get_name(thread) : "null"); 9035 return -EINVAL; 9036 } 9037 9038 rc = bdev_register(bdev); 9039 if (rc != 0) { 9040 return rc; 9041 } 9042 9043 /* A descriptor is opened to prevent bdev deletion during examination */ 9044 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9045 if (rc != 0) { 9046 spdk_bdev_unregister(bdev, NULL, NULL); 9047 return rc; 9048 } 9049 9050 rc = bdev_open(bdev, false, desc); 9051 if (rc != 0) { 9052 bdev_desc_free(desc); 9053 spdk_bdev_unregister(bdev, NULL, NULL); 9054 return rc; 9055 } 9056 9057 /* Examine configuration before initializing I/O */ 9058 bdev_examine(bdev); 9059 9060 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 9061 if (rc != 0) { 9062 bdev_close(bdev, desc); 9063 spdk_bdev_unregister(bdev, NULL, NULL); 9064 } 9065 9066 return rc; 9067 } 9068 9069 int 9070 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 9071 struct spdk_bdev_module *module) 9072 { 9073 spdk_spin_lock(&bdev->internal.spinlock); 9074 9075 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9076 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9077 spdk_spin_unlock(&bdev->internal.spinlock); 9078 return -EPERM; 9079 } 9080 9081 if (desc && !desc->write) { 9082 desc->write = true; 9083 } 9084 9085 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 9086 bdev->internal.claim.v1.module = module; 9087 9088 spdk_spin_unlock(&bdev->internal.spinlock); 9089 return 0; 9090 } 9091 9092 void 9093 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 9094 { 9095 spdk_spin_lock(&bdev->internal.spinlock); 9096 9097 assert(bdev->internal.claim.v1.module != NULL); 9098 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 9099 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9100 bdev->internal.claim.v1.module = NULL; 9101 9102 spdk_spin_unlock(&bdev->internal.spinlock); 9103 } 9104 9105 /* 9106 * Start claims v2 9107 */ 9108 9109 const char * 9110 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 9111 { 9112 switch (type) { 9113 case SPDK_BDEV_CLAIM_NONE: 9114 return "not_claimed"; 9115 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9116 return "exclusive_write"; 9117 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9118 return "read_many_write_one"; 9119 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9120 return "read_many_write_none"; 9121 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9122 return "read_many_write_many"; 9123 default: 9124 break; 9125 } 9126 return "invalid_claim"; 9127 } 9128 9129 static bool 9130 claim_type_is_v2(enum spdk_bdev_claim_type type) 9131 { 9132 switch (type) { 9133 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9134 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9135 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9136 return true; 9137 default: 9138 break; 9139 } 9140 return false; 9141 } 9142 9143 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 9144 static bool 9145 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 9146 { 9147 switch (type) { 9148 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9149 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9150 return true; 9151 default: 9152 break; 9153 } 9154 return false; 9155 } 9156 9157 void 9158 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 9159 { 9160 if (opts == NULL) { 9161 SPDK_ERRLOG("opts should not be NULL\n"); 9162 assert(opts != NULL); 9163 return; 9164 } 9165 if (size == 0) { 9166 SPDK_ERRLOG("size should not be zero\n"); 9167 assert(size != 0); 9168 return; 9169 } 9170 9171 memset(opts, 0, size); 9172 opts->opts_size = size; 9173 9174 #define FIELD_OK(field) \ 9175 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 9176 9177 #define SET_FIELD(field, value) \ 9178 if (FIELD_OK(field)) { \ 9179 opts->field = value; \ 9180 } \ 9181 9182 SET_FIELD(shared_claim_key, 0); 9183 9184 #undef FIELD_OK 9185 #undef SET_FIELD 9186 } 9187 9188 static int 9189 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 9190 { 9191 if (src->opts_size == 0) { 9192 SPDK_ERRLOG("size should not be zero\n"); 9193 return -1; 9194 } 9195 9196 memset(dst, 0, sizeof(*dst)); 9197 dst->opts_size = src->opts_size; 9198 9199 #define FIELD_OK(field) \ 9200 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 9201 9202 #define SET_FIELD(field) \ 9203 if (FIELD_OK(field)) { \ 9204 dst->field = src->field; \ 9205 } \ 9206 9207 if (FIELD_OK(name)) { 9208 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 9209 } 9210 9211 SET_FIELD(shared_claim_key); 9212 9213 /* You should not remove this statement, but need to update the assert statement 9214 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9215 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 9216 9217 #undef FIELD_OK 9218 #undef SET_FIELD 9219 return 0; 9220 } 9221 9222 /* Returns 0 if a read-write-once claim can be taken. */ 9223 static int 9224 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9225 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9226 { 9227 struct spdk_bdev *bdev = desc->bdev; 9228 struct spdk_bdev_desc *open_desc; 9229 9230 assert(spdk_spin_held(&bdev->internal.spinlock)); 9231 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 9232 9233 if (opts->shared_claim_key != 0) { 9234 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 9235 bdev->name); 9236 return -EINVAL; 9237 } 9238 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9239 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9240 return -EPERM; 9241 } 9242 if (desc->claim != NULL) { 9243 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 9244 bdev->name, desc->claim->module->name); 9245 return -EPERM; 9246 } 9247 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9248 if (desc != open_desc && open_desc->write) { 9249 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 9250 "another descriptor is open for writing\n", 9251 bdev->name); 9252 return -EPERM; 9253 } 9254 } 9255 9256 return 0; 9257 } 9258 9259 /* Returns 0 if a read-only-many claim can be taken. */ 9260 static int 9261 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9262 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9263 { 9264 struct spdk_bdev *bdev = desc->bdev; 9265 struct spdk_bdev_desc *open_desc; 9266 9267 assert(spdk_spin_held(&bdev->internal.spinlock)); 9268 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9269 assert(desc->claim == NULL); 9270 9271 if (desc->write) { 9272 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9273 bdev->name); 9274 return -EINVAL; 9275 } 9276 if (opts->shared_claim_key != 0) { 9277 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9278 return -EINVAL; 9279 } 9280 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9281 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9282 if (open_desc->write) { 9283 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9284 "another descriptor is open for writing\n", 9285 bdev->name); 9286 return -EPERM; 9287 } 9288 } 9289 } 9290 9291 return 0; 9292 } 9293 9294 /* Returns 0 if a read-write-many claim can be taken. */ 9295 static int 9296 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9297 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9298 { 9299 struct spdk_bdev *bdev = desc->bdev; 9300 struct spdk_bdev_desc *open_desc; 9301 9302 assert(spdk_spin_held(&bdev->internal.spinlock)); 9303 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9304 assert(desc->claim == NULL); 9305 9306 if (opts->shared_claim_key == 0) { 9307 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9308 bdev->name); 9309 return -EINVAL; 9310 } 9311 switch (bdev->internal.claim_type) { 9312 case SPDK_BDEV_CLAIM_NONE: 9313 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9314 if (open_desc == desc) { 9315 continue; 9316 } 9317 if (open_desc->write) { 9318 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9319 "another descriptor is open for writing without a " 9320 "claim\n", bdev->name); 9321 return -EPERM; 9322 } 9323 } 9324 break; 9325 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9326 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9327 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9328 return -EPERM; 9329 } 9330 break; 9331 default: 9332 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9333 return -EBUSY; 9334 } 9335 9336 return 0; 9337 } 9338 9339 /* Updates desc and its bdev with a v2 claim. */ 9340 static int 9341 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9342 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9343 { 9344 struct spdk_bdev *bdev = desc->bdev; 9345 struct spdk_bdev_module_claim *claim; 9346 9347 assert(spdk_spin_held(&bdev->internal.spinlock)); 9348 assert(claim_type_is_v2(type)); 9349 assert(desc->claim == NULL); 9350 9351 claim = calloc(1, sizeof(*desc->claim)); 9352 if (claim == NULL) { 9353 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9354 return -ENOMEM; 9355 } 9356 claim->module = module; 9357 claim->desc = desc; 9358 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9359 memcpy(claim->name, opts->name, sizeof(claim->name)); 9360 desc->claim = claim; 9361 9362 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9363 bdev->internal.claim_type = type; 9364 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9365 bdev->internal.claim.v2.key = opts->shared_claim_key; 9366 } 9367 assert(type == bdev->internal.claim_type); 9368 9369 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9370 9371 if (!desc->write && claim_type_promotes_to_write(type)) { 9372 desc->write = true; 9373 } 9374 9375 return 0; 9376 } 9377 9378 int 9379 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9380 struct spdk_bdev_claim_opts *_opts, 9381 struct spdk_bdev_module *module) 9382 { 9383 struct spdk_bdev *bdev; 9384 struct spdk_bdev_claim_opts opts; 9385 int rc = 0; 9386 9387 if (desc == NULL) { 9388 SPDK_ERRLOG("descriptor must not be NULL\n"); 9389 return -EINVAL; 9390 } 9391 9392 bdev = desc->bdev; 9393 9394 if (_opts == NULL) { 9395 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9396 } else if (claim_opts_copy(_opts, &opts) != 0) { 9397 return -EINVAL; 9398 } 9399 9400 spdk_spin_lock(&bdev->internal.spinlock); 9401 9402 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9403 bdev->internal.claim_type != type) { 9404 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9405 spdk_spin_unlock(&bdev->internal.spinlock); 9406 return -EPERM; 9407 } 9408 9409 if (claim_type_is_v2(type) && desc->claim != NULL) { 9410 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9411 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9412 spdk_spin_unlock(&bdev->internal.spinlock); 9413 return -EPERM; 9414 } 9415 9416 switch (type) { 9417 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9418 spdk_spin_unlock(&bdev->internal.spinlock); 9419 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9420 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9421 rc = claim_verify_rwo(desc, type, &opts, module); 9422 break; 9423 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9424 rc = claim_verify_rom(desc, type, &opts, module); 9425 break; 9426 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9427 rc = claim_verify_rwm(desc, type, &opts, module); 9428 break; 9429 default: 9430 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9431 rc = -ENOTSUP; 9432 } 9433 9434 if (rc == 0) { 9435 rc = claim_bdev(desc, type, &opts, module); 9436 } 9437 9438 spdk_spin_unlock(&bdev->internal.spinlock); 9439 return rc; 9440 } 9441 9442 static void 9443 claim_reset(struct spdk_bdev *bdev) 9444 { 9445 assert(spdk_spin_held(&bdev->internal.spinlock)); 9446 assert(claim_type_is_v2(bdev->internal.claim_type)); 9447 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9448 9449 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9450 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9451 } 9452 9453 static void 9454 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9455 { 9456 struct spdk_bdev *bdev = desc->bdev; 9457 9458 assert(spdk_spin_held(&bdev->internal.spinlock)); 9459 assert(claim_type_is_v2(bdev->internal.claim_type)); 9460 9461 if (bdev->internal.examine_in_progress == 0) { 9462 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9463 free(desc->claim); 9464 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9465 claim_reset(bdev); 9466 } 9467 } else { 9468 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9469 desc->claim->module = NULL; 9470 desc->claim->desc = NULL; 9471 } 9472 desc->claim = NULL; 9473 } 9474 9475 /* 9476 * End claims v2 9477 */ 9478 9479 struct spdk_bdev * 9480 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9481 { 9482 assert(desc != NULL); 9483 return desc->bdev; 9484 } 9485 9486 int 9487 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9488 { 9489 struct spdk_bdev *bdev, *tmp; 9490 struct spdk_bdev_desc *desc; 9491 int rc = 0; 9492 9493 assert(fn != NULL); 9494 9495 spdk_spin_lock(&g_bdev_mgr.spinlock); 9496 bdev = spdk_bdev_first(); 9497 while (bdev != NULL) { 9498 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9499 if (rc != 0) { 9500 break; 9501 } 9502 rc = bdev_open(bdev, false, desc); 9503 if (rc != 0) { 9504 bdev_desc_free(desc); 9505 if (rc == -ENODEV) { 9506 /* Ignore the error and move to the next bdev. */ 9507 rc = 0; 9508 bdev = spdk_bdev_next(bdev); 9509 continue; 9510 } 9511 break; 9512 } 9513 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9514 9515 rc = fn(ctx, bdev); 9516 9517 spdk_spin_lock(&g_bdev_mgr.spinlock); 9518 tmp = spdk_bdev_next(bdev); 9519 bdev_close(bdev, desc); 9520 if (rc != 0) { 9521 break; 9522 } 9523 bdev = tmp; 9524 } 9525 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9526 9527 return rc; 9528 } 9529 9530 int 9531 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9532 { 9533 struct spdk_bdev *bdev, *tmp; 9534 struct spdk_bdev_desc *desc; 9535 int rc = 0; 9536 9537 assert(fn != NULL); 9538 9539 spdk_spin_lock(&g_bdev_mgr.spinlock); 9540 bdev = spdk_bdev_first_leaf(); 9541 while (bdev != NULL) { 9542 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9543 if (rc != 0) { 9544 break; 9545 } 9546 rc = bdev_open(bdev, false, desc); 9547 if (rc != 0) { 9548 bdev_desc_free(desc); 9549 if (rc == -ENODEV) { 9550 /* Ignore the error and move to the next bdev. */ 9551 rc = 0; 9552 bdev = spdk_bdev_next_leaf(bdev); 9553 continue; 9554 } 9555 break; 9556 } 9557 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9558 9559 rc = fn(ctx, bdev); 9560 9561 spdk_spin_lock(&g_bdev_mgr.spinlock); 9562 tmp = spdk_bdev_next_leaf(bdev); 9563 bdev_close(bdev, desc); 9564 if (rc != 0) { 9565 break; 9566 } 9567 bdev = tmp; 9568 } 9569 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9570 9571 return rc; 9572 } 9573 9574 void 9575 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9576 { 9577 struct iovec *iovs; 9578 int iovcnt; 9579 9580 if (bdev_io == NULL) { 9581 return; 9582 } 9583 9584 switch (bdev_io->type) { 9585 case SPDK_BDEV_IO_TYPE_READ: 9586 case SPDK_BDEV_IO_TYPE_WRITE: 9587 case SPDK_BDEV_IO_TYPE_ZCOPY: 9588 iovs = bdev_io->u.bdev.iovs; 9589 iovcnt = bdev_io->u.bdev.iovcnt; 9590 break; 9591 default: 9592 iovs = NULL; 9593 iovcnt = 0; 9594 break; 9595 } 9596 9597 if (iovp) { 9598 *iovp = iovs; 9599 } 9600 if (iovcntp) { 9601 *iovcntp = iovcnt; 9602 } 9603 } 9604 9605 void * 9606 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9607 { 9608 if (bdev_io == NULL) { 9609 return NULL; 9610 } 9611 9612 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9613 return NULL; 9614 } 9615 9616 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9617 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9618 return bdev_io->u.bdev.md_buf; 9619 } 9620 9621 return NULL; 9622 } 9623 9624 void * 9625 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9626 { 9627 if (bdev_io == NULL) { 9628 assert(false); 9629 return NULL; 9630 } 9631 9632 return bdev_io->internal.caller_ctx; 9633 } 9634 9635 void 9636 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9637 { 9638 9639 if (spdk_bdev_module_list_find(bdev_module->name)) { 9640 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9641 assert(false); 9642 } 9643 9644 spdk_spin_init(&bdev_module->internal.spinlock); 9645 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9646 9647 /* 9648 * Modules with examine callbacks must be initialized first, so they are 9649 * ready to handle examine callbacks from later modules that will 9650 * register physical bdevs. 9651 */ 9652 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9653 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9654 } else { 9655 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9656 } 9657 } 9658 9659 struct spdk_bdev_module * 9660 spdk_bdev_module_list_find(const char *name) 9661 { 9662 struct spdk_bdev_module *bdev_module; 9663 9664 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9665 if (strcmp(name, bdev_module->name) == 0) { 9666 break; 9667 } 9668 } 9669 9670 return bdev_module; 9671 } 9672 9673 static int 9674 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9675 { 9676 uint64_t num_blocks; 9677 void *md_buf = NULL; 9678 9679 num_blocks = bdev_io->u.bdev.num_blocks; 9680 9681 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9682 md_buf = (char *)g_bdev_mgr.zero_buffer + 9683 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9684 } 9685 9686 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9687 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9688 g_bdev_mgr.zero_buffer, md_buf, 9689 bdev_io->u.bdev.offset_blocks, num_blocks, 9690 bdev_write_zero_buffer_done, bdev_io); 9691 } 9692 9693 static void 9694 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9695 { 9696 struct spdk_bdev_io *parent_io = cb_arg; 9697 9698 spdk_bdev_free_io(bdev_io); 9699 9700 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9701 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9702 } 9703 9704 static void 9705 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9706 { 9707 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9708 ctx->bdev->internal.qos_mod_in_progress = false; 9709 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9710 9711 if (ctx->cb_fn) { 9712 ctx->cb_fn(ctx->cb_arg, status); 9713 } 9714 free(ctx); 9715 } 9716 9717 static void 9718 bdev_disable_qos_done(void *cb_arg) 9719 { 9720 struct set_qos_limit_ctx *ctx = cb_arg; 9721 struct spdk_bdev *bdev = ctx->bdev; 9722 struct spdk_bdev_qos *qos; 9723 9724 spdk_spin_lock(&bdev->internal.spinlock); 9725 qos = bdev->internal.qos; 9726 bdev->internal.qos = NULL; 9727 spdk_spin_unlock(&bdev->internal.spinlock); 9728 9729 if (qos->thread != NULL) { 9730 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9731 spdk_poller_unregister(&qos->poller); 9732 } 9733 9734 free(qos); 9735 9736 bdev_set_qos_limit_done(ctx, 0); 9737 } 9738 9739 static void 9740 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9741 { 9742 struct set_qos_limit_ctx *ctx = _ctx; 9743 struct spdk_thread *thread; 9744 9745 spdk_spin_lock(&bdev->internal.spinlock); 9746 thread = bdev->internal.qos->thread; 9747 spdk_spin_unlock(&bdev->internal.spinlock); 9748 9749 if (thread != NULL) { 9750 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9751 } else { 9752 bdev_disable_qos_done(ctx); 9753 } 9754 } 9755 9756 static void 9757 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9758 struct spdk_io_channel *ch, void *_ctx) 9759 { 9760 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9761 struct spdk_bdev_io *bdev_io; 9762 9763 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9764 9765 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9766 /* Re-submit the queued I/O. */ 9767 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9768 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9769 _bdev_io_submit(bdev_io); 9770 } 9771 9772 spdk_bdev_for_each_channel_continue(i, 0); 9773 } 9774 9775 static void 9776 bdev_update_qos_rate_limit_msg(void *cb_arg) 9777 { 9778 struct set_qos_limit_ctx *ctx = cb_arg; 9779 struct spdk_bdev *bdev = ctx->bdev; 9780 9781 spdk_spin_lock(&bdev->internal.spinlock); 9782 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9783 spdk_spin_unlock(&bdev->internal.spinlock); 9784 9785 bdev_set_qos_limit_done(ctx, 0); 9786 } 9787 9788 static void 9789 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9790 struct spdk_io_channel *ch, void *_ctx) 9791 { 9792 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9793 9794 spdk_spin_lock(&bdev->internal.spinlock); 9795 bdev_enable_qos(bdev, bdev_ch); 9796 spdk_spin_unlock(&bdev->internal.spinlock); 9797 spdk_bdev_for_each_channel_continue(i, 0); 9798 } 9799 9800 static void 9801 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9802 { 9803 struct set_qos_limit_ctx *ctx = _ctx; 9804 9805 bdev_set_qos_limit_done(ctx, status); 9806 } 9807 9808 static void 9809 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9810 { 9811 int i; 9812 9813 assert(bdev->internal.qos != NULL); 9814 9815 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9816 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9817 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9818 9819 if (limits[i] == 0) { 9820 bdev->internal.qos->rate_limits[i].limit = 9821 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9822 } 9823 } 9824 } 9825 } 9826 9827 void 9828 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9829 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9830 { 9831 struct set_qos_limit_ctx *ctx; 9832 uint32_t limit_set_complement; 9833 uint64_t min_limit_per_sec; 9834 int i; 9835 bool disable_rate_limit = true; 9836 9837 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9838 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9839 continue; 9840 } 9841 9842 if (limits[i] > 0) { 9843 disable_rate_limit = false; 9844 } 9845 9846 if (bdev_qos_is_iops_rate_limit(i) == true) { 9847 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9848 } else { 9849 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9850 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9851 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9852 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9853 } 9854 /* Change from megabyte to byte rate limit */ 9855 limits[i] = limits[i] * 1024 * 1024; 9856 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9857 } 9858 9859 limit_set_complement = limits[i] % min_limit_per_sec; 9860 if (limit_set_complement) { 9861 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9862 limits[i], min_limit_per_sec); 9863 limits[i] += min_limit_per_sec - limit_set_complement; 9864 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9865 } 9866 } 9867 9868 ctx = calloc(1, sizeof(*ctx)); 9869 if (ctx == NULL) { 9870 cb_fn(cb_arg, -ENOMEM); 9871 return; 9872 } 9873 9874 ctx->cb_fn = cb_fn; 9875 ctx->cb_arg = cb_arg; 9876 ctx->bdev = bdev; 9877 9878 spdk_spin_lock(&bdev->internal.spinlock); 9879 if (bdev->internal.qos_mod_in_progress) { 9880 spdk_spin_unlock(&bdev->internal.spinlock); 9881 free(ctx); 9882 cb_fn(cb_arg, -EAGAIN); 9883 return; 9884 } 9885 bdev->internal.qos_mod_in_progress = true; 9886 9887 if (disable_rate_limit == true && bdev->internal.qos) { 9888 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9889 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9890 (bdev->internal.qos->rate_limits[i].limit > 0 && 9891 bdev->internal.qos->rate_limits[i].limit != 9892 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9893 disable_rate_limit = false; 9894 break; 9895 } 9896 } 9897 } 9898 9899 if (disable_rate_limit == false) { 9900 if (bdev->internal.qos == NULL) { 9901 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9902 if (!bdev->internal.qos) { 9903 spdk_spin_unlock(&bdev->internal.spinlock); 9904 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9905 bdev_set_qos_limit_done(ctx, -ENOMEM); 9906 return; 9907 } 9908 } 9909 9910 if (bdev->internal.qos->thread == NULL) { 9911 /* Enabling */ 9912 bdev_set_qos_rate_limits(bdev, limits); 9913 9914 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9915 bdev_enable_qos_done); 9916 } else { 9917 /* Updating */ 9918 bdev_set_qos_rate_limits(bdev, limits); 9919 9920 spdk_thread_send_msg(bdev->internal.qos->thread, 9921 bdev_update_qos_rate_limit_msg, ctx); 9922 } 9923 } else { 9924 if (bdev->internal.qos != NULL) { 9925 bdev_set_qos_rate_limits(bdev, limits); 9926 9927 /* Disabling */ 9928 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9929 bdev_disable_qos_msg_done); 9930 } else { 9931 spdk_spin_unlock(&bdev->internal.spinlock); 9932 bdev_set_qos_limit_done(ctx, 0); 9933 return; 9934 } 9935 } 9936 9937 spdk_spin_unlock(&bdev->internal.spinlock); 9938 } 9939 9940 struct spdk_bdev_histogram_ctx { 9941 spdk_bdev_histogram_status_cb cb_fn; 9942 void *cb_arg; 9943 struct spdk_bdev *bdev; 9944 int status; 9945 }; 9946 9947 static void 9948 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9949 { 9950 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9951 9952 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9953 ctx->bdev->internal.histogram_in_progress = false; 9954 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9955 ctx->cb_fn(ctx->cb_arg, ctx->status); 9956 free(ctx); 9957 } 9958 9959 static void 9960 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9961 struct spdk_io_channel *_ch, void *_ctx) 9962 { 9963 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9964 9965 if (ch->histogram != NULL) { 9966 spdk_histogram_data_free(ch->histogram); 9967 ch->histogram = NULL; 9968 } 9969 spdk_bdev_for_each_channel_continue(i, 0); 9970 } 9971 9972 static void 9973 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9974 { 9975 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9976 9977 if (status != 0) { 9978 ctx->status = status; 9979 ctx->bdev->internal.histogram_enabled = false; 9980 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9981 bdev_histogram_disable_channel_cb); 9982 } else { 9983 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9984 ctx->bdev->internal.histogram_in_progress = false; 9985 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9986 ctx->cb_fn(ctx->cb_arg, ctx->status); 9987 free(ctx); 9988 } 9989 } 9990 9991 static void 9992 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9993 struct spdk_io_channel *_ch, void *_ctx) 9994 { 9995 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9996 int status = 0; 9997 9998 if (ch->histogram == NULL) { 9999 ch->histogram = spdk_histogram_data_alloc(); 10000 if (ch->histogram == NULL) { 10001 status = -ENOMEM; 10002 } 10003 } 10004 10005 spdk_bdev_for_each_channel_continue(i, status); 10006 } 10007 10008 void 10009 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10010 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 10011 { 10012 struct spdk_bdev_histogram_ctx *ctx; 10013 10014 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 10015 if (ctx == NULL) { 10016 cb_fn(cb_arg, -ENOMEM); 10017 return; 10018 } 10019 10020 ctx->bdev = bdev; 10021 ctx->status = 0; 10022 ctx->cb_fn = cb_fn; 10023 ctx->cb_arg = cb_arg; 10024 10025 spdk_spin_lock(&bdev->internal.spinlock); 10026 if (bdev->internal.histogram_in_progress) { 10027 spdk_spin_unlock(&bdev->internal.spinlock); 10028 free(ctx); 10029 cb_fn(cb_arg, -EAGAIN); 10030 return; 10031 } 10032 10033 bdev->internal.histogram_in_progress = true; 10034 spdk_spin_unlock(&bdev->internal.spinlock); 10035 10036 bdev->internal.histogram_enabled = enable; 10037 bdev->internal.histogram_io_type = opts->io_type; 10038 10039 if (enable) { 10040 /* Allocate histogram for each channel */ 10041 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 10042 bdev_histogram_enable_channel_cb); 10043 } else { 10044 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 10045 bdev_histogram_disable_channel_cb); 10046 } 10047 } 10048 10049 void 10050 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 10051 { 10052 if (opts == NULL) { 10053 SPDK_ERRLOG("opts should not be NULL\n"); 10054 assert(opts != NULL); 10055 return; 10056 } 10057 if (size == 0) { 10058 SPDK_ERRLOG("size should not be zero\n"); 10059 assert(size != 0); 10060 return; 10061 } 10062 10063 memset(opts, 0, size); 10064 opts->size = size; 10065 10066 #define FIELD_OK(field) \ 10067 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 10068 10069 #define SET_FIELD(field, value) \ 10070 if (FIELD_OK(field)) { \ 10071 opts->field = value; \ 10072 } \ 10073 10074 SET_FIELD(io_type, 0); 10075 10076 /* You should not remove this statement, but need to update the assert statement 10077 * if you add a new field, and also add a corresponding SET_FIELD statement */ 10078 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 10079 10080 #undef FIELD_OK 10081 #undef SET_FIELD 10082 } 10083 10084 void 10085 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10086 void *cb_arg, bool enable) 10087 { 10088 struct spdk_bdev_enable_histogram_opts opts; 10089 10090 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 10091 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 10092 } 10093 10094 struct spdk_bdev_histogram_data_ctx { 10095 spdk_bdev_histogram_data_cb cb_fn; 10096 void *cb_arg; 10097 struct spdk_bdev *bdev; 10098 /** merged histogram data from all channels */ 10099 struct spdk_histogram_data *histogram; 10100 }; 10101 10102 static void 10103 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10104 { 10105 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10106 10107 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 10108 free(ctx); 10109 } 10110 10111 static void 10112 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10113 struct spdk_io_channel *_ch, void *_ctx) 10114 { 10115 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10116 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10117 int status = 0; 10118 10119 if (ch->histogram == NULL) { 10120 status = -EFAULT; 10121 } else { 10122 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 10123 } 10124 10125 spdk_bdev_for_each_channel_continue(i, status); 10126 } 10127 10128 void 10129 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 10130 spdk_bdev_histogram_data_cb cb_fn, 10131 void *cb_arg) 10132 { 10133 struct spdk_bdev_histogram_data_ctx *ctx; 10134 10135 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 10136 if (ctx == NULL) { 10137 cb_fn(cb_arg, -ENOMEM, NULL); 10138 return; 10139 } 10140 10141 ctx->bdev = bdev; 10142 ctx->cb_fn = cb_fn; 10143 ctx->cb_arg = cb_arg; 10144 10145 ctx->histogram = histogram; 10146 10147 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 10148 bdev_histogram_get_channel_cb); 10149 } 10150 10151 void 10152 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 10153 void *cb_arg) 10154 { 10155 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 10156 int status = 0; 10157 10158 assert(cb_fn != NULL); 10159 10160 if (bdev_ch->histogram == NULL) { 10161 status = -EFAULT; 10162 } 10163 cb_fn(cb_arg, status, bdev_ch->histogram); 10164 } 10165 10166 size_t 10167 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 10168 size_t max_events) 10169 { 10170 struct media_event_entry *entry; 10171 size_t num_events = 0; 10172 10173 for (; num_events < max_events; ++num_events) { 10174 entry = TAILQ_FIRST(&desc->pending_media_events); 10175 if (entry == NULL) { 10176 break; 10177 } 10178 10179 events[num_events] = entry->event; 10180 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 10181 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 10182 } 10183 10184 return num_events; 10185 } 10186 10187 int 10188 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 10189 size_t num_events) 10190 { 10191 struct spdk_bdev_desc *desc; 10192 struct media_event_entry *entry; 10193 size_t event_id; 10194 int rc = 0; 10195 10196 assert(bdev->media_events); 10197 10198 spdk_spin_lock(&bdev->internal.spinlock); 10199 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10200 if (desc->write) { 10201 break; 10202 } 10203 } 10204 10205 if (desc == NULL || desc->media_events_buffer == NULL) { 10206 rc = -ENODEV; 10207 goto out; 10208 } 10209 10210 for (event_id = 0; event_id < num_events; ++event_id) { 10211 entry = TAILQ_FIRST(&desc->free_media_events); 10212 if (entry == NULL) { 10213 break; 10214 } 10215 10216 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 10217 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 10218 entry->event = events[event_id]; 10219 } 10220 10221 rc = event_id; 10222 out: 10223 spdk_spin_unlock(&bdev->internal.spinlock); 10224 return rc; 10225 } 10226 10227 static void 10228 _media_management_notify(void *arg) 10229 { 10230 struct spdk_bdev_desc *desc = arg; 10231 10232 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 10233 } 10234 10235 void 10236 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 10237 { 10238 struct spdk_bdev_desc *desc; 10239 10240 spdk_spin_lock(&bdev->internal.spinlock); 10241 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10242 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 10243 event_notify(desc, _media_management_notify); 10244 } 10245 } 10246 spdk_spin_unlock(&bdev->internal.spinlock); 10247 } 10248 10249 struct locked_lba_range_ctx { 10250 struct lba_range range; 10251 struct lba_range *current_range; 10252 struct lba_range *owner_range; 10253 struct spdk_poller *poller; 10254 lock_range_cb cb_fn; 10255 void *cb_arg; 10256 }; 10257 10258 static void 10259 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10260 { 10261 struct locked_lba_range_ctx *ctx = _ctx; 10262 10263 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10264 free(ctx); 10265 } 10266 10267 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10268 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10269 10270 static void 10271 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10272 { 10273 struct locked_lba_range_ctx *ctx = _ctx; 10274 10275 if (status == -ENOMEM) { 10276 /* One of the channels could not allocate a range object. 10277 * So we have to go back and clean up any ranges that were 10278 * allocated successfully before we return error status to 10279 * the caller. We can reuse the unlock function to do that 10280 * clean up. 10281 */ 10282 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10283 bdev_lock_error_cleanup_cb); 10284 return; 10285 } 10286 10287 /* All channels have locked this range and no I/O overlapping the range 10288 * are outstanding! Set the owner_ch for the range object for the 10289 * locking channel, so that this channel will know that it is allowed 10290 * to write to this range. 10291 */ 10292 if (ctx->owner_range != NULL) { 10293 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10294 } 10295 10296 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10297 10298 /* Don't free the ctx here. Its range is in the bdev's global list of 10299 * locked ranges still, and will be removed and freed when this range 10300 * is later unlocked. 10301 */ 10302 } 10303 10304 static int 10305 bdev_lock_lba_range_check_io(void *_i) 10306 { 10307 struct spdk_bdev_channel_iter *i = _i; 10308 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10309 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10310 struct locked_lba_range_ctx *ctx = i->ctx; 10311 struct lba_range *range = ctx->current_range; 10312 struct spdk_bdev_io *bdev_io; 10313 10314 spdk_poller_unregister(&ctx->poller); 10315 10316 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10317 * range. But we need to wait until any outstanding IO overlapping with this range 10318 * are completed. 10319 */ 10320 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10321 if (bdev_io_range_is_locked(bdev_io, range)) { 10322 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10323 return SPDK_POLLER_BUSY; 10324 } 10325 } 10326 10327 spdk_bdev_for_each_channel_continue(i, 0); 10328 return SPDK_POLLER_BUSY; 10329 } 10330 10331 static void 10332 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10333 struct spdk_io_channel *_ch, void *_ctx) 10334 { 10335 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10336 struct locked_lba_range_ctx *ctx = _ctx; 10337 struct lba_range *range; 10338 10339 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10340 if (range->length == ctx->range.length && 10341 range->offset == ctx->range.offset && 10342 range->locked_ctx == ctx->range.locked_ctx) { 10343 /* This range already exists on this channel, so don't add 10344 * it again. This can happen when a new channel is created 10345 * while the for_each_channel operation is in progress. 10346 * Do not check for outstanding I/O in that case, since the 10347 * range was locked before any I/O could be submitted to the 10348 * new channel. 10349 */ 10350 spdk_bdev_for_each_channel_continue(i, 0); 10351 return; 10352 } 10353 } 10354 10355 range = calloc(1, sizeof(*range)); 10356 if (range == NULL) { 10357 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10358 return; 10359 } 10360 10361 range->length = ctx->range.length; 10362 range->offset = ctx->range.offset; 10363 range->locked_ctx = ctx->range.locked_ctx; 10364 range->quiesce = ctx->range.quiesce; 10365 ctx->current_range = range; 10366 if (ctx->range.owner_ch == ch) { 10367 /* This is the range object for the channel that will hold 10368 * the lock. Store it in the ctx object so that we can easily 10369 * set its owner_ch after the lock is finally acquired. 10370 */ 10371 ctx->owner_range = range; 10372 } 10373 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10374 bdev_lock_lba_range_check_io(i); 10375 } 10376 10377 static void 10378 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10379 { 10380 assert(spdk_get_thread() == ctx->range.owner_thread); 10381 assert(ctx->range.owner_ch == NULL || 10382 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10383 10384 /* We will add a copy of this range to each channel now. */ 10385 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10386 bdev_lock_lba_range_cb); 10387 } 10388 10389 static bool 10390 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10391 { 10392 struct lba_range *r; 10393 10394 TAILQ_FOREACH(r, tailq, tailq) { 10395 if (bdev_lba_range_overlapped(range, r)) { 10396 return true; 10397 } 10398 } 10399 return false; 10400 } 10401 10402 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10403 10404 static int 10405 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10406 uint64_t offset, uint64_t length, 10407 lock_range_cb cb_fn, void *cb_arg) 10408 { 10409 struct locked_lba_range_ctx *ctx; 10410 10411 ctx = calloc(1, sizeof(*ctx)); 10412 if (ctx == NULL) { 10413 return -ENOMEM; 10414 } 10415 10416 ctx->range.offset = offset; 10417 ctx->range.length = length; 10418 ctx->range.owner_thread = spdk_get_thread(); 10419 ctx->range.owner_ch = ch; 10420 ctx->range.locked_ctx = cb_arg; 10421 ctx->range.bdev = bdev; 10422 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10423 ctx->cb_fn = cb_fn; 10424 ctx->cb_arg = cb_arg; 10425 10426 spdk_spin_lock(&bdev->internal.spinlock); 10427 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10428 /* There is an active lock overlapping with this range. 10429 * Put it on the pending list until this range no 10430 * longer overlaps with another. 10431 */ 10432 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10433 } else { 10434 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10435 bdev_lock_lba_range_ctx(bdev, ctx); 10436 } 10437 spdk_spin_unlock(&bdev->internal.spinlock); 10438 return 0; 10439 } 10440 10441 static int 10442 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10443 uint64_t offset, uint64_t length, 10444 lock_range_cb cb_fn, void *cb_arg) 10445 { 10446 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10447 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10448 10449 if (cb_arg == NULL) { 10450 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10451 return -EINVAL; 10452 } 10453 10454 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10455 } 10456 10457 static void 10458 bdev_lock_lba_range_ctx_msg(void *_ctx) 10459 { 10460 struct locked_lba_range_ctx *ctx = _ctx; 10461 10462 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10463 } 10464 10465 static void 10466 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10467 { 10468 struct locked_lba_range_ctx *ctx = _ctx; 10469 struct locked_lba_range_ctx *pending_ctx; 10470 struct lba_range *range, *tmp; 10471 10472 spdk_spin_lock(&bdev->internal.spinlock); 10473 /* Check if there are any pending locked ranges that overlap with this range 10474 * that was just unlocked. If there are, check that it doesn't overlap with any 10475 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10476 * the lock process. 10477 */ 10478 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10479 if (bdev_lba_range_overlapped(range, &ctx->range) && 10480 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10481 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10482 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10483 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10484 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10485 bdev_lock_lba_range_ctx_msg, pending_ctx); 10486 } 10487 } 10488 spdk_spin_unlock(&bdev->internal.spinlock); 10489 10490 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10491 free(ctx); 10492 } 10493 10494 static void 10495 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10496 struct spdk_io_channel *_ch, void *_ctx) 10497 { 10498 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10499 struct locked_lba_range_ctx *ctx = _ctx; 10500 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10501 struct spdk_bdev_io *bdev_io; 10502 struct lba_range *range; 10503 10504 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10505 if (ctx->range.offset == range->offset && 10506 ctx->range.length == range->length && 10507 ctx->range.locked_ctx == range->locked_ctx) { 10508 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10509 free(range); 10510 break; 10511 } 10512 } 10513 10514 /* Note: we should almost always be able to assert that the range specified 10515 * was found. But there are some very rare corner cases where a new channel 10516 * gets created simultaneously with a range unlock, where this function 10517 * would execute on that new channel and wouldn't have the range. 10518 * We also use this to clean up range allocations when a later allocation 10519 * fails in the locking path. 10520 * So we can't actually assert() here. 10521 */ 10522 10523 /* Swap the locked IO into a temporary list, and then try to submit them again. 10524 * We could hyper-optimize this to only resubmit locked I/O that overlap 10525 * with the range that was just unlocked, but this isn't a performance path so 10526 * we go for simplicity here. 10527 */ 10528 TAILQ_INIT(&io_locked); 10529 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10530 while (!TAILQ_EMPTY(&io_locked)) { 10531 bdev_io = TAILQ_FIRST(&io_locked); 10532 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10533 bdev_io_submit(bdev_io); 10534 } 10535 10536 spdk_bdev_for_each_channel_continue(i, 0); 10537 } 10538 10539 static int 10540 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10541 lock_range_cb cb_fn, void *cb_arg) 10542 { 10543 struct locked_lba_range_ctx *ctx; 10544 struct lba_range *range; 10545 10546 spdk_spin_lock(&bdev->internal.spinlock); 10547 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10548 * and remove it. This ensures new channels don't inherit the locked range. 10549 * Then we will send a message to each channel to remove the range from its 10550 * per-channel list. 10551 */ 10552 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10553 if (range->offset == offset && range->length == length && 10554 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10555 break; 10556 } 10557 } 10558 if (range == NULL) { 10559 assert(false); 10560 spdk_spin_unlock(&bdev->internal.spinlock); 10561 return -EINVAL; 10562 } 10563 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10564 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10565 spdk_spin_unlock(&bdev->internal.spinlock); 10566 10567 ctx->cb_fn = cb_fn; 10568 ctx->cb_arg = cb_arg; 10569 10570 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10571 bdev_unlock_lba_range_cb); 10572 return 0; 10573 } 10574 10575 static int 10576 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10577 uint64_t offset, uint64_t length, 10578 lock_range_cb cb_fn, void *cb_arg) 10579 { 10580 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10581 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10582 struct lba_range *range; 10583 bool range_found = false; 10584 10585 /* Let's make sure the specified channel actually has a lock on 10586 * the specified range. Note that the range must match exactly. 10587 */ 10588 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10589 if (range->offset == offset && range->length == length && 10590 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10591 range_found = true; 10592 break; 10593 } 10594 } 10595 10596 if (!range_found) { 10597 return -EINVAL; 10598 } 10599 10600 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10601 } 10602 10603 struct bdev_quiesce_ctx { 10604 spdk_bdev_quiesce_cb cb_fn; 10605 void *cb_arg; 10606 }; 10607 10608 static void 10609 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10610 { 10611 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10612 10613 if (quiesce_ctx->cb_fn != NULL) { 10614 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10615 } 10616 10617 free(quiesce_ctx); 10618 } 10619 10620 static void 10621 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10622 { 10623 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10624 struct spdk_bdev_module *module = range->bdev->module; 10625 10626 if (status != 0) { 10627 if (quiesce_ctx->cb_fn != NULL) { 10628 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10629 } 10630 free(quiesce_ctx); 10631 return; 10632 } 10633 10634 spdk_spin_lock(&module->internal.spinlock); 10635 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10636 spdk_spin_unlock(&module->internal.spinlock); 10637 10638 if (quiesce_ctx->cb_fn != NULL) { 10639 /* copy the context in case the range is unlocked by the callback */ 10640 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10641 10642 quiesce_ctx->cb_fn = NULL; 10643 quiesce_ctx->cb_arg = NULL; 10644 10645 tmp.cb_fn(tmp.cb_arg, status); 10646 } 10647 /* quiesce_ctx will be freed on unquiesce */ 10648 } 10649 10650 static int 10651 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10652 uint64_t offset, uint64_t length, 10653 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10654 bool unquiesce) 10655 { 10656 struct bdev_quiesce_ctx *quiesce_ctx; 10657 int rc; 10658 10659 if (module != bdev->module) { 10660 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10661 return -EINVAL; 10662 } 10663 10664 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10665 return -EINVAL; 10666 } 10667 10668 if (unquiesce) { 10669 struct lba_range *range; 10670 10671 /* Make sure the specified range is actually quiesced in the specified module and 10672 * then remove it from the list. Note that the range must match exactly. 10673 */ 10674 spdk_spin_lock(&module->internal.spinlock); 10675 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10676 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10677 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10678 break; 10679 } 10680 } 10681 spdk_spin_unlock(&module->internal.spinlock); 10682 10683 if (range == NULL) { 10684 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10685 return -EINVAL; 10686 } 10687 10688 quiesce_ctx = range->locked_ctx; 10689 quiesce_ctx->cb_fn = cb_fn; 10690 quiesce_ctx->cb_arg = cb_arg; 10691 10692 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10693 } else { 10694 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10695 if (quiesce_ctx == NULL) { 10696 return -ENOMEM; 10697 } 10698 10699 quiesce_ctx->cb_fn = cb_fn; 10700 quiesce_ctx->cb_arg = cb_arg; 10701 10702 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10703 if (rc != 0) { 10704 free(quiesce_ctx); 10705 } 10706 } 10707 10708 return rc; 10709 } 10710 10711 int 10712 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10713 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10714 { 10715 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10716 } 10717 10718 int 10719 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10720 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10721 { 10722 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10723 } 10724 10725 int 10726 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10727 uint64_t offset, uint64_t length, 10728 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10729 { 10730 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10731 } 10732 10733 int 10734 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10735 uint64_t offset, uint64_t length, 10736 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10737 { 10738 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10739 } 10740 10741 int 10742 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10743 int array_size) 10744 { 10745 if (!bdev) { 10746 return -EINVAL; 10747 } 10748 10749 if (bdev->fn_table->get_memory_domains) { 10750 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10751 } 10752 10753 return 0; 10754 } 10755 10756 struct spdk_bdev_for_each_io_ctx { 10757 void *ctx; 10758 spdk_bdev_io_fn fn; 10759 spdk_bdev_for_each_io_cb cb; 10760 }; 10761 10762 static void 10763 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10764 struct spdk_io_channel *io_ch, void *_ctx) 10765 { 10766 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10767 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10768 struct spdk_bdev_io *bdev_io; 10769 int rc = 0; 10770 10771 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10772 rc = ctx->fn(ctx->ctx, bdev_io); 10773 if (rc != 0) { 10774 break; 10775 } 10776 } 10777 10778 spdk_bdev_for_each_channel_continue(i, rc); 10779 } 10780 10781 static void 10782 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10783 { 10784 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10785 10786 ctx->cb(ctx->ctx, status); 10787 10788 free(ctx); 10789 } 10790 10791 void 10792 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10793 spdk_bdev_for_each_io_cb cb) 10794 { 10795 struct spdk_bdev_for_each_io_ctx *ctx; 10796 10797 assert(fn != NULL && cb != NULL); 10798 10799 ctx = calloc(1, sizeof(*ctx)); 10800 if (ctx == NULL) { 10801 SPDK_ERRLOG("Failed to allocate context.\n"); 10802 cb(_ctx, -ENOMEM); 10803 return; 10804 } 10805 10806 ctx->ctx = _ctx; 10807 ctx->fn = fn; 10808 ctx->cb = cb; 10809 10810 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10811 bdev_for_each_io_done); 10812 } 10813 10814 void 10815 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10816 { 10817 spdk_for_each_channel_continue(iter->i, status); 10818 } 10819 10820 static struct spdk_bdev * 10821 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10822 { 10823 void *io_device = spdk_io_channel_iter_get_io_device(i); 10824 10825 return __bdev_from_io_dev(io_device); 10826 } 10827 10828 static void 10829 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10830 { 10831 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10832 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10833 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10834 10835 iter->i = i; 10836 iter->fn(iter, bdev, ch, iter->ctx); 10837 } 10838 10839 static void 10840 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10841 { 10842 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10843 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10844 10845 iter->i = i; 10846 iter->cpl(bdev, iter->ctx, status); 10847 10848 free(iter); 10849 } 10850 10851 void 10852 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10853 void *ctx, spdk_bdev_for_each_channel_done cpl) 10854 { 10855 struct spdk_bdev_channel_iter *iter; 10856 10857 assert(bdev != NULL && fn != NULL && ctx != NULL); 10858 10859 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10860 if (iter == NULL) { 10861 SPDK_ERRLOG("Unable to allocate iterator\n"); 10862 assert(false); 10863 return; 10864 } 10865 10866 iter->fn = fn; 10867 iter->cpl = cpl; 10868 iter->ctx = ctx; 10869 10870 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10871 iter, bdev_each_channel_cpl); 10872 } 10873 10874 static void 10875 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10876 { 10877 struct spdk_bdev_io *parent_io = cb_arg; 10878 10879 spdk_bdev_free_io(bdev_io); 10880 10881 /* Check return status of write */ 10882 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10883 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10884 } 10885 10886 static void 10887 bdev_copy_do_write(void *_bdev_io) 10888 { 10889 struct spdk_bdev_io *bdev_io = _bdev_io; 10890 int rc; 10891 10892 /* Write blocks */ 10893 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10894 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10895 bdev_io->u.bdev.iovs[0].iov_base, 10896 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10897 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10898 10899 if (rc == -ENOMEM) { 10900 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10901 } else if (rc != 0) { 10902 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10903 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10904 } 10905 } 10906 10907 static void 10908 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10909 { 10910 struct spdk_bdev_io *parent_io = cb_arg; 10911 10912 spdk_bdev_free_io(bdev_io); 10913 10914 /* Check return status of read */ 10915 if (!success) { 10916 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10917 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10918 return; 10919 } 10920 10921 /* Do write */ 10922 bdev_copy_do_write(parent_io); 10923 } 10924 10925 static void 10926 bdev_copy_do_read(void *_bdev_io) 10927 { 10928 struct spdk_bdev_io *bdev_io = _bdev_io; 10929 int rc; 10930 10931 /* Read blocks */ 10932 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10933 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10934 bdev_io->u.bdev.iovs[0].iov_base, 10935 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10936 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10937 10938 if (rc == -ENOMEM) { 10939 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10940 } else if (rc != 0) { 10941 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10942 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10943 } 10944 } 10945 10946 static void 10947 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10948 { 10949 if (!success) { 10950 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10951 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10952 return; 10953 } 10954 10955 bdev_copy_do_read(bdev_io); 10956 } 10957 10958 int 10959 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10960 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10961 spdk_bdev_io_completion_cb cb, void *cb_arg) 10962 { 10963 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10964 struct spdk_bdev_io *bdev_io; 10965 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10966 10967 if (!desc->write) { 10968 return -EBADF; 10969 } 10970 10971 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10972 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10973 SPDK_DEBUGLOG(bdev, 10974 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10975 dst_offset_blocks, src_offset_blocks, num_blocks); 10976 return -EINVAL; 10977 } 10978 10979 bdev_io = bdev_channel_get_io(channel); 10980 if (!bdev_io) { 10981 return -ENOMEM; 10982 } 10983 10984 bdev_io->internal.ch = channel; 10985 bdev_io->internal.desc = desc; 10986 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10987 10988 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10989 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10990 bdev_io->u.bdev.num_blocks = num_blocks; 10991 bdev_io->u.bdev.memory_domain = NULL; 10992 bdev_io->u.bdev.memory_domain_ctx = NULL; 10993 bdev_io->u.bdev.iovs = NULL; 10994 bdev_io->u.bdev.iovcnt = 0; 10995 bdev_io->u.bdev.md_buf = NULL; 10996 bdev_io->u.bdev.accel_sequence = NULL; 10997 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10998 10999 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 11000 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 11001 return 0; 11002 } 11003 11004 11005 /* If the copy size is large and should be split, use the generic split logic 11006 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 11007 * 11008 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 11009 * emulate it using regular read and write requests otherwise. 11010 */ 11011 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 11012 bdev_io->internal.f.split) { 11013 bdev_io_submit(bdev_io); 11014 return 0; 11015 } 11016 11017 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 11018 11019 return 0; 11020 } 11021 11022 SPDK_LOG_REGISTER_COMPONENT(bdev) 11023 11024 static void 11025 bdev_trace(void) 11026 { 11027 struct spdk_trace_tpoint_opts opts[] = { 11028 { 11029 "BDEV_IO_START", TRACE_BDEV_IO_START, 11030 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 11031 { 11032 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11033 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11034 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11035 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11036 } 11037 }, 11038 { 11039 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 11040 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 11041 { 11042 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11043 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11044 } 11045 }, 11046 { 11047 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 11048 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11049 { 11050 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11051 } 11052 }, 11053 { 11054 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 11055 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11056 { 11057 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11058 } 11059 }, 11060 }; 11061 11062 11063 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 11064 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 11065 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 11066 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 11067 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 11068 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 11069 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 11070 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 11071 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 11072 } 11073 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 11074