1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF, 392 }; 393 394 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 395 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 396 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 397 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 398 399 static inline void bdev_io_complete(void *ctx); 400 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 402 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 403 static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io); 404 405 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 406 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 407 408 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 409 struct spdk_io_channel *ch, void *_ctx); 410 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 411 412 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 413 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 414 uint64_t num_blocks, 415 struct spdk_memory_domain *domain, void *domain_ctx, 416 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 417 spdk_bdev_io_completion_cb cb, void *cb_arg); 418 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 419 struct iovec *iov, int iovcnt, void *md_buf, 420 uint64_t offset_blocks, uint64_t num_blocks, 421 struct spdk_memory_domain *domain, void *domain_ctx, 422 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 423 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 424 spdk_bdev_io_completion_cb cb, void *cb_arg); 425 426 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 427 uint64_t offset, uint64_t length, 428 lock_range_cb cb_fn, void *cb_arg); 429 430 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 431 uint64_t offset, uint64_t length, 432 lock_range_cb cb_fn, void *cb_arg); 433 434 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 435 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 436 437 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 438 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 439 static void claim_reset(struct spdk_bdev *bdev); 440 441 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 442 443 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 444 445 #define bdev_get_ext_io_opt(opts, field, defval) \ 446 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 447 448 static inline void 449 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 450 { 451 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 452 bdev_io->internal.ch->queue_depth++; 453 } 454 455 static inline void 456 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 457 { 458 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 459 bdev_io->internal.ch->queue_depth--; 460 } 461 462 void 463 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 464 { 465 if (!opts) { 466 SPDK_ERRLOG("opts should not be NULL\n"); 467 return; 468 } 469 470 if (!opts_size) { 471 SPDK_ERRLOG("opts_size should not be zero value\n"); 472 return; 473 } 474 475 opts->opts_size = opts_size; 476 477 #define SET_FIELD(field) \ 478 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 479 opts->field = g_bdev_opts.field; \ 480 } \ 481 482 SET_FIELD(bdev_io_pool_size); 483 SET_FIELD(bdev_io_cache_size); 484 SET_FIELD(bdev_auto_examine); 485 SET_FIELD(iobuf_small_cache_size); 486 SET_FIELD(iobuf_large_cache_size); 487 488 /* Do not remove this statement, you should always update this statement when you adding a new field, 489 * and do not forget to add the SET_FIELD statement for your added field. */ 490 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 491 492 #undef SET_FIELD 493 } 494 495 int 496 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 497 { 498 uint32_t min_pool_size; 499 500 if (!opts) { 501 SPDK_ERRLOG("opts cannot be NULL\n"); 502 return -1; 503 } 504 505 if (!opts->opts_size) { 506 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 507 return -1; 508 } 509 510 /* 511 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 512 * initialization. A second mgmt_ch will be created on the same thread when the application starts 513 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 514 */ 515 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 516 if (opts->bdev_io_pool_size < min_pool_size) { 517 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 518 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 519 spdk_thread_get_count()); 520 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 521 return -1; 522 } 523 524 #define SET_FIELD(field) \ 525 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 526 g_bdev_opts.field = opts->field; \ 527 } \ 528 529 SET_FIELD(bdev_io_pool_size); 530 SET_FIELD(bdev_io_cache_size); 531 SET_FIELD(bdev_auto_examine); 532 SET_FIELD(iobuf_small_cache_size); 533 SET_FIELD(iobuf_large_cache_size); 534 535 g_bdev_opts.opts_size = opts->opts_size; 536 537 #undef SET_FIELD 538 539 return 0; 540 } 541 542 static struct spdk_bdev * 543 bdev_get_by_name(const char *bdev_name) 544 { 545 struct spdk_bdev_name find; 546 struct spdk_bdev_name *res; 547 548 find.name = (char *)bdev_name; 549 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 550 if (res != NULL) { 551 return res->bdev; 552 } 553 554 return NULL; 555 } 556 557 struct spdk_bdev * 558 spdk_bdev_get_by_name(const char *bdev_name) 559 { 560 struct spdk_bdev *bdev; 561 562 spdk_spin_lock(&g_bdev_mgr.spinlock); 563 bdev = bdev_get_by_name(bdev_name); 564 spdk_spin_unlock(&g_bdev_mgr.spinlock); 565 566 return bdev; 567 } 568 569 struct bdev_io_status_string { 570 enum spdk_bdev_io_status status; 571 const char *str; 572 }; 573 574 static const struct bdev_io_status_string bdev_io_status_strings[] = { 575 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 576 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 577 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 578 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 579 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 580 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 581 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 582 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 583 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 584 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 585 }; 586 587 static const char * 588 bdev_io_status_get_string(enum spdk_bdev_io_status status) 589 { 590 uint32_t i; 591 592 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 593 if (bdev_io_status_strings[i].status == status) { 594 return bdev_io_status_strings[i].str; 595 } 596 } 597 598 return "reserved"; 599 } 600 601 struct spdk_bdev_wait_for_examine_ctx { 602 struct spdk_poller *poller; 603 spdk_bdev_wait_for_examine_cb cb_fn; 604 void *cb_arg; 605 }; 606 607 static bool bdev_module_all_actions_completed(void); 608 609 static int 610 bdev_wait_for_examine_cb(void *arg) 611 { 612 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 613 614 if (!bdev_module_all_actions_completed()) { 615 return SPDK_POLLER_IDLE; 616 } 617 618 spdk_poller_unregister(&ctx->poller); 619 ctx->cb_fn(ctx->cb_arg); 620 free(ctx); 621 622 return SPDK_POLLER_BUSY; 623 } 624 625 int 626 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 627 { 628 struct spdk_bdev_wait_for_examine_ctx *ctx; 629 630 ctx = calloc(1, sizeof(*ctx)); 631 if (ctx == NULL) { 632 return -ENOMEM; 633 } 634 ctx->cb_fn = cb_fn; 635 ctx->cb_arg = cb_arg; 636 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 637 638 return 0; 639 } 640 641 struct spdk_bdev_examine_item { 642 char *name; 643 TAILQ_ENTRY(spdk_bdev_examine_item) link; 644 }; 645 646 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 647 648 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 649 g_bdev_examine_allowlist); 650 651 static inline bool 652 bdev_examine_allowlist_check(const char *name) 653 { 654 struct spdk_bdev_examine_item *item; 655 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 656 if (strcmp(name, item->name) == 0) { 657 return true; 658 } 659 } 660 return false; 661 } 662 663 static inline void 664 bdev_examine_allowlist_remove(const char *name) 665 { 666 struct spdk_bdev_examine_item *item; 667 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 668 if (strcmp(name, item->name) == 0) { 669 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 670 free(item->name); 671 free(item); 672 break; 673 } 674 } 675 } 676 677 static inline void 678 bdev_examine_allowlist_free(void) 679 { 680 struct spdk_bdev_examine_item *item; 681 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 682 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 683 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 684 free(item->name); 685 free(item); 686 } 687 } 688 689 static inline bool 690 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 691 { 692 struct spdk_bdev_alias *tmp; 693 if (bdev_examine_allowlist_check(bdev->name)) { 694 return true; 695 } 696 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 697 if (bdev_examine_allowlist_check(tmp->alias.name)) { 698 return true; 699 } 700 } 701 return false; 702 } 703 704 static inline bool 705 bdev_ok_to_examine(struct spdk_bdev *bdev) 706 { 707 /* Some bdevs may not support the READ command. 708 * Do not try to examine them. 709 */ 710 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 711 return false; 712 } 713 714 if (g_bdev_opts.bdev_auto_examine) { 715 return true; 716 } else { 717 return bdev_in_examine_allowlist(bdev); 718 } 719 } 720 721 static void 722 bdev_examine(struct spdk_bdev *bdev) 723 { 724 struct spdk_bdev_module *module; 725 struct spdk_bdev_module_claim *claim, *tmpclaim; 726 uint32_t action; 727 728 if (!bdev_ok_to_examine(bdev)) { 729 return; 730 } 731 732 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 733 if (module->examine_config) { 734 spdk_spin_lock(&module->internal.spinlock); 735 action = module->internal.action_in_progress; 736 module->internal.action_in_progress++; 737 spdk_spin_unlock(&module->internal.spinlock); 738 module->examine_config(bdev); 739 if (action != module->internal.action_in_progress) { 740 SPDK_ERRLOG("examine_config for module %s did not call " 741 "spdk_bdev_module_examine_done()\n", module->name); 742 } 743 } 744 } 745 746 spdk_spin_lock(&bdev->internal.spinlock); 747 748 switch (bdev->internal.claim_type) { 749 case SPDK_BDEV_CLAIM_NONE: 750 /* Examine by all bdev modules */ 751 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 752 if (module->examine_disk) { 753 spdk_spin_lock(&module->internal.spinlock); 754 module->internal.action_in_progress++; 755 spdk_spin_unlock(&module->internal.spinlock); 756 spdk_spin_unlock(&bdev->internal.spinlock); 757 module->examine_disk(bdev); 758 spdk_spin_lock(&bdev->internal.spinlock); 759 } 760 } 761 break; 762 case SPDK_BDEV_CLAIM_EXCL_WRITE: 763 /* Examine by the one bdev module with a v1 claim */ 764 module = bdev->internal.claim.v1.module; 765 if (module->examine_disk) { 766 spdk_spin_lock(&module->internal.spinlock); 767 module->internal.action_in_progress++; 768 spdk_spin_unlock(&module->internal.spinlock); 769 spdk_spin_unlock(&bdev->internal.spinlock); 770 module->examine_disk(bdev); 771 return; 772 } 773 break; 774 default: 775 /* Examine by all bdev modules with a v2 claim */ 776 assert(claim_type_is_v2(bdev->internal.claim_type)); 777 /* 778 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 779 * list, perhaps accessing freed memory. Without protection, this could happen 780 * while the lock is dropped during the examine callback. 781 */ 782 bdev->internal.examine_in_progress++; 783 784 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 785 module = claim->module; 786 787 if (module == NULL) { 788 /* This is a vestigial claim, held by examine_count */ 789 continue; 790 } 791 792 if (module->examine_disk == NULL) { 793 continue; 794 } 795 796 spdk_spin_lock(&module->internal.spinlock); 797 module->internal.action_in_progress++; 798 spdk_spin_unlock(&module->internal.spinlock); 799 800 /* Call examine_disk without holding internal.spinlock. */ 801 spdk_spin_unlock(&bdev->internal.spinlock); 802 module->examine_disk(bdev); 803 spdk_spin_lock(&bdev->internal.spinlock); 804 } 805 806 assert(bdev->internal.examine_in_progress > 0); 807 bdev->internal.examine_in_progress--; 808 if (bdev->internal.examine_in_progress == 0) { 809 /* Remove any claims that were released during examine_disk */ 810 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 811 if (claim->desc != NULL) { 812 continue; 813 } 814 815 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 816 free(claim); 817 } 818 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 819 claim_reset(bdev); 820 } 821 } 822 } 823 824 spdk_spin_unlock(&bdev->internal.spinlock); 825 } 826 827 int 828 spdk_bdev_examine(const char *name) 829 { 830 struct spdk_bdev *bdev; 831 struct spdk_bdev_examine_item *item; 832 struct spdk_thread *thread = spdk_get_thread(); 833 834 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 835 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 836 thread ? spdk_thread_get_name(thread) : "null"); 837 return -EINVAL; 838 } 839 840 if (g_bdev_opts.bdev_auto_examine) { 841 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 842 return -EINVAL; 843 } 844 845 if (bdev_examine_allowlist_check(name)) { 846 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 847 return -EEXIST; 848 } 849 850 item = calloc(1, sizeof(*item)); 851 if (!item) { 852 return -ENOMEM; 853 } 854 item->name = strdup(name); 855 if (!item->name) { 856 free(item); 857 return -ENOMEM; 858 } 859 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 860 861 bdev = spdk_bdev_get_by_name(name); 862 if (bdev) { 863 bdev_examine(bdev); 864 } 865 return 0; 866 } 867 868 static inline void 869 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 870 { 871 struct spdk_bdev_examine_item *item; 872 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 873 spdk_json_write_object_begin(w); 874 spdk_json_write_named_string(w, "method", "bdev_examine"); 875 spdk_json_write_named_object_begin(w, "params"); 876 spdk_json_write_named_string(w, "name", item->name); 877 spdk_json_write_object_end(w); 878 spdk_json_write_object_end(w); 879 } 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_first(void) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 888 if (bdev) { 889 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 890 } 891 892 return bdev; 893 } 894 895 struct spdk_bdev * 896 spdk_bdev_next(struct spdk_bdev *prev) 897 { 898 struct spdk_bdev *bdev; 899 900 bdev = TAILQ_NEXT(prev, internal.link); 901 if (bdev) { 902 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 903 } 904 905 return bdev; 906 } 907 908 static struct spdk_bdev * 909 _bdev_next_leaf(struct spdk_bdev *bdev) 910 { 911 while (bdev != NULL) { 912 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 913 return bdev; 914 } else { 915 bdev = TAILQ_NEXT(bdev, internal.link); 916 } 917 } 918 919 return bdev; 920 } 921 922 struct spdk_bdev * 923 spdk_bdev_first_leaf(void) 924 { 925 struct spdk_bdev *bdev; 926 927 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 928 929 if (bdev) { 930 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 931 } 932 933 return bdev; 934 } 935 936 struct spdk_bdev * 937 spdk_bdev_next_leaf(struct spdk_bdev *prev) 938 { 939 struct spdk_bdev *bdev; 940 941 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 942 943 if (bdev) { 944 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 945 } 946 947 return bdev; 948 } 949 950 static inline bool 951 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 952 { 953 return bdev_io->internal.f.has_memory_domain; 954 } 955 956 static inline bool 957 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 958 { 959 return bdev_io->internal.f.has_accel_sequence; 960 } 961 962 static inline uint32_t 963 bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 964 { 965 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 966 967 if (spdk_unlikely(desc->opts.hide_metadata)) { 968 return bdev->blocklen - bdev->md_len; 969 } else { 970 return bdev->blocklen; 971 } 972 } 973 974 static inline uint32_t 975 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io) 976 { 977 struct spdk_bdev *bdev = bdev_io->bdev; 978 979 if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) { 980 if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) { 981 return bdev->blocklen - bdev->md_len; 982 } else { 983 return bdev->blocklen; 984 } 985 } 986 987 return bdev_desc_get_block_size(bdev_io->internal.desc); 988 } 989 990 static inline void 991 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 992 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 993 { 994 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 995 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 996 * channels we will instead wait for half to complete. 997 */ 998 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 999 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1000 1001 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1002 bdev_io->internal.retry_state = state; 1003 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1004 } 1005 1006 static inline void 1007 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 1008 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1009 { 1010 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 1011 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 1012 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 1013 1014 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1015 bdev_io->internal.retry_state = state; 1016 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1017 } 1018 1019 void 1020 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 1021 { 1022 struct iovec *iovs; 1023 1024 if (bdev_io->u.bdev.iovs == NULL) { 1025 bdev_io->u.bdev.iovs = &bdev_io->iov; 1026 bdev_io->u.bdev.iovcnt = 1; 1027 } 1028 1029 iovs = bdev_io->u.bdev.iovs; 1030 1031 assert(iovs != NULL); 1032 assert(bdev_io->u.bdev.iovcnt >= 1); 1033 1034 iovs[0].iov_base = buf; 1035 iovs[0].iov_len = len; 1036 } 1037 1038 void 1039 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1040 { 1041 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1042 bdev_io->u.bdev.md_buf = md_buf; 1043 } 1044 1045 static bool 1046 _is_buf_allocated(const struct iovec *iovs) 1047 { 1048 if (iovs == NULL) { 1049 return false; 1050 } 1051 1052 return iovs[0].iov_base != NULL; 1053 } 1054 1055 static bool 1056 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1057 { 1058 int i; 1059 uintptr_t iov_base; 1060 1061 if (spdk_likely(alignment == 1)) { 1062 return true; 1063 } 1064 1065 for (i = 0; i < iovcnt; i++) { 1066 iov_base = (uintptr_t)iovs[i].iov_base; 1067 if ((iov_base & (alignment - 1)) != 0) { 1068 return false; 1069 } 1070 } 1071 1072 return true; 1073 } 1074 1075 static inline bool 1076 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1077 { 1078 return (bdev_io->bdev->md_len != 0) && 1079 (desc->opts.hide_metadata || 1080 (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT)); 1081 } 1082 1083 static inline bool 1084 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1085 { 1086 if (!bdev_io_use_accel_sequence(bdev_io)) { 1087 return false; 1088 } 1089 1090 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1091 * bdev module didn't support accel sequences */ 1092 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1093 } 1094 1095 static inline void 1096 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1097 struct spdk_bdev_shared_resource *shared_resource) 1098 { 1099 bdev_ch->io_outstanding++; 1100 shared_resource->io_outstanding++; 1101 } 1102 1103 static inline void 1104 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1105 struct spdk_bdev_shared_resource *shared_resource) 1106 { 1107 assert(bdev_ch->io_outstanding > 0); 1108 assert(shared_resource->io_outstanding > 0); 1109 bdev_ch->io_outstanding--; 1110 shared_resource->io_outstanding--; 1111 } 1112 1113 static void 1114 bdev_io_submit_sequence_cb(void *ctx, int status) 1115 { 1116 struct spdk_bdev_io *bdev_io = ctx; 1117 1118 assert(bdev_io_use_accel_sequence(bdev_io)); 1119 1120 bdev_io->u.bdev.accel_sequence = NULL; 1121 bdev_io->internal.f.has_accel_sequence = false; 1122 1123 if (spdk_unlikely(status != 0)) { 1124 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1126 bdev_io_complete_unsubmitted(bdev_io); 1127 return; 1128 } 1129 1130 bdev_io_submit(bdev_io); 1131 } 1132 1133 static void 1134 bdev_io_exec_sequence_cb(void *ctx, int status) 1135 { 1136 struct spdk_bdev_io *bdev_io = ctx; 1137 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1138 1139 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1140 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1141 1142 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1143 bdev_ch_retry_io(ch); 1144 } 1145 1146 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1147 } 1148 1149 static void 1150 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1151 { 1152 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1153 1154 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1155 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1156 assert(bdev_io_use_accel_sequence(bdev_io)); 1157 1158 /* Since the operations are appended during submission, they're in the opposite order than 1159 * how we want to execute them for reads (i.e. we need to execute the most recently added 1160 * operation first), so reverse the sequence before executing it. 1161 */ 1162 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1163 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1164 } 1165 1166 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1167 bdev_io_increment_outstanding(ch, ch->shared_resource); 1168 bdev_io->internal.data_transfer_cpl = cb_fn; 1169 1170 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1171 bdev_io_exec_sequence_cb, bdev_io); 1172 } 1173 1174 static void 1175 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1176 { 1177 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1178 void *buf; 1179 1180 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1181 buf = bdev_io->internal.buf.ptr; 1182 bdev_io->internal.buf.ptr = NULL; 1183 bdev_io->internal.f.has_buf = false; 1184 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1185 bdev_io->internal.get_aux_buf_cb = NULL; 1186 } else { 1187 assert(bdev_io->internal.get_buf_cb != NULL); 1188 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1189 bdev_io->internal.get_buf_cb = NULL; 1190 } 1191 } 1192 1193 static void 1194 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1195 { 1196 struct spdk_bdev_io *bdev_io = ctx; 1197 1198 if (rc) { 1199 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1200 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1201 } 1202 bdev_io_get_buf_complete(bdev_io, !rc); 1203 } 1204 1205 static void 1206 bdev_io_pull_md_buf_done(void *ctx, int status) 1207 { 1208 struct spdk_bdev_io *bdev_io = ctx; 1209 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1210 1211 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1212 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1213 1214 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1215 bdev_ch_retry_io(ch); 1216 } 1217 1218 assert(bdev_io->internal.data_transfer_cpl); 1219 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1220 } 1221 1222 static void 1223 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1224 { 1225 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1226 int rc = 0; 1227 1228 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1229 assert(bdev_io->internal.f.has_bounce_buf); 1230 if (bdev_io_use_memory_domain(bdev_io)) { 1231 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1232 bdev_io_increment_outstanding(ch, ch->shared_resource); 1233 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1234 bdev_io->internal.memory_domain_ctx, 1235 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1236 &bdev_io->internal.bounce_buf.md_iov, 1, 1237 bdev_io_pull_md_buf_done, bdev_io); 1238 if (rc == 0) { 1239 /* Continue to submit IO in completion callback */ 1240 return; 1241 } 1242 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1243 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1244 if (rc != -ENOMEM) { 1245 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1246 spdk_memory_domain_get_dma_device_id( 1247 bdev_io->internal.memory_domain), rc); 1248 } 1249 } else { 1250 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1251 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1252 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1253 } 1254 } 1255 1256 if (spdk_unlikely(rc == -ENOMEM)) { 1257 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1258 } else { 1259 assert(bdev_io->internal.data_transfer_cpl); 1260 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1261 } 1262 } 1263 1264 static void 1265 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1266 { 1267 assert(bdev_io->internal.f.has_bounce_buf); 1268 1269 /* save original md_buf */ 1270 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1271 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1272 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1273 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1274 /* set bounce md_buf */ 1275 bdev_io->u.bdev.md_buf = md_buf; 1276 1277 bdev_io_pull_md_buf(bdev_io); 1278 } 1279 1280 static void 1281 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1282 { 1283 struct spdk_bdev *bdev = bdev_io->bdev; 1284 uint64_t md_len; 1285 void *buf; 1286 1287 if (spdk_bdev_is_md_separate(bdev)) { 1288 assert(!bdev_io_use_accel_sequence(bdev_io)); 1289 1290 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1291 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1292 1293 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1294 1295 if (bdev_io->u.bdev.md_buf != NULL) { 1296 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1297 return; 1298 } else { 1299 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1300 } 1301 } 1302 1303 bdev_io_get_buf_complete(bdev_io, true); 1304 } 1305 1306 static inline void 1307 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1308 { 1309 if (rc) { 1310 SPDK_ERRLOG("Failed to get data buffer\n"); 1311 assert(bdev_io->internal.data_transfer_cpl); 1312 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1313 return; 1314 } 1315 1316 _bdev_io_set_md_buf(bdev_io); 1317 } 1318 1319 static void 1320 bdev_io_pull_data_done_and_track(void *ctx, int status) 1321 { 1322 struct spdk_bdev_io *bdev_io = ctx; 1323 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1324 1325 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1326 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1327 1328 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1329 bdev_ch_retry_io(ch); 1330 } 1331 1332 bdev_io_pull_data_done(bdev_io, status); 1333 } 1334 1335 static void 1336 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1337 { 1338 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1339 struct spdk_bdev_desc *desc = bdev_io->internal.desc; 1340 int rc = 0; 1341 1342 assert(bdev_io->internal.f.has_bounce_buf); 1343 1344 if (bdev_io_needs_metadata(desc, bdev_io)) { 1345 assert(bdev_io->bdev->md_interleave); 1346 1347 bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT; 1348 1349 if (!bdev_io_use_accel_sequence(bdev_io)) { 1350 bdev_io->internal.accel_sequence = NULL; 1351 } 1352 1353 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1354 rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1355 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1356 bdev_io->u.bdev.memory_domain, 1357 bdev_io->u.bdev.memory_domain_ctx, 1358 bdev_io->internal.bounce_buf.orig_iovs, 1359 bdev_io->internal.bounce_buf.orig_iovcnt, 1360 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1361 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1362 bdev_io->u.bdev.num_blocks, 1363 &bdev_io->u.bdev.dif_ctx, 1364 NULL, NULL); 1365 } else { 1366 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1367 rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1368 bdev_io->internal.bounce_buf.orig_iovs, 1369 bdev_io->internal.bounce_buf.orig_iovcnt, 1370 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1371 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1372 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1373 bdev_io->u.bdev.memory_domain, 1374 bdev_io->u.bdev.memory_domain_ctx, 1375 bdev_io->u.bdev.num_blocks, 1376 &bdev_io->u.bdev.dif_ctx, 1377 &bdev_io->u.bdev.dif_err, 1378 NULL, NULL); 1379 } 1380 1381 if (spdk_likely(rc == 0)) { 1382 bdev_io->internal.f.has_accel_sequence = true; 1383 bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence; 1384 } else if (rc != -ENOMEM) { 1385 SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n", 1386 bdev_io->internal.accel_sequence); 1387 } 1388 } else if (bdev_io_needs_sequence_exec(desc, bdev_io) || 1389 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1390 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1391 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1392 * operation */ 1393 assert(bdev_io_use_accel_sequence(bdev_io)); 1394 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1395 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1396 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1397 NULL, NULL, 1398 bdev_io->internal.bounce_buf.orig_iovs, 1399 bdev_io->internal.bounce_buf.orig_iovcnt, 1400 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1401 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1402 NULL, NULL); 1403 } else { 1404 /* We need to reverse the src/dst for reads */ 1405 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1406 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1407 bdev_io->internal.bounce_buf.orig_iovs, 1408 bdev_io->internal.bounce_buf.orig_iovcnt, 1409 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1410 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1411 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1412 NULL, NULL, NULL, NULL); 1413 } 1414 1415 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1416 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1417 bdev_io->internal.accel_sequence); 1418 } 1419 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1420 /* if this is write path, copy data from original buffer to bounce buffer */ 1421 if (bdev_io_use_memory_domain(bdev_io)) { 1422 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1423 bdev_io_increment_outstanding(ch, ch->shared_resource); 1424 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1425 bdev_io->internal.memory_domain_ctx, 1426 bdev_io->internal.bounce_buf.orig_iovs, 1427 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1428 bdev_io->u.bdev.iovs, 1, 1429 bdev_io_pull_data_done_and_track, 1430 bdev_io); 1431 if (rc == 0) { 1432 /* Continue to submit IO in completion callback */ 1433 return; 1434 } 1435 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1436 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1437 if (rc != -ENOMEM) { 1438 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1439 spdk_memory_domain_get_dma_device_id( 1440 bdev_io->internal.memory_domain)); 1441 } 1442 } else { 1443 assert(bdev_io->u.bdev.iovcnt == 1); 1444 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1445 bdev_io->u.bdev.iovs[0].iov_len, 1446 bdev_io->internal.bounce_buf.orig_iovs, 1447 bdev_io->internal.bounce_buf.orig_iovcnt); 1448 } 1449 } 1450 1451 if (spdk_unlikely(rc == -ENOMEM)) { 1452 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1453 } else { 1454 bdev_io_pull_data_done(bdev_io, rc); 1455 } 1456 } 1457 1458 static void 1459 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1460 bdev_copy_bounce_buffer_cpl cpl_cb) 1461 { 1462 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1463 1464 assert(bdev_io->internal.f.has_bounce_buf == false); 1465 1466 bdev_io->internal.data_transfer_cpl = cpl_cb; 1467 bdev_io->internal.f.has_bounce_buf = true; 1468 /* save original iovec */ 1469 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1470 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1471 /* zero the other data members */ 1472 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1473 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1474 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1475 /* set bounce iov */ 1476 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1477 bdev_io->u.bdev.iovcnt = 1; 1478 /* set bounce buffer for this operation */ 1479 bdev_io->u.bdev.iovs[0].iov_base = buf; 1480 bdev_io->u.bdev.iovs[0].iov_len = len; 1481 /* Now we use 1 iov, the split condition could have been changed */ 1482 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1483 1484 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1485 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1486 } else { 1487 bdev_io_pull_data(bdev_io); 1488 } 1489 } 1490 1491 static void 1492 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1493 { 1494 struct spdk_bdev *bdev = bdev_io->bdev; 1495 bool buf_allocated; 1496 uint64_t alignment; 1497 void *aligned_buf; 1498 1499 bdev_io->internal.buf.ptr = buf; 1500 bdev_io->internal.f.has_buf = true; 1501 1502 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1503 bdev_io_get_buf_complete(bdev_io, true); 1504 return; 1505 } 1506 1507 alignment = spdk_bdev_get_buf_align(bdev); 1508 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1509 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1510 1511 if (buf_allocated) { 1512 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1513 /* Continue in completion callback */ 1514 return; 1515 } else { 1516 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1517 } 1518 1519 _bdev_io_set_md_buf(bdev_io); 1520 } 1521 1522 static inline uint64_t 1523 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1524 { 1525 struct spdk_bdev *bdev = bdev_io->bdev; 1526 uint64_t md_len, alignment; 1527 1528 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1529 1530 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1531 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1532 1533 return len + alignment + md_len; 1534 } 1535 1536 static void 1537 bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io) 1538 { 1539 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1540 1541 spdk_accel_put_buf(ch->accel_channel, 1542 bdev_io->internal.buf.ptr, 1543 bdev_io->u.bdev.memory_domain, 1544 bdev_io->u.bdev.memory_domain_ctx); 1545 } 1546 1547 static void 1548 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1549 { 1550 struct spdk_bdev_mgmt_channel *ch; 1551 1552 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1553 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1554 } 1555 1556 static void 1557 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1558 { 1559 assert(bdev_io->internal.f.has_buf); 1560 1561 if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) { 1562 bdev_io_put_accel_buf(bdev_io); 1563 } else { 1564 assert(bdev_io->u.bdev.memory_domain == NULL); 1565 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, 1566 bdev_io->internal.buf.len); 1567 } 1568 bdev_io->internal.buf.ptr = NULL; 1569 bdev_io->internal.f.has_buf = false; 1570 } 1571 1572 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1573 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1574 1575 void 1576 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1577 { 1578 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1579 1580 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1581 1582 assert(buf != NULL); 1583 _bdev_io_put_buf(bdev_io, buf, len); 1584 } 1585 1586 static inline void 1587 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1588 struct spdk_bdev_io *bdev_io) 1589 { 1590 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1591 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1592 * sequence pointer to make sure we won't touch it anymore. */ 1593 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1594 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1595 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1596 bdev_io->internal.f.has_accel_sequence = false; 1597 } 1598 1599 /* The generic bdev layer should not pass an I/O with a dif_check_flags set that 1600 * the underlying bdev does not support. Add an assert to check this. 1601 */ 1602 assert((bdev_io->type != SPDK_BDEV_IO_TYPE_WRITE && 1603 bdev_io->type != SPDK_BDEV_IO_TYPE_READ) || 1604 ((bdev_io->u.bdev.dif_check_flags & bdev->dif_check_flags) == 1605 bdev_io->u.bdev.dif_check_flags)); 1606 1607 bdev->fn_table->submit_request(ioch, bdev_io); 1608 } 1609 1610 static inline void 1611 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1612 { 1613 struct spdk_bdev *bdev = bdev_io->bdev; 1614 1615 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1616 bdev_io->internal.error.nvme.cdw0 = 0; 1617 bdev_io->num_retries++; 1618 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1619 } 1620 1621 static void 1622 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1623 { 1624 struct spdk_bdev_io *bdev_io; 1625 1626 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1627 /* 1628 * Allow some more I/O to complete before retrying the nomem_io queue. 1629 * Some drivers (such as nvme) cannot immediately take a new I/O in 1630 * the context of a completion, because the resources for the I/O are 1631 * not released until control returns to the bdev poller. Also, we 1632 * may require several small I/O to complete before a larger I/O 1633 * (that requires splitting) can be submitted. 1634 */ 1635 return; 1636 } 1637 1638 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1639 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1640 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1641 1642 switch (bdev_io->internal.retry_state) { 1643 case BDEV_IO_RETRY_STATE_SUBMIT: 1644 bdev_ch_resubmit_io(shared_resource, bdev_io); 1645 break; 1646 case BDEV_IO_RETRY_STATE_PULL: 1647 bdev_io_pull_data(bdev_io); 1648 break; 1649 case BDEV_IO_RETRY_STATE_PULL_MD: 1650 bdev_io_pull_md_buf(bdev_io); 1651 break; 1652 case BDEV_IO_RETRY_STATE_PUSH: 1653 bdev_io_push_bounce_data(bdev_io); 1654 break; 1655 case BDEV_IO_RETRY_STATE_PUSH_MD: 1656 bdev_io_push_bounce_md_buf(bdev_io); 1657 break; 1658 case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF: 1659 _bdev_io_get_accel_buf(bdev_io); 1660 break; 1661 default: 1662 assert(0 && "invalid retry state"); 1663 break; 1664 } 1665 1666 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1667 /* This IO completed again with NOMEM status, so break the loop and 1668 * don't try anymore. Note that a bdev_io that fails with NOMEM 1669 * always gets requeued at the front of the list, to maintain 1670 * ordering. 1671 */ 1672 break; 1673 } 1674 } 1675 } 1676 1677 static void 1678 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1679 { 1680 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1681 } 1682 1683 static int 1684 bdev_no_mem_poller(void *ctx) 1685 { 1686 struct spdk_bdev_shared_resource *shared_resource = ctx; 1687 1688 spdk_poller_unregister(&shared_resource->nomem_poller); 1689 1690 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1691 bdev_shared_ch_retry_io(shared_resource); 1692 } 1693 /* the retry cb may re-register the poller so double check */ 1694 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1695 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1696 /* No IOs were submitted, try again */ 1697 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1698 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1699 } 1700 1701 return SPDK_POLLER_BUSY; 1702 } 1703 1704 static inline bool 1705 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1706 { 1707 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1708 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1709 1710 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1711 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1712 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1713 1714 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1715 /* Special case when we have nomem IOs and no outstanding IOs which completions 1716 * could trigger retry of queued IOs 1717 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1718 * new IOs submitted, e.g. qd==1 */ 1719 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1720 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1721 } 1722 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1723 * ownership of that sequence is transferred back to the bdev layer, so we need to 1724 * restore internal.accel_sequence to make sure that the sequence is handled 1725 * correctly in case the I/O is later aborted. */ 1726 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1727 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1728 assert(!bdev_io_use_accel_sequence(bdev_io)); 1729 bdev_io->internal.f.has_accel_sequence = true; 1730 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1731 } 1732 1733 return true; 1734 } 1735 1736 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1737 bdev_ch_retry_io(bdev_ch); 1738 } 1739 1740 return false; 1741 } 1742 1743 static void 1744 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1745 { 1746 struct spdk_bdev_io *bdev_io = ctx; 1747 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1748 1749 if (rc) { 1750 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1751 } 1752 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1753 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1754 */ 1755 bdev_io_put_buf(bdev_io); 1756 1757 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1758 bdev_ch_retry_io(ch); 1759 } 1760 1761 /* Continue with IO completion flow */ 1762 bdev_io_complete(bdev_io); 1763 } 1764 1765 static void 1766 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1767 { 1768 struct spdk_bdev_io *bdev_io = ctx; 1769 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1770 1771 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1772 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1773 bdev_io->internal.f.has_bounce_buf = false; 1774 1775 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1776 bdev_ch_retry_io(ch); 1777 } 1778 1779 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1780 } 1781 1782 static inline void 1783 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1784 { 1785 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1786 int rc = 0; 1787 1788 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1789 assert(bdev_io->internal.f.has_bounce_buf); 1790 1791 /* do the same for metadata buffer */ 1792 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1793 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1794 1795 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1796 if (bdev_io_use_memory_domain(bdev_io)) { 1797 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1798 bdev_io_increment_outstanding(ch, ch->shared_resource); 1799 /* If memory domain is used then we need to call async push function */ 1800 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1801 bdev_io->internal.memory_domain_ctx, 1802 &bdev_io->internal.bounce_buf.orig_md_iov, 1803 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1804 &bdev_io->internal.bounce_buf.md_iov, 1, 1805 bdev_io_push_bounce_md_buf_done, 1806 bdev_io); 1807 if (rc == 0) { 1808 /* Continue IO completion in async callback */ 1809 return; 1810 } 1811 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1812 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1813 if (rc != -ENOMEM) { 1814 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1815 spdk_memory_domain_get_dma_device_id( 1816 bdev_io->internal.memory_domain)); 1817 } 1818 } else { 1819 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1820 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1821 } 1822 } 1823 } 1824 1825 if (spdk_unlikely(rc == -ENOMEM)) { 1826 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1827 } else { 1828 assert(bdev_io->internal.data_transfer_cpl); 1829 bdev_io->internal.f.has_bounce_buf = false; 1830 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1831 } 1832 } 1833 1834 static inline void 1835 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1836 { 1837 assert(bdev_io->internal.data_transfer_cpl); 1838 if (rc) { 1839 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1840 return; 1841 } 1842 1843 /* set original buffer for this io */ 1844 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1845 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1846 1847 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1848 * we still need to clear the md buf */ 1849 1850 bdev_io_push_bounce_md_buf(bdev_io); 1851 } 1852 1853 static void 1854 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1855 { 1856 struct spdk_bdev_io *bdev_io = ctx; 1857 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1858 1859 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1860 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1861 1862 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1863 bdev_ch_retry_io(ch); 1864 } 1865 1866 bdev_io_push_bounce_data_done(bdev_io, status); 1867 } 1868 1869 static inline void 1870 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1871 { 1872 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1873 int rc = 0; 1874 1875 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1876 assert(!bdev_io_use_accel_sequence(bdev_io)); 1877 assert(bdev_io->internal.f.has_bounce_buf); 1878 1879 /* if this is read path, copy data from bounce buffer to original buffer */ 1880 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1881 if (bdev_io_use_memory_domain(bdev_io)) { 1882 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1883 bdev_io_increment_outstanding(ch, ch->shared_resource); 1884 /* If memory domain is used then we need to call async push function */ 1885 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1886 bdev_io->internal.memory_domain_ctx, 1887 bdev_io->internal.bounce_buf.orig_iovs, 1888 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1889 &bdev_io->internal.bounce_buf.iov, 1, 1890 bdev_io_push_bounce_data_done_and_track, 1891 bdev_io); 1892 if (rc == 0) { 1893 /* Continue IO completion in async callback */ 1894 return; 1895 } 1896 1897 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1898 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1899 if (rc != -ENOMEM) { 1900 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1901 spdk_memory_domain_get_dma_device_id( 1902 bdev_io->internal.memory_domain)); 1903 } 1904 } else { 1905 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1906 bdev_io->internal.bounce_buf.orig_iovcnt, 1907 bdev_io->internal.bounce_buf.iov.iov_base, 1908 bdev_io->internal.bounce_buf.iov.iov_len); 1909 } 1910 } 1911 1912 if (spdk_unlikely(rc == -ENOMEM)) { 1913 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1914 } else { 1915 bdev_io_push_bounce_data_done(bdev_io, rc); 1916 } 1917 } 1918 1919 static inline void 1920 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1921 { 1922 bdev_io->internal.data_transfer_cpl = cpl_cb; 1923 bdev_io_push_bounce_data(bdev_io); 1924 } 1925 1926 static void 1927 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1928 { 1929 struct spdk_bdev_io *bdev_io; 1930 1931 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1932 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1933 } 1934 1935 static void 1936 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1937 { 1938 struct spdk_bdev_mgmt_channel *mgmt_ch; 1939 uint64_t max_len; 1940 void *buf; 1941 1942 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1943 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1944 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1945 1946 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1947 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1948 bdev_io_get_buf_complete(bdev_io, false); 1949 return; 1950 } 1951 1952 bdev_io->internal.buf.len = len; 1953 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1954 bdev_io_get_iobuf_cb); 1955 if (buf != NULL) { 1956 _bdev_io_set_buf(bdev_io, buf, len); 1957 } 1958 } 1959 1960 void 1961 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1962 { 1963 struct spdk_bdev *bdev = bdev_io->bdev; 1964 uint64_t alignment; 1965 1966 assert(cb != NULL); 1967 bdev_io->internal.get_buf_cb = cb; 1968 1969 alignment = spdk_bdev_get_buf_align(bdev); 1970 1971 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1972 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1973 /* Buffer already present and aligned */ 1974 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1975 return; 1976 } 1977 1978 bdev_io_get_buf(bdev_io, len); 1979 } 1980 1981 static void 1982 _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1983 uint64_t len) 1984 { 1985 assert(cb != NULL); 1986 bdev_io->internal.get_buf_cb = cb; 1987 1988 bdev_io_get_buf(bdev_io, len); 1989 } 1990 1991 static void 1992 _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io) 1993 { 1994 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1995 void *buf; 1996 int rc; 1997 1998 rc = spdk_accel_get_buf(ch->accel_channel, 1999 bdev_io->internal.buf.len, 2000 &buf, 2001 &bdev_io->u.bdev.memory_domain, 2002 &bdev_io->u.bdev.memory_domain_ctx); 2003 if (rc != 0) { 2004 bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io, 2005 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF); 2006 return; 2007 } 2008 2009 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 2010 } 2011 2012 static inline void 2013 bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 2014 uint64_t len) 2015 { 2016 bdev_io->internal.buf.len = len; 2017 bdev_io->internal.get_buf_cb = cb; 2018 2019 _bdev_io_get_accel_buf(bdev_io); 2020 } 2021 2022 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 2023 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 2024 2025 void 2026 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 2027 { 2028 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2029 2030 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 2031 2032 assert(cb != NULL); 2033 assert(bdev_io->internal.get_aux_buf_cb == NULL); 2034 bdev_io->internal.get_aux_buf_cb = cb; 2035 bdev_io_get_buf(bdev_io, len); 2036 } 2037 2038 static int 2039 bdev_module_get_max_ctx_size(void) 2040 { 2041 struct spdk_bdev_module *bdev_module; 2042 int max_bdev_module_size = 0; 2043 2044 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2045 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 2046 max_bdev_module_size = bdev_module->get_ctx_size(); 2047 } 2048 } 2049 2050 return max_bdev_module_size; 2051 } 2052 2053 static void 2054 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2055 { 2056 if (!bdev->internal.histogram_enabled) { 2057 return; 2058 } 2059 2060 spdk_json_write_object_begin(w); 2061 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 2062 2063 spdk_json_write_named_object_begin(w, "params"); 2064 spdk_json_write_named_string(w, "name", bdev->name); 2065 2066 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 2067 2068 if (bdev->internal.histogram_io_type) { 2069 spdk_json_write_named_string(w, "opc", 2070 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 2071 } 2072 2073 spdk_json_write_object_end(w); 2074 2075 spdk_json_write_object_end(w); 2076 } 2077 2078 static void 2079 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2080 { 2081 int i; 2082 struct spdk_bdev_qos *qos = bdev->internal.qos; 2083 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 2084 2085 if (!qos) { 2086 return; 2087 } 2088 2089 spdk_bdev_get_qos_rate_limits(bdev, limits); 2090 2091 spdk_json_write_object_begin(w); 2092 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 2093 2094 spdk_json_write_named_object_begin(w, "params"); 2095 spdk_json_write_named_string(w, "name", bdev->name); 2096 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2097 if (limits[i] > 0) { 2098 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 2099 } 2100 } 2101 spdk_json_write_object_end(w); 2102 2103 spdk_json_write_object_end(w); 2104 } 2105 2106 void 2107 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 2108 { 2109 struct spdk_bdev_module *bdev_module; 2110 struct spdk_bdev *bdev; 2111 2112 assert(w != NULL); 2113 2114 spdk_json_write_array_begin(w); 2115 2116 spdk_json_write_object_begin(w); 2117 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2118 spdk_json_write_named_object_begin(w, "params"); 2119 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2120 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2121 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2122 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2123 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2124 spdk_json_write_object_end(w); 2125 spdk_json_write_object_end(w); 2126 2127 bdev_examine_allowlist_config_json(w); 2128 2129 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2130 if (bdev_module->config_json) { 2131 bdev_module->config_json(w); 2132 } 2133 } 2134 2135 spdk_spin_lock(&g_bdev_mgr.spinlock); 2136 2137 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2138 if (bdev->fn_table->write_config_json) { 2139 bdev->fn_table->write_config_json(bdev, w); 2140 } 2141 2142 bdev_qos_config_json(bdev, w); 2143 bdev_enable_histogram_config_json(bdev, w); 2144 } 2145 2146 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2147 2148 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2149 spdk_json_write_object_begin(w); 2150 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2151 spdk_json_write_object_end(w); 2152 2153 spdk_json_write_array_end(w); 2154 } 2155 2156 static void 2157 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2158 { 2159 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2160 struct spdk_bdev_io *bdev_io; 2161 2162 spdk_iobuf_channel_fini(&ch->iobuf); 2163 2164 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2165 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2166 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2167 ch->per_thread_cache_count--; 2168 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2169 } 2170 2171 assert(ch->per_thread_cache_count == 0); 2172 } 2173 2174 static int 2175 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2176 { 2177 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2178 struct spdk_bdev_io *bdev_io; 2179 uint32_t i; 2180 int rc; 2181 2182 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2183 g_bdev_opts.iobuf_small_cache_size, 2184 g_bdev_opts.iobuf_large_cache_size); 2185 if (rc != 0) { 2186 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2187 return -1; 2188 } 2189 2190 STAILQ_INIT(&ch->per_thread_cache); 2191 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2192 2193 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2194 ch->per_thread_cache_count = 0; 2195 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2196 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2197 if (bdev_io == NULL) { 2198 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2199 assert(false); 2200 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2201 return -1; 2202 } 2203 ch->per_thread_cache_count++; 2204 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2205 } 2206 2207 TAILQ_INIT(&ch->shared_resources); 2208 TAILQ_INIT(&ch->io_wait_queue); 2209 2210 return 0; 2211 } 2212 2213 static void 2214 bdev_init_complete(int rc) 2215 { 2216 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2217 void *cb_arg = g_init_cb_arg; 2218 struct spdk_bdev_module *m; 2219 2220 g_bdev_mgr.init_complete = true; 2221 g_init_cb_fn = NULL; 2222 g_init_cb_arg = NULL; 2223 2224 /* 2225 * For modules that need to know when subsystem init is complete, 2226 * inform them now. 2227 */ 2228 if (rc == 0) { 2229 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2230 if (m->init_complete) { 2231 m->init_complete(); 2232 } 2233 } 2234 } 2235 2236 cb_fn(cb_arg, rc); 2237 } 2238 2239 static bool 2240 bdev_module_all_actions_completed(void) 2241 { 2242 struct spdk_bdev_module *m; 2243 2244 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2245 if (m->internal.action_in_progress > 0) { 2246 return false; 2247 } 2248 } 2249 return true; 2250 } 2251 2252 static void 2253 bdev_module_action_complete(void) 2254 { 2255 /* 2256 * Don't finish bdev subsystem initialization if 2257 * module pre-initialization is still in progress, or 2258 * the subsystem been already initialized. 2259 */ 2260 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2261 return; 2262 } 2263 2264 /* 2265 * Check all bdev modules for inits/examinations in progress. If any 2266 * exist, return immediately since we cannot finish bdev subsystem 2267 * initialization until all are completed. 2268 */ 2269 if (!bdev_module_all_actions_completed()) { 2270 return; 2271 } 2272 2273 /* 2274 * Modules already finished initialization - now that all 2275 * the bdev modules have finished their asynchronous I/O 2276 * processing, the entire bdev layer can be marked as complete. 2277 */ 2278 bdev_init_complete(0); 2279 } 2280 2281 static void 2282 bdev_module_action_done(struct spdk_bdev_module *module) 2283 { 2284 spdk_spin_lock(&module->internal.spinlock); 2285 assert(module->internal.action_in_progress > 0); 2286 module->internal.action_in_progress--; 2287 spdk_spin_unlock(&module->internal.spinlock); 2288 bdev_module_action_complete(); 2289 } 2290 2291 void 2292 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2293 { 2294 assert(module->async_init); 2295 bdev_module_action_done(module); 2296 } 2297 2298 void 2299 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2300 { 2301 bdev_module_action_done(module); 2302 } 2303 2304 /** The last initialized bdev module */ 2305 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2306 2307 static void 2308 bdev_init_failed(void *cb_arg) 2309 { 2310 struct spdk_bdev_module *module = cb_arg; 2311 2312 spdk_spin_lock(&module->internal.spinlock); 2313 assert(module->internal.action_in_progress > 0); 2314 module->internal.action_in_progress--; 2315 spdk_spin_unlock(&module->internal.spinlock); 2316 bdev_init_complete(-1); 2317 } 2318 2319 static int 2320 bdev_modules_init(void) 2321 { 2322 struct spdk_bdev_module *module; 2323 int rc = 0; 2324 2325 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2326 g_resume_bdev_module = module; 2327 if (module->async_init) { 2328 spdk_spin_lock(&module->internal.spinlock); 2329 module->internal.action_in_progress = 1; 2330 spdk_spin_unlock(&module->internal.spinlock); 2331 } 2332 rc = module->module_init(); 2333 if (rc != 0) { 2334 /* Bump action_in_progress to prevent other modules from completion of modules_init 2335 * Send message to defer application shutdown until resources are cleaned up */ 2336 spdk_spin_lock(&module->internal.spinlock); 2337 module->internal.action_in_progress = 1; 2338 spdk_spin_unlock(&module->internal.spinlock); 2339 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2340 return rc; 2341 } 2342 } 2343 2344 g_resume_bdev_module = NULL; 2345 return 0; 2346 } 2347 2348 void 2349 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2350 { 2351 int rc = 0; 2352 char mempool_name[32]; 2353 2354 assert(cb_fn != NULL); 2355 2356 g_init_cb_fn = cb_fn; 2357 g_init_cb_arg = cb_arg; 2358 2359 spdk_notify_type_register("bdev_register"); 2360 spdk_notify_type_register("bdev_unregister"); 2361 2362 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2363 2364 rc = spdk_iobuf_register_module("bdev"); 2365 if (rc != 0) { 2366 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2367 bdev_init_complete(-1); 2368 return; 2369 } 2370 2371 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2372 g_bdev_opts.bdev_io_pool_size, 2373 sizeof(struct spdk_bdev_io) + 2374 bdev_module_get_max_ctx_size(), 2375 0, 2376 SPDK_ENV_NUMA_ID_ANY); 2377 2378 if (g_bdev_mgr.bdev_io_pool == NULL) { 2379 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2380 bdev_init_complete(-1); 2381 return; 2382 } 2383 2384 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2385 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2386 if (!g_bdev_mgr.zero_buffer) { 2387 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2388 bdev_init_complete(-1); 2389 return; 2390 } 2391 2392 #ifdef SPDK_CONFIG_VTUNE 2393 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2394 #endif 2395 2396 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2397 bdev_mgmt_channel_destroy, 2398 sizeof(struct spdk_bdev_mgmt_channel), 2399 "bdev_mgr"); 2400 2401 rc = bdev_modules_init(); 2402 g_bdev_mgr.module_init_complete = true; 2403 if (rc != 0) { 2404 SPDK_ERRLOG("bdev modules init failed\n"); 2405 return; 2406 } 2407 2408 bdev_module_action_complete(); 2409 } 2410 2411 static void 2412 bdev_mgr_unregister_cb(void *io_device) 2413 { 2414 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2415 2416 if (g_bdev_mgr.bdev_io_pool) { 2417 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2418 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2419 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2420 g_bdev_opts.bdev_io_pool_size); 2421 } 2422 2423 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2424 } 2425 2426 spdk_free(g_bdev_mgr.zero_buffer); 2427 2428 bdev_examine_allowlist_free(); 2429 2430 cb_fn(g_fini_cb_arg); 2431 g_fini_cb_fn = NULL; 2432 g_fini_cb_arg = NULL; 2433 g_bdev_mgr.init_complete = false; 2434 g_bdev_mgr.module_init_complete = false; 2435 } 2436 2437 static void 2438 bdev_module_fini_iter(void *arg) 2439 { 2440 struct spdk_bdev_module *bdev_module; 2441 2442 /* FIXME: Handling initialization failures is broken now, 2443 * so we won't even try cleaning up after successfully 2444 * initialized modules. if module_init_complete is false, 2445 * just call spdk_bdev_mgr_unregister_cb 2446 */ 2447 if (!g_bdev_mgr.module_init_complete) { 2448 bdev_mgr_unregister_cb(NULL); 2449 return; 2450 } 2451 2452 /* Start iterating from the last touched module */ 2453 if (!g_resume_bdev_module) { 2454 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2455 } else { 2456 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2457 internal.tailq); 2458 } 2459 2460 while (bdev_module) { 2461 if (bdev_module->async_fini) { 2462 /* Save our place so we can resume later. We must 2463 * save the variable here, before calling module_fini() 2464 * below, because in some cases the module may immediately 2465 * call spdk_bdev_module_fini_done() and re-enter 2466 * this function to continue iterating. */ 2467 g_resume_bdev_module = bdev_module; 2468 } 2469 2470 if (bdev_module->module_fini) { 2471 bdev_module->module_fini(); 2472 } 2473 2474 if (bdev_module->async_fini) { 2475 return; 2476 } 2477 2478 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2479 internal.tailq); 2480 } 2481 2482 g_resume_bdev_module = NULL; 2483 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2484 } 2485 2486 void 2487 spdk_bdev_module_fini_done(void) 2488 { 2489 if (spdk_get_thread() != g_fini_thread) { 2490 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2491 } else { 2492 bdev_module_fini_iter(NULL); 2493 } 2494 } 2495 2496 static void 2497 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2498 { 2499 struct spdk_bdev *bdev = cb_arg; 2500 2501 if (bdeverrno && bdev) { 2502 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2503 bdev->name); 2504 2505 /* 2506 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2507 * bdev; try to continue by manually removing this bdev from the list and continue 2508 * with the next bdev in the list. 2509 */ 2510 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2511 } 2512 2513 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2514 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2515 /* 2516 * Bdev module finish need to be deferred as we might be in the middle of some context 2517 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2518 * after returning. 2519 */ 2520 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2521 return; 2522 } 2523 2524 /* 2525 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2526 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2527 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2528 * base bdevs. 2529 * 2530 * Also, walk the list in the reverse order. 2531 */ 2532 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2533 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2534 spdk_spin_lock(&bdev->internal.spinlock); 2535 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2536 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2537 spdk_spin_unlock(&bdev->internal.spinlock); 2538 continue; 2539 } 2540 spdk_spin_unlock(&bdev->internal.spinlock); 2541 2542 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2543 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2544 return; 2545 } 2546 2547 /* 2548 * If any bdev fails to unclaim underlying bdev properly, we may face the 2549 * case of bdev list consisting of claimed bdevs only (if claims are managed 2550 * correctly, this would mean there's a loop in the claims graph which is 2551 * clearly impossible). Warn and unregister last bdev on the list then. 2552 */ 2553 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2554 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2555 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2556 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2557 return; 2558 } 2559 } 2560 2561 static void 2562 bdev_module_fini_start_iter(void *arg) 2563 { 2564 struct spdk_bdev_module *bdev_module; 2565 2566 if (!g_resume_bdev_module) { 2567 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2568 } else { 2569 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2570 } 2571 2572 while (bdev_module) { 2573 if (bdev_module->async_fini_start) { 2574 /* Save our place so we can resume later. We must 2575 * save the variable here, before calling fini_start() 2576 * below, because in some cases the module may immediately 2577 * call spdk_bdev_module_fini_start_done() and re-enter 2578 * this function to continue iterating. */ 2579 g_resume_bdev_module = bdev_module; 2580 } 2581 2582 if (bdev_module->fini_start) { 2583 bdev_module->fini_start(); 2584 } 2585 2586 if (bdev_module->async_fini_start) { 2587 return; 2588 } 2589 2590 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2591 } 2592 2593 g_resume_bdev_module = NULL; 2594 2595 bdev_finish_unregister_bdevs_iter(NULL, 0); 2596 } 2597 2598 void 2599 spdk_bdev_module_fini_start_done(void) 2600 { 2601 if (spdk_get_thread() != g_fini_thread) { 2602 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2603 } else { 2604 bdev_module_fini_start_iter(NULL); 2605 } 2606 } 2607 2608 static void 2609 bdev_finish_wait_for_examine_done(void *cb_arg) 2610 { 2611 bdev_module_fini_start_iter(NULL); 2612 } 2613 2614 static void bdev_open_async_fini(void); 2615 2616 void 2617 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2618 { 2619 int rc; 2620 2621 assert(cb_fn != NULL); 2622 2623 g_fini_thread = spdk_get_thread(); 2624 2625 g_fini_cb_fn = cb_fn; 2626 g_fini_cb_arg = cb_arg; 2627 2628 bdev_open_async_fini(); 2629 2630 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2631 if (rc != 0) { 2632 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2633 bdev_finish_wait_for_examine_done(NULL); 2634 } 2635 } 2636 2637 struct spdk_bdev_io * 2638 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2639 { 2640 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2641 struct spdk_bdev_io *bdev_io; 2642 2643 if (ch->per_thread_cache_count > 0) { 2644 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2645 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2646 ch->per_thread_cache_count--; 2647 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2648 /* 2649 * Don't try to look for bdev_ios in the global pool if there are 2650 * waiters on bdev_ios - we don't want this caller to jump the line. 2651 */ 2652 bdev_io = NULL; 2653 } else { 2654 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2655 } 2656 2657 return bdev_io; 2658 } 2659 2660 void 2661 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2662 { 2663 struct spdk_bdev_mgmt_channel *ch; 2664 2665 assert(bdev_io != NULL); 2666 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2667 2668 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2669 2670 if (bdev_io->internal.f.has_buf) { 2671 bdev_io_put_buf(bdev_io); 2672 } 2673 2674 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2675 ch->per_thread_cache_count++; 2676 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2677 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2678 struct spdk_bdev_io_wait_entry *entry; 2679 2680 entry = TAILQ_FIRST(&ch->io_wait_queue); 2681 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2682 entry->cb_fn(entry->cb_arg); 2683 } 2684 } else { 2685 /* We should never have a full cache with entries on the io wait queue. */ 2686 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2687 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2688 } 2689 } 2690 2691 static bool 2692 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2693 { 2694 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2695 2696 switch (limit) { 2697 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2698 return true; 2699 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2700 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2701 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2702 return false; 2703 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2704 default: 2705 return false; 2706 } 2707 } 2708 2709 static bool 2710 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2711 { 2712 switch (bdev_io->type) { 2713 case SPDK_BDEV_IO_TYPE_NVME_IO: 2714 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2715 case SPDK_BDEV_IO_TYPE_READ: 2716 case SPDK_BDEV_IO_TYPE_WRITE: 2717 return true; 2718 case SPDK_BDEV_IO_TYPE_ZCOPY: 2719 if (bdev_io->u.bdev.zcopy.start) { 2720 return true; 2721 } else { 2722 return false; 2723 } 2724 default: 2725 return false; 2726 } 2727 } 2728 2729 static bool 2730 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2731 { 2732 switch (bdev_io->type) { 2733 case SPDK_BDEV_IO_TYPE_NVME_IO: 2734 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2735 /* Bit 1 (0x2) set for read operation */ 2736 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2737 return true; 2738 } else { 2739 return false; 2740 } 2741 case SPDK_BDEV_IO_TYPE_READ: 2742 return true; 2743 case SPDK_BDEV_IO_TYPE_ZCOPY: 2744 /* Populate to read from disk */ 2745 if (bdev_io->u.bdev.zcopy.populate) { 2746 return true; 2747 } else { 2748 return false; 2749 } 2750 default: 2751 return false; 2752 } 2753 } 2754 2755 static uint64_t 2756 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2757 { 2758 uint32_t blocklen = bdev_io_get_block_size(bdev_io); 2759 2760 switch (bdev_io->type) { 2761 case SPDK_BDEV_IO_TYPE_NVME_IO: 2762 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2763 return bdev_io->u.nvme_passthru.nbytes; 2764 case SPDK_BDEV_IO_TYPE_READ: 2765 case SPDK_BDEV_IO_TYPE_WRITE: 2766 return bdev_io->u.bdev.num_blocks * blocklen; 2767 case SPDK_BDEV_IO_TYPE_ZCOPY: 2768 /* Track the data in the start phase only */ 2769 if (bdev_io->u.bdev.zcopy.start) { 2770 return bdev_io->u.bdev.num_blocks * blocklen; 2771 } else { 2772 return 0; 2773 } 2774 default: 2775 return 0; 2776 } 2777 } 2778 2779 static inline bool 2780 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2781 { 2782 int64_t remaining_this_timeslice; 2783 2784 if (!limit->max_per_timeslice) { 2785 /* The QoS is disabled */ 2786 return false; 2787 } 2788 2789 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2790 __ATOMIC_RELAXED); 2791 if (remaining_this_timeslice + (int64_t)delta > 0) { 2792 /* There was still a quota for this delta -> the IO shouldn't be queued 2793 * 2794 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2795 * quota can be allowed once a while. Such overrun then taken into account in 2796 * the QoS poller, where the next timeslice quota is calculated. 2797 */ 2798 return false; 2799 } 2800 2801 /* There was no quota for this delta -> the IO should be queued 2802 * The remaining_this_timeslice must be rewinded so it reflects the real 2803 * amount of IOs or bytes allowed. 2804 */ 2805 __atomic_add_fetch( 2806 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2807 return true; 2808 } 2809 2810 static inline void 2811 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2812 { 2813 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2814 } 2815 2816 static bool 2817 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2818 { 2819 return bdev_qos_rw_queue_io(limit, io, 1); 2820 } 2821 2822 static void 2823 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2824 { 2825 bdev_qos_rw_rewind_io(limit, io, 1); 2826 } 2827 2828 static bool 2829 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2830 { 2831 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2832 } 2833 2834 static void 2835 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2836 { 2837 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2838 } 2839 2840 static bool 2841 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2842 { 2843 if (bdev_is_read_io(io) == false) { 2844 return false; 2845 } 2846 2847 return bdev_qos_rw_bps_queue(limit, io); 2848 } 2849 2850 static void 2851 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2852 { 2853 if (bdev_is_read_io(io) != false) { 2854 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2855 } 2856 } 2857 2858 static bool 2859 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2860 { 2861 if (bdev_is_read_io(io) == true) { 2862 return false; 2863 } 2864 2865 return bdev_qos_rw_bps_queue(limit, io); 2866 } 2867 2868 static void 2869 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2870 { 2871 if (bdev_is_read_io(io) != true) { 2872 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2873 } 2874 } 2875 2876 static void 2877 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2878 { 2879 int i; 2880 2881 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2882 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2883 qos->rate_limits[i].queue_io = NULL; 2884 continue; 2885 } 2886 2887 switch (i) { 2888 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2889 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2890 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2891 break; 2892 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2893 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2894 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2895 break; 2896 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2897 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2898 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2899 break; 2900 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2901 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2902 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2903 break; 2904 default: 2905 break; 2906 } 2907 } 2908 } 2909 2910 static void 2911 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2912 struct spdk_bdev_io *bdev_io, 2913 enum spdk_bdev_io_status status) 2914 { 2915 bdev_io->internal.f.in_submit_request = true; 2916 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2917 spdk_bdev_io_complete(bdev_io, status); 2918 bdev_io->internal.f.in_submit_request = false; 2919 } 2920 2921 static inline void 2922 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2923 { 2924 struct spdk_bdev *bdev = bdev_io->bdev; 2925 struct spdk_io_channel *ch = bdev_ch->channel; 2926 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2927 2928 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2929 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2930 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2931 2932 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2933 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2934 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2935 SPDK_BDEV_IO_STATUS_SUCCESS); 2936 return; 2937 } 2938 } 2939 2940 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2941 bdev_io->bdev->split_on_write_unit && 2942 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2943 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2944 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2945 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2946 return; 2947 } 2948 2949 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2950 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2951 bdev_io->internal.f.in_submit_request = true; 2952 bdev_submit_request(bdev, ch, bdev_io); 2953 bdev_io->internal.f.in_submit_request = false; 2954 } else { 2955 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2956 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2957 /* Special case when we have nomem IOs and no outstanding IOs which completions 2958 * could trigger retry of queued IOs */ 2959 bdev_shared_ch_retry_io(shared_resource); 2960 } 2961 } 2962 } 2963 2964 static bool 2965 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2966 { 2967 int i; 2968 2969 if (bdev_qos_io_to_limit(bdev_io) == true) { 2970 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2971 if (!qos->rate_limits[i].queue_io) { 2972 continue; 2973 } 2974 2975 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2976 bdev_io) == true) { 2977 for (i -= 1; i >= 0 ; i--) { 2978 if (!qos->rate_limits[i].queue_io) { 2979 continue; 2980 } 2981 2982 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2983 } 2984 return true; 2985 } 2986 } 2987 } 2988 2989 return false; 2990 } 2991 2992 static int 2993 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2994 { 2995 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2996 int submitted_ios = 0; 2997 2998 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2999 if (!bdev_qos_queue_io(qos, bdev_io)) { 3000 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 3001 bdev_io_do_submit(ch, bdev_io); 3002 3003 submitted_ios++; 3004 } 3005 } 3006 3007 return submitted_ios; 3008 } 3009 3010 static void 3011 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 3012 { 3013 int rc; 3014 3015 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 3016 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 3017 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 3018 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3019 &bdev_io->internal.waitq_entry); 3020 if (rc != 0) { 3021 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 3022 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3023 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3024 } 3025 } 3026 3027 static bool 3028 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 3029 { 3030 uint32_t io_boundary; 3031 struct spdk_bdev *bdev = bdev_io->bdev; 3032 uint32_t max_segment_size = bdev->max_segment_size; 3033 uint32_t max_size = bdev->max_rw_size; 3034 int max_segs = bdev->max_num_segments; 3035 3036 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3037 io_boundary = bdev->write_unit_size; 3038 } else if (bdev->split_on_optimal_io_boundary) { 3039 io_boundary = bdev->optimal_io_boundary; 3040 } else { 3041 io_boundary = 0; 3042 } 3043 3044 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 3045 return false; 3046 } 3047 3048 if (io_boundary) { 3049 uint64_t start_stripe, end_stripe; 3050 3051 start_stripe = bdev_io->u.bdev.offset_blocks; 3052 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 3053 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3054 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 3055 start_stripe >>= spdk_u32log2(io_boundary); 3056 end_stripe >>= spdk_u32log2(io_boundary); 3057 } else { 3058 start_stripe /= io_boundary; 3059 end_stripe /= io_boundary; 3060 } 3061 3062 if (start_stripe != end_stripe) { 3063 return true; 3064 } 3065 } 3066 3067 if (max_segs) { 3068 if (bdev_io->u.bdev.iovcnt > max_segs) { 3069 return true; 3070 } 3071 } 3072 3073 if (max_segment_size) { 3074 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 3075 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 3076 return true; 3077 } 3078 } 3079 } 3080 3081 if (max_size) { 3082 if (bdev_io->u.bdev.num_blocks > max_size) { 3083 return true; 3084 } 3085 } 3086 3087 return false; 3088 } 3089 3090 static bool 3091 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 3092 { 3093 uint32_t num_unmap_segments; 3094 3095 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 3096 return false; 3097 } 3098 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 3099 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 3100 return true; 3101 } 3102 3103 return false; 3104 } 3105 3106 static bool 3107 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 3108 { 3109 if (!bdev_io->bdev->max_write_zeroes) { 3110 return false; 3111 } 3112 3113 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3114 return true; 3115 } 3116 3117 return false; 3118 } 3119 3120 static bool 3121 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3122 { 3123 if (bdev_io->bdev->max_copy != 0 && 3124 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3125 return true; 3126 } 3127 3128 return false; 3129 } 3130 3131 static bool 3132 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3133 { 3134 switch (bdev_io->type) { 3135 case SPDK_BDEV_IO_TYPE_READ: 3136 case SPDK_BDEV_IO_TYPE_WRITE: 3137 return bdev_rw_should_split(bdev_io); 3138 case SPDK_BDEV_IO_TYPE_UNMAP: 3139 return bdev_unmap_should_split(bdev_io); 3140 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3141 return bdev_write_zeroes_should_split(bdev_io); 3142 case SPDK_BDEV_IO_TYPE_COPY: 3143 return bdev_copy_should_split(bdev_io); 3144 default: 3145 return false; 3146 } 3147 } 3148 3149 static uint32_t 3150 _to_next_boundary(uint64_t offset, uint32_t boundary) 3151 { 3152 return (boundary - (offset % boundary)); 3153 } 3154 3155 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3156 3157 static void _bdev_rw_split(void *_bdev_io); 3158 3159 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3160 3161 static void 3162 _bdev_unmap_split(void *_bdev_io) 3163 { 3164 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3165 } 3166 3167 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3168 3169 static void 3170 _bdev_write_zeroes_split(void *_bdev_io) 3171 { 3172 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3173 } 3174 3175 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3176 3177 static void 3178 _bdev_copy_split(void *_bdev_io) 3179 { 3180 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3181 } 3182 3183 static int 3184 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3185 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3186 { 3187 int rc; 3188 uint64_t current_offset, current_remaining, current_src_offset; 3189 spdk_bdev_io_wait_cb io_wait_fn; 3190 3191 current_offset = *offset; 3192 current_remaining = *remaining; 3193 3194 assert(bdev_io->internal.f.split); 3195 3196 bdev_io->internal.split.outstanding++; 3197 3198 io_wait_fn = _bdev_rw_split; 3199 switch (bdev_io->type) { 3200 case SPDK_BDEV_IO_TYPE_READ: 3201 assert(bdev_io->u.bdev.accel_sequence == NULL); 3202 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3203 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3204 iov, iovcnt, md_buf, current_offset, 3205 num_blocks, 3206 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3207 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3208 NULL, 3209 bdev_io->u.bdev.dif_check_flags, 3210 bdev_io_split_done, bdev_io); 3211 break; 3212 case SPDK_BDEV_IO_TYPE_WRITE: 3213 assert(bdev_io->u.bdev.accel_sequence == NULL); 3214 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3215 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3216 iov, iovcnt, md_buf, current_offset, 3217 num_blocks, 3218 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3219 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3220 NULL, 3221 bdev_io->u.bdev.dif_check_flags, 3222 bdev_io->u.bdev.nvme_cdw12.raw, 3223 bdev_io->u.bdev.nvme_cdw13.raw, 3224 bdev_io_split_done, bdev_io); 3225 break; 3226 case SPDK_BDEV_IO_TYPE_UNMAP: 3227 io_wait_fn = _bdev_unmap_split; 3228 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3229 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3230 current_offset, num_blocks, 3231 bdev_io_split_done, bdev_io); 3232 break; 3233 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3234 io_wait_fn = _bdev_write_zeroes_split; 3235 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3236 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3237 current_offset, num_blocks, 3238 bdev_io_split_done, bdev_io); 3239 break; 3240 case SPDK_BDEV_IO_TYPE_COPY: 3241 io_wait_fn = _bdev_copy_split; 3242 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3243 (current_offset - bdev_io->u.bdev.offset_blocks); 3244 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3245 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3246 current_offset, current_src_offset, num_blocks, 3247 bdev_io_split_done, bdev_io); 3248 break; 3249 default: 3250 assert(false); 3251 rc = -EINVAL; 3252 break; 3253 } 3254 3255 if (rc == 0) { 3256 current_offset += num_blocks; 3257 current_remaining -= num_blocks; 3258 bdev_io->internal.split.current_offset_blocks = current_offset; 3259 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3260 *offset = current_offset; 3261 *remaining = current_remaining; 3262 } else { 3263 bdev_io->internal.split.outstanding--; 3264 if (rc == -ENOMEM) { 3265 if (bdev_io->internal.split.outstanding == 0) { 3266 /* No I/O is outstanding. Hence we should wait here. */ 3267 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3268 } 3269 } else { 3270 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3271 if (bdev_io->internal.split.outstanding == 0) { 3272 bdev_ch_remove_from_io_submitted(bdev_io); 3273 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3274 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3275 bdev_io->internal.ch->queue_depth); 3276 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3277 } 3278 } 3279 } 3280 3281 return rc; 3282 } 3283 3284 static void 3285 _bdev_rw_split(void *_bdev_io) 3286 { 3287 struct iovec *parent_iov, *iov; 3288 struct spdk_bdev_io *bdev_io = _bdev_io; 3289 struct spdk_bdev *bdev = bdev_io->bdev; 3290 uint64_t parent_offset, current_offset, remaining; 3291 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3292 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3293 uint32_t iovcnt, iov_len, child_iovsize; 3294 uint32_t blocklen; 3295 uint32_t io_boundary; 3296 uint32_t max_segment_size = bdev->max_segment_size; 3297 uint32_t max_child_iovcnt = bdev->max_num_segments; 3298 uint32_t max_size = bdev->max_rw_size; 3299 void *md_buf = NULL; 3300 int rc; 3301 3302 blocklen = bdev_io_get_block_size(bdev_io); 3303 3304 max_size = max_size ? max_size : UINT32_MAX; 3305 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3306 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3307 SPDK_BDEV_IO_NUM_CHILD_IOV; 3308 3309 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3310 io_boundary = bdev->write_unit_size; 3311 } else if (bdev->split_on_optimal_io_boundary) { 3312 io_boundary = bdev->optimal_io_boundary; 3313 } else { 3314 io_boundary = UINT32_MAX; 3315 } 3316 3317 assert(bdev_io->internal.f.split); 3318 3319 remaining = bdev_io->internal.split.remaining_num_blocks; 3320 current_offset = bdev_io->internal.split.current_offset_blocks; 3321 parent_offset = bdev_io->u.bdev.offset_blocks; 3322 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3323 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3324 3325 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3326 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3327 if (parent_iov_offset < parent_iov->iov_len) { 3328 break; 3329 } 3330 parent_iov_offset -= parent_iov->iov_len; 3331 } 3332 3333 child_iovcnt = 0; 3334 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3335 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3336 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3337 to_next_boundary = spdk_min(remaining, to_next_boundary); 3338 to_next_boundary = spdk_min(max_size, to_next_boundary); 3339 to_next_boundary_bytes = to_next_boundary * blocklen; 3340 3341 iov = &bdev_io->child_iov[child_iovcnt]; 3342 iovcnt = 0; 3343 3344 if (bdev_io->u.bdev.md_buf) { 3345 md_buf = (char *)bdev_io->u.bdev.md_buf + 3346 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3347 } 3348 3349 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3350 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3351 iovcnt < child_iovsize) { 3352 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3353 iov_len = parent_iov->iov_len - parent_iov_offset; 3354 3355 iov_len = spdk_min(iov_len, max_segment_size); 3356 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3357 to_next_boundary_bytes -= iov_len; 3358 3359 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3360 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3361 3362 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3363 parent_iov_offset += iov_len; 3364 } else { 3365 parent_iovpos++; 3366 parent_iov_offset = 0; 3367 } 3368 child_iovcnt++; 3369 iovcnt++; 3370 } 3371 3372 if (to_next_boundary_bytes > 0) { 3373 /* We had to stop this child I/O early because we ran out of 3374 * child_iov space or were limited by max_num_segments. 3375 * Ensure the iovs to be aligned with block size and 3376 * then adjust to_next_boundary before starting the 3377 * child I/O. 3378 */ 3379 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3380 iovcnt == child_iovsize); 3381 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3382 if (to_last_block_bytes != 0) { 3383 uint32_t child_iovpos = child_iovcnt - 1; 3384 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3385 * so the loop will naturally end 3386 */ 3387 3388 to_last_block_bytes = blocklen - to_last_block_bytes; 3389 to_next_boundary_bytes += to_last_block_bytes; 3390 while (to_last_block_bytes > 0 && iovcnt > 0) { 3391 iov_len = spdk_min(to_last_block_bytes, 3392 bdev_io->child_iov[child_iovpos].iov_len); 3393 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3394 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3395 child_iovpos--; 3396 if (--iovcnt == 0) { 3397 /* If the child IO is less than a block size just return. 3398 * If the first child IO of any split round is less than 3399 * a block size, an error exit. 3400 */ 3401 if (bdev_io->internal.split.outstanding == 0) { 3402 SPDK_ERRLOG("The first child io was less than a block size\n"); 3403 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3404 bdev_ch_remove_from_io_submitted(bdev_io); 3405 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3406 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3407 bdev_io->internal.ch->queue_depth); 3408 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3409 } 3410 3411 return; 3412 } 3413 } 3414 3415 to_last_block_bytes -= iov_len; 3416 3417 if (parent_iov_offset == 0) { 3418 parent_iovpos--; 3419 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3420 } 3421 parent_iov_offset -= iov_len; 3422 } 3423 3424 assert(to_last_block_bytes == 0); 3425 } 3426 to_next_boundary -= to_next_boundary_bytes / blocklen; 3427 } 3428 3429 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3430 ¤t_offset, &remaining); 3431 if (spdk_unlikely(rc)) { 3432 return; 3433 } 3434 } 3435 } 3436 3437 static void 3438 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3439 { 3440 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3441 uint32_t num_children_reqs = 0; 3442 int rc; 3443 3444 assert(bdev_io->internal.f.split); 3445 3446 offset = bdev_io->internal.split.current_offset_blocks; 3447 remaining = bdev_io->internal.split.remaining_num_blocks; 3448 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3449 3450 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3451 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3452 3453 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3454 &offset, &remaining); 3455 if (spdk_likely(rc == 0)) { 3456 num_children_reqs++; 3457 } else { 3458 return; 3459 } 3460 } 3461 } 3462 3463 static void 3464 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3465 { 3466 uint64_t offset, write_zeroes_blocks, remaining; 3467 uint32_t num_children_reqs = 0; 3468 int rc; 3469 3470 assert(bdev_io->internal.f.split); 3471 3472 offset = bdev_io->internal.split.current_offset_blocks; 3473 remaining = bdev_io->internal.split.remaining_num_blocks; 3474 3475 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3476 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3477 3478 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3479 &offset, &remaining); 3480 if (spdk_likely(rc == 0)) { 3481 num_children_reqs++; 3482 } else { 3483 return; 3484 } 3485 } 3486 } 3487 3488 static void 3489 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3490 { 3491 uint64_t offset, copy_blocks, remaining; 3492 uint32_t num_children_reqs = 0; 3493 int rc; 3494 3495 assert(bdev_io->internal.f.split); 3496 3497 offset = bdev_io->internal.split.current_offset_blocks; 3498 remaining = bdev_io->internal.split.remaining_num_blocks; 3499 3500 assert(bdev_io->bdev->max_copy != 0); 3501 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3502 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3503 3504 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3505 &offset, &remaining); 3506 if (spdk_likely(rc == 0)) { 3507 num_children_reqs++; 3508 } else { 3509 return; 3510 } 3511 } 3512 } 3513 3514 static void 3515 parent_bdev_io_complete(void *ctx, int rc) 3516 { 3517 struct spdk_bdev_io *parent_io = ctx; 3518 3519 if (rc) { 3520 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3521 } 3522 3523 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3524 parent_io->internal.caller_ctx); 3525 } 3526 3527 static void 3528 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3529 { 3530 struct spdk_bdev_io *bdev_io = ctx; 3531 3532 /* u.bdev.accel_sequence should have already been cleared at this point */ 3533 assert(bdev_io->u.bdev.accel_sequence == NULL); 3534 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3535 bdev_io->internal.f.has_accel_sequence = false; 3536 3537 if (spdk_unlikely(status != 0)) { 3538 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3539 } 3540 3541 parent_bdev_io_complete(bdev_io, status); 3542 } 3543 3544 static void 3545 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3546 { 3547 struct spdk_bdev_io *parent_io = cb_arg; 3548 3549 spdk_bdev_free_io(bdev_io); 3550 3551 assert(parent_io->internal.f.split); 3552 3553 if (!success) { 3554 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3555 /* If any child I/O failed, stop further splitting process. */ 3556 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3557 parent_io->internal.split.remaining_num_blocks = 0; 3558 } 3559 parent_io->internal.split.outstanding--; 3560 if (parent_io->internal.split.outstanding != 0) { 3561 return; 3562 } 3563 3564 /* 3565 * Parent I/O finishes when all blocks are consumed. 3566 */ 3567 if (parent_io->internal.split.remaining_num_blocks == 0) { 3568 assert(parent_io->internal.cb != bdev_io_split_done); 3569 bdev_ch_remove_from_io_submitted(parent_io); 3570 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3571 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3572 parent_io->internal.ch->queue_depth); 3573 3574 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3575 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3576 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3577 return; 3578 } else if (parent_io->internal.f.has_bounce_buf && 3579 !bdev_io_use_accel_sequence(bdev_io)) { 3580 /* bdev IO will be completed in the callback */ 3581 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3582 return; 3583 } 3584 } 3585 3586 parent_bdev_io_complete(parent_io, 0); 3587 return; 3588 } 3589 3590 /* 3591 * Continue with the splitting process. This function will complete the parent I/O if the 3592 * splitting is done. 3593 */ 3594 switch (parent_io->type) { 3595 case SPDK_BDEV_IO_TYPE_READ: 3596 case SPDK_BDEV_IO_TYPE_WRITE: 3597 _bdev_rw_split(parent_io); 3598 break; 3599 case SPDK_BDEV_IO_TYPE_UNMAP: 3600 bdev_unmap_split(parent_io); 3601 break; 3602 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3603 bdev_write_zeroes_split(parent_io); 3604 break; 3605 case SPDK_BDEV_IO_TYPE_COPY: 3606 bdev_copy_split(parent_io); 3607 break; 3608 default: 3609 assert(false); 3610 break; 3611 } 3612 } 3613 3614 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3615 bool success); 3616 3617 static void 3618 bdev_io_split(struct spdk_bdev_io *bdev_io) 3619 { 3620 assert(bdev_io_should_split(bdev_io)); 3621 assert(bdev_io->internal.f.split); 3622 3623 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3624 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3625 bdev_io->internal.split.outstanding = 0; 3626 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3627 3628 switch (bdev_io->type) { 3629 case SPDK_BDEV_IO_TYPE_READ: 3630 case SPDK_BDEV_IO_TYPE_WRITE: 3631 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3632 _bdev_rw_split(bdev_io); 3633 } else { 3634 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3635 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3636 bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io)); 3637 } 3638 break; 3639 case SPDK_BDEV_IO_TYPE_UNMAP: 3640 bdev_unmap_split(bdev_io); 3641 break; 3642 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3643 bdev_write_zeroes_split(bdev_io); 3644 break; 3645 case SPDK_BDEV_IO_TYPE_COPY: 3646 bdev_copy_split(bdev_io); 3647 break; 3648 default: 3649 assert(false); 3650 break; 3651 } 3652 } 3653 3654 static void 3655 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3656 { 3657 if (!success) { 3658 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3659 return; 3660 } 3661 3662 _bdev_rw_split(bdev_io); 3663 } 3664 3665 static inline void 3666 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3667 { 3668 struct spdk_bdev *bdev = bdev_io->bdev; 3669 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3670 3671 if (spdk_likely(bdev_ch->flags == 0)) { 3672 bdev_io_do_submit(bdev_ch, bdev_io); 3673 return; 3674 } 3675 3676 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3677 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3678 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3679 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3680 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3681 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3682 } else { 3683 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3684 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3685 } 3686 } else { 3687 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3688 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3689 } 3690 } 3691 3692 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3693 3694 bool 3695 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3696 { 3697 if (range1->length == 0 || range2->length == 0) { 3698 return false; 3699 } 3700 3701 if (range1->offset + range1->length <= range2->offset) { 3702 return false; 3703 } 3704 3705 if (range2->offset + range2->length <= range1->offset) { 3706 return false; 3707 } 3708 3709 return true; 3710 } 3711 3712 static bool 3713 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3714 { 3715 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3716 struct lba_range r; 3717 3718 switch (bdev_io->type) { 3719 case SPDK_BDEV_IO_TYPE_NVME_IO: 3720 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3721 /* Don't try to decode the NVMe command - just assume worst-case and that 3722 * it overlaps a locked range. 3723 */ 3724 return true; 3725 case SPDK_BDEV_IO_TYPE_READ: 3726 if (!range->quiesce) { 3727 return false; 3728 } 3729 /* fallthrough */ 3730 case SPDK_BDEV_IO_TYPE_WRITE: 3731 case SPDK_BDEV_IO_TYPE_UNMAP: 3732 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3733 case SPDK_BDEV_IO_TYPE_ZCOPY: 3734 case SPDK_BDEV_IO_TYPE_COPY: 3735 r.offset = bdev_io->u.bdev.offset_blocks; 3736 r.length = bdev_io->u.bdev.num_blocks; 3737 if (!bdev_lba_range_overlapped(range, &r)) { 3738 /* This I/O doesn't overlap the specified LBA range. */ 3739 return false; 3740 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3741 /* This I/O overlaps, but the I/O is on the same channel that locked this 3742 * range, and the caller_ctx is the same as the locked_ctx. This means 3743 * that this I/O is associated with the lock, and is allowed to execute. 3744 */ 3745 return false; 3746 } else { 3747 return true; 3748 } 3749 default: 3750 return false; 3751 } 3752 } 3753 3754 void 3755 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3756 { 3757 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3758 3759 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3760 3761 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3762 struct lba_range *range; 3763 3764 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3765 if (bdev_io_range_is_locked(bdev_io, range)) { 3766 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3767 return; 3768 } 3769 } 3770 } 3771 3772 bdev_ch_add_to_io_submitted(bdev_io); 3773 3774 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3775 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3776 ch->trace_id, bdev_io->u.bdev.num_blocks, 3777 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3778 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3779 3780 if (bdev_io->internal.f.split) { 3781 bdev_io_split(bdev_io); 3782 return; 3783 } 3784 3785 _bdev_io_submit(bdev_io); 3786 } 3787 3788 static inline int 3789 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io) 3790 { 3791 struct spdk_bdev *bdev = bdev_io->bdev; 3792 struct spdk_dif_ctx_init_ext_opts dif_opts; 3793 3794 memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error)); 3795 3796 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 3797 dif_opts.dif_pi_format = bdev->dif_pi_format; 3798 3799 return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx, 3800 bdev->blocklen, 3801 bdev->md_len, 3802 bdev->md_interleave, 3803 bdev->dif_is_head_of_md, 3804 bdev->dif_type, 3805 bdev_io->u.bdev.dif_check_flags, 3806 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 3807 0xFFFF, 0, 0, 0, &dif_opts); 3808 } 3809 3810 static void 3811 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3812 bool success) 3813 { 3814 if (!success) { 3815 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 3816 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3817 bdev_io_complete_unsubmitted(bdev_io); 3818 return; 3819 } 3820 3821 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 3822 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3823 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3824 return; 3825 } 3826 /* For reads we'll execute the sequence after the data is read, so, for now, only 3827 * clear out accel_sequence pointer and submit the IO */ 3828 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3829 bdev_io->u.bdev.accel_sequence = NULL; 3830 } 3831 3832 bdev_io_submit(bdev_io); 3833 } 3834 3835 static inline void 3836 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3837 { 3838 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3839 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3840 * For write operation we need to pull buffers from memory domain before submitting IO. 3841 * Once read operation completes, we need to use memory_domain push functionality to 3842 * update data in original memory domain IO buffer. 3843 * 3844 * If this I/O request is not aware of metadata, buffers in thsi IO request can't be 3845 * accessed directly too. It is needed to allocate buffers before issuing IO operation. 3846 * For write operation we need to insert metadata before submitting IO. Once read 3847 * operation completes, we need to strip metadata in original IO buffer. 3848 * 3849 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3850 assert(bdev_io_use_memory_domain(bdev_io) || 3851 bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3852 3853 bdev_io->u.bdev.memory_domain = NULL; 3854 bdev_io->u.bdev.memory_domain_ctx = NULL; 3855 _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3856 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3857 } 3858 3859 static inline void 3860 _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io) 3861 { 3862 assert(bdev_io_use_memory_domain(bdev_io)); 3863 assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3864 3865 bdev_io->u.bdev.memory_domain = NULL; 3866 bdev_io->u.bdev.memory_domain_ctx = NULL; 3867 bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3868 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3869 } 3870 3871 /* We need to allocate bounce buffer 3872 * - if bdev doesn't support memory domains, 3873 * - if it does support them, but we need to execute an accel sequence and the data buffer is 3874 * from accel memory domain (to avoid doing a push/pull from that domain), or 3875 * - if IO is not aware of metadata. 3876 */ 3877 static inline bool 3878 bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3879 { 3880 if (bdev_io_use_memory_domain(bdev_io)) { 3881 if (!desc->memory_domains_supported || 3882 (bdev_io_needs_sequence_exec(desc, bdev_io) && 3883 (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() || 3884 bdev_io_needs_metadata(desc, bdev_io)))) { 3885 return true; 3886 } 3887 3888 return false; 3889 } 3890 3891 if (bdev_io_needs_metadata(desc, bdev_io)) { 3892 return true; 3893 } 3894 3895 return false; 3896 } 3897 3898 /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not 3899 * aware of metadata. 3900 */ 3901 static inline bool 3902 bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3903 { 3904 if (bdev_io_needs_metadata(desc, bdev_io)) { 3905 assert(bdev_io_use_memory_domain(bdev_io)); 3906 return true; 3907 } 3908 3909 return false; 3910 } 3911 3912 static inline void 3913 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3914 { 3915 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3916 int rc; 3917 3918 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3919 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3920 bdev_io_complete_unsubmitted(bdev_io); 3921 return; 3922 } 3923 3924 if (bdev_io_needs_metadata(desc, bdev_io)) { 3925 rc = bdev_io_init_dif_ctx(bdev_io); 3926 if (spdk_unlikely(rc != 0)) { 3927 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3928 bdev_io_complete_unsubmitted(bdev_io); 3929 return; 3930 } 3931 } 3932 3933 if (bdev_io_needs_bounce_buffer(desc, bdev_io)) { 3934 _bdev_io_ext_use_bounce_buffer(bdev_io); 3935 return; 3936 } 3937 3938 if (bdev_io_needs_accel_buffer(desc, bdev_io)) { 3939 _bdev_io_ext_use_accel_buffer(bdev_io); 3940 return; 3941 } 3942 3943 if (bdev_io_needs_sequence_exec(desc, bdev_io)) { 3944 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3945 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3946 return; 3947 } 3948 /* For reads we'll execute the sequence after the data is read, so, for now, only 3949 * clear out accel_sequence pointer and submit the IO */ 3950 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3951 bdev_io->u.bdev.accel_sequence = NULL; 3952 } 3953 3954 bdev_io_submit(bdev_io); 3955 } 3956 3957 static void 3958 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3959 { 3960 struct spdk_bdev *bdev = bdev_io->bdev; 3961 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3962 struct spdk_io_channel *ch = bdev_ch->channel; 3963 3964 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3965 3966 bdev_io->internal.f.in_submit_request = true; 3967 bdev_submit_request(bdev, ch, bdev_io); 3968 bdev_io->internal.f.in_submit_request = false; 3969 } 3970 3971 void 3972 bdev_io_init(struct spdk_bdev_io *bdev_io, 3973 struct spdk_bdev *bdev, void *cb_arg, 3974 spdk_bdev_io_completion_cb cb) 3975 { 3976 bdev_io->bdev = bdev; 3977 bdev_io->internal.f.raw = 0; 3978 bdev_io->internal.caller_ctx = cb_arg; 3979 bdev_io->internal.cb = cb; 3980 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3981 bdev_io->internal.f.in_submit_request = false; 3982 bdev_io->internal.error.nvme.cdw0 = 0; 3983 bdev_io->num_retries = 0; 3984 bdev_io->internal.get_buf_cb = NULL; 3985 bdev_io->internal.get_aux_buf_cb = NULL; 3986 bdev_io->internal.data_transfer_cpl = NULL; 3987 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3988 } 3989 3990 static bool 3991 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3992 { 3993 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3994 } 3995 3996 bool 3997 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3998 { 3999 bool supported; 4000 4001 supported = bdev_io_type_supported(bdev, io_type); 4002 4003 if (!supported) { 4004 switch (io_type) { 4005 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 4006 /* The bdev layer will emulate write zeroes as long as write is supported. */ 4007 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 4008 break; 4009 default: 4010 break; 4011 } 4012 } 4013 4014 return supported; 4015 } 4016 4017 static const char *g_io_type_strings[] = { 4018 [SPDK_BDEV_IO_TYPE_READ] = "read", 4019 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 4020 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 4021 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 4022 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 4023 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 4024 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 4025 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 4026 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 4027 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 4028 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 4029 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 4030 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 4031 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 4032 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 4033 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 4034 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 4035 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 4036 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 4037 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 4038 }; 4039 4040 const char * 4041 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 4042 { 4043 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 4044 return NULL; 4045 } 4046 4047 return g_io_type_strings[io_type]; 4048 } 4049 4050 int 4051 spdk_bdev_get_io_type(const char *io_type_string) 4052 { 4053 int i; 4054 4055 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 4056 if (!strcmp(io_type_string, g_io_type_strings[i])) { 4057 return i; 4058 } 4059 } 4060 4061 return -1; 4062 } 4063 4064 uint64_t 4065 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 4066 { 4067 return bdev_io->internal.submit_tsc; 4068 } 4069 4070 bool 4071 spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io) 4072 { 4073 return bdev_io->internal.desc->opts.hide_metadata; 4074 } 4075 4076 int 4077 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4078 { 4079 if (bdev->fn_table->dump_info_json) { 4080 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 4081 } 4082 4083 return 0; 4084 } 4085 4086 static void 4087 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 4088 { 4089 uint32_t max_per_timeslice = 0; 4090 int i; 4091 4092 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4093 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4094 qos->rate_limits[i].max_per_timeslice = 0; 4095 continue; 4096 } 4097 4098 max_per_timeslice = qos->rate_limits[i].limit * 4099 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 4100 4101 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 4102 qos->rate_limits[i].min_per_timeslice); 4103 4104 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4105 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 4106 } 4107 4108 bdev_qos_set_ops(qos); 4109 } 4110 4111 static void 4112 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4113 struct spdk_io_channel *io_ch, void *ctx) 4114 { 4115 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4116 int status; 4117 4118 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 4119 4120 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 4121 /* TODO: channels round robing */ 4122 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 4123 4124 spdk_bdev_for_each_channel_continue(i, status); 4125 } 4126 4127 4128 static void 4129 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 4130 { 4131 4132 } 4133 4134 static int 4135 bdev_channel_poll_qos(void *arg) 4136 { 4137 struct spdk_bdev *bdev = arg; 4138 struct spdk_bdev_qos *qos = bdev->internal.qos; 4139 uint64_t now = spdk_get_ticks(); 4140 int i; 4141 int64_t remaining_last_timeslice; 4142 4143 if (spdk_unlikely(qos->thread == NULL)) { 4144 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 4145 return SPDK_POLLER_IDLE; 4146 } 4147 4148 if (now < (qos->last_timeslice + qos->timeslice_size)) { 4149 /* We received our callback earlier than expected - return 4150 * immediately and wait to do accounting until at least one 4151 * timeslice has actually expired. This should never happen 4152 * with a well-behaved timer implementation. 4153 */ 4154 return SPDK_POLLER_IDLE; 4155 } 4156 4157 /* Reset for next round of rate limiting */ 4158 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4159 /* We may have allowed the IOs or bytes to slightly overrun in the last 4160 * timeslice. remaining_this_timeslice is signed, so if it's negative 4161 * here, we'll account for the overrun so that the next timeslice will 4162 * be appropriately reduced. 4163 */ 4164 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 4165 0, __ATOMIC_RELAXED); 4166 if (remaining_last_timeslice < 0) { 4167 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 4168 * potentially use 2 atomic ops each, so they can intertwine. 4169 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 4170 */ 4171 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4172 remaining_last_timeslice, __ATOMIC_RELAXED); 4173 } 4174 } 4175 4176 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 4177 qos->last_timeslice += qos->timeslice_size; 4178 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4179 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 4180 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 4181 } 4182 } 4183 4184 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 4185 bdev_channel_submit_qos_io_done); 4186 4187 return SPDK_POLLER_BUSY; 4188 } 4189 4190 static void 4191 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 4192 { 4193 struct spdk_bdev_shared_resource *shared_resource; 4194 struct lba_range *range; 4195 4196 bdev_free_io_stat(ch->stat); 4197 #ifdef SPDK_CONFIG_VTUNE 4198 bdev_free_io_stat(ch->prev_stat); 4199 #endif 4200 4201 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 4202 range = TAILQ_FIRST(&ch->locked_ranges); 4203 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 4204 free(range); 4205 } 4206 4207 spdk_put_io_channel(ch->channel); 4208 spdk_put_io_channel(ch->accel_channel); 4209 4210 shared_resource = ch->shared_resource; 4211 4212 assert(TAILQ_EMPTY(&ch->io_locked)); 4213 assert(TAILQ_EMPTY(&ch->io_submitted)); 4214 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 4215 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 4216 assert(ch->io_outstanding == 0); 4217 assert(shared_resource->ref > 0); 4218 shared_resource->ref--; 4219 if (shared_resource->ref == 0) { 4220 assert(shared_resource->io_outstanding == 0); 4221 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 4222 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 4223 spdk_poller_unregister(&shared_resource->nomem_poller); 4224 free(shared_resource); 4225 } 4226 } 4227 4228 static void 4229 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 4230 { 4231 struct spdk_bdev_qos *qos = bdev->internal.qos; 4232 int i; 4233 4234 assert(spdk_spin_held(&bdev->internal.spinlock)); 4235 4236 /* Rate limiting on this bdev enabled */ 4237 if (qos) { 4238 if (qos->ch == NULL) { 4239 struct spdk_io_channel *io_ch; 4240 4241 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4242 bdev->name, spdk_get_thread()); 4243 4244 /* No qos channel has been selected, so set one up */ 4245 4246 /* Take another reference to ch */ 4247 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4248 assert(io_ch != NULL); 4249 qos->ch = ch; 4250 4251 qos->thread = spdk_io_channel_get_thread(io_ch); 4252 4253 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4254 if (bdev_qos_is_iops_rate_limit(i) == true) { 4255 qos->rate_limits[i].min_per_timeslice = 4256 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4257 } else { 4258 qos->rate_limits[i].min_per_timeslice = 4259 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4260 } 4261 4262 if (qos->rate_limits[i].limit == 0) { 4263 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4264 } 4265 } 4266 bdev_qos_update_max_quota_per_timeslice(qos); 4267 qos->timeslice_size = 4268 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4269 qos->last_timeslice = spdk_get_ticks(); 4270 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4271 bdev, 4272 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4273 } 4274 4275 ch->flags |= BDEV_CH_QOS_ENABLED; 4276 } 4277 } 4278 4279 struct poll_timeout_ctx { 4280 struct spdk_bdev_desc *desc; 4281 uint64_t timeout_in_sec; 4282 spdk_bdev_io_timeout_cb cb_fn; 4283 void *cb_arg; 4284 }; 4285 4286 static void 4287 bdev_desc_free(struct spdk_bdev_desc *desc) 4288 { 4289 spdk_spin_destroy(&desc->spinlock); 4290 free(desc->media_events_buffer); 4291 free(desc); 4292 } 4293 4294 static void 4295 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4296 { 4297 struct poll_timeout_ctx *ctx = _ctx; 4298 struct spdk_bdev_desc *desc = ctx->desc; 4299 4300 free(ctx); 4301 4302 spdk_spin_lock(&desc->spinlock); 4303 desc->refs--; 4304 if (desc->closed == true && desc->refs == 0) { 4305 spdk_spin_unlock(&desc->spinlock); 4306 bdev_desc_free(desc); 4307 return; 4308 } 4309 spdk_spin_unlock(&desc->spinlock); 4310 } 4311 4312 static void 4313 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4314 struct spdk_io_channel *io_ch, void *_ctx) 4315 { 4316 struct poll_timeout_ctx *ctx = _ctx; 4317 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4318 struct spdk_bdev_desc *desc = ctx->desc; 4319 struct spdk_bdev_io *bdev_io; 4320 uint64_t now; 4321 4322 spdk_spin_lock(&desc->spinlock); 4323 if (desc->closed == true) { 4324 spdk_spin_unlock(&desc->spinlock); 4325 spdk_bdev_for_each_channel_continue(i, -1); 4326 return; 4327 } 4328 spdk_spin_unlock(&desc->spinlock); 4329 4330 now = spdk_get_ticks(); 4331 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4332 /* Exclude any I/O that are generated via splitting. */ 4333 if (bdev_io->internal.cb == bdev_io_split_done) { 4334 continue; 4335 } 4336 4337 /* Once we find an I/O that has not timed out, we can immediately 4338 * exit the loop. 4339 */ 4340 if (now < (bdev_io->internal.submit_tsc + 4341 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4342 goto end; 4343 } 4344 4345 if (bdev_io->internal.desc == desc) { 4346 ctx->cb_fn(ctx->cb_arg, bdev_io); 4347 } 4348 } 4349 4350 end: 4351 spdk_bdev_for_each_channel_continue(i, 0); 4352 } 4353 4354 static int 4355 bdev_poll_timeout_io(void *arg) 4356 { 4357 struct spdk_bdev_desc *desc = arg; 4358 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4359 struct poll_timeout_ctx *ctx; 4360 4361 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4362 if (!ctx) { 4363 SPDK_ERRLOG("failed to allocate memory\n"); 4364 return SPDK_POLLER_BUSY; 4365 } 4366 ctx->desc = desc; 4367 ctx->cb_arg = desc->cb_arg; 4368 ctx->cb_fn = desc->cb_fn; 4369 ctx->timeout_in_sec = desc->timeout_in_sec; 4370 4371 /* Take a ref on the descriptor in case it gets closed while we are checking 4372 * all of the channels. 4373 */ 4374 spdk_spin_lock(&desc->spinlock); 4375 desc->refs++; 4376 spdk_spin_unlock(&desc->spinlock); 4377 4378 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4379 bdev_channel_poll_timeout_io_done); 4380 4381 return SPDK_POLLER_BUSY; 4382 } 4383 4384 int 4385 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4386 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4387 { 4388 assert(desc->thread == spdk_get_thread()); 4389 4390 spdk_poller_unregister(&desc->io_timeout_poller); 4391 4392 if (timeout_in_sec) { 4393 assert(cb_fn != NULL); 4394 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4395 desc, 4396 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4397 1000); 4398 if (desc->io_timeout_poller == NULL) { 4399 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4400 return -1; 4401 } 4402 } 4403 4404 desc->cb_fn = cb_fn; 4405 desc->cb_arg = cb_arg; 4406 desc->timeout_in_sec = timeout_in_sec; 4407 4408 return 0; 4409 } 4410 4411 static int 4412 bdev_channel_create(void *io_device, void *ctx_buf) 4413 { 4414 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4415 struct spdk_bdev_channel *ch = ctx_buf; 4416 struct spdk_io_channel *mgmt_io_ch; 4417 struct spdk_bdev_mgmt_channel *mgmt_ch; 4418 struct spdk_bdev_shared_resource *shared_resource; 4419 struct lba_range *range; 4420 4421 ch->bdev = bdev; 4422 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4423 if (!ch->channel) { 4424 return -1; 4425 } 4426 4427 ch->accel_channel = spdk_accel_get_io_channel(); 4428 if (!ch->accel_channel) { 4429 spdk_put_io_channel(ch->channel); 4430 return -1; 4431 } 4432 4433 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4434 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4435 4436 assert(ch->histogram == NULL); 4437 if (bdev->internal.histogram_enabled) { 4438 ch->histogram = spdk_histogram_data_alloc(); 4439 if (ch->histogram == NULL) { 4440 SPDK_ERRLOG("Could not allocate histogram\n"); 4441 } 4442 } 4443 4444 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4445 if (!mgmt_io_ch) { 4446 spdk_put_io_channel(ch->channel); 4447 spdk_put_io_channel(ch->accel_channel); 4448 return -1; 4449 } 4450 4451 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4452 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4453 if (shared_resource->shared_ch == ch->channel) { 4454 spdk_put_io_channel(mgmt_io_ch); 4455 shared_resource->ref++; 4456 break; 4457 } 4458 } 4459 4460 if (shared_resource == NULL) { 4461 shared_resource = calloc(1, sizeof(*shared_resource)); 4462 if (shared_resource == NULL) { 4463 spdk_put_io_channel(ch->channel); 4464 spdk_put_io_channel(ch->accel_channel); 4465 spdk_put_io_channel(mgmt_io_ch); 4466 return -1; 4467 } 4468 4469 shared_resource->mgmt_ch = mgmt_ch; 4470 shared_resource->io_outstanding = 0; 4471 TAILQ_INIT(&shared_resource->nomem_io); 4472 shared_resource->nomem_threshold = 0; 4473 shared_resource->shared_ch = ch->channel; 4474 shared_resource->ref = 1; 4475 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4476 } 4477 4478 ch->io_outstanding = 0; 4479 TAILQ_INIT(&ch->locked_ranges); 4480 TAILQ_INIT(&ch->qos_queued_io); 4481 ch->flags = 0; 4482 ch->trace_id = bdev->internal.trace_id; 4483 ch->shared_resource = shared_resource; 4484 4485 TAILQ_INIT(&ch->io_submitted); 4486 TAILQ_INIT(&ch->io_locked); 4487 TAILQ_INIT(&ch->io_accel_exec); 4488 TAILQ_INIT(&ch->io_memory_domain); 4489 4490 ch->stat = bdev_alloc_io_stat(false); 4491 if (ch->stat == NULL) { 4492 bdev_channel_destroy_resource(ch); 4493 return -1; 4494 } 4495 4496 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4497 4498 #ifdef SPDK_CONFIG_VTUNE 4499 { 4500 char *name; 4501 __itt_init_ittlib(NULL, 0); 4502 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4503 if (!name) { 4504 bdev_channel_destroy_resource(ch); 4505 return -1; 4506 } 4507 ch->handle = __itt_string_handle_create(name); 4508 free(name); 4509 ch->start_tsc = spdk_get_ticks(); 4510 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4511 ch->prev_stat = bdev_alloc_io_stat(false); 4512 if (ch->prev_stat == NULL) { 4513 bdev_channel_destroy_resource(ch); 4514 return -1; 4515 } 4516 } 4517 #endif 4518 4519 spdk_spin_lock(&bdev->internal.spinlock); 4520 bdev_enable_qos(bdev, ch); 4521 4522 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4523 struct lba_range *new_range; 4524 4525 new_range = calloc(1, sizeof(*new_range)); 4526 if (new_range == NULL) { 4527 spdk_spin_unlock(&bdev->internal.spinlock); 4528 bdev_channel_destroy_resource(ch); 4529 return -1; 4530 } 4531 new_range->length = range->length; 4532 new_range->offset = range->offset; 4533 new_range->locked_ctx = range->locked_ctx; 4534 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4535 } 4536 4537 spdk_spin_unlock(&bdev->internal.spinlock); 4538 4539 return 0; 4540 } 4541 4542 static int 4543 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4544 void *cb_ctx) 4545 { 4546 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4547 struct spdk_bdev_io *bdev_io; 4548 uint64_t buf_len; 4549 4550 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4551 if (bdev_io->internal.ch == bdev_ch) { 4552 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4553 spdk_iobuf_entry_abort(ch, entry, buf_len); 4554 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4555 } 4556 4557 return 0; 4558 } 4559 4560 /* 4561 * Abort I/O that are waiting on a data buffer. 4562 */ 4563 static void 4564 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4565 { 4566 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4567 } 4568 4569 /* 4570 * Abort I/O that are queued waiting for submission. These types of I/O are 4571 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4572 */ 4573 static void 4574 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4575 { 4576 struct spdk_bdev_io *bdev_io, *tmp; 4577 4578 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4579 if (bdev_io->internal.ch == ch) { 4580 TAILQ_REMOVE(queue, bdev_io, internal.link); 4581 /* 4582 * spdk_bdev_io_complete() assumes that the completed I/O had 4583 * been submitted to the bdev module. Since in this case it 4584 * hadn't, bump io_outstanding to account for the decrement 4585 * that spdk_bdev_io_complete() will do. 4586 */ 4587 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4588 bdev_io_increment_outstanding(ch, ch->shared_resource); 4589 } 4590 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4591 } 4592 } 4593 } 4594 4595 static bool 4596 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4597 { 4598 struct spdk_bdev_io *bdev_io; 4599 4600 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4601 if (bdev_io == bio_to_abort) { 4602 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4603 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4604 return true; 4605 } 4606 } 4607 4608 return false; 4609 } 4610 4611 static int 4612 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4613 { 4614 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4615 uint64_t buf_len; 4616 4617 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4618 if (bdev_io == bio_to_abort) { 4619 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4620 spdk_iobuf_entry_abort(ch, entry, buf_len); 4621 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4622 return 1; 4623 } 4624 4625 return 0; 4626 } 4627 4628 static bool 4629 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4630 { 4631 int rc; 4632 4633 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4634 return rc == 1; 4635 } 4636 4637 static void 4638 bdev_qos_channel_destroy(void *cb_arg) 4639 { 4640 struct spdk_bdev_qos *qos = cb_arg; 4641 4642 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4643 spdk_poller_unregister(&qos->poller); 4644 4645 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4646 4647 free(qos); 4648 } 4649 4650 static int 4651 bdev_qos_destroy(struct spdk_bdev *bdev) 4652 { 4653 int i; 4654 4655 /* 4656 * Cleanly shutting down the QoS poller is tricky, because 4657 * during the asynchronous operation the user could open 4658 * a new descriptor and create a new channel, spawning 4659 * a new QoS poller. 4660 * 4661 * The strategy is to create a new QoS structure here and swap it 4662 * in. The shutdown path then continues to refer to the old one 4663 * until it completes and then releases it. 4664 */ 4665 struct spdk_bdev_qos *new_qos, *old_qos; 4666 4667 old_qos = bdev->internal.qos; 4668 4669 new_qos = calloc(1, sizeof(*new_qos)); 4670 if (!new_qos) { 4671 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4672 return -ENOMEM; 4673 } 4674 4675 /* Copy the old QoS data into the newly allocated structure */ 4676 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4677 4678 /* Zero out the key parts of the QoS structure */ 4679 new_qos->ch = NULL; 4680 new_qos->thread = NULL; 4681 new_qos->poller = NULL; 4682 /* 4683 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4684 * It will be used later for the new QoS structure. 4685 */ 4686 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4687 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4688 new_qos->rate_limits[i].min_per_timeslice = 0; 4689 new_qos->rate_limits[i].max_per_timeslice = 0; 4690 } 4691 4692 bdev->internal.qos = new_qos; 4693 4694 if (old_qos->thread == NULL) { 4695 free(old_qos); 4696 } else { 4697 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4698 } 4699 4700 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4701 * been destroyed yet. The destruction path will end up waiting for the final 4702 * channel to be put before it releases resources. */ 4703 4704 return 0; 4705 } 4706 4707 void 4708 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4709 { 4710 total->bytes_read += add->bytes_read; 4711 total->num_read_ops += add->num_read_ops; 4712 total->bytes_written += add->bytes_written; 4713 total->num_write_ops += add->num_write_ops; 4714 total->bytes_unmapped += add->bytes_unmapped; 4715 total->num_unmap_ops += add->num_unmap_ops; 4716 total->bytes_copied += add->bytes_copied; 4717 total->num_copy_ops += add->num_copy_ops; 4718 total->read_latency_ticks += add->read_latency_ticks; 4719 total->write_latency_ticks += add->write_latency_ticks; 4720 total->unmap_latency_ticks += add->unmap_latency_ticks; 4721 total->copy_latency_ticks += add->copy_latency_ticks; 4722 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4723 total->max_read_latency_ticks = add->max_read_latency_ticks; 4724 } 4725 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4726 total->min_read_latency_ticks = add->min_read_latency_ticks; 4727 } 4728 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4729 total->max_write_latency_ticks = add->max_write_latency_ticks; 4730 } 4731 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4732 total->min_write_latency_ticks = add->min_write_latency_ticks; 4733 } 4734 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4735 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4736 } 4737 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4738 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4739 } 4740 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4741 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4742 } 4743 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4744 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4745 } 4746 } 4747 4748 static void 4749 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4750 { 4751 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4752 4753 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4754 memcpy(to_stat->io_error, from_stat->io_error, 4755 sizeof(struct spdk_bdev_io_error_stat)); 4756 } 4757 } 4758 4759 void 4760 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4761 { 4762 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4763 return; 4764 } 4765 4766 stat->max_read_latency_ticks = 0; 4767 stat->min_read_latency_ticks = UINT64_MAX; 4768 stat->max_write_latency_ticks = 0; 4769 stat->min_write_latency_ticks = UINT64_MAX; 4770 stat->max_unmap_latency_ticks = 0; 4771 stat->min_unmap_latency_ticks = UINT64_MAX; 4772 stat->max_copy_latency_ticks = 0; 4773 stat->min_copy_latency_ticks = UINT64_MAX; 4774 4775 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4776 return; 4777 } 4778 4779 stat->bytes_read = 0; 4780 stat->num_read_ops = 0; 4781 stat->bytes_written = 0; 4782 stat->num_write_ops = 0; 4783 stat->bytes_unmapped = 0; 4784 stat->num_unmap_ops = 0; 4785 stat->bytes_copied = 0; 4786 stat->num_copy_ops = 0; 4787 stat->read_latency_ticks = 0; 4788 stat->write_latency_ticks = 0; 4789 stat->unmap_latency_ticks = 0; 4790 stat->copy_latency_ticks = 0; 4791 4792 if (stat->io_error != NULL) { 4793 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4794 } 4795 } 4796 4797 struct spdk_bdev_io_stat * 4798 bdev_alloc_io_stat(bool io_error_stat) 4799 { 4800 struct spdk_bdev_io_stat *stat; 4801 4802 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4803 if (stat == NULL) { 4804 return NULL; 4805 } 4806 4807 if (io_error_stat) { 4808 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4809 if (stat->io_error == NULL) { 4810 free(stat); 4811 return NULL; 4812 } 4813 } else { 4814 stat->io_error = NULL; 4815 } 4816 4817 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4818 4819 return stat; 4820 } 4821 4822 void 4823 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4824 { 4825 if (stat != NULL) { 4826 free(stat->io_error); 4827 free(stat); 4828 } 4829 } 4830 4831 void 4832 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4833 { 4834 int i; 4835 4836 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4837 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4838 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4839 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4840 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4841 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4842 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4843 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4844 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4845 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4846 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4847 stat->min_read_latency_ticks != UINT64_MAX ? 4848 stat->min_read_latency_ticks : 0); 4849 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4850 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4851 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4852 stat->min_write_latency_ticks != UINT64_MAX ? 4853 stat->min_write_latency_ticks : 0); 4854 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4855 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4856 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4857 stat->min_unmap_latency_ticks != UINT64_MAX ? 4858 stat->min_unmap_latency_ticks : 0); 4859 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4860 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4861 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4862 stat->min_copy_latency_ticks != UINT64_MAX ? 4863 stat->min_copy_latency_ticks : 0); 4864 4865 if (stat->io_error != NULL) { 4866 spdk_json_write_named_object_begin(w, "io_error"); 4867 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4868 if (stat->io_error->error_status[i] != 0) { 4869 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4870 stat->io_error->error_status[i]); 4871 } 4872 } 4873 spdk_json_write_object_end(w); 4874 } 4875 } 4876 4877 static void 4878 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4879 { 4880 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4881 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4882 4883 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4884 bdev_abort_all_buf_io(mgmt_ch, ch); 4885 } 4886 4887 static void 4888 bdev_channel_destroy(void *io_device, void *ctx_buf) 4889 { 4890 struct spdk_bdev_channel *ch = ctx_buf; 4891 4892 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4893 spdk_get_thread()); 4894 4895 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4896 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4897 4898 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4899 spdk_spin_lock(&ch->bdev->internal.spinlock); 4900 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4901 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4902 4903 bdev_channel_abort_queued_ios(ch); 4904 4905 if (ch->histogram) { 4906 spdk_histogram_data_free(ch->histogram); 4907 } 4908 4909 bdev_channel_destroy_resource(ch); 4910 } 4911 4912 /* 4913 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4914 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4915 */ 4916 static int 4917 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4918 { 4919 struct spdk_bdev_name *tmp; 4920 4921 bdev_name->name = strdup(name); 4922 if (bdev_name->name == NULL) { 4923 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4924 return -ENOMEM; 4925 } 4926 4927 bdev_name->bdev = bdev; 4928 4929 spdk_spin_lock(&g_bdev_mgr.spinlock); 4930 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4931 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4932 4933 if (tmp != NULL) { 4934 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4935 free(bdev_name->name); 4936 return -EEXIST; 4937 } 4938 4939 return 0; 4940 } 4941 4942 static void 4943 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4944 { 4945 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4946 free(bdev_name->name); 4947 } 4948 4949 static void 4950 bdev_name_del(struct spdk_bdev_name *bdev_name) 4951 { 4952 spdk_spin_lock(&g_bdev_mgr.spinlock); 4953 bdev_name_del_unsafe(bdev_name); 4954 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4955 } 4956 4957 int 4958 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4959 { 4960 struct spdk_bdev_alias *tmp; 4961 int ret; 4962 4963 if (alias == NULL) { 4964 SPDK_ERRLOG("Empty alias passed\n"); 4965 return -EINVAL; 4966 } 4967 4968 tmp = calloc(1, sizeof(*tmp)); 4969 if (tmp == NULL) { 4970 SPDK_ERRLOG("Unable to allocate alias\n"); 4971 return -ENOMEM; 4972 } 4973 4974 ret = bdev_name_add(&tmp->alias, bdev, alias); 4975 if (ret != 0) { 4976 free(tmp); 4977 return ret; 4978 } 4979 4980 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4981 4982 return 0; 4983 } 4984 4985 static int 4986 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4987 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4988 { 4989 struct spdk_bdev_alias *tmp; 4990 4991 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4992 if (strcmp(alias, tmp->alias.name) == 0) { 4993 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4994 alias_del_fn(&tmp->alias); 4995 free(tmp); 4996 return 0; 4997 } 4998 } 4999 5000 return -ENOENT; 5001 } 5002 5003 int 5004 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 5005 { 5006 int rc; 5007 5008 rc = bdev_alias_del(bdev, alias, bdev_name_del); 5009 if (rc == -ENOENT) { 5010 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 5011 } 5012 5013 return rc; 5014 } 5015 5016 void 5017 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 5018 { 5019 struct spdk_bdev_alias *p, *tmp; 5020 5021 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 5022 TAILQ_REMOVE(&bdev->aliases, p, tailq); 5023 bdev_name_del(&p->alias); 5024 free(p); 5025 } 5026 } 5027 5028 struct spdk_io_channel * 5029 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 5030 { 5031 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 5032 } 5033 5034 void * 5035 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 5036 { 5037 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5038 void *ctx = NULL; 5039 5040 if (bdev->fn_table->get_module_ctx) { 5041 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 5042 } 5043 5044 return ctx; 5045 } 5046 5047 const char * 5048 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 5049 { 5050 return bdev->module->name; 5051 } 5052 5053 const char * 5054 spdk_bdev_get_name(const struct spdk_bdev *bdev) 5055 { 5056 return bdev->name; 5057 } 5058 5059 const char * 5060 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 5061 { 5062 return bdev->product_name; 5063 } 5064 5065 const struct spdk_bdev_aliases_list * 5066 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 5067 { 5068 return &bdev->aliases; 5069 } 5070 5071 uint32_t 5072 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 5073 { 5074 return bdev->blocklen; 5075 } 5076 5077 uint32_t 5078 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 5079 { 5080 return bdev->write_unit_size; 5081 } 5082 5083 uint64_t 5084 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 5085 { 5086 return bdev->blockcnt; 5087 } 5088 5089 const char * 5090 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 5091 { 5092 return qos_rpc_type[type]; 5093 } 5094 5095 void 5096 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 5097 { 5098 int i; 5099 5100 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 5101 5102 spdk_spin_lock(&bdev->internal.spinlock); 5103 if (bdev->internal.qos) { 5104 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5105 if (bdev->internal.qos->rate_limits[i].limit != 5106 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5107 limits[i] = bdev->internal.qos->rate_limits[i].limit; 5108 if (bdev_qos_is_iops_rate_limit(i) == false) { 5109 /* Change from Byte to Megabyte which is user visible. */ 5110 limits[i] = limits[i] / 1024 / 1024; 5111 } 5112 } 5113 } 5114 } 5115 spdk_spin_unlock(&bdev->internal.spinlock); 5116 } 5117 5118 size_t 5119 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 5120 { 5121 return 1 << bdev->required_alignment; 5122 } 5123 5124 uint32_t 5125 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 5126 { 5127 return bdev->optimal_io_boundary; 5128 } 5129 5130 bool 5131 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 5132 { 5133 return bdev->write_cache; 5134 } 5135 5136 const struct spdk_uuid * 5137 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 5138 { 5139 return &bdev->uuid; 5140 } 5141 5142 uint16_t 5143 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 5144 { 5145 return bdev->acwu; 5146 } 5147 5148 uint32_t 5149 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 5150 { 5151 return bdev->md_len; 5152 } 5153 5154 bool 5155 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 5156 { 5157 return (bdev->md_len != 0) && bdev->md_interleave; 5158 } 5159 5160 bool 5161 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 5162 { 5163 return (bdev->md_len != 0) && !bdev->md_interleave; 5164 } 5165 5166 bool 5167 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 5168 { 5169 return bdev->zoned; 5170 } 5171 5172 uint32_t 5173 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 5174 { 5175 if (spdk_bdev_is_md_interleaved(bdev)) { 5176 return bdev->blocklen - bdev->md_len; 5177 } else { 5178 return bdev->blocklen; 5179 } 5180 } 5181 5182 uint32_t 5183 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 5184 { 5185 return bdev->phys_blocklen; 5186 } 5187 5188 static uint32_t 5189 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 5190 { 5191 if (!spdk_bdev_is_md_interleaved(bdev)) { 5192 return bdev->blocklen + bdev->md_len; 5193 } else { 5194 return bdev->blocklen; 5195 } 5196 } 5197 5198 /* We have to use the typedef in the function declaration to appease astyle. */ 5199 typedef enum spdk_dif_type spdk_dif_type_t; 5200 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 5201 5202 spdk_dif_type_t 5203 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 5204 { 5205 if (bdev->md_len != 0) { 5206 return bdev->dif_type; 5207 } else { 5208 return SPDK_DIF_DISABLE; 5209 } 5210 } 5211 5212 spdk_dif_pi_format_t 5213 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 5214 { 5215 return bdev->dif_pi_format; 5216 } 5217 5218 bool 5219 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 5220 { 5221 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 5222 return bdev->dif_is_head_of_md; 5223 } else { 5224 return false; 5225 } 5226 } 5227 5228 bool 5229 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 5230 enum spdk_dif_check_type check_type) 5231 { 5232 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5233 return false; 5234 } 5235 5236 switch (check_type) { 5237 case SPDK_DIF_CHECK_TYPE_REFTAG: 5238 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5239 case SPDK_DIF_CHECK_TYPE_APPTAG: 5240 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5241 case SPDK_DIF_CHECK_TYPE_GUARD: 5242 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5243 default: 5244 return false; 5245 } 5246 } 5247 5248 static uint32_t 5249 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5250 { 5251 uint64_t aligned_length, max_write_blocks; 5252 5253 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5254 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5255 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5256 5257 return max_write_blocks; 5258 } 5259 5260 uint32_t 5261 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5262 { 5263 return bdev->max_copy; 5264 } 5265 5266 uint64_t 5267 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5268 { 5269 return bdev->internal.measured_queue_depth; 5270 } 5271 5272 uint64_t 5273 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5274 { 5275 return bdev->internal.period; 5276 } 5277 5278 uint64_t 5279 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5280 { 5281 return bdev->internal.weighted_io_time; 5282 } 5283 5284 uint64_t 5285 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5286 { 5287 return bdev->internal.io_time; 5288 } 5289 5290 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5291 { 5292 return bdev->ctratt; 5293 } 5294 5295 uint32_t 5296 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5297 { 5298 return bdev->nsid; 5299 } 5300 5301 uint32_t 5302 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5303 { 5304 struct spdk_bdev *bdev = desc->bdev; 5305 5306 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5307 } 5308 5309 uint32_t 5310 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5311 { 5312 struct spdk_bdev *bdev = desc->bdev; 5313 5314 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5315 } 5316 5317 bool 5318 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5319 { 5320 struct spdk_bdev *bdev = desc->bdev; 5321 5322 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5323 } 5324 5325 bool 5326 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5327 { 5328 struct spdk_bdev *bdev = desc->bdev; 5329 5330 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5331 } 5332 5333 spdk_dif_type_t 5334 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5335 { 5336 struct spdk_bdev *bdev = desc->bdev; 5337 5338 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5339 } 5340 5341 spdk_dif_pi_format_t 5342 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5343 { 5344 struct spdk_bdev *bdev = desc->bdev; 5345 5346 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5347 } 5348 5349 bool 5350 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5351 { 5352 struct spdk_bdev *bdev = desc->bdev; 5353 5354 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5355 } 5356 5357 bool 5358 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5359 enum spdk_dif_check_type check_type) 5360 { 5361 struct spdk_bdev *bdev = desc->bdev; 5362 5363 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5364 } 5365 5366 static void bdev_update_qd_sampling_period(void *ctx); 5367 5368 static void 5369 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5370 { 5371 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5372 5373 if (bdev->internal.measured_queue_depth) { 5374 bdev->internal.io_time += bdev->internal.period; 5375 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5376 } 5377 5378 bdev->internal.qd_poll_in_progress = false; 5379 5380 bdev_update_qd_sampling_period(bdev); 5381 } 5382 5383 static void 5384 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5385 struct spdk_io_channel *io_ch, void *_ctx) 5386 { 5387 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5388 5389 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5390 spdk_bdev_for_each_channel_continue(i, 0); 5391 } 5392 5393 static int 5394 bdev_calculate_measured_queue_depth(void *ctx) 5395 { 5396 struct spdk_bdev *bdev = ctx; 5397 5398 bdev->internal.qd_poll_in_progress = true; 5399 bdev->internal.temporary_queue_depth = 0; 5400 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5401 return SPDK_POLLER_BUSY; 5402 } 5403 5404 static void 5405 bdev_update_qd_sampling_period(void *ctx) 5406 { 5407 struct spdk_bdev *bdev = ctx; 5408 5409 if (bdev->internal.period == bdev->internal.new_period) { 5410 return; 5411 } 5412 5413 if (bdev->internal.qd_poll_in_progress) { 5414 return; 5415 } 5416 5417 bdev->internal.period = bdev->internal.new_period; 5418 5419 spdk_poller_unregister(&bdev->internal.qd_poller); 5420 if (bdev->internal.period != 0) { 5421 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5422 bdev, bdev->internal.period); 5423 } else { 5424 spdk_bdev_close(bdev->internal.qd_desc); 5425 bdev->internal.qd_desc = NULL; 5426 } 5427 } 5428 5429 static void 5430 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5431 { 5432 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5433 } 5434 5435 void 5436 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5437 { 5438 int rc; 5439 5440 if (bdev->internal.new_period == period) { 5441 return; 5442 } 5443 5444 bdev->internal.new_period = period; 5445 5446 if (bdev->internal.qd_desc != NULL) { 5447 assert(bdev->internal.period != 0); 5448 5449 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5450 bdev_update_qd_sampling_period, bdev); 5451 return; 5452 } 5453 5454 assert(bdev->internal.period == 0); 5455 5456 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5457 NULL, &bdev->internal.qd_desc); 5458 if (rc != 0) { 5459 return; 5460 } 5461 5462 bdev->internal.period = period; 5463 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5464 bdev, period); 5465 } 5466 5467 struct bdev_get_current_qd_ctx { 5468 uint64_t current_qd; 5469 spdk_bdev_get_current_qd_cb cb_fn; 5470 void *cb_arg; 5471 }; 5472 5473 static void 5474 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5475 { 5476 struct bdev_get_current_qd_ctx *ctx = _ctx; 5477 5478 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5479 5480 free(ctx); 5481 } 5482 5483 static void 5484 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5485 struct spdk_io_channel *io_ch, void *_ctx) 5486 { 5487 struct bdev_get_current_qd_ctx *ctx = _ctx; 5488 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5489 5490 ctx->current_qd += bdev_ch->io_outstanding; 5491 5492 spdk_bdev_for_each_channel_continue(i, 0); 5493 } 5494 5495 void 5496 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5497 void *cb_arg) 5498 { 5499 struct bdev_get_current_qd_ctx *ctx; 5500 5501 assert(cb_fn != NULL); 5502 5503 ctx = calloc(1, sizeof(*ctx)); 5504 if (ctx == NULL) { 5505 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5506 return; 5507 } 5508 5509 ctx->cb_fn = cb_fn; 5510 ctx->cb_arg = cb_arg; 5511 5512 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5513 } 5514 5515 static void 5516 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5517 { 5518 assert(desc->thread == spdk_get_thread()); 5519 5520 spdk_spin_lock(&desc->spinlock); 5521 desc->refs--; 5522 if (!desc->closed) { 5523 spdk_spin_unlock(&desc->spinlock); 5524 desc->callback.event_fn(type, 5525 desc->bdev, 5526 desc->callback.ctx); 5527 return; 5528 } else if (desc->refs == 0) { 5529 /* This descriptor was closed after this event_notify message was sent. 5530 * spdk_bdev_close() could not free the descriptor since this message was 5531 * in flight, so we free it now using bdev_desc_free(). 5532 */ 5533 spdk_spin_unlock(&desc->spinlock); 5534 bdev_desc_free(desc); 5535 return; 5536 } 5537 spdk_spin_unlock(&desc->spinlock); 5538 } 5539 5540 static void 5541 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5542 { 5543 spdk_spin_lock(&desc->spinlock); 5544 desc->refs++; 5545 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5546 spdk_spin_unlock(&desc->spinlock); 5547 } 5548 5549 static void 5550 _resize_notify(void *ctx) 5551 { 5552 struct spdk_bdev_desc *desc = ctx; 5553 5554 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5555 } 5556 5557 int 5558 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5559 { 5560 struct spdk_bdev_desc *desc; 5561 int ret; 5562 5563 if (size == bdev->blockcnt) { 5564 return 0; 5565 } 5566 5567 spdk_spin_lock(&bdev->internal.spinlock); 5568 5569 /* bdev has open descriptors */ 5570 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5571 bdev->blockcnt > size) { 5572 ret = -EBUSY; 5573 } else { 5574 bdev->blockcnt = size; 5575 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5576 event_notify(desc, _resize_notify); 5577 } 5578 ret = 0; 5579 } 5580 5581 spdk_spin_unlock(&bdev->internal.spinlock); 5582 5583 return ret; 5584 } 5585 5586 /* 5587 * Convert I/O offset and length from bytes to blocks. 5588 * 5589 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5590 */ 5591 static uint64_t 5592 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5593 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5594 { 5595 uint32_t block_size = bdev_desc_get_block_size(desc); 5596 uint8_t shift_cnt; 5597 5598 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5599 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5600 shift_cnt = spdk_u32log2(block_size); 5601 *offset_blocks = offset_bytes >> shift_cnt; 5602 *num_blocks = num_bytes >> shift_cnt; 5603 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5604 (num_bytes - (*num_blocks << shift_cnt)); 5605 } else { 5606 *offset_blocks = offset_bytes / block_size; 5607 *num_blocks = num_bytes / block_size; 5608 return (offset_bytes % block_size) | (num_bytes % block_size); 5609 } 5610 } 5611 5612 static bool 5613 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5614 { 5615 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5616 * has been an overflow and hence the offset has been wrapped around */ 5617 if (offset_blocks + num_blocks < offset_blocks) { 5618 return false; 5619 } 5620 5621 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5622 if (offset_blocks + num_blocks > bdev->blockcnt) { 5623 return false; 5624 } 5625 5626 return true; 5627 } 5628 5629 static void 5630 bdev_seek_complete_cb(void *ctx) 5631 { 5632 struct spdk_bdev_io *bdev_io = ctx; 5633 5634 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5635 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5636 } 5637 5638 static int 5639 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5640 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5641 spdk_bdev_io_completion_cb cb, void *cb_arg) 5642 { 5643 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5644 struct spdk_bdev_io *bdev_io; 5645 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5646 5647 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5648 5649 /* Check if offset_blocks is valid looking at the validity of one block */ 5650 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5651 return -EINVAL; 5652 } 5653 5654 bdev_io = bdev_channel_get_io(channel); 5655 if (!bdev_io) { 5656 return -ENOMEM; 5657 } 5658 5659 bdev_io->internal.ch = channel; 5660 bdev_io->internal.desc = desc; 5661 bdev_io->type = io_type; 5662 bdev_io->u.bdev.offset_blocks = offset_blocks; 5663 bdev_io->u.bdev.memory_domain = NULL; 5664 bdev_io->u.bdev.memory_domain_ctx = NULL; 5665 bdev_io->u.bdev.accel_sequence = NULL; 5666 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5667 5668 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5669 /* In case bdev doesn't support seek to next data/hole offset, 5670 * it is assumed that only data and no holes are present */ 5671 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5672 bdev_io->u.bdev.seek.offset = offset_blocks; 5673 } else { 5674 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5675 } 5676 5677 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5678 return 0; 5679 } 5680 5681 bdev_io_submit(bdev_io); 5682 return 0; 5683 } 5684 5685 int 5686 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5687 uint64_t offset_blocks, 5688 spdk_bdev_io_completion_cb cb, void *cb_arg) 5689 { 5690 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5691 } 5692 5693 int 5694 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5695 uint64_t offset_blocks, 5696 spdk_bdev_io_completion_cb cb, void *cb_arg) 5697 { 5698 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5699 } 5700 5701 uint64_t 5702 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5703 { 5704 return bdev_io->u.bdev.seek.offset; 5705 } 5706 5707 static int 5708 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5709 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5710 spdk_bdev_io_completion_cb cb, void *cb_arg) 5711 { 5712 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5713 struct spdk_bdev_io *bdev_io; 5714 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5715 5716 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5717 return -EINVAL; 5718 } 5719 5720 bdev_io = bdev_channel_get_io(channel); 5721 if (!bdev_io) { 5722 return -ENOMEM; 5723 } 5724 5725 bdev_io->internal.ch = channel; 5726 bdev_io->internal.desc = desc; 5727 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5728 bdev_io->u.bdev.iovs = &bdev_io->iov; 5729 bdev_io->u.bdev.iovs[0].iov_base = buf; 5730 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5731 bdev_io->u.bdev.iovcnt = 1; 5732 bdev_io->u.bdev.md_buf = md_buf; 5733 bdev_io->u.bdev.num_blocks = num_blocks; 5734 bdev_io->u.bdev.offset_blocks = offset_blocks; 5735 bdev_io->u.bdev.memory_domain = NULL; 5736 bdev_io->u.bdev.memory_domain_ctx = NULL; 5737 bdev_io->u.bdev.accel_sequence = NULL; 5738 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5739 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5740 5741 bdev_io_submit(bdev_io); 5742 return 0; 5743 } 5744 5745 int 5746 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5747 void *buf, uint64_t offset, uint64_t nbytes, 5748 spdk_bdev_io_completion_cb cb, void *cb_arg) 5749 { 5750 uint64_t offset_blocks, num_blocks; 5751 5752 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5753 return -EINVAL; 5754 } 5755 5756 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5757 } 5758 5759 int 5760 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5761 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5762 spdk_bdev_io_completion_cb cb, void *cb_arg) 5763 { 5764 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5765 } 5766 5767 int 5768 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5769 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5770 spdk_bdev_io_completion_cb cb, void *cb_arg) 5771 { 5772 struct iovec iov = { 5773 .iov_base = buf, 5774 }; 5775 5776 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5777 return -EINVAL; 5778 } 5779 5780 if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) { 5781 return -EINVAL; 5782 } 5783 5784 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5785 cb, cb_arg); 5786 } 5787 5788 int 5789 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5790 struct iovec *iov, int iovcnt, 5791 uint64_t offset, uint64_t nbytes, 5792 spdk_bdev_io_completion_cb cb, void *cb_arg) 5793 { 5794 uint64_t offset_blocks, num_blocks; 5795 5796 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5797 return -EINVAL; 5798 } 5799 5800 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5801 } 5802 5803 static int 5804 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5805 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5806 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5807 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5808 spdk_bdev_io_completion_cb cb, void *cb_arg) 5809 { 5810 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5811 struct spdk_bdev_io *bdev_io; 5812 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5813 5814 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5815 return -EINVAL; 5816 } 5817 5818 bdev_io = bdev_channel_get_io(channel); 5819 if (spdk_unlikely(!bdev_io)) { 5820 return -ENOMEM; 5821 } 5822 5823 bdev_io->internal.ch = channel; 5824 bdev_io->internal.desc = desc; 5825 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5826 bdev_io->u.bdev.iovs = iov; 5827 bdev_io->u.bdev.iovcnt = iovcnt; 5828 bdev_io->u.bdev.md_buf = md_buf; 5829 bdev_io->u.bdev.num_blocks = num_blocks; 5830 bdev_io->u.bdev.offset_blocks = offset_blocks; 5831 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5832 5833 if (seq != NULL) { 5834 bdev_io->internal.f.has_accel_sequence = true; 5835 bdev_io->internal.accel_sequence = seq; 5836 } 5837 5838 if (domain != NULL) { 5839 bdev_io->internal.f.has_memory_domain = true; 5840 bdev_io->internal.memory_domain = domain; 5841 bdev_io->internal.memory_domain_ctx = domain_ctx; 5842 } 5843 5844 bdev_io->u.bdev.memory_domain = domain; 5845 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5846 bdev_io->u.bdev.accel_sequence = seq; 5847 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5848 5849 _bdev_io_submit_ext(desc, bdev_io); 5850 5851 return 0; 5852 } 5853 5854 int 5855 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5856 struct iovec *iov, int iovcnt, 5857 uint64_t offset_blocks, uint64_t num_blocks, 5858 spdk_bdev_io_completion_cb cb, void *cb_arg) 5859 { 5860 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5861 5862 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5863 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5864 } 5865 5866 int 5867 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5868 struct iovec *iov, int iovcnt, void *md_buf, 5869 uint64_t offset_blocks, uint64_t num_blocks, 5870 spdk_bdev_io_completion_cb cb, void *cb_arg) 5871 { 5872 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5873 5874 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5875 return -EINVAL; 5876 } 5877 5878 if (md_buf && !_is_buf_allocated(iov)) { 5879 return -EINVAL; 5880 } 5881 5882 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5883 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5884 } 5885 5886 static inline bool 5887 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5888 { 5889 /* 5890 * We check if opts size is at least of size when we first introduced 5891 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5892 * are not checked internal. 5893 */ 5894 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5895 sizeof(opts->metadata) && 5896 opts->size <= sizeof(*opts) && 5897 /* When memory domain is used, the user must provide data buffers */ 5898 (!opts->memory_domain || (iov && iov[0].iov_base)); 5899 } 5900 5901 int 5902 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5903 struct iovec *iov, int iovcnt, 5904 uint64_t offset_blocks, uint64_t num_blocks, 5905 spdk_bdev_io_completion_cb cb, void *cb_arg, 5906 struct spdk_bdev_ext_io_opts *opts) 5907 { 5908 struct spdk_memory_domain *domain = NULL; 5909 struct spdk_accel_sequence *seq = NULL; 5910 void *domain_ctx = NULL, *md = NULL; 5911 uint32_t dif_check_flags = 0; 5912 uint32_t nvme_cdw12_raw; 5913 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5914 5915 if (opts) { 5916 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5917 return -EINVAL; 5918 } 5919 5920 md = opts->metadata; 5921 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5922 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5923 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5924 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5925 if (md) { 5926 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5927 return -EINVAL; 5928 } 5929 5930 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5931 return -EINVAL; 5932 } 5933 5934 if (spdk_unlikely(seq != NULL)) { 5935 return -EINVAL; 5936 } 5937 5938 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5939 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 5940 return -ENOTSUP; 5941 } 5942 } 5943 5944 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5945 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 5946 } 5947 } 5948 5949 dif_check_flags |= bdev->dif_check_flags & 5950 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5951 5952 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5953 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5954 } 5955 5956 static int 5957 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5958 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5959 spdk_bdev_io_completion_cb cb, void *cb_arg) 5960 { 5961 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5962 struct spdk_bdev_io *bdev_io; 5963 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5964 5965 if (!desc->write) { 5966 return -EBADF; 5967 } 5968 5969 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5970 return -EINVAL; 5971 } 5972 5973 bdev_io = bdev_channel_get_io(channel); 5974 if (!bdev_io) { 5975 return -ENOMEM; 5976 } 5977 5978 bdev_io->internal.ch = channel; 5979 bdev_io->internal.desc = desc; 5980 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5981 bdev_io->u.bdev.iovs = &bdev_io->iov; 5982 bdev_io->u.bdev.iovs[0].iov_base = buf; 5983 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5984 bdev_io->u.bdev.iovcnt = 1; 5985 bdev_io->u.bdev.md_buf = md_buf; 5986 bdev_io->u.bdev.num_blocks = num_blocks; 5987 bdev_io->u.bdev.offset_blocks = offset_blocks; 5988 bdev_io->u.bdev.memory_domain = NULL; 5989 bdev_io->u.bdev.memory_domain_ctx = NULL; 5990 bdev_io->u.bdev.accel_sequence = NULL; 5991 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5992 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5993 5994 bdev_io_submit(bdev_io); 5995 return 0; 5996 } 5997 5998 int 5999 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6000 void *buf, uint64_t offset, uint64_t nbytes, 6001 spdk_bdev_io_completion_cb cb, void *cb_arg) 6002 { 6003 uint64_t offset_blocks, num_blocks; 6004 6005 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6006 return -EINVAL; 6007 } 6008 6009 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 6010 } 6011 6012 int 6013 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6014 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6015 spdk_bdev_io_completion_cb cb, void *cb_arg) 6016 { 6017 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6018 cb, cb_arg); 6019 } 6020 6021 int 6022 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6023 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6024 spdk_bdev_io_completion_cb cb, void *cb_arg) 6025 { 6026 struct iovec iov = { 6027 .iov_base = buf, 6028 }; 6029 6030 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6031 return -EINVAL; 6032 } 6033 6034 if (md_buf && !_is_buf_allocated(&iov)) { 6035 return -EINVAL; 6036 } 6037 6038 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6039 cb, cb_arg); 6040 } 6041 6042 static int 6043 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6044 struct iovec *iov, int iovcnt, void *md_buf, 6045 uint64_t offset_blocks, uint64_t num_blocks, 6046 struct spdk_memory_domain *domain, void *domain_ctx, 6047 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 6048 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 6049 spdk_bdev_io_completion_cb cb, void *cb_arg) 6050 { 6051 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6052 struct spdk_bdev_io *bdev_io; 6053 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6054 6055 if (spdk_unlikely(!desc->write)) { 6056 return -EBADF; 6057 } 6058 6059 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 6060 return -EINVAL; 6061 } 6062 6063 bdev_io = bdev_channel_get_io(channel); 6064 if (spdk_unlikely(!bdev_io)) { 6065 return -ENOMEM; 6066 } 6067 6068 bdev_io->internal.ch = channel; 6069 bdev_io->internal.desc = desc; 6070 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 6071 bdev_io->u.bdev.iovs = iov; 6072 bdev_io->u.bdev.iovcnt = iovcnt; 6073 bdev_io->u.bdev.md_buf = md_buf; 6074 bdev_io->u.bdev.num_blocks = num_blocks; 6075 bdev_io->u.bdev.offset_blocks = offset_blocks; 6076 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6077 if (seq != NULL) { 6078 bdev_io->internal.f.has_accel_sequence = true; 6079 bdev_io->internal.accel_sequence = seq; 6080 } 6081 6082 if (domain != NULL) { 6083 bdev_io->internal.f.has_memory_domain = true; 6084 bdev_io->internal.memory_domain = domain; 6085 bdev_io->internal.memory_domain_ctx = domain_ctx; 6086 } 6087 6088 bdev_io->u.bdev.memory_domain = domain; 6089 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 6090 bdev_io->u.bdev.accel_sequence = seq; 6091 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 6092 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 6093 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 6094 6095 _bdev_io_submit_ext(desc, bdev_io); 6096 6097 return 0; 6098 } 6099 6100 int 6101 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6102 struct iovec *iov, int iovcnt, 6103 uint64_t offset, uint64_t len, 6104 spdk_bdev_io_completion_cb cb, void *cb_arg) 6105 { 6106 uint64_t offset_blocks, num_blocks; 6107 6108 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6109 return -EINVAL; 6110 } 6111 6112 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 6113 } 6114 6115 int 6116 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6117 struct iovec *iov, int iovcnt, 6118 uint64_t offset_blocks, uint64_t num_blocks, 6119 spdk_bdev_io_completion_cb cb, void *cb_arg) 6120 { 6121 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6122 6123 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6124 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6125 cb, cb_arg); 6126 } 6127 6128 int 6129 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6130 struct iovec *iov, int iovcnt, void *md_buf, 6131 uint64_t offset_blocks, uint64_t num_blocks, 6132 spdk_bdev_io_completion_cb cb, void *cb_arg) 6133 { 6134 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6135 6136 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 6137 return -EINVAL; 6138 } 6139 6140 if (md_buf && !_is_buf_allocated(iov)) { 6141 return -EINVAL; 6142 } 6143 6144 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6145 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6146 cb, cb_arg); 6147 } 6148 6149 int 6150 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6151 struct iovec *iov, int iovcnt, 6152 uint64_t offset_blocks, uint64_t num_blocks, 6153 spdk_bdev_io_completion_cb cb, void *cb_arg, 6154 struct spdk_bdev_ext_io_opts *opts) 6155 { 6156 struct spdk_memory_domain *domain = NULL; 6157 struct spdk_accel_sequence *seq = NULL; 6158 void *domain_ctx = NULL, *md = NULL; 6159 uint32_t dif_check_flags = 0; 6160 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6161 uint32_t nvme_cdw12_raw = 0; 6162 uint32_t nvme_cdw13_raw = 0; 6163 6164 if (opts) { 6165 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 6166 return -EINVAL; 6167 } 6168 md = opts->metadata; 6169 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 6170 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 6171 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 6172 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 6173 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 6174 if (md) { 6175 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 6176 return -EINVAL; 6177 } 6178 6179 if (spdk_unlikely(!_is_buf_allocated(iov))) { 6180 return -EINVAL; 6181 } 6182 6183 if (spdk_unlikely(seq != NULL)) { 6184 return -EINVAL; 6185 } 6186 6187 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6188 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 6189 return -ENOTSUP; 6190 } 6191 } 6192 6193 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6194 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 6195 } 6196 } 6197 6198 dif_check_flags |= bdev->dif_check_flags & 6199 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 6200 6201 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 6202 domain, domain_ctx, seq, dif_check_flags, 6203 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 6204 } 6205 6206 static void 6207 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6208 { 6209 struct spdk_bdev_io *parent_io = cb_arg; 6210 struct spdk_bdev *bdev = parent_io->bdev; 6211 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 6212 int i, rc = 0; 6213 6214 if (!success) { 6215 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6216 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6217 spdk_bdev_free_io(bdev_io); 6218 return; 6219 } 6220 6221 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 6222 rc = memcmp(read_buf, 6223 parent_io->u.bdev.iovs[i].iov_base, 6224 parent_io->u.bdev.iovs[i].iov_len); 6225 if (rc) { 6226 break; 6227 } 6228 read_buf += parent_io->u.bdev.iovs[i].iov_len; 6229 } 6230 6231 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 6232 rc = memcmp(bdev_io->u.bdev.md_buf, 6233 parent_io->u.bdev.md_buf, 6234 spdk_bdev_get_md_size(bdev)); 6235 } 6236 6237 spdk_bdev_free_io(bdev_io); 6238 6239 if (rc == 0) { 6240 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6241 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6242 } else { 6243 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 6244 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6245 } 6246 } 6247 6248 static void 6249 bdev_compare_do_read(void *_bdev_io) 6250 { 6251 struct spdk_bdev_io *bdev_io = _bdev_io; 6252 int rc; 6253 6254 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 6255 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6256 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6257 bdev_compare_do_read_done, bdev_io); 6258 6259 if (rc == -ENOMEM) { 6260 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6261 } else if (rc != 0) { 6262 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6263 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6264 } 6265 } 6266 6267 static int 6268 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6269 struct iovec *iov, int iovcnt, void *md_buf, 6270 uint64_t offset_blocks, uint64_t num_blocks, 6271 spdk_bdev_io_completion_cb cb, void *cb_arg) 6272 { 6273 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6274 struct spdk_bdev_io *bdev_io; 6275 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6276 6277 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6278 return -EINVAL; 6279 } 6280 6281 bdev_io = bdev_channel_get_io(channel); 6282 if (!bdev_io) { 6283 return -ENOMEM; 6284 } 6285 6286 bdev_io->internal.ch = channel; 6287 bdev_io->internal.desc = desc; 6288 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6289 bdev_io->u.bdev.iovs = iov; 6290 bdev_io->u.bdev.iovcnt = iovcnt; 6291 bdev_io->u.bdev.md_buf = md_buf; 6292 bdev_io->u.bdev.num_blocks = num_blocks; 6293 bdev_io->u.bdev.offset_blocks = offset_blocks; 6294 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6295 bdev_io->u.bdev.memory_domain = NULL; 6296 bdev_io->u.bdev.memory_domain_ctx = NULL; 6297 bdev_io->u.bdev.accel_sequence = NULL; 6298 6299 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6300 bdev_io_submit(bdev_io); 6301 return 0; 6302 } 6303 6304 bdev_compare_do_read(bdev_io); 6305 6306 return 0; 6307 } 6308 6309 int 6310 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6311 struct iovec *iov, int iovcnt, 6312 uint64_t offset_blocks, uint64_t num_blocks, 6313 spdk_bdev_io_completion_cb cb, void *cb_arg) 6314 { 6315 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6316 num_blocks, cb, cb_arg); 6317 } 6318 6319 int 6320 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6321 struct iovec *iov, int iovcnt, void *md_buf, 6322 uint64_t offset_blocks, uint64_t num_blocks, 6323 spdk_bdev_io_completion_cb cb, void *cb_arg) 6324 { 6325 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6326 return -EINVAL; 6327 } 6328 6329 if (md_buf && !_is_buf_allocated(iov)) { 6330 return -EINVAL; 6331 } 6332 6333 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6334 num_blocks, cb, cb_arg); 6335 } 6336 6337 static int 6338 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6339 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6340 spdk_bdev_io_completion_cb cb, void *cb_arg) 6341 { 6342 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6343 struct spdk_bdev_io *bdev_io; 6344 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6345 6346 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6347 return -EINVAL; 6348 } 6349 6350 bdev_io = bdev_channel_get_io(channel); 6351 if (!bdev_io) { 6352 return -ENOMEM; 6353 } 6354 6355 bdev_io->internal.ch = channel; 6356 bdev_io->internal.desc = desc; 6357 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6358 bdev_io->u.bdev.iovs = &bdev_io->iov; 6359 bdev_io->u.bdev.iovs[0].iov_base = buf; 6360 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 6361 bdev_io->u.bdev.iovcnt = 1; 6362 bdev_io->u.bdev.md_buf = md_buf; 6363 bdev_io->u.bdev.num_blocks = num_blocks; 6364 bdev_io->u.bdev.offset_blocks = offset_blocks; 6365 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6366 bdev_io->u.bdev.memory_domain = NULL; 6367 bdev_io->u.bdev.memory_domain_ctx = NULL; 6368 bdev_io->u.bdev.accel_sequence = NULL; 6369 6370 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6371 bdev_io_submit(bdev_io); 6372 return 0; 6373 } 6374 6375 bdev_compare_do_read(bdev_io); 6376 6377 return 0; 6378 } 6379 6380 int 6381 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6382 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6383 spdk_bdev_io_completion_cb cb, void *cb_arg) 6384 { 6385 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6386 cb, cb_arg); 6387 } 6388 6389 int 6390 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6391 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6392 spdk_bdev_io_completion_cb cb, void *cb_arg) 6393 { 6394 struct iovec iov = { 6395 .iov_base = buf, 6396 }; 6397 6398 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6399 return -EINVAL; 6400 } 6401 6402 if (md_buf && !_is_buf_allocated(&iov)) { 6403 return -EINVAL; 6404 } 6405 6406 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6407 cb, cb_arg); 6408 } 6409 6410 static void 6411 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6412 { 6413 struct spdk_bdev_io *bdev_io = ctx; 6414 6415 if (unlock_status) { 6416 SPDK_ERRLOG("LBA range unlock failed\n"); 6417 } 6418 6419 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6420 false, bdev_io->internal.caller_ctx); 6421 } 6422 6423 static void 6424 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6425 { 6426 bdev_io->internal.status = status; 6427 6428 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6429 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6430 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6431 } 6432 6433 static void 6434 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6435 { 6436 struct spdk_bdev_io *parent_io = cb_arg; 6437 6438 if (!success) { 6439 SPDK_ERRLOG("Compare and write operation failed\n"); 6440 } 6441 6442 spdk_bdev_free_io(bdev_io); 6443 6444 bdev_comparev_and_writev_blocks_unlock(parent_io, 6445 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6446 } 6447 6448 static void 6449 bdev_compare_and_write_do_write(void *_bdev_io) 6450 { 6451 struct spdk_bdev_io *bdev_io = _bdev_io; 6452 int rc; 6453 6454 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6455 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6456 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6457 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6458 bdev_compare_and_write_do_write_done, bdev_io); 6459 6460 6461 if (rc == -ENOMEM) { 6462 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6463 } else if (rc != 0) { 6464 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6465 } 6466 } 6467 6468 static void 6469 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6470 { 6471 struct spdk_bdev_io *parent_io = cb_arg; 6472 6473 spdk_bdev_free_io(bdev_io); 6474 6475 if (!success) { 6476 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6477 return; 6478 } 6479 6480 bdev_compare_and_write_do_write(parent_io); 6481 } 6482 6483 static void 6484 bdev_compare_and_write_do_compare(void *_bdev_io) 6485 { 6486 struct spdk_bdev_io *bdev_io = _bdev_io; 6487 int rc; 6488 6489 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6490 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6491 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6492 bdev_compare_and_write_do_compare_done, bdev_io); 6493 6494 if (rc == -ENOMEM) { 6495 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6496 } else if (rc != 0) { 6497 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6498 } 6499 } 6500 6501 static void 6502 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6503 { 6504 struct spdk_bdev_io *bdev_io = ctx; 6505 6506 if (status) { 6507 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6508 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6509 return; 6510 } 6511 6512 bdev_compare_and_write_do_compare(bdev_io); 6513 } 6514 6515 int 6516 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6517 struct iovec *compare_iov, int compare_iovcnt, 6518 struct iovec *write_iov, int write_iovcnt, 6519 uint64_t offset_blocks, uint64_t num_blocks, 6520 spdk_bdev_io_completion_cb cb, void *cb_arg) 6521 { 6522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6523 struct spdk_bdev_io *bdev_io; 6524 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6525 6526 if (!desc->write) { 6527 return -EBADF; 6528 } 6529 6530 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6531 return -EINVAL; 6532 } 6533 6534 if (num_blocks > bdev->acwu) { 6535 return -EINVAL; 6536 } 6537 6538 bdev_io = bdev_channel_get_io(channel); 6539 if (!bdev_io) { 6540 return -ENOMEM; 6541 } 6542 6543 bdev_io->internal.ch = channel; 6544 bdev_io->internal.desc = desc; 6545 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6546 bdev_io->u.bdev.iovs = compare_iov; 6547 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6548 bdev_io->u.bdev.fused_iovs = write_iov; 6549 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6550 bdev_io->u.bdev.md_buf = NULL; 6551 bdev_io->u.bdev.num_blocks = num_blocks; 6552 bdev_io->u.bdev.offset_blocks = offset_blocks; 6553 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6554 bdev_io->u.bdev.memory_domain = NULL; 6555 bdev_io->u.bdev.memory_domain_ctx = NULL; 6556 bdev_io->u.bdev.accel_sequence = NULL; 6557 6558 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6559 bdev_io_submit(bdev_io); 6560 return 0; 6561 } 6562 6563 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6564 bdev_comparev_and_writev_blocks_locked, bdev_io); 6565 } 6566 6567 int 6568 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6569 struct iovec *iov, int iovcnt, 6570 uint64_t offset_blocks, uint64_t num_blocks, 6571 bool populate, 6572 spdk_bdev_io_completion_cb cb, void *cb_arg) 6573 { 6574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6575 struct spdk_bdev_io *bdev_io; 6576 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6577 6578 if (!desc->write) { 6579 return -EBADF; 6580 } 6581 6582 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6583 return -EINVAL; 6584 } 6585 6586 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6587 return -ENOTSUP; 6588 } 6589 6590 bdev_io = bdev_channel_get_io(channel); 6591 if (!bdev_io) { 6592 return -ENOMEM; 6593 } 6594 6595 bdev_io->internal.ch = channel; 6596 bdev_io->internal.desc = desc; 6597 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6598 bdev_io->u.bdev.num_blocks = num_blocks; 6599 bdev_io->u.bdev.offset_blocks = offset_blocks; 6600 bdev_io->u.bdev.iovs = iov; 6601 bdev_io->u.bdev.iovcnt = iovcnt; 6602 bdev_io->u.bdev.md_buf = NULL; 6603 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6604 bdev_io->u.bdev.zcopy.commit = 0; 6605 bdev_io->u.bdev.zcopy.start = 1; 6606 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6607 bdev_io->u.bdev.memory_domain = NULL; 6608 bdev_io->u.bdev.memory_domain_ctx = NULL; 6609 bdev_io->u.bdev.accel_sequence = NULL; 6610 6611 bdev_io_submit(bdev_io); 6612 6613 return 0; 6614 } 6615 6616 int 6617 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6618 spdk_bdev_io_completion_cb cb, void *cb_arg) 6619 { 6620 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6621 return -EINVAL; 6622 } 6623 6624 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6625 bdev_io->u.bdev.zcopy.start = 0; 6626 bdev_io->internal.caller_ctx = cb_arg; 6627 bdev_io->internal.cb = cb; 6628 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6629 6630 bdev_io_submit(bdev_io); 6631 6632 return 0; 6633 } 6634 6635 int 6636 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6637 uint64_t offset, uint64_t len, 6638 spdk_bdev_io_completion_cb cb, void *cb_arg) 6639 { 6640 uint64_t offset_blocks, num_blocks; 6641 6642 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6643 return -EINVAL; 6644 } 6645 6646 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6647 } 6648 6649 int 6650 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6651 uint64_t offset_blocks, uint64_t num_blocks, 6652 spdk_bdev_io_completion_cb cb, void *cb_arg) 6653 { 6654 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6655 struct spdk_bdev_io *bdev_io; 6656 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6657 6658 if (!desc->write) { 6659 return -EBADF; 6660 } 6661 6662 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6663 return -EINVAL; 6664 } 6665 6666 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6667 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6668 return -ENOTSUP; 6669 } 6670 6671 bdev_io = bdev_channel_get_io(channel); 6672 6673 if (!bdev_io) { 6674 return -ENOMEM; 6675 } 6676 6677 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6678 bdev_io->internal.ch = channel; 6679 bdev_io->internal.desc = desc; 6680 bdev_io->u.bdev.offset_blocks = offset_blocks; 6681 bdev_io->u.bdev.num_blocks = num_blocks; 6682 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6683 bdev_io->u.bdev.memory_domain = NULL; 6684 bdev_io->u.bdev.memory_domain_ctx = NULL; 6685 bdev_io->u.bdev.accel_sequence = NULL; 6686 6687 /* If the write_zeroes size is large and should be split, use the generic split 6688 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6689 * 6690 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6691 * or emulate it using regular write request otherwise. 6692 */ 6693 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6694 bdev_io->internal.f.split) { 6695 bdev_io_submit(bdev_io); 6696 return 0; 6697 } 6698 6699 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6700 6701 return bdev_write_zero_buffer(bdev_io); 6702 } 6703 6704 int 6705 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6706 uint64_t offset, uint64_t nbytes, 6707 spdk_bdev_io_completion_cb cb, void *cb_arg) 6708 { 6709 uint64_t offset_blocks, num_blocks; 6710 6711 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6712 return -EINVAL; 6713 } 6714 6715 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6716 } 6717 6718 static void 6719 bdev_io_complete_cb(void *ctx) 6720 { 6721 struct spdk_bdev_io *bdev_io = ctx; 6722 6723 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6724 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6725 } 6726 6727 int 6728 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6729 uint64_t offset_blocks, uint64_t num_blocks, 6730 spdk_bdev_io_completion_cb cb, void *cb_arg) 6731 { 6732 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6733 struct spdk_bdev_io *bdev_io; 6734 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6735 6736 if (!desc->write) { 6737 return -EBADF; 6738 } 6739 6740 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6741 return -EINVAL; 6742 } 6743 6744 bdev_io = bdev_channel_get_io(channel); 6745 if (!bdev_io) { 6746 return -ENOMEM; 6747 } 6748 6749 bdev_io->internal.ch = channel; 6750 bdev_io->internal.desc = desc; 6751 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6752 6753 bdev_io->u.bdev.iovs = &bdev_io->iov; 6754 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6755 bdev_io->u.bdev.iovs[0].iov_len = 0; 6756 bdev_io->u.bdev.iovcnt = 1; 6757 6758 bdev_io->u.bdev.offset_blocks = offset_blocks; 6759 bdev_io->u.bdev.num_blocks = num_blocks; 6760 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6761 bdev_io->u.bdev.memory_domain = NULL; 6762 bdev_io->u.bdev.memory_domain_ctx = NULL; 6763 bdev_io->u.bdev.accel_sequence = NULL; 6764 6765 if (num_blocks == 0) { 6766 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6767 return 0; 6768 } 6769 6770 bdev_io_submit(bdev_io); 6771 return 0; 6772 } 6773 6774 int 6775 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6776 uint64_t offset, uint64_t length, 6777 spdk_bdev_io_completion_cb cb, void *cb_arg) 6778 { 6779 uint64_t offset_blocks, num_blocks; 6780 6781 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6782 return -EINVAL; 6783 } 6784 6785 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6786 } 6787 6788 int 6789 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6790 uint64_t offset_blocks, uint64_t num_blocks, 6791 spdk_bdev_io_completion_cb cb, void *cb_arg) 6792 { 6793 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6794 struct spdk_bdev_io *bdev_io; 6795 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6796 6797 if (!desc->write) { 6798 return -EBADF; 6799 } 6800 6801 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6802 return -ENOTSUP; 6803 } 6804 6805 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6806 return -EINVAL; 6807 } 6808 6809 bdev_io = bdev_channel_get_io(channel); 6810 if (!bdev_io) { 6811 return -ENOMEM; 6812 } 6813 6814 bdev_io->internal.ch = channel; 6815 bdev_io->internal.desc = desc; 6816 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6817 bdev_io->u.bdev.iovs = NULL; 6818 bdev_io->u.bdev.iovcnt = 0; 6819 bdev_io->u.bdev.offset_blocks = offset_blocks; 6820 bdev_io->u.bdev.num_blocks = num_blocks; 6821 bdev_io->u.bdev.memory_domain = NULL; 6822 bdev_io->u.bdev.memory_domain_ctx = NULL; 6823 bdev_io->u.bdev.accel_sequence = NULL; 6824 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6825 6826 bdev_io_submit(bdev_io); 6827 return 0; 6828 } 6829 6830 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6831 6832 static void 6833 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6834 { 6835 struct spdk_bdev_io *bdev_io = _ctx; 6836 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6837 6838 if (status == -EBUSY) { 6839 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6840 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6841 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6842 } else { 6843 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6844 /* If outstanding IOs are still present and reset_io_drain_timeout 6845 * seconds passed, start the reset. */ 6846 bdev_io_submit_reset(bdev_io); 6847 } else { 6848 /* We still have in progress memory domain pull/push or we're 6849 * executing accel sequence. Since we cannot abort either of those 6850 * operations, fail the reset request. */ 6851 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6852 } 6853 } 6854 } else { 6855 SPDK_DEBUGLOG(bdev, 6856 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6857 ch->bdev->name); 6858 /* Mark the completion status as a SUCCESS and complete the reset. */ 6859 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6860 } 6861 } 6862 6863 static void 6864 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6865 struct spdk_io_channel *io_ch, void *_ctx) 6866 { 6867 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6868 int status = 0; 6869 6870 if (cur_ch->io_outstanding > 0 || 6871 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6872 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6873 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6874 * further iteration over the rest of the channels and pass non-zero status 6875 * to the callback function. */ 6876 status = -EBUSY; 6877 } 6878 spdk_bdev_for_each_channel_continue(i, status); 6879 } 6880 6881 static int 6882 bdev_reset_poll_for_outstanding_io(void *ctx) 6883 { 6884 struct spdk_bdev_io *bdev_io = ctx; 6885 6886 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6887 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6888 bdev_reset_check_outstanding_io_done); 6889 6890 return SPDK_POLLER_BUSY; 6891 } 6892 6893 static void 6894 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6895 { 6896 struct spdk_bdev_io *bdev_io = _ctx; 6897 6898 if (bdev->reset_io_drain_timeout == 0) { 6899 bdev_io_submit_reset(bdev_io); 6900 return; 6901 } 6902 6903 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6904 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6905 6906 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6907 * submit the reset to the underlying module only if outstanding I/O 6908 * remain after reset_io_drain_timeout seconds have passed. */ 6909 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6910 bdev_reset_check_outstanding_io_done); 6911 } 6912 6913 static void 6914 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6915 struct spdk_io_channel *ch, void *_ctx) 6916 { 6917 struct spdk_bdev_channel *channel; 6918 struct spdk_bdev_mgmt_channel *mgmt_channel; 6919 struct spdk_bdev_shared_resource *shared_resource; 6920 bdev_io_tailq_t tmp_queued; 6921 6922 TAILQ_INIT(&tmp_queued); 6923 6924 channel = __io_ch_to_bdev_ch(ch); 6925 shared_resource = channel->shared_resource; 6926 mgmt_channel = shared_resource->mgmt_ch; 6927 6928 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6929 6930 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6931 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6932 } 6933 6934 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6935 bdev_abort_all_buf_io(mgmt_channel, channel); 6936 bdev_abort_all_queued_io(&tmp_queued, channel); 6937 6938 spdk_bdev_for_each_channel_continue(i, 0); 6939 } 6940 6941 static void 6942 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6943 { 6944 struct spdk_bdev *bdev = bdev_io->bdev; 6945 bool freeze_channel = false; 6946 6947 bdev_ch_add_to_io_submitted(bdev_io); 6948 6949 /** 6950 * Take a channel reference for the target bdev for the life of this 6951 * reset. This guards against the channel getting destroyed before 6952 * the reset is completed. We will release the reference when this 6953 * reset is completed. 6954 */ 6955 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6956 6957 spdk_spin_lock(&bdev->internal.spinlock); 6958 if (bdev->internal.reset_in_progress == NULL) { 6959 bdev->internal.reset_in_progress = bdev_io; 6960 freeze_channel = true; 6961 } else { 6962 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6963 } 6964 spdk_spin_unlock(&bdev->internal.spinlock); 6965 6966 if (freeze_channel) { 6967 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6968 bdev_reset_freeze_channel_done); 6969 } 6970 } 6971 6972 int 6973 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6974 spdk_bdev_io_completion_cb cb, void *cb_arg) 6975 { 6976 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6977 struct spdk_bdev_io *bdev_io; 6978 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6979 6980 bdev_io = bdev_channel_get_io(channel); 6981 if (!bdev_io) { 6982 return -ENOMEM; 6983 } 6984 6985 bdev_io->internal.ch = channel; 6986 bdev_io->internal.desc = desc; 6987 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6988 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6989 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6990 6991 bdev_start_reset(bdev_io); 6992 return 0; 6993 } 6994 6995 void 6996 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6997 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6998 { 6999 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7000 7001 bdev_get_io_stat(stat, channel->stat); 7002 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 7003 } 7004 7005 static void 7006 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 7007 { 7008 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7009 7010 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 7011 bdev_iostat_ctx->cb_arg, 0); 7012 free(bdev_iostat_ctx); 7013 } 7014 7015 static void 7016 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7017 struct spdk_io_channel *ch, void *_ctx) 7018 { 7019 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7020 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7021 7022 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 7023 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 7024 spdk_bdev_for_each_channel_continue(i, 0); 7025 } 7026 7027 void 7028 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 7029 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 7030 { 7031 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 7032 7033 assert(bdev != NULL); 7034 assert(stat != NULL); 7035 assert(cb != NULL); 7036 7037 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 7038 if (bdev_iostat_ctx == NULL) { 7039 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 7040 cb(bdev, stat, cb_arg, -ENOMEM); 7041 return; 7042 } 7043 7044 bdev_iostat_ctx->stat = stat; 7045 bdev_iostat_ctx->cb = cb; 7046 bdev_iostat_ctx->cb_arg = cb_arg; 7047 bdev_iostat_ctx->reset_mode = reset_mode; 7048 7049 /* Start with the statistics from previously deleted channels. */ 7050 spdk_spin_lock(&bdev->internal.spinlock); 7051 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 7052 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 7053 spdk_spin_unlock(&bdev->internal.spinlock); 7054 7055 /* Then iterate and add the statistics from each existing channel. */ 7056 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 7057 bdev_get_device_stat_done); 7058 } 7059 7060 struct bdev_iostat_reset_ctx { 7061 enum spdk_bdev_reset_stat_mode mode; 7062 bdev_reset_device_stat_cb cb; 7063 void *cb_arg; 7064 }; 7065 7066 static void 7067 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 7068 { 7069 struct bdev_iostat_reset_ctx *ctx = _ctx; 7070 7071 ctx->cb(bdev, ctx->cb_arg, 0); 7072 7073 free(ctx); 7074 } 7075 7076 static void 7077 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7078 struct spdk_io_channel *ch, void *_ctx) 7079 { 7080 struct bdev_iostat_reset_ctx *ctx = _ctx; 7081 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7082 7083 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 7084 7085 spdk_bdev_for_each_channel_continue(i, 0); 7086 } 7087 7088 void 7089 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 7090 bdev_reset_device_stat_cb cb, void *cb_arg) 7091 { 7092 struct bdev_iostat_reset_ctx *ctx; 7093 7094 assert(bdev != NULL); 7095 assert(cb != NULL); 7096 7097 ctx = calloc(1, sizeof(*ctx)); 7098 if (ctx == NULL) { 7099 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 7100 cb(bdev, cb_arg, -ENOMEM); 7101 return; 7102 } 7103 7104 ctx->mode = mode; 7105 ctx->cb = cb; 7106 ctx->cb_arg = cb_arg; 7107 7108 spdk_spin_lock(&bdev->internal.spinlock); 7109 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 7110 spdk_spin_unlock(&bdev->internal.spinlock); 7111 7112 spdk_bdev_for_each_channel(bdev, 7113 bdev_reset_each_channel_stat, 7114 ctx, 7115 bdev_reset_device_stat_done); 7116 } 7117 7118 int 7119 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7120 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7121 spdk_bdev_io_completion_cb cb, void *cb_arg) 7122 { 7123 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7124 struct spdk_bdev_io *bdev_io; 7125 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7126 7127 if (!desc->write) { 7128 return -EBADF; 7129 } 7130 7131 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 7132 return -ENOTSUP; 7133 } 7134 7135 bdev_io = bdev_channel_get_io(channel); 7136 if (!bdev_io) { 7137 return -ENOMEM; 7138 } 7139 7140 bdev_io->internal.ch = channel; 7141 bdev_io->internal.desc = desc; 7142 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 7143 bdev_io->u.nvme_passthru.cmd = *cmd; 7144 bdev_io->u.nvme_passthru.buf = buf; 7145 bdev_io->u.nvme_passthru.nbytes = nbytes; 7146 bdev_io->u.nvme_passthru.md_buf = NULL; 7147 bdev_io->u.nvme_passthru.md_len = 0; 7148 7149 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7150 7151 bdev_io_submit(bdev_io); 7152 return 0; 7153 } 7154 7155 int 7156 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7157 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7158 spdk_bdev_io_completion_cb cb, void *cb_arg) 7159 { 7160 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7161 struct spdk_bdev_io *bdev_io; 7162 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7163 7164 if (!desc->write) { 7165 /* 7166 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7167 * to easily determine if the command is a read or write, but for now just 7168 * do not allow io_passthru with a read-only descriptor. 7169 */ 7170 return -EBADF; 7171 } 7172 7173 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7174 return -ENOTSUP; 7175 } 7176 7177 bdev_io = bdev_channel_get_io(channel); 7178 if (!bdev_io) { 7179 return -ENOMEM; 7180 } 7181 7182 bdev_io->internal.ch = channel; 7183 bdev_io->internal.desc = desc; 7184 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 7185 bdev_io->u.nvme_passthru.cmd = *cmd; 7186 bdev_io->u.nvme_passthru.buf = buf; 7187 bdev_io->u.nvme_passthru.nbytes = nbytes; 7188 bdev_io->u.nvme_passthru.md_buf = NULL; 7189 bdev_io->u.nvme_passthru.md_len = 0; 7190 7191 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7192 7193 bdev_io_submit(bdev_io); 7194 return 0; 7195 } 7196 7197 int 7198 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7199 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 7200 spdk_bdev_io_completion_cb cb, void *cb_arg) 7201 { 7202 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7203 struct spdk_bdev_io *bdev_io; 7204 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7205 7206 if (!desc->write) { 7207 /* 7208 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7209 * to easily determine if the command is a read or write, but for now just 7210 * do not allow io_passthru with a read-only descriptor. 7211 */ 7212 return -EBADF; 7213 } 7214 7215 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7216 return -ENOTSUP; 7217 } 7218 7219 bdev_io = bdev_channel_get_io(channel); 7220 if (!bdev_io) { 7221 return -ENOMEM; 7222 } 7223 7224 bdev_io->internal.ch = channel; 7225 bdev_io->internal.desc = desc; 7226 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 7227 bdev_io->u.nvme_passthru.cmd = *cmd; 7228 bdev_io->u.nvme_passthru.buf = buf; 7229 bdev_io->u.nvme_passthru.nbytes = nbytes; 7230 bdev_io->u.nvme_passthru.md_buf = md_buf; 7231 bdev_io->u.nvme_passthru.md_len = md_len; 7232 7233 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7234 7235 bdev_io_submit(bdev_io); 7236 return 0; 7237 } 7238 7239 int 7240 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 7241 struct spdk_io_channel *ch, 7242 const struct spdk_nvme_cmd *cmd, 7243 struct iovec *iov, int iovcnt, size_t nbytes, 7244 void *md_buf, size_t md_len, 7245 spdk_bdev_io_completion_cb cb, void *cb_arg) 7246 { 7247 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7248 struct spdk_bdev_io *bdev_io; 7249 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7250 7251 if (!desc->write) { 7252 /* 7253 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7254 * to easily determine if the command is a read or write, but for now just 7255 * do not allow io_passthru with a read-only descriptor. 7256 */ 7257 return -EBADF; 7258 } 7259 7260 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7261 return -ENOTSUP; 7262 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7263 return -ENOTSUP; 7264 } 7265 7266 bdev_io = bdev_channel_get_io(channel); 7267 if (!bdev_io) { 7268 return -ENOMEM; 7269 } 7270 7271 bdev_io->internal.ch = channel; 7272 bdev_io->internal.desc = desc; 7273 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7274 bdev_io->u.nvme_passthru.cmd = *cmd; 7275 bdev_io->u.nvme_passthru.iovs = iov; 7276 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7277 bdev_io->u.nvme_passthru.nbytes = nbytes; 7278 bdev_io->u.nvme_passthru.md_buf = md_buf; 7279 bdev_io->u.nvme_passthru.md_len = md_len; 7280 7281 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7282 7283 bdev_io_submit(bdev_io); 7284 return 0; 7285 } 7286 7287 static void bdev_abort_retry(void *ctx); 7288 static void bdev_abort(struct spdk_bdev_io *parent_io); 7289 7290 static void 7291 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7292 { 7293 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7294 struct spdk_bdev_io *parent_io = cb_arg; 7295 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7296 7297 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7298 7299 spdk_bdev_free_io(bdev_io); 7300 7301 if (!success) { 7302 /* Check if the target I/O completed in the meantime. */ 7303 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7304 if (tmp_io == bio_to_abort) { 7305 break; 7306 } 7307 } 7308 7309 /* If the target I/O still exists, set the parent to failed. */ 7310 if (tmp_io != NULL) { 7311 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7312 } 7313 } 7314 7315 assert(parent_io->internal.f.split); 7316 7317 parent_io->internal.split.outstanding--; 7318 if (parent_io->internal.split.outstanding == 0) { 7319 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7320 bdev_abort_retry(parent_io); 7321 } else { 7322 bdev_io_complete(parent_io); 7323 } 7324 } 7325 } 7326 7327 static int 7328 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7329 struct spdk_bdev_io *bio_to_abort, 7330 spdk_bdev_io_completion_cb cb, void *cb_arg) 7331 { 7332 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7333 struct spdk_bdev_io *bdev_io; 7334 7335 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7336 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7337 /* TODO: Abort reset or abort request. */ 7338 return -ENOTSUP; 7339 } 7340 7341 bdev_io = bdev_channel_get_io(channel); 7342 if (bdev_io == NULL) { 7343 return -ENOMEM; 7344 } 7345 7346 bdev_io->internal.ch = channel; 7347 bdev_io->internal.desc = desc; 7348 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7349 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7350 7351 if (bio_to_abort->internal.f.split) { 7352 assert(bdev_io_should_split(bio_to_abort)); 7353 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7354 7355 /* Parent abort request is not submitted directly, but to manage its 7356 * execution add it to the submitted list here. 7357 */ 7358 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7359 bdev_ch_add_to_io_submitted(bdev_io); 7360 7361 bdev_abort(bdev_io); 7362 7363 return 0; 7364 } 7365 7366 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7367 7368 /* Submit the abort request to the underlying bdev module. */ 7369 bdev_io_submit(bdev_io); 7370 7371 return 0; 7372 } 7373 7374 static bool 7375 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7376 { 7377 struct spdk_bdev_io *iter; 7378 7379 TAILQ_FOREACH(iter, tailq, internal.link) { 7380 if (iter == bdev_io) { 7381 return true; 7382 } 7383 } 7384 7385 return false; 7386 } 7387 7388 static uint32_t 7389 _bdev_abort(struct spdk_bdev_io *parent_io) 7390 { 7391 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7392 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7393 void *bio_cb_arg; 7394 struct spdk_bdev_io *bio_to_abort; 7395 uint32_t matched_ios; 7396 int rc; 7397 7398 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7399 7400 /* matched_ios is returned and will be kept by the caller. 7401 * 7402 * This function will be used for two cases, 1) the same cb_arg is used for 7403 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7404 * Incrementing split_outstanding directly here may confuse readers especially 7405 * for the 1st case. 7406 * 7407 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7408 * works as expected. 7409 */ 7410 matched_ios = 0; 7411 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7412 7413 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7414 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7415 continue; 7416 } 7417 7418 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7419 /* Any I/O which was submitted after this abort command should be excluded. */ 7420 continue; 7421 } 7422 7423 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7424 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7425 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7426 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7427 break; 7428 } 7429 7430 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7431 if (rc != 0) { 7432 if (rc == -ENOMEM) { 7433 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7434 } else { 7435 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7436 } 7437 break; 7438 } 7439 matched_ios++; 7440 } 7441 7442 return matched_ios; 7443 } 7444 7445 static void 7446 bdev_abort_retry(void *ctx) 7447 { 7448 struct spdk_bdev_io *parent_io = ctx; 7449 uint32_t matched_ios; 7450 7451 matched_ios = _bdev_abort(parent_io); 7452 7453 if (matched_ios == 0) { 7454 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7455 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7456 } else { 7457 /* For retry, the case that no target I/O was found is success 7458 * because it means target I/Os completed in the meantime. 7459 */ 7460 bdev_io_complete(parent_io); 7461 } 7462 return; 7463 } 7464 7465 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7466 parent_io->internal.f.split = true; 7467 parent_io->internal.split.outstanding = matched_ios; 7468 } 7469 7470 static void 7471 bdev_abort(struct spdk_bdev_io *parent_io) 7472 { 7473 uint32_t matched_ios; 7474 7475 matched_ios = _bdev_abort(parent_io); 7476 7477 if (matched_ios == 0) { 7478 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7479 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7480 } else { 7481 /* The case the no target I/O was found is failure. */ 7482 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7483 bdev_io_complete(parent_io); 7484 } 7485 return; 7486 } 7487 7488 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7489 parent_io->internal.f.split = true; 7490 parent_io->internal.split.outstanding = matched_ios; 7491 } 7492 7493 int 7494 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7495 void *bio_cb_arg, 7496 spdk_bdev_io_completion_cb cb, void *cb_arg) 7497 { 7498 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7499 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7500 struct spdk_bdev_io *bdev_io; 7501 7502 if (bio_cb_arg == NULL) { 7503 return -EINVAL; 7504 } 7505 7506 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7507 return -ENOTSUP; 7508 } 7509 7510 bdev_io = bdev_channel_get_io(channel); 7511 if (bdev_io == NULL) { 7512 return -ENOMEM; 7513 } 7514 7515 bdev_io->internal.ch = channel; 7516 bdev_io->internal.desc = desc; 7517 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7518 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7519 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7520 7521 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7522 7523 /* Parent abort request is not submitted directly, but to manage its execution, 7524 * add it to the submitted list here. 7525 */ 7526 bdev_ch_add_to_io_submitted(bdev_io); 7527 7528 bdev_abort(bdev_io); 7529 7530 return 0; 7531 } 7532 7533 int 7534 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7535 struct spdk_bdev_io_wait_entry *entry) 7536 { 7537 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7538 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7539 7540 if (bdev != entry->bdev) { 7541 SPDK_ERRLOG("bdevs do not match\n"); 7542 return -EINVAL; 7543 } 7544 7545 if (mgmt_ch->per_thread_cache_count > 0) { 7546 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7547 return -EINVAL; 7548 } 7549 7550 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7551 return 0; 7552 } 7553 7554 static inline void 7555 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7556 { 7557 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7558 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7559 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7560 uint32_t blocklen = bdev_io->bdev->blocklen; 7561 7562 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7563 switch (bdev_io->type) { 7564 case SPDK_BDEV_IO_TYPE_READ: 7565 io_stat->bytes_read += num_blocks * blocklen; 7566 io_stat->num_read_ops++; 7567 io_stat->read_latency_ticks += tsc_diff; 7568 if (io_stat->max_read_latency_ticks < tsc_diff) { 7569 io_stat->max_read_latency_ticks = tsc_diff; 7570 } 7571 if (io_stat->min_read_latency_ticks > tsc_diff) { 7572 io_stat->min_read_latency_ticks = tsc_diff; 7573 } 7574 break; 7575 case SPDK_BDEV_IO_TYPE_WRITE: 7576 io_stat->bytes_written += num_blocks * blocklen; 7577 io_stat->num_write_ops++; 7578 io_stat->write_latency_ticks += tsc_diff; 7579 if (io_stat->max_write_latency_ticks < tsc_diff) { 7580 io_stat->max_write_latency_ticks = tsc_diff; 7581 } 7582 if (io_stat->min_write_latency_ticks > tsc_diff) { 7583 io_stat->min_write_latency_ticks = tsc_diff; 7584 } 7585 break; 7586 case SPDK_BDEV_IO_TYPE_UNMAP: 7587 io_stat->bytes_unmapped += num_blocks * blocklen; 7588 io_stat->num_unmap_ops++; 7589 io_stat->unmap_latency_ticks += tsc_diff; 7590 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7591 io_stat->max_unmap_latency_ticks = tsc_diff; 7592 } 7593 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7594 io_stat->min_unmap_latency_ticks = tsc_diff; 7595 } 7596 break; 7597 case SPDK_BDEV_IO_TYPE_ZCOPY: 7598 /* Track the data in the start phase only */ 7599 if (bdev_io->u.bdev.zcopy.start) { 7600 if (bdev_io->u.bdev.zcopy.populate) { 7601 io_stat->bytes_read += num_blocks * blocklen; 7602 io_stat->num_read_ops++; 7603 io_stat->read_latency_ticks += tsc_diff; 7604 if (io_stat->max_read_latency_ticks < tsc_diff) { 7605 io_stat->max_read_latency_ticks = tsc_diff; 7606 } 7607 if (io_stat->min_read_latency_ticks > tsc_diff) { 7608 io_stat->min_read_latency_ticks = tsc_diff; 7609 } 7610 } else { 7611 io_stat->bytes_written += num_blocks * blocklen; 7612 io_stat->num_write_ops++; 7613 io_stat->write_latency_ticks += tsc_diff; 7614 if (io_stat->max_write_latency_ticks < tsc_diff) { 7615 io_stat->max_write_latency_ticks = tsc_diff; 7616 } 7617 if (io_stat->min_write_latency_ticks > tsc_diff) { 7618 io_stat->min_write_latency_ticks = tsc_diff; 7619 } 7620 } 7621 } 7622 break; 7623 case SPDK_BDEV_IO_TYPE_COPY: 7624 io_stat->bytes_copied += num_blocks * blocklen; 7625 io_stat->num_copy_ops++; 7626 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7627 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7628 io_stat->max_copy_latency_ticks = tsc_diff; 7629 } 7630 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7631 io_stat->min_copy_latency_ticks = tsc_diff; 7632 } 7633 break; 7634 default: 7635 break; 7636 } 7637 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7638 io_stat = bdev_io->bdev->internal.stat; 7639 assert(io_stat->io_error != NULL); 7640 7641 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7642 io_stat->io_error->error_status[-io_status - 1]++; 7643 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7644 } 7645 7646 #ifdef SPDK_CONFIG_VTUNE 7647 uint64_t now_tsc = spdk_get_ticks(); 7648 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7649 uint64_t data[5]; 7650 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7651 7652 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7653 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7654 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7655 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7656 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7657 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7658 7659 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7660 __itt_metadata_u64, 5, data); 7661 7662 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7663 bdev_io->internal.ch->start_tsc = now_tsc; 7664 } 7665 #endif 7666 } 7667 7668 static inline void 7669 _bdev_io_complete(void *ctx) 7670 { 7671 struct spdk_bdev_io *bdev_io = ctx; 7672 7673 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7674 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7675 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7676 } 7677 7678 assert(bdev_io->internal.cb != NULL); 7679 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7680 7681 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7682 bdev_io->internal.caller_ctx); 7683 } 7684 7685 static inline void 7686 bdev_io_complete(void *ctx) 7687 { 7688 struct spdk_bdev_io *bdev_io = ctx; 7689 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7690 uint64_t tsc, tsc_diff; 7691 7692 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7693 /* 7694 * Defer completion to avoid potential infinite recursion if the 7695 * user's completion callback issues a new I/O. 7696 */ 7697 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7698 bdev_io_complete, bdev_io); 7699 return; 7700 } 7701 7702 tsc = spdk_get_ticks(); 7703 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7704 7705 bdev_ch_remove_from_io_submitted(bdev_io); 7706 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7707 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7708 7709 if (bdev_ch->histogram) { 7710 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7711 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7712 /* 7713 * Tally all I/O types if the histogram_io_type is set to 0. 7714 */ 7715 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7716 } 7717 } 7718 7719 bdev_io_update_io_stat(bdev_io, tsc_diff); 7720 _bdev_io_complete(bdev_io); 7721 } 7722 7723 /* The difference between this function and bdev_io_complete() is that this should be called to 7724 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7725 * io_submitted list and don't have submit_tsc updated. 7726 */ 7727 static inline void 7728 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7729 { 7730 /* Since the IO hasn't been submitted it's bound to be failed */ 7731 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7732 7733 /* At this point we don't know if the IO is completed from submission context or not, but, 7734 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7735 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7736 _bdev_io_complete, bdev_io); 7737 } 7738 7739 static void bdev_destroy_cb(void *io_device); 7740 7741 static inline void 7742 _bdev_reset_complete(void *ctx) 7743 { 7744 struct spdk_bdev_io *bdev_io = ctx; 7745 7746 /* Put the channel reference we got in submission. */ 7747 assert(bdev_io->u.reset.ch_ref != NULL); 7748 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7749 bdev_io->u.reset.ch_ref = NULL; 7750 7751 bdev_io_complete(bdev_io); 7752 } 7753 7754 static void 7755 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7756 { 7757 struct spdk_bdev_io *bdev_io = _ctx; 7758 bdev_io_tailq_t queued_resets; 7759 struct spdk_bdev_io *queued_reset; 7760 7761 assert(bdev_io == bdev->internal.reset_in_progress); 7762 7763 TAILQ_INIT(&queued_resets); 7764 7765 spdk_spin_lock(&bdev->internal.spinlock); 7766 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7767 spdk_bdev_io, internal.link); 7768 bdev->internal.reset_in_progress = NULL; 7769 spdk_spin_unlock(&bdev->internal.spinlock); 7770 7771 while (!TAILQ_EMPTY(&queued_resets)) { 7772 queued_reset = TAILQ_FIRST(&queued_resets); 7773 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7774 queued_reset->internal.status = bdev_io->internal.status; 7775 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7776 _bdev_reset_complete, queued_reset); 7777 } 7778 7779 _bdev_reset_complete(bdev_io); 7780 7781 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7782 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7783 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7784 } 7785 } 7786 7787 static void 7788 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7789 struct spdk_io_channel *_ch, void *_ctx) 7790 { 7791 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7792 7793 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7794 7795 spdk_bdev_for_each_channel_continue(i, 0); 7796 } 7797 7798 static void 7799 bdev_io_complete_sequence_cb(void *ctx, int status) 7800 { 7801 struct spdk_bdev_io *bdev_io = ctx; 7802 7803 /* u.bdev.accel_sequence should have already been cleared at this point */ 7804 assert(bdev_io->u.bdev.accel_sequence == NULL); 7805 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7806 bdev_io->internal.f.has_accel_sequence = false; 7807 7808 if (spdk_unlikely(status != 0)) { 7809 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7810 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7811 } 7812 7813 bdev_io_complete(bdev_io); 7814 } 7815 7816 void 7817 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7818 { 7819 struct spdk_bdev *bdev = bdev_io->bdev; 7820 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7821 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7822 7823 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7824 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7825 spdk_bdev_get_module_name(bdev), 7826 bdev_io_status_get_string(bdev_io->internal.status)); 7827 assert(false); 7828 } 7829 bdev_io->internal.status = status; 7830 7831 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7832 assert(bdev_io == bdev->internal.reset_in_progress); 7833 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7834 bdev_reset_complete); 7835 return; 7836 } else { 7837 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7838 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7839 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7840 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7841 return; 7842 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7843 !bdev_io_use_accel_sequence(bdev_io))) { 7844 _bdev_io_push_bounce_data_buffer(bdev_io, 7845 _bdev_io_complete_push_bounce_done); 7846 /* bdev IO will be completed in the callback */ 7847 return; 7848 } 7849 } 7850 7851 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7852 return; 7853 } 7854 } 7855 7856 bdev_io_complete(bdev_io); 7857 } 7858 7859 void 7860 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7861 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7862 { 7863 enum spdk_bdev_io_status status; 7864 7865 if (sc == SPDK_SCSI_STATUS_GOOD) { 7866 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7867 } else { 7868 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7869 bdev_io->internal.error.scsi.sc = sc; 7870 bdev_io->internal.error.scsi.sk = sk; 7871 bdev_io->internal.error.scsi.asc = asc; 7872 bdev_io->internal.error.scsi.ascq = ascq; 7873 } 7874 7875 spdk_bdev_io_complete(bdev_io, status); 7876 } 7877 7878 void 7879 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7880 int *sc, int *sk, int *asc, int *ascq) 7881 { 7882 assert(sc != NULL); 7883 assert(sk != NULL); 7884 assert(asc != NULL); 7885 assert(ascq != NULL); 7886 7887 switch (bdev_io->internal.status) { 7888 case SPDK_BDEV_IO_STATUS_SUCCESS: 7889 *sc = SPDK_SCSI_STATUS_GOOD; 7890 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7891 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7892 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7893 break; 7894 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7895 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7896 break; 7897 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7898 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7899 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7900 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7901 *ascq = bdev_io->internal.error.scsi.ascq; 7902 break; 7903 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7904 *sc = bdev_io->internal.error.scsi.sc; 7905 *sk = bdev_io->internal.error.scsi.sk; 7906 *asc = bdev_io->internal.error.scsi.asc; 7907 *ascq = bdev_io->internal.error.scsi.ascq; 7908 break; 7909 default: 7910 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7911 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7912 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7913 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7914 break; 7915 } 7916 } 7917 7918 void 7919 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7920 { 7921 enum spdk_bdev_io_status status; 7922 7923 if (aio_result == 0) { 7924 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7925 } else { 7926 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7927 } 7928 7929 bdev_io->internal.error.aio_result = aio_result; 7930 7931 spdk_bdev_io_complete(bdev_io, status); 7932 } 7933 7934 void 7935 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7936 { 7937 assert(aio_result != NULL); 7938 7939 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7940 *aio_result = bdev_io->internal.error.aio_result; 7941 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7942 *aio_result = 0; 7943 } else { 7944 *aio_result = -EIO; 7945 } 7946 } 7947 7948 void 7949 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7950 { 7951 enum spdk_bdev_io_status status; 7952 7953 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7954 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7955 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7956 status = SPDK_BDEV_IO_STATUS_ABORTED; 7957 } else { 7958 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7959 } 7960 7961 bdev_io->internal.error.nvme.cdw0 = cdw0; 7962 bdev_io->internal.error.nvme.sct = sct; 7963 bdev_io->internal.error.nvme.sc = sc; 7964 7965 spdk_bdev_io_complete(bdev_io, status); 7966 } 7967 7968 void 7969 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7970 { 7971 assert(sct != NULL); 7972 assert(sc != NULL); 7973 assert(cdw0 != NULL); 7974 7975 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7976 *sct = SPDK_NVME_SCT_GENERIC; 7977 *sc = SPDK_NVME_SC_SUCCESS; 7978 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7979 *cdw0 = 0; 7980 } else { 7981 *cdw0 = 1U; 7982 } 7983 return; 7984 } 7985 7986 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7987 *sct = SPDK_NVME_SCT_GENERIC; 7988 *sc = SPDK_NVME_SC_SUCCESS; 7989 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7990 *sct = bdev_io->internal.error.nvme.sct; 7991 *sc = bdev_io->internal.error.nvme.sc; 7992 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7993 *sct = SPDK_NVME_SCT_GENERIC; 7994 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7995 } else { 7996 *sct = SPDK_NVME_SCT_GENERIC; 7997 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7998 } 7999 8000 *cdw0 = bdev_io->internal.error.nvme.cdw0; 8001 } 8002 8003 void 8004 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 8005 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 8006 { 8007 assert(first_sct != NULL); 8008 assert(first_sc != NULL); 8009 assert(second_sct != NULL); 8010 assert(second_sc != NULL); 8011 assert(cdw0 != NULL); 8012 8013 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 8014 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 8015 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 8016 *first_sct = bdev_io->internal.error.nvme.sct; 8017 *first_sc = bdev_io->internal.error.nvme.sc; 8018 *second_sct = SPDK_NVME_SCT_GENERIC; 8019 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8020 } else { 8021 *first_sct = SPDK_NVME_SCT_GENERIC; 8022 *first_sc = SPDK_NVME_SC_SUCCESS; 8023 *second_sct = bdev_io->internal.error.nvme.sct; 8024 *second_sc = bdev_io->internal.error.nvme.sc; 8025 } 8026 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 8027 *first_sct = SPDK_NVME_SCT_GENERIC; 8028 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8029 *second_sct = SPDK_NVME_SCT_GENERIC; 8030 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8031 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 8032 *first_sct = SPDK_NVME_SCT_GENERIC; 8033 *first_sc = SPDK_NVME_SC_SUCCESS; 8034 *second_sct = SPDK_NVME_SCT_GENERIC; 8035 *second_sc = SPDK_NVME_SC_SUCCESS; 8036 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 8037 *first_sct = SPDK_NVME_SCT_GENERIC; 8038 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8039 *second_sct = SPDK_NVME_SCT_GENERIC; 8040 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8041 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 8042 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 8043 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 8044 *second_sct = SPDK_NVME_SCT_GENERIC; 8045 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8046 } else { 8047 *first_sct = SPDK_NVME_SCT_GENERIC; 8048 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8049 *second_sct = SPDK_NVME_SCT_GENERIC; 8050 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8051 } 8052 8053 *cdw0 = bdev_io->internal.error.nvme.cdw0; 8054 } 8055 8056 void 8057 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 8058 const struct spdk_bdev_io *base_io) 8059 { 8060 switch (base_io->internal.status) { 8061 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 8062 spdk_bdev_io_complete_nvme_status(bdev_io, 8063 base_io->internal.error.nvme.cdw0, 8064 base_io->internal.error.nvme.sct, 8065 base_io->internal.error.nvme.sc); 8066 break; 8067 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 8068 spdk_bdev_io_complete_scsi_status(bdev_io, 8069 base_io->internal.error.scsi.sc, 8070 base_io->internal.error.scsi.sk, 8071 base_io->internal.error.scsi.asc, 8072 base_io->internal.error.scsi.ascq); 8073 break; 8074 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 8075 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 8076 break; 8077 default: 8078 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 8079 break; 8080 } 8081 } 8082 8083 struct spdk_thread * 8084 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 8085 { 8086 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 8087 } 8088 8089 struct spdk_io_channel * 8090 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 8091 { 8092 return bdev_io->internal.ch->channel; 8093 } 8094 8095 static int 8096 bdev_register(struct spdk_bdev *bdev) 8097 { 8098 char *bdev_name; 8099 char uuid[SPDK_UUID_STRING_LEN]; 8100 struct spdk_iobuf_opts iobuf_opts; 8101 int ret; 8102 8103 assert(bdev->module != NULL); 8104 8105 if (!bdev->name) { 8106 SPDK_ERRLOG("Bdev name is NULL\n"); 8107 return -EINVAL; 8108 } 8109 8110 if (!strlen(bdev->name)) { 8111 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 8112 return -EINVAL; 8113 } 8114 8115 /* Users often register their own I/O devices using the bdev name. In 8116 * order to avoid conflicts, prepend bdev_. */ 8117 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 8118 if (!bdev_name) { 8119 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 8120 return -ENOMEM; 8121 } 8122 8123 bdev->internal.stat = bdev_alloc_io_stat(true); 8124 if (!bdev->internal.stat) { 8125 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 8126 free(bdev_name); 8127 return -ENOMEM; 8128 } 8129 8130 bdev->internal.status = SPDK_BDEV_STATUS_READY; 8131 bdev->internal.measured_queue_depth = UINT64_MAX; 8132 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8133 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8134 bdev->internal.qd_poller = NULL; 8135 bdev->internal.qos = NULL; 8136 8137 TAILQ_INIT(&bdev->internal.open_descs); 8138 TAILQ_INIT(&bdev->internal.locked_ranges); 8139 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 8140 TAILQ_INIT(&bdev->internal.queued_resets); 8141 TAILQ_INIT(&bdev->aliases); 8142 8143 /* UUID may be specified by the user or defined by bdev itself. 8144 * Otherwise it will be generated here, so this field will never be empty. */ 8145 if (spdk_uuid_is_null(&bdev->uuid)) { 8146 spdk_uuid_generate(&bdev->uuid); 8147 } 8148 8149 /* Add the UUID alias only if it's different than the name */ 8150 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8151 if (strcmp(bdev->name, uuid) != 0) { 8152 ret = spdk_bdev_alias_add(bdev, uuid); 8153 if (ret != 0) { 8154 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 8155 bdev_free_io_stat(bdev->internal.stat); 8156 free(bdev_name); 8157 return ret; 8158 } 8159 } 8160 8161 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 8162 if (spdk_bdev_get_buf_align(bdev) > 1) { 8163 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 8164 iobuf_opts.large_bufsize / bdev->blocklen); 8165 } 8166 8167 /* If the user didn't specify a write unit size, set it to one. */ 8168 if (bdev->write_unit_size == 0) { 8169 bdev->write_unit_size = 1; 8170 } 8171 8172 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 8173 if (bdev->acwu == 0) { 8174 bdev->acwu = bdev->write_unit_size; 8175 } 8176 8177 if (bdev->phys_blocklen == 0) { 8178 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 8179 } 8180 8181 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 8182 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 8183 } 8184 8185 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 8186 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 8187 } 8188 8189 bdev->internal.reset_in_progress = NULL; 8190 bdev->internal.qd_poll_in_progress = false; 8191 bdev->internal.period = 0; 8192 bdev->internal.new_period = 0; 8193 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 8194 8195 /* 8196 * Initialize spinlock before registering IO device because spinlock is used in 8197 * bdev_channel_create 8198 */ 8199 spdk_spin_init(&bdev->internal.spinlock); 8200 8201 spdk_io_device_register(__bdev_to_io_dev(bdev), 8202 bdev_channel_create, bdev_channel_destroy, 8203 sizeof(struct spdk_bdev_channel), 8204 bdev_name); 8205 8206 /* 8207 * Register bdev name only after the bdev object is ready. 8208 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 8209 * create IO channels... 8210 */ 8211 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 8212 if (ret != 0) { 8213 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 8214 bdev_free_io_stat(bdev->internal.stat); 8215 spdk_spin_destroy(&bdev->internal.spinlock); 8216 free(bdev_name); 8217 return ret; 8218 } 8219 8220 free(bdev_name); 8221 8222 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 8223 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 8224 8225 return 0; 8226 } 8227 8228 static void 8229 bdev_destroy_cb(void *io_device) 8230 { 8231 int rc; 8232 struct spdk_bdev *bdev; 8233 spdk_bdev_unregister_cb cb_fn; 8234 void *cb_arg; 8235 8236 bdev = __bdev_from_io_dev(io_device); 8237 8238 if (bdev->internal.unregister_td != spdk_get_thread()) { 8239 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 8240 return; 8241 } 8242 8243 cb_fn = bdev->internal.unregister_cb; 8244 cb_arg = bdev->internal.unregister_ctx; 8245 8246 spdk_spin_destroy(&bdev->internal.spinlock); 8247 free(bdev->internal.qos); 8248 bdev_free_io_stat(bdev->internal.stat); 8249 spdk_trace_unregister_owner(bdev->internal.trace_id); 8250 8251 rc = bdev->fn_table->destruct(bdev->ctxt); 8252 if (rc < 0) { 8253 SPDK_ERRLOG("destruct failed\n"); 8254 } 8255 if (rc <= 0 && cb_fn != NULL) { 8256 cb_fn(cb_arg, rc); 8257 } 8258 } 8259 8260 void 8261 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8262 { 8263 if (bdev->internal.unregister_cb != NULL) { 8264 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8265 } 8266 } 8267 8268 static void 8269 _remove_notify(void *arg) 8270 { 8271 struct spdk_bdev_desc *desc = arg; 8272 8273 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8274 } 8275 8276 /* returns: 0 - bdev removed and ready to be destructed. 8277 * -EBUSY - bdev can't be destructed yet. */ 8278 static int 8279 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8280 { 8281 struct spdk_bdev_desc *desc, *tmp; 8282 struct spdk_bdev_alias *alias; 8283 int rc = 0; 8284 char uuid[SPDK_UUID_STRING_LEN]; 8285 8286 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8287 assert(spdk_spin_held(&bdev->internal.spinlock)); 8288 8289 /* Notify each descriptor about hotremoval */ 8290 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8291 rc = -EBUSY; 8292 /* 8293 * Defer invocation of the event_cb to a separate message that will 8294 * run later on its thread. This ensures this context unwinds and 8295 * we don't recursively unregister this bdev again if the event_cb 8296 * immediately closes its descriptor. 8297 */ 8298 event_notify(desc, _remove_notify); 8299 } 8300 8301 /* If there are no descriptors, proceed removing the bdev */ 8302 if (rc == 0) { 8303 bdev_examine_allowlist_remove(bdev->name); 8304 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8305 bdev_examine_allowlist_remove(alias->alias.name); 8306 } 8307 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8308 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8309 8310 /* Delete the name and the UUID alias */ 8311 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8312 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8313 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8314 8315 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8316 8317 if (bdev->internal.reset_in_progress != NULL) { 8318 /* If reset is in progress, let the completion callback for reset 8319 * unregister the bdev. 8320 */ 8321 rc = -EBUSY; 8322 } 8323 } 8324 8325 return rc; 8326 } 8327 8328 static void 8329 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8330 struct spdk_io_channel *io_ch, void *_ctx) 8331 { 8332 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8333 8334 bdev_channel_abort_queued_ios(bdev_ch); 8335 spdk_bdev_for_each_channel_continue(i, 0); 8336 } 8337 8338 static void 8339 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8340 { 8341 int rc; 8342 8343 spdk_spin_lock(&g_bdev_mgr.spinlock); 8344 spdk_spin_lock(&bdev->internal.spinlock); 8345 /* 8346 * Set the status to REMOVING after completing to abort channels. Otherwise, 8347 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8348 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8349 * may fail. 8350 */ 8351 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8352 rc = bdev_unregister_unsafe(bdev); 8353 spdk_spin_unlock(&bdev->internal.spinlock); 8354 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8355 8356 if (rc == 0) { 8357 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8358 } 8359 } 8360 8361 void 8362 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8363 { 8364 struct spdk_thread *thread; 8365 8366 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8367 8368 thread = spdk_get_thread(); 8369 if (!thread) { 8370 /* The user called this from a non-SPDK thread. */ 8371 if (cb_fn != NULL) { 8372 cb_fn(cb_arg, -ENOTSUP); 8373 } 8374 return; 8375 } 8376 8377 spdk_spin_lock(&g_bdev_mgr.spinlock); 8378 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8379 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8380 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8381 if (cb_fn) { 8382 cb_fn(cb_arg, -EBUSY); 8383 } 8384 return; 8385 } 8386 8387 spdk_spin_lock(&bdev->internal.spinlock); 8388 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8389 bdev->internal.unregister_cb = cb_fn; 8390 bdev->internal.unregister_ctx = cb_arg; 8391 bdev->internal.unregister_td = thread; 8392 spdk_spin_unlock(&bdev->internal.spinlock); 8393 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8394 8395 spdk_bdev_set_qd_sampling_period(bdev, 0); 8396 8397 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8398 bdev_unregister); 8399 } 8400 8401 int 8402 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8403 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8404 { 8405 struct spdk_bdev_desc *desc; 8406 struct spdk_bdev *bdev; 8407 int rc; 8408 8409 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8410 if (rc != 0) { 8411 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8412 return rc; 8413 } 8414 8415 bdev = spdk_bdev_desc_get_bdev(desc); 8416 8417 if (bdev->module != module) { 8418 spdk_bdev_close(desc); 8419 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8420 bdev_name); 8421 return -ENODEV; 8422 } 8423 8424 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8425 8426 spdk_bdev_close(desc); 8427 8428 return 0; 8429 } 8430 8431 static int 8432 bdev_start_qos(struct spdk_bdev *bdev) 8433 { 8434 struct set_qos_limit_ctx *ctx; 8435 8436 /* Enable QoS */ 8437 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8438 ctx = calloc(1, sizeof(*ctx)); 8439 if (ctx == NULL) { 8440 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8441 return -ENOMEM; 8442 } 8443 ctx->bdev = bdev; 8444 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8445 } 8446 8447 return 0; 8448 } 8449 8450 static void 8451 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8452 struct spdk_bdev *bdev) 8453 { 8454 enum spdk_bdev_claim_type type; 8455 const char *typename, *modname; 8456 extern struct spdk_log_flag SPDK_LOG_bdev; 8457 8458 assert(spdk_spin_held(&bdev->internal.spinlock)); 8459 8460 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8461 return; 8462 } 8463 8464 type = bdev->internal.claim_type; 8465 typename = spdk_bdev_claim_get_name(type); 8466 8467 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8468 modname = bdev->internal.claim.v1.module->name; 8469 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8470 bdev->name, detail, typename, modname); 8471 return; 8472 } 8473 8474 if (claim_type_is_v2(type)) { 8475 struct spdk_bdev_module_claim *claim; 8476 8477 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8478 modname = claim->module->name; 8479 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8480 bdev->name, detail, typename, modname); 8481 } 8482 return; 8483 } 8484 8485 assert(false); 8486 } 8487 8488 static int 8489 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8490 { 8491 struct spdk_thread *thread; 8492 int rc = 0; 8493 8494 thread = spdk_get_thread(); 8495 if (!thread) { 8496 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8497 return -ENOTSUP; 8498 } 8499 8500 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8501 spdk_get_thread()); 8502 8503 desc->bdev = bdev; 8504 desc->thread = thread; 8505 desc->write = write; 8506 8507 spdk_spin_lock(&bdev->internal.spinlock); 8508 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8509 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8510 spdk_spin_unlock(&bdev->internal.spinlock); 8511 return -ENODEV; 8512 } 8513 8514 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8515 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8516 spdk_spin_unlock(&bdev->internal.spinlock); 8517 return -EPERM; 8518 } 8519 8520 rc = bdev_start_qos(bdev); 8521 if (rc != 0) { 8522 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8523 spdk_spin_unlock(&bdev->internal.spinlock); 8524 return rc; 8525 } 8526 8527 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8528 8529 spdk_spin_unlock(&bdev->internal.spinlock); 8530 8531 return 0; 8532 } 8533 8534 static void 8535 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8536 { 8537 if (!opts) { 8538 SPDK_ERRLOG("opts should not be NULL.\n"); 8539 return; 8540 } 8541 8542 if (!opts_size) { 8543 SPDK_ERRLOG("opts_size should not be zero.\n"); 8544 return; 8545 } 8546 8547 memset(opts, 0, opts_size); 8548 opts->size = opts_size; 8549 8550 #define FIELD_OK(field) \ 8551 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8552 8553 #define SET_FIELD(field, value) \ 8554 if (FIELD_OK(field)) { \ 8555 opts->field = value; \ 8556 } \ 8557 8558 SET_FIELD(hide_metadata, false); 8559 8560 #undef FIELD_OK 8561 #undef SET_FIELD 8562 } 8563 8564 static void 8565 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8566 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8567 { 8568 assert(opts); 8569 assert(opts_src); 8570 8571 #define SET_FIELD(field) \ 8572 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8573 opts->field = opts_src->field; \ 8574 } \ 8575 8576 SET_FIELD(hide_metadata); 8577 8578 opts->size = opts_src->size; 8579 8580 /* We should not remove this statement, but need to update the assert statement 8581 * if we add a new field, and also add a corresponding SET_FIELD statement. 8582 */ 8583 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8584 8585 #undef SET_FIELD 8586 } 8587 8588 void 8589 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8590 { 8591 struct spdk_bdev_open_opts opts_local; 8592 8593 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8594 bdev_open_opts_copy(opts, &opts_local, opts_size); 8595 } 8596 8597 static int 8598 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8599 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8600 { 8601 struct spdk_bdev_desc *desc; 8602 struct spdk_bdev_open_opts opts; 8603 unsigned int i; 8604 8605 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8606 if (user_opts != NULL) { 8607 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8608 } 8609 8610 desc = calloc(1, sizeof(*desc)); 8611 if (desc == NULL) { 8612 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8613 return -ENOMEM; 8614 } 8615 8616 desc->opts = opts; 8617 8618 TAILQ_INIT(&desc->pending_media_events); 8619 TAILQ_INIT(&desc->free_media_events); 8620 8621 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8622 desc->callback.event_fn = event_cb; 8623 desc->callback.ctx = event_ctx; 8624 spdk_spin_init(&desc->spinlock); 8625 8626 if (desc->opts.hide_metadata) { 8627 if (spdk_bdev_is_md_separate(bdev)) { 8628 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8629 bdev_desc_free(desc); 8630 return -EINVAL; 8631 } 8632 } 8633 8634 if (bdev->media_events) { 8635 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8636 sizeof(*desc->media_events_buffer)); 8637 if (desc->media_events_buffer == NULL) { 8638 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8639 bdev_desc_free(desc); 8640 return -ENOMEM; 8641 } 8642 8643 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8644 TAILQ_INSERT_TAIL(&desc->free_media_events, 8645 &desc->media_events_buffer[i], tailq); 8646 } 8647 } 8648 8649 if (bdev->fn_table->accel_sequence_supported != NULL) { 8650 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8651 desc->accel_sequence_supported[i] = 8652 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8653 (enum spdk_bdev_io_type)i); 8654 } 8655 } 8656 8657 *_desc = desc; 8658 8659 return 0; 8660 } 8661 8662 static int 8663 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8664 void *event_ctx, struct spdk_bdev_open_opts *opts, 8665 struct spdk_bdev_desc **_desc) 8666 { 8667 struct spdk_bdev_desc *desc; 8668 struct spdk_bdev *bdev; 8669 int rc; 8670 8671 bdev = bdev_get_by_name(bdev_name); 8672 8673 if (bdev == NULL) { 8674 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8675 return -ENODEV; 8676 } 8677 8678 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8679 if (rc != 0) { 8680 return rc; 8681 } 8682 8683 rc = bdev_open(bdev, write, desc); 8684 if (rc != 0) { 8685 bdev_desc_free(desc); 8686 desc = NULL; 8687 } 8688 8689 *_desc = desc; 8690 8691 return rc; 8692 } 8693 8694 int 8695 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8696 void *event_ctx, struct spdk_bdev_open_opts *opts, 8697 struct spdk_bdev_desc **_desc) 8698 { 8699 int rc; 8700 8701 if (event_cb == NULL) { 8702 SPDK_ERRLOG("Missing event callback function\n"); 8703 return -EINVAL; 8704 } 8705 8706 spdk_spin_lock(&g_bdev_mgr.spinlock); 8707 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8708 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8709 8710 return rc; 8711 } 8712 8713 int 8714 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8715 void *event_ctx, struct spdk_bdev_desc **_desc) 8716 { 8717 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8718 } 8719 8720 struct spdk_bdev_open_async_ctx { 8721 char *bdev_name; 8722 spdk_bdev_event_cb_t event_cb; 8723 void *event_ctx; 8724 bool write; 8725 int rc; 8726 spdk_bdev_open_async_cb_t cb_fn; 8727 void *cb_arg; 8728 struct spdk_bdev_desc *desc; 8729 struct spdk_bdev_open_async_opts opts; 8730 uint64_t start_ticks; 8731 struct spdk_thread *orig_thread; 8732 struct spdk_poller *poller; 8733 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8734 }; 8735 8736 static void 8737 bdev_open_async_done(void *arg) 8738 { 8739 struct spdk_bdev_open_async_ctx *ctx = arg; 8740 8741 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8742 8743 free(ctx->bdev_name); 8744 free(ctx); 8745 } 8746 8747 static void 8748 bdev_open_async_cancel(void *arg) 8749 { 8750 struct spdk_bdev_open_async_ctx *ctx = arg; 8751 8752 assert(ctx->rc == -ESHUTDOWN); 8753 8754 spdk_poller_unregister(&ctx->poller); 8755 8756 bdev_open_async_done(ctx); 8757 } 8758 8759 /* This is called when the bdev library finishes at shutdown. */ 8760 static void 8761 bdev_open_async_fini(void) 8762 { 8763 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8764 8765 spdk_spin_lock(&g_bdev_mgr.spinlock); 8766 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8767 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8768 /* 8769 * We have to move to ctx->orig_thread to unregister ctx->poller. 8770 * However, there is a chance that ctx->poller is executed before 8771 * message is executed, which could result in bdev_open_async_done() 8772 * being called twice. To avoid such race condition, set ctx->rc to 8773 * -ESHUTDOWN. 8774 */ 8775 ctx->rc = -ESHUTDOWN; 8776 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8777 } 8778 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8779 } 8780 8781 static int bdev_open_async(void *arg); 8782 8783 static void 8784 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8785 { 8786 uint64_t timeout_ticks; 8787 8788 if (ctx->rc == -ESHUTDOWN) { 8789 /* This context is being canceled. Do nothing. */ 8790 return; 8791 } 8792 8793 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8794 NULL, &ctx->desc); 8795 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8796 goto exit; 8797 } 8798 8799 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8800 if (spdk_get_ticks() >= timeout_ticks) { 8801 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8802 ctx->rc = -ETIMEDOUT; 8803 goto exit; 8804 } 8805 8806 return; 8807 8808 exit: 8809 spdk_poller_unregister(&ctx->poller); 8810 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8811 8812 /* Completion callback is processed after stack unwinding. */ 8813 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8814 } 8815 8816 static int 8817 bdev_open_async(void *arg) 8818 { 8819 struct spdk_bdev_open_async_ctx *ctx = arg; 8820 8821 spdk_spin_lock(&g_bdev_mgr.spinlock); 8822 8823 _bdev_open_async(ctx); 8824 8825 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8826 8827 return SPDK_POLLER_BUSY; 8828 } 8829 8830 static void 8831 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8832 struct spdk_bdev_open_async_opts *opts_src, 8833 size_t size) 8834 { 8835 assert(opts); 8836 assert(opts_src); 8837 8838 opts->size = size; 8839 8840 #define SET_FIELD(field) \ 8841 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8842 opts->field = opts_src->field; \ 8843 } \ 8844 8845 SET_FIELD(timeout_ms); 8846 8847 /* Do not remove this statement, you should always update this statement when you adding a new field, 8848 * and do not forget to add the SET_FIELD statement for your added field. */ 8849 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8850 8851 #undef SET_FIELD 8852 } 8853 8854 static void 8855 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8856 { 8857 assert(opts); 8858 8859 opts->size = size; 8860 8861 #define SET_FIELD(field, value) \ 8862 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8863 opts->field = value; \ 8864 } \ 8865 8866 SET_FIELD(timeout_ms, 0); 8867 8868 #undef SET_FIELD 8869 } 8870 8871 int 8872 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8873 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8874 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8875 { 8876 struct spdk_bdev_open_async_ctx *ctx; 8877 8878 if (event_cb == NULL) { 8879 SPDK_ERRLOG("Missing event callback function\n"); 8880 return -EINVAL; 8881 } 8882 8883 if (open_cb == NULL) { 8884 SPDK_ERRLOG("Missing open callback function\n"); 8885 return -EINVAL; 8886 } 8887 8888 if (opts != NULL && opts->size == 0) { 8889 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8890 return -EINVAL; 8891 } 8892 8893 ctx = calloc(1, sizeof(*ctx)); 8894 if (ctx == NULL) { 8895 SPDK_ERRLOG("Failed to allocate open context\n"); 8896 return -ENOMEM; 8897 } 8898 8899 ctx->bdev_name = strdup(bdev_name); 8900 if (ctx->bdev_name == NULL) { 8901 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8902 free(ctx); 8903 return -ENOMEM; 8904 } 8905 8906 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8907 if (ctx->poller == NULL) { 8908 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8909 free(ctx->bdev_name); 8910 free(ctx); 8911 return -ENOMEM; 8912 } 8913 8914 ctx->cb_fn = open_cb; 8915 ctx->cb_arg = open_cb_arg; 8916 ctx->write = write; 8917 ctx->event_cb = event_cb; 8918 ctx->event_ctx = event_ctx; 8919 ctx->orig_thread = spdk_get_thread(); 8920 ctx->start_ticks = spdk_get_ticks(); 8921 8922 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8923 if (opts != NULL) { 8924 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8925 } 8926 8927 spdk_spin_lock(&g_bdev_mgr.spinlock); 8928 8929 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8930 _bdev_open_async(ctx); 8931 8932 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8933 8934 return 0; 8935 } 8936 8937 static void 8938 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8939 { 8940 int rc; 8941 8942 spdk_spin_lock(&bdev->internal.spinlock); 8943 spdk_spin_lock(&desc->spinlock); 8944 8945 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8946 8947 desc->closed = true; 8948 8949 if (desc->claim != NULL) { 8950 bdev_desc_release_claims(desc); 8951 } 8952 8953 if (0 == desc->refs) { 8954 spdk_spin_unlock(&desc->spinlock); 8955 bdev_desc_free(desc); 8956 } else { 8957 spdk_spin_unlock(&desc->spinlock); 8958 } 8959 8960 /* If no more descriptors, kill QoS channel */ 8961 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8962 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8963 bdev->name, spdk_get_thread()); 8964 8965 if (bdev_qos_destroy(bdev)) { 8966 /* There isn't anything we can do to recover here. Just let the 8967 * old QoS poller keep running. The QoS handling won't change 8968 * cores when the user allocates a new channel, but it won't break. */ 8969 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8970 } 8971 } 8972 8973 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8974 rc = bdev_unregister_unsafe(bdev); 8975 spdk_spin_unlock(&bdev->internal.spinlock); 8976 8977 if (rc == 0) { 8978 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8979 } 8980 } else { 8981 spdk_spin_unlock(&bdev->internal.spinlock); 8982 } 8983 } 8984 8985 void 8986 spdk_bdev_close(struct spdk_bdev_desc *desc) 8987 { 8988 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8989 8990 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8991 spdk_get_thread()); 8992 8993 assert(desc->thread == spdk_get_thread()); 8994 8995 spdk_poller_unregister(&desc->io_timeout_poller); 8996 8997 spdk_spin_lock(&g_bdev_mgr.spinlock); 8998 8999 bdev_close(bdev, desc); 9000 9001 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9002 } 9003 9004 int32_t 9005 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 9006 { 9007 if (bdev->numa.id_valid) { 9008 return bdev->numa.id; 9009 } else { 9010 return SPDK_ENV_NUMA_ID_ANY; 9011 } 9012 } 9013 9014 static void 9015 bdev_register_finished(void *arg) 9016 { 9017 struct spdk_bdev_desc *desc = arg; 9018 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9019 9020 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 9021 9022 spdk_spin_lock(&g_bdev_mgr.spinlock); 9023 9024 bdev_close(bdev, desc); 9025 9026 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9027 } 9028 9029 int 9030 spdk_bdev_register(struct spdk_bdev *bdev) 9031 { 9032 struct spdk_bdev_desc *desc; 9033 struct spdk_thread *thread = spdk_get_thread(); 9034 int rc; 9035 9036 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 9037 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 9038 thread ? spdk_thread_get_name(thread) : "null"); 9039 return -EINVAL; 9040 } 9041 9042 rc = bdev_register(bdev); 9043 if (rc != 0) { 9044 return rc; 9045 } 9046 9047 /* A descriptor is opened to prevent bdev deletion during examination */ 9048 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9049 if (rc != 0) { 9050 spdk_bdev_unregister(bdev, NULL, NULL); 9051 return rc; 9052 } 9053 9054 rc = bdev_open(bdev, false, desc); 9055 if (rc != 0) { 9056 bdev_desc_free(desc); 9057 spdk_bdev_unregister(bdev, NULL, NULL); 9058 return rc; 9059 } 9060 9061 /* Examine configuration before initializing I/O */ 9062 bdev_examine(bdev); 9063 9064 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 9065 if (rc != 0) { 9066 bdev_close(bdev, desc); 9067 spdk_bdev_unregister(bdev, NULL, NULL); 9068 } 9069 9070 return rc; 9071 } 9072 9073 int 9074 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 9075 struct spdk_bdev_module *module) 9076 { 9077 spdk_spin_lock(&bdev->internal.spinlock); 9078 9079 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9080 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9081 spdk_spin_unlock(&bdev->internal.spinlock); 9082 return -EPERM; 9083 } 9084 9085 if (desc && !desc->write) { 9086 desc->write = true; 9087 } 9088 9089 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 9090 bdev->internal.claim.v1.module = module; 9091 9092 spdk_spin_unlock(&bdev->internal.spinlock); 9093 return 0; 9094 } 9095 9096 void 9097 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 9098 { 9099 spdk_spin_lock(&bdev->internal.spinlock); 9100 9101 assert(bdev->internal.claim.v1.module != NULL); 9102 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 9103 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9104 bdev->internal.claim.v1.module = NULL; 9105 9106 spdk_spin_unlock(&bdev->internal.spinlock); 9107 } 9108 9109 /* 9110 * Start claims v2 9111 */ 9112 9113 const char * 9114 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 9115 { 9116 switch (type) { 9117 case SPDK_BDEV_CLAIM_NONE: 9118 return "not_claimed"; 9119 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9120 return "exclusive_write"; 9121 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9122 return "read_many_write_one"; 9123 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9124 return "read_many_write_none"; 9125 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9126 return "read_many_write_many"; 9127 default: 9128 break; 9129 } 9130 return "invalid_claim"; 9131 } 9132 9133 static bool 9134 claim_type_is_v2(enum spdk_bdev_claim_type type) 9135 { 9136 switch (type) { 9137 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9138 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9139 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9140 return true; 9141 default: 9142 break; 9143 } 9144 return false; 9145 } 9146 9147 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 9148 static bool 9149 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 9150 { 9151 switch (type) { 9152 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9153 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9154 return true; 9155 default: 9156 break; 9157 } 9158 return false; 9159 } 9160 9161 void 9162 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 9163 { 9164 if (opts == NULL) { 9165 SPDK_ERRLOG("opts should not be NULL\n"); 9166 assert(opts != NULL); 9167 return; 9168 } 9169 if (size == 0) { 9170 SPDK_ERRLOG("size should not be zero\n"); 9171 assert(size != 0); 9172 return; 9173 } 9174 9175 memset(opts, 0, size); 9176 opts->opts_size = size; 9177 9178 #define FIELD_OK(field) \ 9179 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 9180 9181 #define SET_FIELD(field, value) \ 9182 if (FIELD_OK(field)) { \ 9183 opts->field = value; \ 9184 } \ 9185 9186 SET_FIELD(shared_claim_key, 0); 9187 9188 #undef FIELD_OK 9189 #undef SET_FIELD 9190 } 9191 9192 static int 9193 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 9194 { 9195 if (src->opts_size == 0) { 9196 SPDK_ERRLOG("size should not be zero\n"); 9197 return -1; 9198 } 9199 9200 memset(dst, 0, sizeof(*dst)); 9201 dst->opts_size = src->opts_size; 9202 9203 #define FIELD_OK(field) \ 9204 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 9205 9206 #define SET_FIELD(field) \ 9207 if (FIELD_OK(field)) { \ 9208 dst->field = src->field; \ 9209 } \ 9210 9211 if (FIELD_OK(name)) { 9212 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 9213 } 9214 9215 SET_FIELD(shared_claim_key); 9216 9217 /* You should not remove this statement, but need to update the assert statement 9218 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9219 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 9220 9221 #undef FIELD_OK 9222 #undef SET_FIELD 9223 return 0; 9224 } 9225 9226 /* Returns 0 if a read-write-once claim can be taken. */ 9227 static int 9228 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9229 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9230 { 9231 struct spdk_bdev *bdev = desc->bdev; 9232 struct spdk_bdev_desc *open_desc; 9233 9234 assert(spdk_spin_held(&bdev->internal.spinlock)); 9235 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 9236 9237 if (opts->shared_claim_key != 0) { 9238 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 9239 bdev->name); 9240 return -EINVAL; 9241 } 9242 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9243 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9244 return -EPERM; 9245 } 9246 if (desc->claim != NULL) { 9247 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 9248 bdev->name, desc->claim->module->name); 9249 return -EPERM; 9250 } 9251 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9252 if (desc != open_desc && open_desc->write) { 9253 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 9254 "another descriptor is open for writing\n", 9255 bdev->name); 9256 return -EPERM; 9257 } 9258 } 9259 9260 return 0; 9261 } 9262 9263 /* Returns 0 if a read-only-many claim can be taken. */ 9264 static int 9265 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9266 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9267 { 9268 struct spdk_bdev *bdev = desc->bdev; 9269 struct spdk_bdev_desc *open_desc; 9270 9271 assert(spdk_spin_held(&bdev->internal.spinlock)); 9272 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9273 assert(desc->claim == NULL); 9274 9275 if (desc->write) { 9276 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9277 bdev->name); 9278 return -EINVAL; 9279 } 9280 if (opts->shared_claim_key != 0) { 9281 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9282 return -EINVAL; 9283 } 9284 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9285 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9286 if (open_desc->write) { 9287 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9288 "another descriptor is open for writing\n", 9289 bdev->name); 9290 return -EPERM; 9291 } 9292 } 9293 } 9294 9295 return 0; 9296 } 9297 9298 /* Returns 0 if a read-write-many claim can be taken. */ 9299 static int 9300 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9301 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9302 { 9303 struct spdk_bdev *bdev = desc->bdev; 9304 struct spdk_bdev_desc *open_desc; 9305 9306 assert(spdk_spin_held(&bdev->internal.spinlock)); 9307 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9308 assert(desc->claim == NULL); 9309 9310 if (opts->shared_claim_key == 0) { 9311 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9312 bdev->name); 9313 return -EINVAL; 9314 } 9315 switch (bdev->internal.claim_type) { 9316 case SPDK_BDEV_CLAIM_NONE: 9317 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9318 if (open_desc == desc) { 9319 continue; 9320 } 9321 if (open_desc->write) { 9322 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9323 "another descriptor is open for writing without a " 9324 "claim\n", bdev->name); 9325 return -EPERM; 9326 } 9327 } 9328 break; 9329 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9330 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9331 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9332 return -EPERM; 9333 } 9334 break; 9335 default: 9336 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9337 return -EBUSY; 9338 } 9339 9340 return 0; 9341 } 9342 9343 /* Updates desc and its bdev with a v2 claim. */ 9344 static int 9345 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9346 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9347 { 9348 struct spdk_bdev *bdev = desc->bdev; 9349 struct spdk_bdev_module_claim *claim; 9350 9351 assert(spdk_spin_held(&bdev->internal.spinlock)); 9352 assert(claim_type_is_v2(type)); 9353 assert(desc->claim == NULL); 9354 9355 claim = calloc(1, sizeof(*desc->claim)); 9356 if (claim == NULL) { 9357 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9358 return -ENOMEM; 9359 } 9360 claim->module = module; 9361 claim->desc = desc; 9362 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9363 memcpy(claim->name, opts->name, sizeof(claim->name)); 9364 desc->claim = claim; 9365 9366 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9367 bdev->internal.claim_type = type; 9368 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9369 bdev->internal.claim.v2.key = opts->shared_claim_key; 9370 } 9371 assert(type == bdev->internal.claim_type); 9372 9373 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9374 9375 if (!desc->write && claim_type_promotes_to_write(type)) { 9376 desc->write = true; 9377 } 9378 9379 return 0; 9380 } 9381 9382 int 9383 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9384 struct spdk_bdev_claim_opts *_opts, 9385 struct spdk_bdev_module *module) 9386 { 9387 struct spdk_bdev *bdev; 9388 struct spdk_bdev_claim_opts opts; 9389 int rc = 0; 9390 9391 if (desc == NULL) { 9392 SPDK_ERRLOG("descriptor must not be NULL\n"); 9393 return -EINVAL; 9394 } 9395 9396 bdev = desc->bdev; 9397 9398 if (_opts == NULL) { 9399 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9400 } else if (claim_opts_copy(_opts, &opts) != 0) { 9401 return -EINVAL; 9402 } 9403 9404 spdk_spin_lock(&bdev->internal.spinlock); 9405 9406 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9407 bdev->internal.claim_type != type) { 9408 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9409 spdk_spin_unlock(&bdev->internal.spinlock); 9410 return -EPERM; 9411 } 9412 9413 if (claim_type_is_v2(type) && desc->claim != NULL) { 9414 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9415 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9416 spdk_spin_unlock(&bdev->internal.spinlock); 9417 return -EPERM; 9418 } 9419 9420 switch (type) { 9421 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9422 spdk_spin_unlock(&bdev->internal.spinlock); 9423 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9424 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9425 rc = claim_verify_rwo(desc, type, &opts, module); 9426 break; 9427 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9428 rc = claim_verify_rom(desc, type, &opts, module); 9429 break; 9430 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9431 rc = claim_verify_rwm(desc, type, &opts, module); 9432 break; 9433 default: 9434 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9435 rc = -ENOTSUP; 9436 } 9437 9438 if (rc == 0) { 9439 rc = claim_bdev(desc, type, &opts, module); 9440 } 9441 9442 spdk_spin_unlock(&bdev->internal.spinlock); 9443 return rc; 9444 } 9445 9446 static void 9447 claim_reset(struct spdk_bdev *bdev) 9448 { 9449 assert(spdk_spin_held(&bdev->internal.spinlock)); 9450 assert(claim_type_is_v2(bdev->internal.claim_type)); 9451 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9452 9453 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9454 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9455 } 9456 9457 static void 9458 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9459 { 9460 struct spdk_bdev *bdev = desc->bdev; 9461 9462 assert(spdk_spin_held(&bdev->internal.spinlock)); 9463 assert(claim_type_is_v2(bdev->internal.claim_type)); 9464 9465 if (bdev->internal.examine_in_progress == 0) { 9466 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9467 free(desc->claim); 9468 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9469 claim_reset(bdev); 9470 } 9471 } else { 9472 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9473 desc->claim->module = NULL; 9474 desc->claim->desc = NULL; 9475 } 9476 desc->claim = NULL; 9477 } 9478 9479 /* 9480 * End claims v2 9481 */ 9482 9483 struct spdk_bdev * 9484 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9485 { 9486 assert(desc != NULL); 9487 return desc->bdev; 9488 } 9489 9490 int 9491 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9492 { 9493 struct spdk_bdev *bdev, *tmp; 9494 struct spdk_bdev_desc *desc; 9495 int rc = 0; 9496 9497 assert(fn != NULL); 9498 9499 spdk_spin_lock(&g_bdev_mgr.spinlock); 9500 bdev = spdk_bdev_first(); 9501 while (bdev != NULL) { 9502 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9503 if (rc != 0) { 9504 break; 9505 } 9506 rc = bdev_open(bdev, false, desc); 9507 if (rc != 0) { 9508 bdev_desc_free(desc); 9509 if (rc == -ENODEV) { 9510 /* Ignore the error and move to the next bdev. */ 9511 rc = 0; 9512 bdev = spdk_bdev_next(bdev); 9513 continue; 9514 } 9515 break; 9516 } 9517 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9518 9519 rc = fn(ctx, bdev); 9520 9521 spdk_spin_lock(&g_bdev_mgr.spinlock); 9522 tmp = spdk_bdev_next(bdev); 9523 bdev_close(bdev, desc); 9524 if (rc != 0) { 9525 break; 9526 } 9527 bdev = tmp; 9528 } 9529 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9530 9531 return rc; 9532 } 9533 9534 int 9535 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9536 { 9537 struct spdk_bdev *bdev, *tmp; 9538 struct spdk_bdev_desc *desc; 9539 int rc = 0; 9540 9541 assert(fn != NULL); 9542 9543 spdk_spin_lock(&g_bdev_mgr.spinlock); 9544 bdev = spdk_bdev_first_leaf(); 9545 while (bdev != NULL) { 9546 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9547 if (rc != 0) { 9548 break; 9549 } 9550 rc = bdev_open(bdev, false, desc); 9551 if (rc != 0) { 9552 bdev_desc_free(desc); 9553 if (rc == -ENODEV) { 9554 /* Ignore the error and move to the next bdev. */ 9555 rc = 0; 9556 bdev = spdk_bdev_next_leaf(bdev); 9557 continue; 9558 } 9559 break; 9560 } 9561 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9562 9563 rc = fn(ctx, bdev); 9564 9565 spdk_spin_lock(&g_bdev_mgr.spinlock); 9566 tmp = spdk_bdev_next_leaf(bdev); 9567 bdev_close(bdev, desc); 9568 if (rc != 0) { 9569 break; 9570 } 9571 bdev = tmp; 9572 } 9573 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9574 9575 return rc; 9576 } 9577 9578 void 9579 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9580 { 9581 struct iovec *iovs; 9582 int iovcnt; 9583 9584 if (bdev_io == NULL) { 9585 return; 9586 } 9587 9588 switch (bdev_io->type) { 9589 case SPDK_BDEV_IO_TYPE_READ: 9590 case SPDK_BDEV_IO_TYPE_WRITE: 9591 case SPDK_BDEV_IO_TYPE_ZCOPY: 9592 iovs = bdev_io->u.bdev.iovs; 9593 iovcnt = bdev_io->u.bdev.iovcnt; 9594 break; 9595 default: 9596 iovs = NULL; 9597 iovcnt = 0; 9598 break; 9599 } 9600 9601 if (iovp) { 9602 *iovp = iovs; 9603 } 9604 if (iovcntp) { 9605 *iovcntp = iovcnt; 9606 } 9607 } 9608 9609 void * 9610 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9611 { 9612 if (bdev_io == NULL) { 9613 return NULL; 9614 } 9615 9616 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9617 return NULL; 9618 } 9619 9620 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9621 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9622 return bdev_io->u.bdev.md_buf; 9623 } 9624 9625 return NULL; 9626 } 9627 9628 void * 9629 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9630 { 9631 if (bdev_io == NULL) { 9632 assert(false); 9633 return NULL; 9634 } 9635 9636 return bdev_io->internal.caller_ctx; 9637 } 9638 9639 void 9640 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9641 { 9642 9643 if (spdk_bdev_module_list_find(bdev_module->name)) { 9644 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9645 assert(false); 9646 } 9647 9648 spdk_spin_init(&bdev_module->internal.spinlock); 9649 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9650 9651 /* 9652 * Modules with examine callbacks must be initialized first, so they are 9653 * ready to handle examine callbacks from later modules that will 9654 * register physical bdevs. 9655 */ 9656 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9657 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9658 } else { 9659 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9660 } 9661 } 9662 9663 struct spdk_bdev_module * 9664 spdk_bdev_module_list_find(const char *name) 9665 { 9666 struct spdk_bdev_module *bdev_module; 9667 9668 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9669 if (strcmp(name, bdev_module->name) == 0) { 9670 break; 9671 } 9672 } 9673 9674 return bdev_module; 9675 } 9676 9677 static int 9678 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9679 { 9680 uint64_t num_blocks; 9681 void *md_buf = NULL; 9682 9683 num_blocks = bdev_io->u.bdev.num_blocks; 9684 9685 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9686 md_buf = (char *)g_bdev_mgr.zero_buffer + 9687 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9688 } 9689 9690 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9691 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9692 g_bdev_mgr.zero_buffer, md_buf, 9693 bdev_io->u.bdev.offset_blocks, num_blocks, 9694 bdev_write_zero_buffer_done, bdev_io); 9695 } 9696 9697 static void 9698 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9699 { 9700 struct spdk_bdev_io *parent_io = cb_arg; 9701 9702 spdk_bdev_free_io(bdev_io); 9703 9704 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9705 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9706 } 9707 9708 static void 9709 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9710 { 9711 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9712 ctx->bdev->internal.qos_mod_in_progress = false; 9713 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9714 9715 if (ctx->cb_fn) { 9716 ctx->cb_fn(ctx->cb_arg, status); 9717 } 9718 free(ctx); 9719 } 9720 9721 static void 9722 bdev_disable_qos_done(void *cb_arg) 9723 { 9724 struct set_qos_limit_ctx *ctx = cb_arg; 9725 struct spdk_bdev *bdev = ctx->bdev; 9726 struct spdk_bdev_qos *qos; 9727 9728 spdk_spin_lock(&bdev->internal.spinlock); 9729 qos = bdev->internal.qos; 9730 bdev->internal.qos = NULL; 9731 spdk_spin_unlock(&bdev->internal.spinlock); 9732 9733 if (qos->thread != NULL) { 9734 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9735 spdk_poller_unregister(&qos->poller); 9736 } 9737 9738 free(qos); 9739 9740 bdev_set_qos_limit_done(ctx, 0); 9741 } 9742 9743 static void 9744 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9745 { 9746 struct set_qos_limit_ctx *ctx = _ctx; 9747 struct spdk_thread *thread; 9748 9749 spdk_spin_lock(&bdev->internal.spinlock); 9750 thread = bdev->internal.qos->thread; 9751 spdk_spin_unlock(&bdev->internal.spinlock); 9752 9753 if (thread != NULL) { 9754 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9755 } else { 9756 bdev_disable_qos_done(ctx); 9757 } 9758 } 9759 9760 static void 9761 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9762 struct spdk_io_channel *ch, void *_ctx) 9763 { 9764 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9765 struct spdk_bdev_io *bdev_io; 9766 9767 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9768 9769 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9770 /* Re-submit the queued I/O. */ 9771 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9772 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9773 _bdev_io_submit(bdev_io); 9774 } 9775 9776 spdk_bdev_for_each_channel_continue(i, 0); 9777 } 9778 9779 static void 9780 bdev_update_qos_rate_limit_msg(void *cb_arg) 9781 { 9782 struct set_qos_limit_ctx *ctx = cb_arg; 9783 struct spdk_bdev *bdev = ctx->bdev; 9784 9785 spdk_spin_lock(&bdev->internal.spinlock); 9786 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9787 spdk_spin_unlock(&bdev->internal.spinlock); 9788 9789 bdev_set_qos_limit_done(ctx, 0); 9790 } 9791 9792 static void 9793 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9794 struct spdk_io_channel *ch, void *_ctx) 9795 { 9796 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9797 9798 spdk_spin_lock(&bdev->internal.spinlock); 9799 bdev_enable_qos(bdev, bdev_ch); 9800 spdk_spin_unlock(&bdev->internal.spinlock); 9801 spdk_bdev_for_each_channel_continue(i, 0); 9802 } 9803 9804 static void 9805 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9806 { 9807 struct set_qos_limit_ctx *ctx = _ctx; 9808 9809 bdev_set_qos_limit_done(ctx, status); 9810 } 9811 9812 static void 9813 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9814 { 9815 int i; 9816 9817 assert(bdev->internal.qos != NULL); 9818 9819 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9820 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9821 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9822 9823 if (limits[i] == 0) { 9824 bdev->internal.qos->rate_limits[i].limit = 9825 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9826 } 9827 } 9828 } 9829 } 9830 9831 void 9832 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9833 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9834 { 9835 struct set_qos_limit_ctx *ctx; 9836 uint32_t limit_set_complement; 9837 uint64_t min_limit_per_sec; 9838 int i; 9839 bool disable_rate_limit = true; 9840 9841 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9842 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9843 continue; 9844 } 9845 9846 if (limits[i] > 0) { 9847 disable_rate_limit = false; 9848 } 9849 9850 if (bdev_qos_is_iops_rate_limit(i) == true) { 9851 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9852 } else { 9853 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9854 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9855 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9856 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9857 } 9858 /* Change from megabyte to byte rate limit */ 9859 limits[i] = limits[i] * 1024 * 1024; 9860 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9861 } 9862 9863 limit_set_complement = limits[i] % min_limit_per_sec; 9864 if (limit_set_complement) { 9865 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9866 limits[i], min_limit_per_sec); 9867 limits[i] += min_limit_per_sec - limit_set_complement; 9868 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9869 } 9870 } 9871 9872 ctx = calloc(1, sizeof(*ctx)); 9873 if (ctx == NULL) { 9874 cb_fn(cb_arg, -ENOMEM); 9875 return; 9876 } 9877 9878 ctx->cb_fn = cb_fn; 9879 ctx->cb_arg = cb_arg; 9880 ctx->bdev = bdev; 9881 9882 spdk_spin_lock(&bdev->internal.spinlock); 9883 if (bdev->internal.qos_mod_in_progress) { 9884 spdk_spin_unlock(&bdev->internal.spinlock); 9885 free(ctx); 9886 cb_fn(cb_arg, -EAGAIN); 9887 return; 9888 } 9889 bdev->internal.qos_mod_in_progress = true; 9890 9891 if (disable_rate_limit == true && bdev->internal.qos) { 9892 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9893 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9894 (bdev->internal.qos->rate_limits[i].limit > 0 && 9895 bdev->internal.qos->rate_limits[i].limit != 9896 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9897 disable_rate_limit = false; 9898 break; 9899 } 9900 } 9901 } 9902 9903 if (disable_rate_limit == false) { 9904 if (bdev->internal.qos == NULL) { 9905 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9906 if (!bdev->internal.qos) { 9907 spdk_spin_unlock(&bdev->internal.spinlock); 9908 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9909 bdev_set_qos_limit_done(ctx, -ENOMEM); 9910 return; 9911 } 9912 } 9913 9914 if (bdev->internal.qos->thread == NULL) { 9915 /* Enabling */ 9916 bdev_set_qos_rate_limits(bdev, limits); 9917 9918 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9919 bdev_enable_qos_done); 9920 } else { 9921 /* Updating */ 9922 bdev_set_qos_rate_limits(bdev, limits); 9923 9924 spdk_thread_send_msg(bdev->internal.qos->thread, 9925 bdev_update_qos_rate_limit_msg, ctx); 9926 } 9927 } else { 9928 if (bdev->internal.qos != NULL) { 9929 bdev_set_qos_rate_limits(bdev, limits); 9930 9931 /* Disabling */ 9932 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9933 bdev_disable_qos_msg_done); 9934 } else { 9935 spdk_spin_unlock(&bdev->internal.spinlock); 9936 bdev_set_qos_limit_done(ctx, 0); 9937 return; 9938 } 9939 } 9940 9941 spdk_spin_unlock(&bdev->internal.spinlock); 9942 } 9943 9944 struct spdk_bdev_histogram_ctx { 9945 spdk_bdev_histogram_status_cb cb_fn; 9946 void *cb_arg; 9947 struct spdk_bdev *bdev; 9948 int status; 9949 }; 9950 9951 static void 9952 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9953 { 9954 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9955 9956 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9957 ctx->bdev->internal.histogram_in_progress = false; 9958 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9959 ctx->cb_fn(ctx->cb_arg, ctx->status); 9960 free(ctx); 9961 } 9962 9963 static void 9964 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9965 struct spdk_io_channel *_ch, void *_ctx) 9966 { 9967 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9968 9969 if (ch->histogram != NULL) { 9970 spdk_histogram_data_free(ch->histogram); 9971 ch->histogram = NULL; 9972 } 9973 spdk_bdev_for_each_channel_continue(i, 0); 9974 } 9975 9976 static void 9977 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9978 { 9979 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9980 9981 if (status != 0) { 9982 ctx->status = status; 9983 ctx->bdev->internal.histogram_enabled = false; 9984 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9985 bdev_histogram_disable_channel_cb); 9986 } else { 9987 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9988 ctx->bdev->internal.histogram_in_progress = false; 9989 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9990 ctx->cb_fn(ctx->cb_arg, ctx->status); 9991 free(ctx); 9992 } 9993 } 9994 9995 static void 9996 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9997 struct spdk_io_channel *_ch, void *_ctx) 9998 { 9999 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10000 int status = 0; 10001 10002 if (ch->histogram == NULL) { 10003 ch->histogram = spdk_histogram_data_alloc(); 10004 if (ch->histogram == NULL) { 10005 status = -ENOMEM; 10006 } 10007 } 10008 10009 spdk_bdev_for_each_channel_continue(i, status); 10010 } 10011 10012 void 10013 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10014 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 10015 { 10016 struct spdk_bdev_histogram_ctx *ctx; 10017 10018 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 10019 if (ctx == NULL) { 10020 cb_fn(cb_arg, -ENOMEM); 10021 return; 10022 } 10023 10024 ctx->bdev = bdev; 10025 ctx->status = 0; 10026 ctx->cb_fn = cb_fn; 10027 ctx->cb_arg = cb_arg; 10028 10029 spdk_spin_lock(&bdev->internal.spinlock); 10030 if (bdev->internal.histogram_in_progress) { 10031 spdk_spin_unlock(&bdev->internal.spinlock); 10032 free(ctx); 10033 cb_fn(cb_arg, -EAGAIN); 10034 return; 10035 } 10036 10037 bdev->internal.histogram_in_progress = true; 10038 spdk_spin_unlock(&bdev->internal.spinlock); 10039 10040 bdev->internal.histogram_enabled = enable; 10041 bdev->internal.histogram_io_type = opts->io_type; 10042 10043 if (enable) { 10044 /* Allocate histogram for each channel */ 10045 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 10046 bdev_histogram_enable_channel_cb); 10047 } else { 10048 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 10049 bdev_histogram_disable_channel_cb); 10050 } 10051 } 10052 10053 void 10054 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 10055 { 10056 if (opts == NULL) { 10057 SPDK_ERRLOG("opts should not be NULL\n"); 10058 assert(opts != NULL); 10059 return; 10060 } 10061 if (size == 0) { 10062 SPDK_ERRLOG("size should not be zero\n"); 10063 assert(size != 0); 10064 return; 10065 } 10066 10067 memset(opts, 0, size); 10068 opts->size = size; 10069 10070 #define FIELD_OK(field) \ 10071 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 10072 10073 #define SET_FIELD(field, value) \ 10074 if (FIELD_OK(field)) { \ 10075 opts->field = value; \ 10076 } \ 10077 10078 SET_FIELD(io_type, 0); 10079 10080 /* You should not remove this statement, but need to update the assert statement 10081 * if you add a new field, and also add a corresponding SET_FIELD statement */ 10082 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 10083 10084 #undef FIELD_OK 10085 #undef SET_FIELD 10086 } 10087 10088 void 10089 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10090 void *cb_arg, bool enable) 10091 { 10092 struct spdk_bdev_enable_histogram_opts opts; 10093 10094 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 10095 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 10096 } 10097 10098 struct spdk_bdev_histogram_data_ctx { 10099 spdk_bdev_histogram_data_cb cb_fn; 10100 void *cb_arg; 10101 struct spdk_bdev *bdev; 10102 /** merged histogram data from all channels */ 10103 struct spdk_histogram_data *histogram; 10104 }; 10105 10106 static void 10107 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10108 { 10109 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10110 10111 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 10112 free(ctx); 10113 } 10114 10115 static void 10116 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10117 struct spdk_io_channel *_ch, void *_ctx) 10118 { 10119 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10120 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10121 int status = 0; 10122 10123 if (ch->histogram == NULL) { 10124 status = -EFAULT; 10125 } else { 10126 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 10127 } 10128 10129 spdk_bdev_for_each_channel_continue(i, status); 10130 } 10131 10132 void 10133 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 10134 spdk_bdev_histogram_data_cb cb_fn, 10135 void *cb_arg) 10136 { 10137 struct spdk_bdev_histogram_data_ctx *ctx; 10138 10139 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 10140 if (ctx == NULL) { 10141 cb_fn(cb_arg, -ENOMEM, NULL); 10142 return; 10143 } 10144 10145 ctx->bdev = bdev; 10146 ctx->cb_fn = cb_fn; 10147 ctx->cb_arg = cb_arg; 10148 10149 ctx->histogram = histogram; 10150 10151 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 10152 bdev_histogram_get_channel_cb); 10153 } 10154 10155 void 10156 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 10157 void *cb_arg) 10158 { 10159 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 10160 int status = 0; 10161 10162 assert(cb_fn != NULL); 10163 10164 if (bdev_ch->histogram == NULL) { 10165 status = -EFAULT; 10166 } 10167 cb_fn(cb_arg, status, bdev_ch->histogram); 10168 } 10169 10170 size_t 10171 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 10172 size_t max_events) 10173 { 10174 struct media_event_entry *entry; 10175 size_t num_events = 0; 10176 10177 for (; num_events < max_events; ++num_events) { 10178 entry = TAILQ_FIRST(&desc->pending_media_events); 10179 if (entry == NULL) { 10180 break; 10181 } 10182 10183 events[num_events] = entry->event; 10184 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 10185 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 10186 } 10187 10188 return num_events; 10189 } 10190 10191 int 10192 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 10193 size_t num_events) 10194 { 10195 struct spdk_bdev_desc *desc; 10196 struct media_event_entry *entry; 10197 size_t event_id; 10198 int rc = 0; 10199 10200 assert(bdev->media_events); 10201 10202 spdk_spin_lock(&bdev->internal.spinlock); 10203 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10204 if (desc->write) { 10205 break; 10206 } 10207 } 10208 10209 if (desc == NULL || desc->media_events_buffer == NULL) { 10210 rc = -ENODEV; 10211 goto out; 10212 } 10213 10214 for (event_id = 0; event_id < num_events; ++event_id) { 10215 entry = TAILQ_FIRST(&desc->free_media_events); 10216 if (entry == NULL) { 10217 break; 10218 } 10219 10220 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 10221 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 10222 entry->event = events[event_id]; 10223 } 10224 10225 rc = event_id; 10226 out: 10227 spdk_spin_unlock(&bdev->internal.spinlock); 10228 return rc; 10229 } 10230 10231 static void 10232 _media_management_notify(void *arg) 10233 { 10234 struct spdk_bdev_desc *desc = arg; 10235 10236 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 10237 } 10238 10239 void 10240 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 10241 { 10242 struct spdk_bdev_desc *desc; 10243 10244 spdk_spin_lock(&bdev->internal.spinlock); 10245 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10246 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 10247 event_notify(desc, _media_management_notify); 10248 } 10249 } 10250 spdk_spin_unlock(&bdev->internal.spinlock); 10251 } 10252 10253 struct locked_lba_range_ctx { 10254 struct lba_range range; 10255 struct lba_range *current_range; 10256 struct lba_range *owner_range; 10257 struct spdk_poller *poller; 10258 lock_range_cb cb_fn; 10259 void *cb_arg; 10260 }; 10261 10262 static void 10263 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10264 { 10265 struct locked_lba_range_ctx *ctx = _ctx; 10266 10267 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10268 free(ctx); 10269 } 10270 10271 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10272 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10273 10274 static void 10275 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10276 { 10277 struct locked_lba_range_ctx *ctx = _ctx; 10278 10279 if (status == -ENOMEM) { 10280 /* One of the channels could not allocate a range object. 10281 * So we have to go back and clean up any ranges that were 10282 * allocated successfully before we return error status to 10283 * the caller. We can reuse the unlock function to do that 10284 * clean up. 10285 */ 10286 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10287 bdev_lock_error_cleanup_cb); 10288 return; 10289 } 10290 10291 /* All channels have locked this range and no I/O overlapping the range 10292 * are outstanding! Set the owner_ch for the range object for the 10293 * locking channel, so that this channel will know that it is allowed 10294 * to write to this range. 10295 */ 10296 if (ctx->owner_range != NULL) { 10297 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10298 } 10299 10300 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10301 10302 /* Don't free the ctx here. Its range is in the bdev's global list of 10303 * locked ranges still, and will be removed and freed when this range 10304 * is later unlocked. 10305 */ 10306 } 10307 10308 static int 10309 bdev_lock_lba_range_check_io(void *_i) 10310 { 10311 struct spdk_bdev_channel_iter *i = _i; 10312 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10313 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10314 struct locked_lba_range_ctx *ctx = i->ctx; 10315 struct lba_range *range = ctx->current_range; 10316 struct spdk_bdev_io *bdev_io; 10317 10318 spdk_poller_unregister(&ctx->poller); 10319 10320 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10321 * range. But we need to wait until any outstanding IO overlapping with this range 10322 * are completed. 10323 */ 10324 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10325 if (bdev_io_range_is_locked(bdev_io, range)) { 10326 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10327 return SPDK_POLLER_BUSY; 10328 } 10329 } 10330 10331 spdk_bdev_for_each_channel_continue(i, 0); 10332 return SPDK_POLLER_BUSY; 10333 } 10334 10335 static void 10336 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10337 struct spdk_io_channel *_ch, void *_ctx) 10338 { 10339 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10340 struct locked_lba_range_ctx *ctx = _ctx; 10341 struct lba_range *range; 10342 10343 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10344 if (range->length == ctx->range.length && 10345 range->offset == ctx->range.offset && 10346 range->locked_ctx == ctx->range.locked_ctx) { 10347 /* This range already exists on this channel, so don't add 10348 * it again. This can happen when a new channel is created 10349 * while the for_each_channel operation is in progress. 10350 * Do not check for outstanding I/O in that case, since the 10351 * range was locked before any I/O could be submitted to the 10352 * new channel. 10353 */ 10354 spdk_bdev_for_each_channel_continue(i, 0); 10355 return; 10356 } 10357 } 10358 10359 range = calloc(1, sizeof(*range)); 10360 if (range == NULL) { 10361 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10362 return; 10363 } 10364 10365 range->length = ctx->range.length; 10366 range->offset = ctx->range.offset; 10367 range->locked_ctx = ctx->range.locked_ctx; 10368 range->quiesce = ctx->range.quiesce; 10369 ctx->current_range = range; 10370 if (ctx->range.owner_ch == ch) { 10371 /* This is the range object for the channel that will hold 10372 * the lock. Store it in the ctx object so that we can easily 10373 * set its owner_ch after the lock is finally acquired. 10374 */ 10375 ctx->owner_range = range; 10376 } 10377 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10378 bdev_lock_lba_range_check_io(i); 10379 } 10380 10381 static void 10382 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10383 { 10384 assert(spdk_get_thread() == ctx->range.owner_thread); 10385 assert(ctx->range.owner_ch == NULL || 10386 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10387 10388 /* We will add a copy of this range to each channel now. */ 10389 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10390 bdev_lock_lba_range_cb); 10391 } 10392 10393 static bool 10394 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10395 { 10396 struct lba_range *r; 10397 10398 TAILQ_FOREACH(r, tailq, tailq) { 10399 if (bdev_lba_range_overlapped(range, r)) { 10400 return true; 10401 } 10402 } 10403 return false; 10404 } 10405 10406 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10407 10408 static int 10409 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10410 uint64_t offset, uint64_t length, 10411 lock_range_cb cb_fn, void *cb_arg) 10412 { 10413 struct locked_lba_range_ctx *ctx; 10414 10415 ctx = calloc(1, sizeof(*ctx)); 10416 if (ctx == NULL) { 10417 return -ENOMEM; 10418 } 10419 10420 ctx->range.offset = offset; 10421 ctx->range.length = length; 10422 ctx->range.owner_thread = spdk_get_thread(); 10423 ctx->range.owner_ch = ch; 10424 ctx->range.locked_ctx = cb_arg; 10425 ctx->range.bdev = bdev; 10426 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10427 ctx->cb_fn = cb_fn; 10428 ctx->cb_arg = cb_arg; 10429 10430 spdk_spin_lock(&bdev->internal.spinlock); 10431 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10432 /* There is an active lock overlapping with this range. 10433 * Put it on the pending list until this range no 10434 * longer overlaps with another. 10435 */ 10436 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10437 } else { 10438 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10439 bdev_lock_lba_range_ctx(bdev, ctx); 10440 } 10441 spdk_spin_unlock(&bdev->internal.spinlock); 10442 return 0; 10443 } 10444 10445 static int 10446 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10447 uint64_t offset, uint64_t length, 10448 lock_range_cb cb_fn, void *cb_arg) 10449 { 10450 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10451 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10452 10453 if (cb_arg == NULL) { 10454 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10455 return -EINVAL; 10456 } 10457 10458 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10459 } 10460 10461 static void 10462 bdev_lock_lba_range_ctx_msg(void *_ctx) 10463 { 10464 struct locked_lba_range_ctx *ctx = _ctx; 10465 10466 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10467 } 10468 10469 static void 10470 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10471 { 10472 struct locked_lba_range_ctx *ctx = _ctx; 10473 struct locked_lba_range_ctx *pending_ctx; 10474 struct lba_range *range, *tmp; 10475 10476 spdk_spin_lock(&bdev->internal.spinlock); 10477 /* Check if there are any pending locked ranges that overlap with this range 10478 * that was just unlocked. If there are, check that it doesn't overlap with any 10479 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10480 * the lock process. 10481 */ 10482 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10483 if (bdev_lba_range_overlapped(range, &ctx->range) && 10484 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10485 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10486 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10487 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10488 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10489 bdev_lock_lba_range_ctx_msg, pending_ctx); 10490 } 10491 } 10492 spdk_spin_unlock(&bdev->internal.spinlock); 10493 10494 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10495 free(ctx); 10496 } 10497 10498 static void 10499 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10500 struct spdk_io_channel *_ch, void *_ctx) 10501 { 10502 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10503 struct locked_lba_range_ctx *ctx = _ctx; 10504 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10505 struct spdk_bdev_io *bdev_io; 10506 struct lba_range *range; 10507 10508 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10509 if (ctx->range.offset == range->offset && 10510 ctx->range.length == range->length && 10511 ctx->range.locked_ctx == range->locked_ctx) { 10512 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10513 free(range); 10514 break; 10515 } 10516 } 10517 10518 /* Note: we should almost always be able to assert that the range specified 10519 * was found. But there are some very rare corner cases where a new channel 10520 * gets created simultaneously with a range unlock, where this function 10521 * would execute on that new channel and wouldn't have the range. 10522 * We also use this to clean up range allocations when a later allocation 10523 * fails in the locking path. 10524 * So we can't actually assert() here. 10525 */ 10526 10527 /* Swap the locked IO into a temporary list, and then try to submit them again. 10528 * We could hyper-optimize this to only resubmit locked I/O that overlap 10529 * with the range that was just unlocked, but this isn't a performance path so 10530 * we go for simplicity here. 10531 */ 10532 TAILQ_INIT(&io_locked); 10533 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10534 while (!TAILQ_EMPTY(&io_locked)) { 10535 bdev_io = TAILQ_FIRST(&io_locked); 10536 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10537 bdev_io_submit(bdev_io); 10538 } 10539 10540 spdk_bdev_for_each_channel_continue(i, 0); 10541 } 10542 10543 static int 10544 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10545 lock_range_cb cb_fn, void *cb_arg) 10546 { 10547 struct locked_lba_range_ctx *ctx; 10548 struct lba_range *range; 10549 10550 spdk_spin_lock(&bdev->internal.spinlock); 10551 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10552 * and remove it. This ensures new channels don't inherit the locked range. 10553 * Then we will send a message to each channel to remove the range from its 10554 * per-channel list. 10555 */ 10556 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10557 if (range->offset == offset && range->length == length && 10558 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10559 break; 10560 } 10561 } 10562 if (range == NULL) { 10563 assert(false); 10564 spdk_spin_unlock(&bdev->internal.spinlock); 10565 return -EINVAL; 10566 } 10567 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10568 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10569 spdk_spin_unlock(&bdev->internal.spinlock); 10570 10571 ctx->cb_fn = cb_fn; 10572 ctx->cb_arg = cb_arg; 10573 10574 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10575 bdev_unlock_lba_range_cb); 10576 return 0; 10577 } 10578 10579 static int 10580 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10581 uint64_t offset, uint64_t length, 10582 lock_range_cb cb_fn, void *cb_arg) 10583 { 10584 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10585 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10586 struct lba_range *range; 10587 bool range_found = false; 10588 10589 /* Let's make sure the specified channel actually has a lock on 10590 * the specified range. Note that the range must match exactly. 10591 */ 10592 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10593 if (range->offset == offset && range->length == length && 10594 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10595 range_found = true; 10596 break; 10597 } 10598 } 10599 10600 if (!range_found) { 10601 return -EINVAL; 10602 } 10603 10604 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10605 } 10606 10607 struct bdev_quiesce_ctx { 10608 spdk_bdev_quiesce_cb cb_fn; 10609 void *cb_arg; 10610 }; 10611 10612 static void 10613 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10614 { 10615 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10616 10617 if (quiesce_ctx->cb_fn != NULL) { 10618 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10619 } 10620 10621 free(quiesce_ctx); 10622 } 10623 10624 static void 10625 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10626 { 10627 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10628 struct spdk_bdev_module *module = range->bdev->module; 10629 10630 if (status != 0) { 10631 if (quiesce_ctx->cb_fn != NULL) { 10632 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10633 } 10634 free(quiesce_ctx); 10635 return; 10636 } 10637 10638 spdk_spin_lock(&module->internal.spinlock); 10639 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10640 spdk_spin_unlock(&module->internal.spinlock); 10641 10642 if (quiesce_ctx->cb_fn != NULL) { 10643 /* copy the context in case the range is unlocked by the callback */ 10644 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10645 10646 quiesce_ctx->cb_fn = NULL; 10647 quiesce_ctx->cb_arg = NULL; 10648 10649 tmp.cb_fn(tmp.cb_arg, status); 10650 } 10651 /* quiesce_ctx will be freed on unquiesce */ 10652 } 10653 10654 static int 10655 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10656 uint64_t offset, uint64_t length, 10657 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10658 bool unquiesce) 10659 { 10660 struct bdev_quiesce_ctx *quiesce_ctx; 10661 int rc; 10662 10663 if (module != bdev->module) { 10664 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10665 return -EINVAL; 10666 } 10667 10668 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10669 return -EINVAL; 10670 } 10671 10672 if (unquiesce) { 10673 struct lba_range *range; 10674 10675 /* Make sure the specified range is actually quiesced in the specified module and 10676 * then remove it from the list. Note that the range must match exactly. 10677 */ 10678 spdk_spin_lock(&module->internal.spinlock); 10679 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10680 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10681 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10682 break; 10683 } 10684 } 10685 spdk_spin_unlock(&module->internal.spinlock); 10686 10687 if (range == NULL) { 10688 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10689 return -EINVAL; 10690 } 10691 10692 quiesce_ctx = range->locked_ctx; 10693 quiesce_ctx->cb_fn = cb_fn; 10694 quiesce_ctx->cb_arg = cb_arg; 10695 10696 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10697 } else { 10698 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10699 if (quiesce_ctx == NULL) { 10700 return -ENOMEM; 10701 } 10702 10703 quiesce_ctx->cb_fn = cb_fn; 10704 quiesce_ctx->cb_arg = cb_arg; 10705 10706 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10707 if (rc != 0) { 10708 free(quiesce_ctx); 10709 } 10710 } 10711 10712 return rc; 10713 } 10714 10715 int 10716 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10717 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10718 { 10719 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10720 } 10721 10722 int 10723 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10724 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10725 { 10726 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10727 } 10728 10729 int 10730 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10731 uint64_t offset, uint64_t length, 10732 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10733 { 10734 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10735 } 10736 10737 int 10738 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10739 uint64_t offset, uint64_t length, 10740 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10741 { 10742 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10743 } 10744 10745 int 10746 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10747 int array_size) 10748 { 10749 if (!bdev) { 10750 return -EINVAL; 10751 } 10752 10753 if (bdev->fn_table->get_memory_domains) { 10754 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10755 } 10756 10757 return 0; 10758 } 10759 10760 struct spdk_bdev_for_each_io_ctx { 10761 void *ctx; 10762 spdk_bdev_io_fn fn; 10763 spdk_bdev_for_each_io_cb cb; 10764 }; 10765 10766 static void 10767 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10768 struct spdk_io_channel *io_ch, void *_ctx) 10769 { 10770 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10771 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10772 struct spdk_bdev_io *bdev_io; 10773 int rc = 0; 10774 10775 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10776 rc = ctx->fn(ctx->ctx, bdev_io); 10777 if (rc != 0) { 10778 break; 10779 } 10780 } 10781 10782 spdk_bdev_for_each_channel_continue(i, rc); 10783 } 10784 10785 static void 10786 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10787 { 10788 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10789 10790 ctx->cb(ctx->ctx, status); 10791 10792 free(ctx); 10793 } 10794 10795 void 10796 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10797 spdk_bdev_for_each_io_cb cb) 10798 { 10799 struct spdk_bdev_for_each_io_ctx *ctx; 10800 10801 assert(fn != NULL && cb != NULL); 10802 10803 ctx = calloc(1, sizeof(*ctx)); 10804 if (ctx == NULL) { 10805 SPDK_ERRLOG("Failed to allocate context.\n"); 10806 cb(_ctx, -ENOMEM); 10807 return; 10808 } 10809 10810 ctx->ctx = _ctx; 10811 ctx->fn = fn; 10812 ctx->cb = cb; 10813 10814 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10815 bdev_for_each_io_done); 10816 } 10817 10818 void 10819 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10820 { 10821 spdk_for_each_channel_continue(iter->i, status); 10822 } 10823 10824 static struct spdk_bdev * 10825 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10826 { 10827 void *io_device = spdk_io_channel_iter_get_io_device(i); 10828 10829 return __bdev_from_io_dev(io_device); 10830 } 10831 10832 static void 10833 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10834 { 10835 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10836 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10837 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10838 10839 iter->i = i; 10840 iter->fn(iter, bdev, ch, iter->ctx); 10841 } 10842 10843 static void 10844 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10845 { 10846 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10847 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10848 10849 iter->i = i; 10850 iter->cpl(bdev, iter->ctx, status); 10851 10852 free(iter); 10853 } 10854 10855 void 10856 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10857 void *ctx, spdk_bdev_for_each_channel_done cpl) 10858 { 10859 struct spdk_bdev_channel_iter *iter; 10860 10861 assert(bdev != NULL && fn != NULL && ctx != NULL); 10862 10863 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10864 if (iter == NULL) { 10865 SPDK_ERRLOG("Unable to allocate iterator\n"); 10866 assert(false); 10867 return; 10868 } 10869 10870 iter->fn = fn; 10871 iter->cpl = cpl; 10872 iter->ctx = ctx; 10873 10874 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10875 iter, bdev_each_channel_cpl); 10876 } 10877 10878 static void 10879 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10880 { 10881 struct spdk_bdev_io *parent_io = cb_arg; 10882 10883 spdk_bdev_free_io(bdev_io); 10884 10885 /* Check return status of write */ 10886 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10887 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10888 } 10889 10890 static void 10891 bdev_copy_do_write(void *_bdev_io) 10892 { 10893 struct spdk_bdev_io *bdev_io = _bdev_io; 10894 int rc; 10895 10896 /* Write blocks */ 10897 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10898 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10899 bdev_io->u.bdev.iovs[0].iov_base, 10900 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10901 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10902 10903 if (rc == -ENOMEM) { 10904 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10905 } else if (rc != 0) { 10906 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10907 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10908 } 10909 } 10910 10911 static void 10912 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10913 { 10914 struct spdk_bdev_io *parent_io = cb_arg; 10915 10916 spdk_bdev_free_io(bdev_io); 10917 10918 /* Check return status of read */ 10919 if (!success) { 10920 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10921 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10922 return; 10923 } 10924 10925 /* Do write */ 10926 bdev_copy_do_write(parent_io); 10927 } 10928 10929 static void 10930 bdev_copy_do_read(void *_bdev_io) 10931 { 10932 struct spdk_bdev_io *bdev_io = _bdev_io; 10933 int rc; 10934 10935 /* Read blocks */ 10936 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10937 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10938 bdev_io->u.bdev.iovs[0].iov_base, 10939 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10940 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10941 10942 if (rc == -ENOMEM) { 10943 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10944 } else if (rc != 0) { 10945 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10946 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10947 } 10948 } 10949 10950 static void 10951 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10952 { 10953 if (!success) { 10954 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10955 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10956 return; 10957 } 10958 10959 bdev_copy_do_read(bdev_io); 10960 } 10961 10962 int 10963 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10964 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10965 spdk_bdev_io_completion_cb cb, void *cb_arg) 10966 { 10967 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10968 struct spdk_bdev_io *bdev_io; 10969 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10970 10971 if (!desc->write) { 10972 return -EBADF; 10973 } 10974 10975 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10976 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10977 SPDK_DEBUGLOG(bdev, 10978 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10979 dst_offset_blocks, src_offset_blocks, num_blocks); 10980 return -EINVAL; 10981 } 10982 10983 bdev_io = bdev_channel_get_io(channel); 10984 if (!bdev_io) { 10985 return -ENOMEM; 10986 } 10987 10988 bdev_io->internal.ch = channel; 10989 bdev_io->internal.desc = desc; 10990 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10991 10992 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10993 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10994 bdev_io->u.bdev.num_blocks = num_blocks; 10995 bdev_io->u.bdev.memory_domain = NULL; 10996 bdev_io->u.bdev.memory_domain_ctx = NULL; 10997 bdev_io->u.bdev.iovs = NULL; 10998 bdev_io->u.bdev.iovcnt = 0; 10999 bdev_io->u.bdev.md_buf = NULL; 11000 bdev_io->u.bdev.accel_sequence = NULL; 11001 bdev_io_init(bdev_io, bdev, cb_arg, cb); 11002 11003 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 11004 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 11005 return 0; 11006 } 11007 11008 11009 /* If the copy size is large and should be split, use the generic split logic 11010 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 11011 * 11012 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 11013 * emulate it using regular read and write requests otherwise. 11014 */ 11015 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 11016 bdev_io->internal.f.split) { 11017 bdev_io_submit(bdev_io); 11018 return 0; 11019 } 11020 11021 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 11022 11023 return 0; 11024 } 11025 11026 SPDK_LOG_REGISTER_COMPONENT(bdev) 11027 11028 static void 11029 bdev_trace(void) 11030 { 11031 struct spdk_trace_tpoint_opts opts[] = { 11032 { 11033 "BDEV_IO_START", TRACE_BDEV_IO_START, 11034 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 11035 { 11036 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11037 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11038 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11039 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11040 } 11041 }, 11042 { 11043 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 11044 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 11045 { 11046 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11047 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11048 } 11049 }, 11050 { 11051 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 11052 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11053 { 11054 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11055 } 11056 }, 11057 { 11058 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 11059 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11060 { 11061 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11062 } 11063 }, 11064 }; 11065 11066 11067 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 11068 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 11069 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 11070 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 11071 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 11072 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 11073 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 11074 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 11075 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 11076 } 11077 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 11078