1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 lba_range_tailq_t locked_ranges; 322 323 /** List of I/Os queued by QoS. */ 324 bdev_io_tailq_t qos_queued_io; 325 }; 326 327 struct media_event_entry { 328 struct spdk_bdev_media_event event; 329 TAILQ_ENTRY(media_event_entry) tailq; 330 }; 331 332 #define MEDIA_EVENT_POOL_SIZE 64 333 334 struct spdk_bdev_desc { 335 struct spdk_bdev *bdev; 336 bool write; 337 bool memory_domains_supported; 338 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 339 struct spdk_bdev_open_opts opts; 340 struct spdk_thread *thread; 341 struct { 342 spdk_bdev_event_cb_t event_fn; 343 void *ctx; 344 } callback; 345 bool closed; 346 struct spdk_spinlock spinlock; 347 uint32_t refs; 348 TAILQ_HEAD(, media_event_entry) pending_media_events; 349 TAILQ_HEAD(, media_event_entry) free_media_events; 350 struct media_event_entry *media_events_buffer; 351 TAILQ_ENTRY(spdk_bdev_desc) link; 352 353 uint64_t timeout_in_sec; 354 spdk_bdev_io_timeout_cb cb_fn; 355 void *cb_arg; 356 struct spdk_poller *io_timeout_poller; 357 struct spdk_bdev_module_claim *claim; 358 }; 359 360 struct spdk_bdev_iostat_ctx { 361 struct spdk_bdev_io_stat *stat; 362 enum spdk_bdev_reset_stat_mode reset_mode; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF, 392 }; 393 394 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 395 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 396 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 397 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 398 399 static inline void bdev_io_complete(void *ctx); 400 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 402 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 403 static void _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io); 404 405 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 406 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 407 408 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 409 struct spdk_io_channel *ch, void *_ctx); 410 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 411 412 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 413 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 414 uint64_t num_blocks, 415 struct spdk_memory_domain *domain, void *domain_ctx, 416 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 417 spdk_bdev_io_completion_cb cb, void *cb_arg); 418 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 419 struct iovec *iov, int iovcnt, void *md_buf, 420 uint64_t offset_blocks, uint64_t num_blocks, 421 struct spdk_memory_domain *domain, void *domain_ctx, 422 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 423 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 424 spdk_bdev_io_completion_cb cb, void *cb_arg); 425 426 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 427 uint64_t offset, uint64_t length, 428 lock_range_cb cb_fn, void *cb_arg); 429 430 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 431 uint64_t offset, uint64_t length, 432 lock_range_cb cb_fn, void *cb_arg); 433 434 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 435 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 436 437 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 438 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 439 static void claim_reset(struct spdk_bdev *bdev); 440 441 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 442 443 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 444 445 #define bdev_get_ext_io_opt(opts, field, defval) \ 446 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 447 448 static inline void 449 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 450 { 451 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 452 bdev_io->internal.ch->queue_depth++; 453 } 454 455 static inline void 456 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 457 { 458 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 459 bdev_io->internal.ch->queue_depth--; 460 } 461 462 void 463 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 464 { 465 if (!opts) { 466 SPDK_ERRLOG("opts should not be NULL\n"); 467 return; 468 } 469 470 if (!opts_size) { 471 SPDK_ERRLOG("opts_size should not be zero value\n"); 472 return; 473 } 474 475 opts->opts_size = opts_size; 476 477 #define SET_FIELD(field) \ 478 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 479 opts->field = g_bdev_opts.field; \ 480 } \ 481 482 SET_FIELD(bdev_io_pool_size); 483 SET_FIELD(bdev_io_cache_size); 484 SET_FIELD(bdev_auto_examine); 485 SET_FIELD(iobuf_small_cache_size); 486 SET_FIELD(iobuf_large_cache_size); 487 488 /* Do not remove this statement, you should always update this statement when you adding a new field, 489 * and do not forget to add the SET_FIELD statement for your added field. */ 490 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 491 492 #undef SET_FIELD 493 } 494 495 int 496 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 497 { 498 uint32_t min_pool_size; 499 500 if (!opts) { 501 SPDK_ERRLOG("opts cannot be NULL\n"); 502 return -1; 503 } 504 505 if (!opts->opts_size) { 506 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 507 return -1; 508 } 509 510 /* 511 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 512 * initialization. A second mgmt_ch will be created on the same thread when the application starts 513 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 514 */ 515 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 516 if (opts->bdev_io_pool_size < min_pool_size) { 517 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 518 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 519 spdk_thread_get_count()); 520 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 521 return -1; 522 } 523 524 #define SET_FIELD(field) \ 525 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 526 g_bdev_opts.field = opts->field; \ 527 } \ 528 529 SET_FIELD(bdev_io_pool_size); 530 SET_FIELD(bdev_io_cache_size); 531 SET_FIELD(bdev_auto_examine); 532 SET_FIELD(iobuf_small_cache_size); 533 SET_FIELD(iobuf_large_cache_size); 534 535 g_bdev_opts.opts_size = opts->opts_size; 536 537 #undef SET_FIELD 538 539 return 0; 540 } 541 542 static struct spdk_bdev * 543 bdev_get_by_name(const char *bdev_name) 544 { 545 struct spdk_bdev_name find; 546 struct spdk_bdev_name *res; 547 548 find.name = (char *)bdev_name; 549 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 550 if (res != NULL) { 551 return res->bdev; 552 } 553 554 return NULL; 555 } 556 557 struct spdk_bdev * 558 spdk_bdev_get_by_name(const char *bdev_name) 559 { 560 struct spdk_bdev *bdev; 561 562 spdk_spin_lock(&g_bdev_mgr.spinlock); 563 bdev = bdev_get_by_name(bdev_name); 564 spdk_spin_unlock(&g_bdev_mgr.spinlock); 565 566 return bdev; 567 } 568 569 struct bdev_io_status_string { 570 enum spdk_bdev_io_status status; 571 const char *str; 572 }; 573 574 static const struct bdev_io_status_string bdev_io_status_strings[] = { 575 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 576 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 577 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 578 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 579 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 580 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 581 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 582 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 583 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 584 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 585 }; 586 587 static const char * 588 bdev_io_status_get_string(enum spdk_bdev_io_status status) 589 { 590 uint32_t i; 591 592 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 593 if (bdev_io_status_strings[i].status == status) { 594 return bdev_io_status_strings[i].str; 595 } 596 } 597 598 return "reserved"; 599 } 600 601 struct spdk_bdev_wait_for_examine_ctx { 602 struct spdk_poller *poller; 603 spdk_bdev_wait_for_examine_cb cb_fn; 604 void *cb_arg; 605 }; 606 607 static bool bdev_module_all_actions_completed(void); 608 609 static int 610 bdev_wait_for_examine_cb(void *arg) 611 { 612 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 613 614 if (!bdev_module_all_actions_completed()) { 615 return SPDK_POLLER_IDLE; 616 } 617 618 spdk_poller_unregister(&ctx->poller); 619 ctx->cb_fn(ctx->cb_arg); 620 free(ctx); 621 622 return SPDK_POLLER_BUSY; 623 } 624 625 int 626 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 627 { 628 struct spdk_bdev_wait_for_examine_ctx *ctx; 629 630 ctx = calloc(1, sizeof(*ctx)); 631 if (ctx == NULL) { 632 return -ENOMEM; 633 } 634 ctx->cb_fn = cb_fn; 635 ctx->cb_arg = cb_arg; 636 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 637 638 return 0; 639 } 640 641 struct spdk_bdev_examine_item { 642 char *name; 643 TAILQ_ENTRY(spdk_bdev_examine_item) link; 644 }; 645 646 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 647 648 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 649 g_bdev_examine_allowlist); 650 651 static inline bool 652 bdev_examine_allowlist_check(const char *name) 653 { 654 struct spdk_bdev_examine_item *item; 655 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 656 if (strcmp(name, item->name) == 0) { 657 return true; 658 } 659 } 660 return false; 661 } 662 663 static inline void 664 bdev_examine_allowlist_remove(const char *name) 665 { 666 struct spdk_bdev_examine_item *item; 667 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 668 if (strcmp(name, item->name) == 0) { 669 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 670 free(item->name); 671 free(item); 672 break; 673 } 674 } 675 } 676 677 static inline void 678 bdev_examine_allowlist_free(void) 679 { 680 struct spdk_bdev_examine_item *item; 681 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 682 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 683 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 684 free(item->name); 685 free(item); 686 } 687 } 688 689 static inline bool 690 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 691 { 692 struct spdk_bdev_alias *tmp; 693 if (bdev_examine_allowlist_check(bdev->name)) { 694 return true; 695 } 696 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 697 if (bdev_examine_allowlist_check(tmp->alias.name)) { 698 return true; 699 } 700 } 701 return false; 702 } 703 704 static inline bool 705 bdev_ok_to_examine(struct spdk_bdev *bdev) 706 { 707 /* Some bdevs may not support the READ command. 708 * Do not try to examine them. 709 */ 710 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 711 return false; 712 } 713 714 if (g_bdev_opts.bdev_auto_examine) { 715 return true; 716 } else { 717 return bdev_in_examine_allowlist(bdev); 718 } 719 } 720 721 static void 722 bdev_examine(struct spdk_bdev *bdev) 723 { 724 struct spdk_bdev_module *module; 725 struct spdk_bdev_module_claim *claim, *tmpclaim; 726 uint32_t action; 727 728 if (!bdev_ok_to_examine(bdev)) { 729 return; 730 } 731 732 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 733 if (module->examine_config) { 734 spdk_spin_lock(&module->internal.spinlock); 735 action = module->internal.action_in_progress; 736 module->internal.action_in_progress++; 737 spdk_spin_unlock(&module->internal.spinlock); 738 module->examine_config(bdev); 739 if (action != module->internal.action_in_progress) { 740 SPDK_ERRLOG("examine_config for module %s did not call " 741 "spdk_bdev_module_examine_done()\n", module->name); 742 } 743 } 744 } 745 746 spdk_spin_lock(&bdev->internal.spinlock); 747 748 switch (bdev->internal.claim_type) { 749 case SPDK_BDEV_CLAIM_NONE: 750 /* Examine by all bdev modules */ 751 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 752 if (module->examine_disk) { 753 spdk_spin_lock(&module->internal.spinlock); 754 module->internal.action_in_progress++; 755 spdk_spin_unlock(&module->internal.spinlock); 756 spdk_spin_unlock(&bdev->internal.spinlock); 757 module->examine_disk(bdev); 758 spdk_spin_lock(&bdev->internal.spinlock); 759 } 760 } 761 break; 762 case SPDK_BDEV_CLAIM_EXCL_WRITE: 763 /* Examine by the one bdev module with a v1 claim */ 764 module = bdev->internal.claim.v1.module; 765 if (module->examine_disk) { 766 spdk_spin_lock(&module->internal.spinlock); 767 module->internal.action_in_progress++; 768 spdk_spin_unlock(&module->internal.spinlock); 769 spdk_spin_unlock(&bdev->internal.spinlock); 770 module->examine_disk(bdev); 771 return; 772 } 773 break; 774 default: 775 /* Examine by all bdev modules with a v2 claim */ 776 assert(claim_type_is_v2(bdev->internal.claim_type)); 777 /* 778 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 779 * list, perhaps accessing freed memory. Without protection, this could happen 780 * while the lock is dropped during the examine callback. 781 */ 782 bdev->internal.examine_in_progress++; 783 784 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 785 module = claim->module; 786 787 if (module == NULL) { 788 /* This is a vestigial claim, held by examine_count */ 789 continue; 790 } 791 792 if (module->examine_disk == NULL) { 793 continue; 794 } 795 796 spdk_spin_lock(&module->internal.spinlock); 797 module->internal.action_in_progress++; 798 spdk_spin_unlock(&module->internal.spinlock); 799 800 /* Call examine_disk without holding internal.spinlock. */ 801 spdk_spin_unlock(&bdev->internal.spinlock); 802 module->examine_disk(bdev); 803 spdk_spin_lock(&bdev->internal.spinlock); 804 } 805 806 assert(bdev->internal.examine_in_progress > 0); 807 bdev->internal.examine_in_progress--; 808 if (bdev->internal.examine_in_progress == 0) { 809 /* Remove any claims that were released during examine_disk */ 810 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 811 if (claim->desc != NULL) { 812 continue; 813 } 814 815 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 816 free(claim); 817 } 818 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 819 claim_reset(bdev); 820 } 821 } 822 } 823 824 spdk_spin_unlock(&bdev->internal.spinlock); 825 } 826 827 int 828 spdk_bdev_examine(const char *name) 829 { 830 struct spdk_bdev *bdev; 831 struct spdk_bdev_examine_item *item; 832 struct spdk_thread *thread = spdk_get_thread(); 833 834 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 835 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 836 thread ? spdk_thread_get_name(thread) : "null"); 837 return -EINVAL; 838 } 839 840 if (g_bdev_opts.bdev_auto_examine) { 841 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 842 return -EINVAL; 843 } 844 845 if (bdev_examine_allowlist_check(name)) { 846 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 847 return -EEXIST; 848 } 849 850 item = calloc(1, sizeof(*item)); 851 if (!item) { 852 return -ENOMEM; 853 } 854 item->name = strdup(name); 855 if (!item->name) { 856 free(item); 857 return -ENOMEM; 858 } 859 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 860 861 bdev = spdk_bdev_get_by_name(name); 862 if (bdev) { 863 bdev_examine(bdev); 864 } 865 return 0; 866 } 867 868 static inline void 869 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 870 { 871 struct spdk_bdev_examine_item *item; 872 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 873 spdk_json_write_object_begin(w); 874 spdk_json_write_named_string(w, "method", "bdev_examine"); 875 spdk_json_write_named_object_begin(w, "params"); 876 spdk_json_write_named_string(w, "name", item->name); 877 spdk_json_write_object_end(w); 878 spdk_json_write_object_end(w); 879 } 880 } 881 882 struct spdk_bdev * 883 spdk_bdev_first(void) 884 { 885 struct spdk_bdev *bdev; 886 887 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 888 if (bdev) { 889 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 890 } 891 892 return bdev; 893 } 894 895 struct spdk_bdev * 896 spdk_bdev_next(struct spdk_bdev *prev) 897 { 898 struct spdk_bdev *bdev; 899 900 bdev = TAILQ_NEXT(prev, internal.link); 901 if (bdev) { 902 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 903 } 904 905 return bdev; 906 } 907 908 static struct spdk_bdev * 909 _bdev_next_leaf(struct spdk_bdev *bdev) 910 { 911 while (bdev != NULL) { 912 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 913 return bdev; 914 } else { 915 bdev = TAILQ_NEXT(bdev, internal.link); 916 } 917 } 918 919 return bdev; 920 } 921 922 struct spdk_bdev * 923 spdk_bdev_first_leaf(void) 924 { 925 struct spdk_bdev *bdev; 926 927 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 928 929 if (bdev) { 930 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 931 } 932 933 return bdev; 934 } 935 936 struct spdk_bdev * 937 spdk_bdev_next_leaf(struct spdk_bdev *prev) 938 { 939 struct spdk_bdev *bdev; 940 941 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 942 943 if (bdev) { 944 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 945 } 946 947 return bdev; 948 } 949 950 static inline bool 951 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 952 { 953 return bdev_io->internal.f.has_memory_domain; 954 } 955 956 static inline bool 957 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 958 { 959 return bdev_io->internal.f.has_accel_sequence; 960 } 961 962 static inline uint32_t 963 bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 964 { 965 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 966 967 if (spdk_unlikely(desc->opts.hide_metadata)) { 968 return bdev->blocklen - bdev->md_len; 969 } else { 970 return bdev->blocklen; 971 } 972 } 973 974 static inline uint32_t 975 bdev_io_get_block_size(struct spdk_bdev_io *bdev_io) 976 { 977 struct spdk_bdev *bdev = bdev_io->bdev; 978 979 if (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT) { 980 if (bdev->md_len == spdk_dif_pi_format_get_size(bdev->dif_pi_format)) { 981 return bdev->blocklen - bdev->md_len; 982 } else { 983 return bdev->blocklen; 984 } 985 } 986 987 return bdev_desc_get_block_size(bdev_io->internal.desc); 988 } 989 990 static inline void 991 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 992 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 993 { 994 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 995 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 996 * channels we will instead wait for half to complete. 997 */ 998 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 999 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1000 1001 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1002 bdev_io->internal.retry_state = state; 1003 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1004 } 1005 1006 static inline void 1007 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 1008 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1009 { 1010 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 1011 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 1012 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 1013 1014 assert(state != BDEV_IO_RETRY_STATE_INVALID); 1015 bdev_io->internal.retry_state = state; 1016 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 1017 } 1018 1019 void 1020 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 1021 { 1022 struct iovec *iovs; 1023 1024 if (bdev_io->u.bdev.iovs == NULL) { 1025 bdev_io->u.bdev.iovs = &bdev_io->iov; 1026 bdev_io->u.bdev.iovcnt = 1; 1027 } 1028 1029 iovs = bdev_io->u.bdev.iovs; 1030 1031 assert(iovs != NULL); 1032 assert(bdev_io->u.bdev.iovcnt >= 1); 1033 1034 iovs[0].iov_base = buf; 1035 iovs[0].iov_len = len; 1036 } 1037 1038 void 1039 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1040 { 1041 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 1042 bdev_io->u.bdev.md_buf = md_buf; 1043 } 1044 1045 static bool 1046 _is_buf_allocated(const struct iovec *iovs) 1047 { 1048 if (iovs == NULL) { 1049 return false; 1050 } 1051 1052 return iovs[0].iov_base != NULL; 1053 } 1054 1055 static bool 1056 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1057 { 1058 int i; 1059 uintptr_t iov_base; 1060 1061 if (spdk_likely(alignment == 1)) { 1062 return true; 1063 } 1064 1065 for (i = 0; i < iovcnt; i++) { 1066 iov_base = (uintptr_t)iovs[i].iov_base; 1067 if ((iov_base & (alignment - 1)) != 0) { 1068 return false; 1069 } 1070 } 1071 1072 return true; 1073 } 1074 1075 static inline bool 1076 bdev_io_needs_metadata(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1077 { 1078 return (bdev_io->bdev->md_len != 0) && 1079 (desc->opts.hide_metadata || 1080 (bdev_io->u.bdev.dif_check_flags & SPDK_DIF_FLAGS_NVME_PRACT)); 1081 } 1082 1083 static inline bool 1084 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1085 { 1086 if (!bdev_io_use_accel_sequence(bdev_io)) { 1087 return false; 1088 } 1089 1090 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1091 * bdev module didn't support accel sequences */ 1092 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1093 } 1094 1095 static inline void 1096 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1097 struct spdk_bdev_shared_resource *shared_resource) 1098 { 1099 bdev_ch->io_outstanding++; 1100 shared_resource->io_outstanding++; 1101 } 1102 1103 static inline void 1104 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1105 struct spdk_bdev_shared_resource *shared_resource) 1106 { 1107 assert(bdev_ch->io_outstanding > 0); 1108 assert(shared_resource->io_outstanding > 0); 1109 bdev_ch->io_outstanding--; 1110 shared_resource->io_outstanding--; 1111 } 1112 1113 static void 1114 bdev_io_submit_sequence_cb(void *ctx, int status) 1115 { 1116 struct spdk_bdev_io *bdev_io = ctx; 1117 1118 assert(bdev_io_use_accel_sequence(bdev_io)); 1119 1120 bdev_io->u.bdev.accel_sequence = NULL; 1121 bdev_io->internal.f.has_accel_sequence = false; 1122 1123 if (spdk_unlikely(status != 0)) { 1124 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1126 bdev_io_complete_unsubmitted(bdev_io); 1127 return; 1128 } 1129 1130 bdev_io_submit(bdev_io); 1131 } 1132 1133 static void 1134 bdev_io_exec_sequence_cb(void *ctx, int status) 1135 { 1136 struct spdk_bdev_io *bdev_io = ctx; 1137 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1138 1139 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1140 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1141 1142 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1143 bdev_ch_retry_io(ch); 1144 } 1145 1146 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1147 } 1148 1149 static void 1150 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1151 { 1152 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1153 1154 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1155 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1156 assert(bdev_io_use_accel_sequence(bdev_io)); 1157 1158 /* Since the operations are appended during submission, they're in the opposite order than 1159 * how we want to execute them for reads (i.e. we need to execute the most recently added 1160 * operation first), so reverse the sequence before executing it. 1161 */ 1162 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1163 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1164 } 1165 1166 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1167 bdev_io_increment_outstanding(ch, ch->shared_resource); 1168 bdev_io->internal.data_transfer_cpl = cb_fn; 1169 1170 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1171 bdev_io_exec_sequence_cb, bdev_io); 1172 } 1173 1174 static void 1175 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1176 { 1177 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1178 void *buf; 1179 1180 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1181 buf = bdev_io->internal.buf.ptr; 1182 bdev_io->internal.buf.ptr = NULL; 1183 bdev_io->internal.f.has_buf = false; 1184 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1185 bdev_io->internal.get_aux_buf_cb = NULL; 1186 } else { 1187 assert(bdev_io->internal.get_buf_cb != NULL); 1188 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1189 bdev_io->internal.get_buf_cb = NULL; 1190 } 1191 } 1192 1193 static void 1194 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1195 { 1196 struct spdk_bdev_io *bdev_io = ctx; 1197 1198 if (rc) { 1199 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1200 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1201 } 1202 bdev_io_get_buf_complete(bdev_io, !rc); 1203 } 1204 1205 static void 1206 bdev_io_pull_md_buf_done(void *ctx, int status) 1207 { 1208 struct spdk_bdev_io *bdev_io = ctx; 1209 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1210 1211 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1212 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1213 1214 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1215 bdev_ch_retry_io(ch); 1216 } 1217 1218 assert(bdev_io->internal.data_transfer_cpl); 1219 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1220 } 1221 1222 static void 1223 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1224 { 1225 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1226 int rc = 0; 1227 1228 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1229 assert(bdev_io->internal.f.has_bounce_buf); 1230 if (bdev_io_use_memory_domain(bdev_io)) { 1231 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1232 bdev_io_increment_outstanding(ch, ch->shared_resource); 1233 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1234 bdev_io->internal.memory_domain_ctx, 1235 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1236 &bdev_io->internal.bounce_buf.md_iov, 1, 1237 bdev_io_pull_md_buf_done, bdev_io); 1238 if (rc == 0) { 1239 /* Continue to submit IO in completion callback */ 1240 return; 1241 } 1242 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1243 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1244 if (rc != -ENOMEM) { 1245 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1246 spdk_memory_domain_get_dma_device_id( 1247 bdev_io->internal.memory_domain), rc); 1248 } 1249 } else { 1250 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1251 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1252 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1253 } 1254 } 1255 1256 if (spdk_unlikely(rc == -ENOMEM)) { 1257 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1258 } else { 1259 assert(bdev_io->internal.data_transfer_cpl); 1260 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1261 } 1262 } 1263 1264 static void 1265 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1266 { 1267 assert(bdev_io->internal.f.has_bounce_buf); 1268 1269 /* save original md_buf */ 1270 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1271 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1272 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1273 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1274 /* set bounce md_buf */ 1275 bdev_io->u.bdev.md_buf = md_buf; 1276 1277 bdev_io_pull_md_buf(bdev_io); 1278 } 1279 1280 static void 1281 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1282 { 1283 struct spdk_bdev *bdev = bdev_io->bdev; 1284 uint64_t md_len; 1285 void *buf; 1286 1287 if (spdk_bdev_is_md_separate(bdev)) { 1288 assert(!bdev_io_use_accel_sequence(bdev_io)); 1289 1290 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1291 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1292 1293 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1294 1295 if (bdev_io->u.bdev.md_buf != NULL) { 1296 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1297 return; 1298 } else { 1299 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1300 } 1301 } 1302 1303 bdev_io_get_buf_complete(bdev_io, true); 1304 } 1305 1306 static inline void 1307 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1308 { 1309 if (rc) { 1310 SPDK_ERRLOG("Failed to get data buffer\n"); 1311 assert(bdev_io->internal.data_transfer_cpl); 1312 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1313 return; 1314 } 1315 1316 _bdev_io_set_md_buf(bdev_io); 1317 } 1318 1319 static void 1320 bdev_io_pull_data_done_and_track(void *ctx, int status) 1321 { 1322 struct spdk_bdev_io *bdev_io = ctx; 1323 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1324 1325 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1326 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1327 1328 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1329 bdev_ch_retry_io(ch); 1330 } 1331 1332 bdev_io_pull_data_done(bdev_io, status); 1333 } 1334 1335 static void 1336 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1337 { 1338 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1339 struct spdk_bdev_desc *desc = bdev_io->internal.desc; 1340 int rc = 0; 1341 1342 assert(bdev_io->internal.f.has_bounce_buf); 1343 1344 if (bdev_io_needs_metadata(desc, bdev_io)) { 1345 assert(bdev_io->bdev->md_interleave); 1346 1347 bdev_io->u.bdev.dif_check_flags &= ~SPDK_DIF_FLAGS_NVME_PRACT; 1348 1349 if (!bdev_io_use_accel_sequence(bdev_io)) { 1350 bdev_io->internal.accel_sequence = NULL; 1351 } 1352 1353 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1354 rc = spdk_accel_append_dif_generate_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1355 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1356 bdev_io->u.bdev.memory_domain, 1357 bdev_io->u.bdev.memory_domain_ctx, 1358 bdev_io->internal.bounce_buf.orig_iovs, 1359 bdev_io->internal.bounce_buf.orig_iovcnt, 1360 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1361 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1362 bdev_io->u.bdev.num_blocks, 1363 &bdev_io->u.bdev.dif_ctx, 1364 NULL, NULL); 1365 } else { 1366 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1367 rc = spdk_accel_append_dif_verify_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1368 bdev_io->internal.bounce_buf.orig_iovs, 1369 bdev_io->internal.bounce_buf.orig_iovcnt, 1370 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1371 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1372 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1373 bdev_io->u.bdev.memory_domain, 1374 bdev_io->u.bdev.memory_domain_ctx, 1375 bdev_io->u.bdev.num_blocks, 1376 &bdev_io->u.bdev.dif_ctx, 1377 &bdev_io->u.bdev.dif_err, 1378 NULL, NULL); 1379 } 1380 1381 if (spdk_likely(rc == 0)) { 1382 bdev_io->internal.f.has_accel_sequence = true; 1383 bdev_io->u.bdev.accel_sequence = bdev_io->internal.accel_sequence; 1384 } else if (rc != -ENOMEM) { 1385 SPDK_ERRLOG("Failed to append generate/verify_copy to accel sequence: %p\n", 1386 bdev_io->internal.accel_sequence); 1387 } 1388 } else if (bdev_io_needs_sequence_exec(desc, bdev_io) || 1389 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1390 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1391 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1392 * operation */ 1393 assert(bdev_io_use_accel_sequence(bdev_io)); 1394 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1395 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1396 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1397 NULL, NULL, 1398 bdev_io->internal.bounce_buf.orig_iovs, 1399 bdev_io->internal.bounce_buf.orig_iovcnt, 1400 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1401 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1402 NULL, NULL); 1403 } else { 1404 /* We need to reverse the src/dst for reads */ 1405 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1406 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1407 bdev_io->internal.bounce_buf.orig_iovs, 1408 bdev_io->internal.bounce_buf.orig_iovcnt, 1409 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1410 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1411 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1412 NULL, NULL, NULL, NULL); 1413 } 1414 1415 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1416 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1417 bdev_io->internal.accel_sequence); 1418 } 1419 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1420 /* if this is write path, copy data from original buffer to bounce buffer */ 1421 if (bdev_io_use_memory_domain(bdev_io)) { 1422 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1423 bdev_io_increment_outstanding(ch, ch->shared_resource); 1424 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1425 bdev_io->internal.memory_domain_ctx, 1426 bdev_io->internal.bounce_buf.orig_iovs, 1427 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1428 bdev_io->u.bdev.iovs, 1, 1429 bdev_io_pull_data_done_and_track, 1430 bdev_io); 1431 if (rc == 0) { 1432 /* Continue to submit IO in completion callback */ 1433 return; 1434 } 1435 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1436 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1437 if (rc != -ENOMEM) { 1438 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1439 spdk_memory_domain_get_dma_device_id( 1440 bdev_io->internal.memory_domain)); 1441 } 1442 } else { 1443 assert(bdev_io->u.bdev.iovcnt == 1); 1444 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1445 bdev_io->u.bdev.iovs[0].iov_len, 1446 bdev_io->internal.bounce_buf.orig_iovs, 1447 bdev_io->internal.bounce_buf.orig_iovcnt); 1448 } 1449 } 1450 1451 if (spdk_unlikely(rc == -ENOMEM)) { 1452 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1453 } else { 1454 bdev_io_pull_data_done(bdev_io, rc); 1455 } 1456 } 1457 1458 static void 1459 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1460 bdev_copy_bounce_buffer_cpl cpl_cb) 1461 { 1462 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1463 1464 assert(bdev_io->internal.f.has_bounce_buf == false); 1465 1466 bdev_io->internal.data_transfer_cpl = cpl_cb; 1467 bdev_io->internal.f.has_bounce_buf = true; 1468 /* save original iovec */ 1469 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1470 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1471 /* zero the other data members */ 1472 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1473 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1474 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1475 /* set bounce iov */ 1476 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1477 bdev_io->u.bdev.iovcnt = 1; 1478 /* set bounce buffer for this operation */ 1479 bdev_io->u.bdev.iovs[0].iov_base = buf; 1480 bdev_io->u.bdev.iovs[0].iov_len = len; 1481 /* Now we use 1 iov, the split condition could have been changed */ 1482 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1483 1484 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1485 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1486 } else { 1487 bdev_io_pull_data(bdev_io); 1488 } 1489 } 1490 1491 static void 1492 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1493 { 1494 struct spdk_bdev *bdev = bdev_io->bdev; 1495 bool buf_allocated; 1496 uint64_t alignment; 1497 void *aligned_buf; 1498 1499 bdev_io->internal.buf.ptr = buf; 1500 bdev_io->internal.f.has_buf = true; 1501 1502 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1503 bdev_io_get_buf_complete(bdev_io, true); 1504 return; 1505 } 1506 1507 alignment = spdk_bdev_get_buf_align(bdev); 1508 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1509 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1510 1511 if (buf_allocated) { 1512 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1513 /* Continue in completion callback */ 1514 return; 1515 } else { 1516 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1517 } 1518 1519 _bdev_io_set_md_buf(bdev_io); 1520 } 1521 1522 static inline uint64_t 1523 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1524 { 1525 struct spdk_bdev *bdev = bdev_io->bdev; 1526 uint64_t md_len, alignment; 1527 1528 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1529 1530 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1531 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1532 1533 return len + alignment + md_len; 1534 } 1535 1536 static void 1537 bdev_io_put_accel_buf(struct spdk_bdev_io *bdev_io) 1538 { 1539 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1540 1541 spdk_accel_put_buf(ch->accel_channel, 1542 bdev_io->internal.buf.ptr, 1543 bdev_io->u.bdev.memory_domain, 1544 bdev_io->u.bdev.memory_domain_ctx); 1545 } 1546 1547 static void 1548 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1549 { 1550 struct spdk_bdev_mgmt_channel *ch; 1551 1552 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1553 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1554 } 1555 1556 static void 1557 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1558 { 1559 assert(bdev_io->internal.f.has_buf); 1560 1561 if (bdev_io->u.bdev.memory_domain == spdk_accel_get_memory_domain()) { 1562 bdev_io_put_accel_buf(bdev_io); 1563 } else { 1564 assert(bdev_io->u.bdev.memory_domain == NULL); 1565 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, 1566 bdev_io->internal.buf.len); 1567 } 1568 bdev_io->internal.buf.ptr = NULL; 1569 bdev_io->internal.f.has_buf = false; 1570 } 1571 1572 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1573 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1574 1575 void 1576 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1577 { 1578 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1579 1580 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1581 1582 assert(buf != NULL); 1583 _bdev_io_put_buf(bdev_io, buf, len); 1584 } 1585 1586 static inline void 1587 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1588 struct spdk_bdev_io *bdev_io) 1589 { 1590 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1591 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1592 * sequence pointer to make sure we won't touch it anymore. */ 1593 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1594 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1595 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1596 bdev_io->internal.f.has_accel_sequence = false; 1597 } 1598 1599 bdev->fn_table->submit_request(ioch, bdev_io); 1600 } 1601 1602 static inline void 1603 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1604 { 1605 struct spdk_bdev *bdev = bdev_io->bdev; 1606 1607 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1608 bdev_io->internal.error.nvme.cdw0 = 0; 1609 bdev_io->num_retries++; 1610 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1611 } 1612 1613 static void 1614 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1615 { 1616 struct spdk_bdev_io *bdev_io; 1617 1618 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1619 /* 1620 * Allow some more I/O to complete before retrying the nomem_io queue. 1621 * Some drivers (such as nvme) cannot immediately take a new I/O in 1622 * the context of a completion, because the resources for the I/O are 1623 * not released until control returns to the bdev poller. Also, we 1624 * may require several small I/O to complete before a larger I/O 1625 * (that requires splitting) can be submitted. 1626 */ 1627 return; 1628 } 1629 1630 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1631 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1632 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1633 1634 switch (bdev_io->internal.retry_state) { 1635 case BDEV_IO_RETRY_STATE_SUBMIT: 1636 bdev_ch_resubmit_io(shared_resource, bdev_io); 1637 break; 1638 case BDEV_IO_RETRY_STATE_PULL: 1639 bdev_io_pull_data(bdev_io); 1640 break; 1641 case BDEV_IO_RETRY_STATE_PULL_MD: 1642 bdev_io_pull_md_buf(bdev_io); 1643 break; 1644 case BDEV_IO_RETRY_STATE_PUSH: 1645 bdev_io_push_bounce_data(bdev_io); 1646 break; 1647 case BDEV_IO_RETRY_STATE_PUSH_MD: 1648 bdev_io_push_bounce_md_buf(bdev_io); 1649 break; 1650 case BDEV_IO_RETRY_STATE_GET_ACCEL_BUF: 1651 _bdev_io_get_accel_buf(bdev_io); 1652 break; 1653 default: 1654 assert(0 && "invalid retry state"); 1655 break; 1656 } 1657 1658 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1659 /* This IO completed again with NOMEM status, so break the loop and 1660 * don't try anymore. Note that a bdev_io that fails with NOMEM 1661 * always gets requeued at the front of the list, to maintain 1662 * ordering. 1663 */ 1664 break; 1665 } 1666 } 1667 } 1668 1669 static void 1670 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1671 { 1672 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1673 } 1674 1675 static int 1676 bdev_no_mem_poller(void *ctx) 1677 { 1678 struct spdk_bdev_shared_resource *shared_resource = ctx; 1679 1680 spdk_poller_unregister(&shared_resource->nomem_poller); 1681 1682 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1683 bdev_shared_ch_retry_io(shared_resource); 1684 } 1685 /* the retry cb may re-register the poller so double check */ 1686 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1687 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1688 /* No IOs were submitted, try again */ 1689 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1690 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1691 } 1692 1693 return SPDK_POLLER_BUSY; 1694 } 1695 1696 static inline bool 1697 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1698 { 1699 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1700 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1701 1702 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1703 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1704 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1705 1706 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1707 /* Special case when we have nomem IOs and no outstanding IOs which completions 1708 * could trigger retry of queued IOs 1709 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1710 * new IOs submitted, e.g. qd==1 */ 1711 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1712 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1713 } 1714 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1715 * ownership of that sequence is transferred back to the bdev layer, so we need to 1716 * restore internal.accel_sequence to make sure that the sequence is handled 1717 * correctly in case the I/O is later aborted. */ 1718 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1719 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1720 assert(!bdev_io_use_accel_sequence(bdev_io)); 1721 bdev_io->internal.f.has_accel_sequence = true; 1722 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1723 } 1724 1725 return true; 1726 } 1727 1728 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1729 bdev_ch_retry_io(bdev_ch); 1730 } 1731 1732 return false; 1733 } 1734 1735 static void 1736 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1737 { 1738 struct spdk_bdev_io *bdev_io = ctx; 1739 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1740 1741 if (rc) { 1742 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1743 } 1744 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1745 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1746 */ 1747 bdev_io_put_buf(bdev_io); 1748 1749 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1750 bdev_ch_retry_io(ch); 1751 } 1752 1753 /* Continue with IO completion flow */ 1754 bdev_io_complete(bdev_io); 1755 } 1756 1757 static void 1758 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1759 { 1760 struct spdk_bdev_io *bdev_io = ctx; 1761 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1762 1763 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1764 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1765 bdev_io->internal.f.has_bounce_buf = false; 1766 1767 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1768 bdev_ch_retry_io(ch); 1769 } 1770 1771 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1772 } 1773 1774 static inline void 1775 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1776 { 1777 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1778 int rc = 0; 1779 1780 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1781 assert(bdev_io->internal.f.has_bounce_buf); 1782 1783 /* do the same for metadata buffer */ 1784 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1785 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1786 1787 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1788 if (bdev_io_use_memory_domain(bdev_io)) { 1789 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1790 bdev_io_increment_outstanding(ch, ch->shared_resource); 1791 /* If memory domain is used then we need to call async push function */ 1792 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1793 bdev_io->internal.memory_domain_ctx, 1794 &bdev_io->internal.bounce_buf.orig_md_iov, 1795 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1796 &bdev_io->internal.bounce_buf.md_iov, 1, 1797 bdev_io_push_bounce_md_buf_done, 1798 bdev_io); 1799 if (rc == 0) { 1800 /* Continue IO completion in async callback */ 1801 return; 1802 } 1803 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1804 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1805 if (rc != -ENOMEM) { 1806 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1807 spdk_memory_domain_get_dma_device_id( 1808 bdev_io->internal.memory_domain)); 1809 } 1810 } else { 1811 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1812 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1813 } 1814 } 1815 } 1816 1817 if (spdk_unlikely(rc == -ENOMEM)) { 1818 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1819 } else { 1820 assert(bdev_io->internal.data_transfer_cpl); 1821 bdev_io->internal.f.has_bounce_buf = false; 1822 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1823 } 1824 } 1825 1826 static inline void 1827 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1828 { 1829 assert(bdev_io->internal.data_transfer_cpl); 1830 if (rc) { 1831 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1832 return; 1833 } 1834 1835 /* set original buffer for this io */ 1836 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1837 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1838 1839 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1840 * we still need to clear the md buf */ 1841 1842 bdev_io_push_bounce_md_buf(bdev_io); 1843 } 1844 1845 static void 1846 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1847 { 1848 struct spdk_bdev_io *bdev_io = ctx; 1849 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1850 1851 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1852 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1853 1854 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1855 bdev_ch_retry_io(ch); 1856 } 1857 1858 bdev_io_push_bounce_data_done(bdev_io, status); 1859 } 1860 1861 static inline void 1862 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1863 { 1864 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1865 int rc = 0; 1866 1867 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1868 assert(!bdev_io_use_accel_sequence(bdev_io)); 1869 assert(bdev_io->internal.f.has_bounce_buf); 1870 1871 /* if this is read path, copy data from bounce buffer to original buffer */ 1872 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1873 if (bdev_io_use_memory_domain(bdev_io)) { 1874 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1875 bdev_io_increment_outstanding(ch, ch->shared_resource); 1876 /* If memory domain is used then we need to call async push function */ 1877 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1878 bdev_io->internal.memory_domain_ctx, 1879 bdev_io->internal.bounce_buf.orig_iovs, 1880 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1881 &bdev_io->internal.bounce_buf.iov, 1, 1882 bdev_io_push_bounce_data_done_and_track, 1883 bdev_io); 1884 if (rc == 0) { 1885 /* Continue IO completion in async callback */ 1886 return; 1887 } 1888 1889 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1890 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1891 if (rc != -ENOMEM) { 1892 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1893 spdk_memory_domain_get_dma_device_id( 1894 bdev_io->internal.memory_domain)); 1895 } 1896 } else { 1897 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1898 bdev_io->internal.bounce_buf.orig_iovcnt, 1899 bdev_io->internal.bounce_buf.iov.iov_base, 1900 bdev_io->internal.bounce_buf.iov.iov_len); 1901 } 1902 } 1903 1904 if (spdk_unlikely(rc == -ENOMEM)) { 1905 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1906 } else { 1907 bdev_io_push_bounce_data_done(bdev_io, rc); 1908 } 1909 } 1910 1911 static inline void 1912 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1913 { 1914 bdev_io->internal.data_transfer_cpl = cpl_cb; 1915 bdev_io_push_bounce_data(bdev_io); 1916 } 1917 1918 static void 1919 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1920 { 1921 struct spdk_bdev_io *bdev_io; 1922 1923 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1924 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1925 } 1926 1927 static void 1928 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1929 { 1930 struct spdk_bdev_mgmt_channel *mgmt_ch; 1931 uint64_t max_len; 1932 void *buf; 1933 1934 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1935 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1936 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1937 1938 if (spdk_unlikely(max_len > mgmt_ch->iobuf.cache[0].large.bufsize)) { 1939 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1940 bdev_io_get_buf_complete(bdev_io, false); 1941 return; 1942 } 1943 1944 bdev_io->internal.buf.len = len; 1945 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1946 bdev_io_get_iobuf_cb); 1947 if (buf != NULL) { 1948 _bdev_io_set_buf(bdev_io, buf, len); 1949 } 1950 } 1951 1952 void 1953 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1954 { 1955 struct spdk_bdev *bdev = bdev_io->bdev; 1956 uint64_t alignment; 1957 1958 assert(cb != NULL); 1959 bdev_io->internal.get_buf_cb = cb; 1960 1961 alignment = spdk_bdev_get_buf_align(bdev); 1962 1963 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1964 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1965 /* Buffer already present and aligned */ 1966 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1967 return; 1968 } 1969 1970 bdev_io_get_buf(bdev_io, len); 1971 } 1972 1973 static void 1974 _bdev_io_get_bounce_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1975 uint64_t len) 1976 { 1977 assert(cb != NULL); 1978 bdev_io->internal.get_buf_cb = cb; 1979 1980 bdev_io_get_buf(bdev_io, len); 1981 } 1982 1983 static void 1984 _bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io) 1985 { 1986 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1987 void *buf; 1988 int rc; 1989 1990 rc = spdk_accel_get_buf(ch->accel_channel, 1991 bdev_io->internal.buf.len, 1992 &buf, 1993 &bdev_io->u.bdev.memory_domain, 1994 &bdev_io->u.bdev.memory_domain_ctx); 1995 if (rc != 0) { 1996 bdev_queue_nomem_io_tail(ch->shared_resource, bdev_io, 1997 BDEV_IO_RETRY_STATE_GET_ACCEL_BUF); 1998 return; 1999 } 2000 2001 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 2002 } 2003 2004 static inline void 2005 bdev_io_get_accel_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 2006 uint64_t len) 2007 { 2008 bdev_io->internal.buf.len = len; 2009 bdev_io->internal.get_buf_cb = cb; 2010 2011 _bdev_io_get_accel_buf(bdev_io); 2012 } 2013 2014 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 2015 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 2016 2017 void 2018 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 2019 { 2020 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 2021 2022 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 2023 2024 assert(cb != NULL); 2025 assert(bdev_io->internal.get_aux_buf_cb == NULL); 2026 bdev_io->internal.get_aux_buf_cb = cb; 2027 bdev_io_get_buf(bdev_io, len); 2028 } 2029 2030 static int 2031 bdev_module_get_max_ctx_size(void) 2032 { 2033 struct spdk_bdev_module *bdev_module; 2034 int max_bdev_module_size = 0; 2035 2036 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2037 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 2038 max_bdev_module_size = bdev_module->get_ctx_size(); 2039 } 2040 } 2041 2042 return max_bdev_module_size; 2043 } 2044 2045 static void 2046 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2047 { 2048 if (!bdev->internal.histogram_enabled) { 2049 return; 2050 } 2051 2052 spdk_json_write_object_begin(w); 2053 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 2054 2055 spdk_json_write_named_object_begin(w, "params"); 2056 spdk_json_write_named_string(w, "name", bdev->name); 2057 2058 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 2059 2060 if (bdev->internal.histogram_io_type) { 2061 spdk_json_write_named_string(w, "opc", 2062 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 2063 } 2064 2065 spdk_json_write_object_end(w); 2066 2067 spdk_json_write_object_end(w); 2068 } 2069 2070 static void 2071 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 2072 { 2073 int i; 2074 struct spdk_bdev_qos *qos = bdev->internal.qos; 2075 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 2076 2077 if (!qos) { 2078 return; 2079 } 2080 2081 spdk_bdev_get_qos_rate_limits(bdev, limits); 2082 2083 spdk_json_write_object_begin(w); 2084 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 2085 2086 spdk_json_write_named_object_begin(w, "params"); 2087 spdk_json_write_named_string(w, "name", bdev->name); 2088 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2089 if (limits[i] > 0) { 2090 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 2091 } 2092 } 2093 spdk_json_write_object_end(w); 2094 2095 spdk_json_write_object_end(w); 2096 } 2097 2098 void 2099 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 2100 { 2101 struct spdk_bdev_module *bdev_module; 2102 struct spdk_bdev *bdev; 2103 2104 assert(w != NULL); 2105 2106 spdk_json_write_array_begin(w); 2107 2108 spdk_json_write_object_begin(w); 2109 spdk_json_write_named_string(w, "method", "bdev_set_options"); 2110 spdk_json_write_named_object_begin(w, "params"); 2111 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 2112 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 2113 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 2114 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 2115 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 2116 spdk_json_write_object_end(w); 2117 spdk_json_write_object_end(w); 2118 2119 bdev_examine_allowlist_config_json(w); 2120 2121 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2122 if (bdev_module->config_json) { 2123 bdev_module->config_json(w); 2124 } 2125 } 2126 2127 spdk_spin_lock(&g_bdev_mgr.spinlock); 2128 2129 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2130 if (bdev->fn_table->write_config_json) { 2131 bdev->fn_table->write_config_json(bdev, w); 2132 } 2133 2134 bdev_qos_config_json(bdev, w); 2135 bdev_enable_histogram_config_json(bdev, w); 2136 } 2137 2138 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2139 2140 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2141 spdk_json_write_object_begin(w); 2142 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2143 spdk_json_write_object_end(w); 2144 2145 spdk_json_write_array_end(w); 2146 } 2147 2148 static void 2149 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2150 { 2151 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2152 struct spdk_bdev_io *bdev_io; 2153 2154 spdk_iobuf_channel_fini(&ch->iobuf); 2155 2156 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2157 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2158 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2159 ch->per_thread_cache_count--; 2160 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2161 } 2162 2163 assert(ch->per_thread_cache_count == 0); 2164 } 2165 2166 static int 2167 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2168 { 2169 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2170 struct spdk_bdev_io *bdev_io; 2171 uint32_t i; 2172 int rc; 2173 2174 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2175 g_bdev_opts.iobuf_small_cache_size, 2176 g_bdev_opts.iobuf_large_cache_size); 2177 if (rc != 0) { 2178 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2179 return -1; 2180 } 2181 2182 STAILQ_INIT(&ch->per_thread_cache); 2183 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2184 2185 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2186 ch->per_thread_cache_count = 0; 2187 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2188 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2189 if (bdev_io == NULL) { 2190 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2191 assert(false); 2192 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2193 return -1; 2194 } 2195 ch->per_thread_cache_count++; 2196 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2197 } 2198 2199 TAILQ_INIT(&ch->shared_resources); 2200 TAILQ_INIT(&ch->io_wait_queue); 2201 2202 return 0; 2203 } 2204 2205 static void 2206 bdev_init_complete(int rc) 2207 { 2208 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2209 void *cb_arg = g_init_cb_arg; 2210 struct spdk_bdev_module *m; 2211 2212 g_bdev_mgr.init_complete = true; 2213 g_init_cb_fn = NULL; 2214 g_init_cb_arg = NULL; 2215 2216 /* 2217 * For modules that need to know when subsystem init is complete, 2218 * inform them now. 2219 */ 2220 if (rc == 0) { 2221 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2222 if (m->init_complete) { 2223 m->init_complete(); 2224 } 2225 } 2226 } 2227 2228 cb_fn(cb_arg, rc); 2229 } 2230 2231 static bool 2232 bdev_module_all_actions_completed(void) 2233 { 2234 struct spdk_bdev_module *m; 2235 2236 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2237 if (m->internal.action_in_progress > 0) { 2238 return false; 2239 } 2240 } 2241 return true; 2242 } 2243 2244 static void 2245 bdev_module_action_complete(void) 2246 { 2247 /* 2248 * Don't finish bdev subsystem initialization if 2249 * module pre-initialization is still in progress, or 2250 * the subsystem been already initialized. 2251 */ 2252 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2253 return; 2254 } 2255 2256 /* 2257 * Check all bdev modules for inits/examinations in progress. If any 2258 * exist, return immediately since we cannot finish bdev subsystem 2259 * initialization until all are completed. 2260 */ 2261 if (!bdev_module_all_actions_completed()) { 2262 return; 2263 } 2264 2265 /* 2266 * Modules already finished initialization - now that all 2267 * the bdev modules have finished their asynchronous I/O 2268 * processing, the entire bdev layer can be marked as complete. 2269 */ 2270 bdev_init_complete(0); 2271 } 2272 2273 static void 2274 bdev_module_action_done(struct spdk_bdev_module *module) 2275 { 2276 spdk_spin_lock(&module->internal.spinlock); 2277 assert(module->internal.action_in_progress > 0); 2278 module->internal.action_in_progress--; 2279 spdk_spin_unlock(&module->internal.spinlock); 2280 bdev_module_action_complete(); 2281 } 2282 2283 void 2284 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2285 { 2286 assert(module->async_init); 2287 bdev_module_action_done(module); 2288 } 2289 2290 void 2291 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2292 { 2293 bdev_module_action_done(module); 2294 } 2295 2296 /** The last initialized bdev module */ 2297 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2298 2299 static void 2300 bdev_init_failed(void *cb_arg) 2301 { 2302 struct spdk_bdev_module *module = cb_arg; 2303 2304 spdk_spin_lock(&module->internal.spinlock); 2305 assert(module->internal.action_in_progress > 0); 2306 module->internal.action_in_progress--; 2307 spdk_spin_unlock(&module->internal.spinlock); 2308 bdev_init_complete(-1); 2309 } 2310 2311 static int 2312 bdev_modules_init(void) 2313 { 2314 struct spdk_bdev_module *module; 2315 int rc = 0; 2316 2317 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2318 g_resume_bdev_module = module; 2319 if (module->async_init) { 2320 spdk_spin_lock(&module->internal.spinlock); 2321 module->internal.action_in_progress = 1; 2322 spdk_spin_unlock(&module->internal.spinlock); 2323 } 2324 rc = module->module_init(); 2325 if (rc != 0) { 2326 /* Bump action_in_progress to prevent other modules from completion of modules_init 2327 * Send message to defer application shutdown until resources are cleaned up */ 2328 spdk_spin_lock(&module->internal.spinlock); 2329 module->internal.action_in_progress = 1; 2330 spdk_spin_unlock(&module->internal.spinlock); 2331 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2332 return rc; 2333 } 2334 } 2335 2336 g_resume_bdev_module = NULL; 2337 return 0; 2338 } 2339 2340 void 2341 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2342 { 2343 int rc = 0; 2344 char mempool_name[32]; 2345 2346 assert(cb_fn != NULL); 2347 2348 g_init_cb_fn = cb_fn; 2349 g_init_cb_arg = cb_arg; 2350 2351 spdk_notify_type_register("bdev_register"); 2352 spdk_notify_type_register("bdev_unregister"); 2353 2354 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2355 2356 rc = spdk_iobuf_register_module("bdev"); 2357 if (rc != 0) { 2358 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2359 bdev_init_complete(-1); 2360 return; 2361 } 2362 2363 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2364 g_bdev_opts.bdev_io_pool_size, 2365 sizeof(struct spdk_bdev_io) + 2366 bdev_module_get_max_ctx_size(), 2367 0, 2368 SPDK_ENV_NUMA_ID_ANY); 2369 2370 if (g_bdev_mgr.bdev_io_pool == NULL) { 2371 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2372 bdev_init_complete(-1); 2373 return; 2374 } 2375 2376 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2377 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2378 if (!g_bdev_mgr.zero_buffer) { 2379 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2380 bdev_init_complete(-1); 2381 return; 2382 } 2383 2384 #ifdef SPDK_CONFIG_VTUNE 2385 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2386 #endif 2387 2388 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2389 bdev_mgmt_channel_destroy, 2390 sizeof(struct spdk_bdev_mgmt_channel), 2391 "bdev_mgr"); 2392 2393 rc = bdev_modules_init(); 2394 g_bdev_mgr.module_init_complete = true; 2395 if (rc != 0) { 2396 SPDK_ERRLOG("bdev modules init failed\n"); 2397 return; 2398 } 2399 2400 bdev_module_action_complete(); 2401 } 2402 2403 static void 2404 bdev_mgr_unregister_cb(void *io_device) 2405 { 2406 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2407 2408 if (g_bdev_mgr.bdev_io_pool) { 2409 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2410 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2411 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2412 g_bdev_opts.bdev_io_pool_size); 2413 } 2414 2415 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2416 } 2417 2418 spdk_free(g_bdev_mgr.zero_buffer); 2419 2420 bdev_examine_allowlist_free(); 2421 2422 cb_fn(g_fini_cb_arg); 2423 g_fini_cb_fn = NULL; 2424 g_fini_cb_arg = NULL; 2425 g_bdev_mgr.init_complete = false; 2426 g_bdev_mgr.module_init_complete = false; 2427 } 2428 2429 static void 2430 bdev_module_fini_iter(void *arg) 2431 { 2432 struct spdk_bdev_module *bdev_module; 2433 2434 /* FIXME: Handling initialization failures is broken now, 2435 * so we won't even try cleaning up after successfully 2436 * initialized modules. if module_init_complete is false, 2437 * just call spdk_bdev_mgr_unregister_cb 2438 */ 2439 if (!g_bdev_mgr.module_init_complete) { 2440 bdev_mgr_unregister_cb(NULL); 2441 return; 2442 } 2443 2444 /* Start iterating from the last touched module */ 2445 if (!g_resume_bdev_module) { 2446 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2447 } else { 2448 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2449 internal.tailq); 2450 } 2451 2452 while (bdev_module) { 2453 if (bdev_module->async_fini) { 2454 /* Save our place so we can resume later. We must 2455 * save the variable here, before calling module_fini() 2456 * below, because in some cases the module may immediately 2457 * call spdk_bdev_module_fini_done() and re-enter 2458 * this function to continue iterating. */ 2459 g_resume_bdev_module = bdev_module; 2460 } 2461 2462 if (bdev_module->module_fini) { 2463 bdev_module->module_fini(); 2464 } 2465 2466 if (bdev_module->async_fini) { 2467 return; 2468 } 2469 2470 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2471 internal.tailq); 2472 } 2473 2474 g_resume_bdev_module = NULL; 2475 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2476 } 2477 2478 void 2479 spdk_bdev_module_fini_done(void) 2480 { 2481 if (spdk_get_thread() != g_fini_thread) { 2482 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2483 } else { 2484 bdev_module_fini_iter(NULL); 2485 } 2486 } 2487 2488 static void 2489 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2490 { 2491 struct spdk_bdev *bdev = cb_arg; 2492 2493 if (bdeverrno && bdev) { 2494 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2495 bdev->name); 2496 2497 /* 2498 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2499 * bdev; try to continue by manually removing this bdev from the list and continue 2500 * with the next bdev in the list. 2501 */ 2502 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2503 } 2504 2505 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2506 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2507 /* 2508 * Bdev module finish need to be deferred as we might be in the middle of some context 2509 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2510 * after returning. 2511 */ 2512 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2513 return; 2514 } 2515 2516 /* 2517 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2518 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2519 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2520 * base bdevs. 2521 * 2522 * Also, walk the list in the reverse order. 2523 */ 2524 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2525 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2526 spdk_spin_lock(&bdev->internal.spinlock); 2527 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2528 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2529 spdk_spin_unlock(&bdev->internal.spinlock); 2530 continue; 2531 } 2532 spdk_spin_unlock(&bdev->internal.spinlock); 2533 2534 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2535 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2536 return; 2537 } 2538 2539 /* 2540 * If any bdev fails to unclaim underlying bdev properly, we may face the 2541 * case of bdev list consisting of claimed bdevs only (if claims are managed 2542 * correctly, this would mean there's a loop in the claims graph which is 2543 * clearly impossible). Warn and unregister last bdev on the list then. 2544 */ 2545 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2546 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2547 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2548 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2549 return; 2550 } 2551 } 2552 2553 static void 2554 bdev_module_fini_start_iter(void *arg) 2555 { 2556 struct spdk_bdev_module *bdev_module; 2557 2558 if (!g_resume_bdev_module) { 2559 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2560 } else { 2561 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2562 } 2563 2564 while (bdev_module) { 2565 if (bdev_module->async_fini_start) { 2566 /* Save our place so we can resume later. We must 2567 * save the variable here, before calling fini_start() 2568 * below, because in some cases the module may immediately 2569 * call spdk_bdev_module_fini_start_done() and re-enter 2570 * this function to continue iterating. */ 2571 g_resume_bdev_module = bdev_module; 2572 } 2573 2574 if (bdev_module->fini_start) { 2575 bdev_module->fini_start(); 2576 } 2577 2578 if (bdev_module->async_fini_start) { 2579 return; 2580 } 2581 2582 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2583 } 2584 2585 g_resume_bdev_module = NULL; 2586 2587 bdev_finish_unregister_bdevs_iter(NULL, 0); 2588 } 2589 2590 void 2591 spdk_bdev_module_fini_start_done(void) 2592 { 2593 if (spdk_get_thread() != g_fini_thread) { 2594 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2595 } else { 2596 bdev_module_fini_start_iter(NULL); 2597 } 2598 } 2599 2600 static void 2601 bdev_finish_wait_for_examine_done(void *cb_arg) 2602 { 2603 bdev_module_fini_start_iter(NULL); 2604 } 2605 2606 static void bdev_open_async_fini(void); 2607 2608 void 2609 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2610 { 2611 int rc; 2612 2613 assert(cb_fn != NULL); 2614 2615 g_fini_thread = spdk_get_thread(); 2616 2617 g_fini_cb_fn = cb_fn; 2618 g_fini_cb_arg = cb_arg; 2619 2620 bdev_open_async_fini(); 2621 2622 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2623 if (rc != 0) { 2624 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2625 bdev_finish_wait_for_examine_done(NULL); 2626 } 2627 } 2628 2629 struct spdk_bdev_io * 2630 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2631 { 2632 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2633 struct spdk_bdev_io *bdev_io; 2634 2635 if (ch->per_thread_cache_count > 0) { 2636 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2637 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2638 ch->per_thread_cache_count--; 2639 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2640 /* 2641 * Don't try to look for bdev_ios in the global pool if there are 2642 * waiters on bdev_ios - we don't want this caller to jump the line. 2643 */ 2644 bdev_io = NULL; 2645 } else { 2646 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2647 } 2648 2649 return bdev_io; 2650 } 2651 2652 void 2653 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2654 { 2655 struct spdk_bdev_mgmt_channel *ch; 2656 2657 assert(bdev_io != NULL); 2658 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2659 2660 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2661 2662 if (bdev_io->internal.f.has_buf) { 2663 bdev_io_put_buf(bdev_io); 2664 } 2665 2666 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2667 ch->per_thread_cache_count++; 2668 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2669 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2670 struct spdk_bdev_io_wait_entry *entry; 2671 2672 entry = TAILQ_FIRST(&ch->io_wait_queue); 2673 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2674 entry->cb_fn(entry->cb_arg); 2675 } 2676 } else { 2677 /* We should never have a full cache with entries on the io wait queue. */ 2678 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2679 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2680 } 2681 } 2682 2683 static bool 2684 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2685 { 2686 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2687 2688 switch (limit) { 2689 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2690 return true; 2691 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2692 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2693 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2694 return false; 2695 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2696 default: 2697 return false; 2698 } 2699 } 2700 2701 static bool 2702 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2703 { 2704 switch (bdev_io->type) { 2705 case SPDK_BDEV_IO_TYPE_NVME_IO: 2706 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2707 case SPDK_BDEV_IO_TYPE_READ: 2708 case SPDK_BDEV_IO_TYPE_WRITE: 2709 return true; 2710 case SPDK_BDEV_IO_TYPE_ZCOPY: 2711 if (bdev_io->u.bdev.zcopy.start) { 2712 return true; 2713 } else { 2714 return false; 2715 } 2716 default: 2717 return false; 2718 } 2719 } 2720 2721 static bool 2722 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2723 { 2724 switch (bdev_io->type) { 2725 case SPDK_BDEV_IO_TYPE_NVME_IO: 2726 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2727 /* Bit 1 (0x2) set for read operation */ 2728 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2729 return true; 2730 } else { 2731 return false; 2732 } 2733 case SPDK_BDEV_IO_TYPE_READ: 2734 return true; 2735 case SPDK_BDEV_IO_TYPE_ZCOPY: 2736 /* Populate to read from disk */ 2737 if (bdev_io->u.bdev.zcopy.populate) { 2738 return true; 2739 } else { 2740 return false; 2741 } 2742 default: 2743 return false; 2744 } 2745 } 2746 2747 static uint64_t 2748 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2749 { 2750 uint32_t blocklen = bdev_io_get_block_size(bdev_io); 2751 2752 switch (bdev_io->type) { 2753 case SPDK_BDEV_IO_TYPE_NVME_IO: 2754 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2755 return bdev_io->u.nvme_passthru.nbytes; 2756 case SPDK_BDEV_IO_TYPE_READ: 2757 case SPDK_BDEV_IO_TYPE_WRITE: 2758 return bdev_io->u.bdev.num_blocks * blocklen; 2759 case SPDK_BDEV_IO_TYPE_ZCOPY: 2760 /* Track the data in the start phase only */ 2761 if (bdev_io->u.bdev.zcopy.start) { 2762 return bdev_io->u.bdev.num_blocks * blocklen; 2763 } else { 2764 return 0; 2765 } 2766 default: 2767 return 0; 2768 } 2769 } 2770 2771 static inline bool 2772 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2773 { 2774 int64_t remaining_this_timeslice; 2775 2776 if (!limit->max_per_timeslice) { 2777 /* The QoS is disabled */ 2778 return false; 2779 } 2780 2781 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2782 __ATOMIC_RELAXED); 2783 if (remaining_this_timeslice + (int64_t)delta > 0) { 2784 /* There was still a quota for this delta -> the IO shouldn't be queued 2785 * 2786 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2787 * quota can be allowed once a while. Such overrun then taken into account in 2788 * the QoS poller, where the next timeslice quota is calculated. 2789 */ 2790 return false; 2791 } 2792 2793 /* There was no quota for this delta -> the IO should be queued 2794 * The remaining_this_timeslice must be rewinded so it reflects the real 2795 * amount of IOs or bytes allowed. 2796 */ 2797 __atomic_add_fetch( 2798 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2799 return true; 2800 } 2801 2802 static inline void 2803 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2804 { 2805 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2806 } 2807 2808 static bool 2809 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2810 { 2811 return bdev_qos_rw_queue_io(limit, io, 1); 2812 } 2813 2814 static void 2815 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2816 { 2817 bdev_qos_rw_rewind_io(limit, io, 1); 2818 } 2819 2820 static bool 2821 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2822 { 2823 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2824 } 2825 2826 static void 2827 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2828 { 2829 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2830 } 2831 2832 static bool 2833 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2834 { 2835 if (bdev_is_read_io(io) == false) { 2836 return false; 2837 } 2838 2839 return bdev_qos_rw_bps_queue(limit, io); 2840 } 2841 2842 static void 2843 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2844 { 2845 if (bdev_is_read_io(io) != false) { 2846 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2847 } 2848 } 2849 2850 static bool 2851 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2852 { 2853 if (bdev_is_read_io(io) == true) { 2854 return false; 2855 } 2856 2857 return bdev_qos_rw_bps_queue(limit, io); 2858 } 2859 2860 static void 2861 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2862 { 2863 if (bdev_is_read_io(io) != true) { 2864 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2865 } 2866 } 2867 2868 static void 2869 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2870 { 2871 int i; 2872 2873 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2874 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2875 qos->rate_limits[i].queue_io = NULL; 2876 continue; 2877 } 2878 2879 switch (i) { 2880 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2881 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2882 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2883 break; 2884 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2885 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2886 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2887 break; 2888 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2889 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2890 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2891 break; 2892 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2893 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2894 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2895 break; 2896 default: 2897 break; 2898 } 2899 } 2900 } 2901 2902 static void 2903 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2904 struct spdk_bdev_io *bdev_io, 2905 enum spdk_bdev_io_status status) 2906 { 2907 bdev_io->internal.f.in_submit_request = true; 2908 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2909 spdk_bdev_io_complete(bdev_io, status); 2910 bdev_io->internal.f.in_submit_request = false; 2911 } 2912 2913 static inline void 2914 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2915 { 2916 struct spdk_bdev *bdev = bdev_io->bdev; 2917 struct spdk_io_channel *ch = bdev_ch->channel; 2918 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2919 2920 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2921 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2922 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2923 2924 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2925 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2926 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2927 SPDK_BDEV_IO_STATUS_SUCCESS); 2928 return; 2929 } 2930 } 2931 2932 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2933 bdev_io->bdev->split_on_write_unit && 2934 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2935 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2936 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2937 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2938 return; 2939 } 2940 2941 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2942 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2943 bdev_io->internal.f.in_submit_request = true; 2944 bdev_submit_request(bdev, ch, bdev_io); 2945 bdev_io->internal.f.in_submit_request = false; 2946 } else { 2947 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2948 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2949 /* Special case when we have nomem IOs and no outstanding IOs which completions 2950 * could trigger retry of queued IOs */ 2951 bdev_shared_ch_retry_io(shared_resource); 2952 } 2953 } 2954 } 2955 2956 static bool 2957 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2958 { 2959 int i; 2960 2961 if (bdev_qos_io_to_limit(bdev_io) == true) { 2962 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2963 if (!qos->rate_limits[i].queue_io) { 2964 continue; 2965 } 2966 2967 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2968 bdev_io) == true) { 2969 for (i -= 1; i >= 0 ; i--) { 2970 if (!qos->rate_limits[i].queue_io) { 2971 continue; 2972 } 2973 2974 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2975 } 2976 return true; 2977 } 2978 } 2979 } 2980 2981 return false; 2982 } 2983 2984 static int 2985 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2986 { 2987 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2988 int submitted_ios = 0; 2989 2990 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2991 if (!bdev_qos_queue_io(qos, bdev_io)) { 2992 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2993 bdev_io_do_submit(ch, bdev_io); 2994 2995 submitted_ios++; 2996 } 2997 } 2998 2999 return submitted_ios; 3000 } 3001 3002 static void 3003 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 3004 { 3005 int rc; 3006 3007 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 3008 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 3009 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 3010 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 3011 &bdev_io->internal.waitq_entry); 3012 if (rc != 0) { 3013 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 3014 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3015 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3016 } 3017 } 3018 3019 static bool 3020 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 3021 { 3022 uint32_t io_boundary; 3023 struct spdk_bdev *bdev = bdev_io->bdev; 3024 uint32_t max_segment_size = bdev->max_segment_size; 3025 uint32_t max_size = bdev->max_rw_size; 3026 int max_segs = bdev->max_num_segments; 3027 3028 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3029 io_boundary = bdev->write_unit_size; 3030 } else if (bdev->split_on_optimal_io_boundary) { 3031 io_boundary = bdev->optimal_io_boundary; 3032 } else { 3033 io_boundary = 0; 3034 } 3035 3036 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 3037 return false; 3038 } 3039 3040 if (io_boundary) { 3041 uint64_t start_stripe, end_stripe; 3042 3043 start_stripe = bdev_io->u.bdev.offset_blocks; 3044 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 3045 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 3046 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 3047 start_stripe >>= spdk_u32log2(io_boundary); 3048 end_stripe >>= spdk_u32log2(io_boundary); 3049 } else { 3050 start_stripe /= io_boundary; 3051 end_stripe /= io_boundary; 3052 } 3053 3054 if (start_stripe != end_stripe) { 3055 return true; 3056 } 3057 } 3058 3059 if (max_segs) { 3060 if (bdev_io->u.bdev.iovcnt > max_segs) { 3061 return true; 3062 } 3063 } 3064 3065 if (max_segment_size) { 3066 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 3067 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 3068 return true; 3069 } 3070 } 3071 } 3072 3073 if (max_size) { 3074 if (bdev_io->u.bdev.num_blocks > max_size) { 3075 return true; 3076 } 3077 } 3078 3079 return false; 3080 } 3081 3082 static bool 3083 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 3084 { 3085 uint32_t num_unmap_segments; 3086 3087 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 3088 return false; 3089 } 3090 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 3091 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 3092 return true; 3093 } 3094 3095 return false; 3096 } 3097 3098 static bool 3099 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 3100 { 3101 if (!bdev_io->bdev->max_write_zeroes) { 3102 return false; 3103 } 3104 3105 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 3106 return true; 3107 } 3108 3109 return false; 3110 } 3111 3112 static bool 3113 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 3114 { 3115 if (bdev_io->bdev->max_copy != 0 && 3116 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 3117 return true; 3118 } 3119 3120 return false; 3121 } 3122 3123 static bool 3124 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3125 { 3126 switch (bdev_io->type) { 3127 case SPDK_BDEV_IO_TYPE_READ: 3128 case SPDK_BDEV_IO_TYPE_WRITE: 3129 return bdev_rw_should_split(bdev_io); 3130 case SPDK_BDEV_IO_TYPE_UNMAP: 3131 return bdev_unmap_should_split(bdev_io); 3132 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3133 return bdev_write_zeroes_should_split(bdev_io); 3134 case SPDK_BDEV_IO_TYPE_COPY: 3135 return bdev_copy_should_split(bdev_io); 3136 default: 3137 return false; 3138 } 3139 } 3140 3141 static uint32_t 3142 _to_next_boundary(uint64_t offset, uint32_t boundary) 3143 { 3144 return (boundary - (offset % boundary)); 3145 } 3146 3147 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3148 3149 static void _bdev_rw_split(void *_bdev_io); 3150 3151 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3152 3153 static void 3154 _bdev_unmap_split(void *_bdev_io) 3155 { 3156 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3157 } 3158 3159 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3160 3161 static void 3162 _bdev_write_zeroes_split(void *_bdev_io) 3163 { 3164 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3165 } 3166 3167 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3168 3169 static void 3170 _bdev_copy_split(void *_bdev_io) 3171 { 3172 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3173 } 3174 3175 static int 3176 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3177 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3178 { 3179 int rc; 3180 uint64_t current_offset, current_remaining, current_src_offset; 3181 spdk_bdev_io_wait_cb io_wait_fn; 3182 3183 current_offset = *offset; 3184 current_remaining = *remaining; 3185 3186 assert(bdev_io->internal.f.split); 3187 3188 bdev_io->internal.split.outstanding++; 3189 3190 io_wait_fn = _bdev_rw_split; 3191 switch (bdev_io->type) { 3192 case SPDK_BDEV_IO_TYPE_READ: 3193 assert(bdev_io->u.bdev.accel_sequence == NULL); 3194 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3195 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3196 iov, iovcnt, md_buf, current_offset, 3197 num_blocks, 3198 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3199 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3200 NULL, 3201 bdev_io->u.bdev.dif_check_flags, 3202 bdev_io_split_done, bdev_io); 3203 break; 3204 case SPDK_BDEV_IO_TYPE_WRITE: 3205 assert(bdev_io->u.bdev.accel_sequence == NULL); 3206 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3207 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3208 iov, iovcnt, md_buf, current_offset, 3209 num_blocks, 3210 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3211 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3212 NULL, 3213 bdev_io->u.bdev.dif_check_flags, 3214 bdev_io->u.bdev.nvme_cdw12.raw, 3215 bdev_io->u.bdev.nvme_cdw13.raw, 3216 bdev_io_split_done, bdev_io); 3217 break; 3218 case SPDK_BDEV_IO_TYPE_UNMAP: 3219 io_wait_fn = _bdev_unmap_split; 3220 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3221 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3222 current_offset, num_blocks, 3223 bdev_io_split_done, bdev_io); 3224 break; 3225 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3226 io_wait_fn = _bdev_write_zeroes_split; 3227 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3228 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3229 current_offset, num_blocks, 3230 bdev_io_split_done, bdev_io); 3231 break; 3232 case SPDK_BDEV_IO_TYPE_COPY: 3233 io_wait_fn = _bdev_copy_split; 3234 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3235 (current_offset - bdev_io->u.bdev.offset_blocks); 3236 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3237 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3238 current_offset, current_src_offset, num_blocks, 3239 bdev_io_split_done, bdev_io); 3240 break; 3241 default: 3242 assert(false); 3243 rc = -EINVAL; 3244 break; 3245 } 3246 3247 if (rc == 0) { 3248 current_offset += num_blocks; 3249 current_remaining -= num_blocks; 3250 bdev_io->internal.split.current_offset_blocks = current_offset; 3251 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3252 *offset = current_offset; 3253 *remaining = current_remaining; 3254 } else { 3255 bdev_io->internal.split.outstanding--; 3256 if (rc == -ENOMEM) { 3257 if (bdev_io->internal.split.outstanding == 0) { 3258 /* No I/O is outstanding. Hence we should wait here. */ 3259 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3260 } 3261 } else { 3262 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3263 if (bdev_io->internal.split.outstanding == 0) { 3264 bdev_ch_remove_from_io_submitted(bdev_io); 3265 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3266 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3267 bdev_io->internal.ch->queue_depth); 3268 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3269 } 3270 } 3271 } 3272 3273 return rc; 3274 } 3275 3276 static void 3277 _bdev_rw_split(void *_bdev_io) 3278 { 3279 struct iovec *parent_iov, *iov; 3280 struct spdk_bdev_io *bdev_io = _bdev_io; 3281 struct spdk_bdev *bdev = bdev_io->bdev; 3282 uint64_t parent_offset, current_offset, remaining; 3283 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3284 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3285 uint32_t iovcnt, iov_len, child_iovsize; 3286 uint32_t blocklen; 3287 uint32_t io_boundary; 3288 uint32_t max_segment_size = bdev->max_segment_size; 3289 uint32_t max_child_iovcnt = bdev->max_num_segments; 3290 uint32_t max_size = bdev->max_rw_size; 3291 void *md_buf = NULL; 3292 int rc; 3293 3294 blocklen = bdev_io_get_block_size(bdev_io); 3295 3296 max_size = max_size ? max_size : UINT32_MAX; 3297 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3298 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3299 SPDK_BDEV_IO_NUM_CHILD_IOV; 3300 3301 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3302 io_boundary = bdev->write_unit_size; 3303 } else if (bdev->split_on_optimal_io_boundary) { 3304 io_boundary = bdev->optimal_io_boundary; 3305 } else { 3306 io_boundary = UINT32_MAX; 3307 } 3308 3309 assert(bdev_io->internal.f.split); 3310 3311 remaining = bdev_io->internal.split.remaining_num_blocks; 3312 current_offset = bdev_io->internal.split.current_offset_blocks; 3313 parent_offset = bdev_io->u.bdev.offset_blocks; 3314 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3315 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3316 3317 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3318 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3319 if (parent_iov_offset < parent_iov->iov_len) { 3320 break; 3321 } 3322 parent_iov_offset -= parent_iov->iov_len; 3323 } 3324 3325 child_iovcnt = 0; 3326 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3327 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3328 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3329 to_next_boundary = spdk_min(remaining, to_next_boundary); 3330 to_next_boundary = spdk_min(max_size, to_next_boundary); 3331 to_next_boundary_bytes = to_next_boundary * blocklen; 3332 3333 iov = &bdev_io->child_iov[child_iovcnt]; 3334 iovcnt = 0; 3335 3336 if (bdev_io->u.bdev.md_buf) { 3337 md_buf = (char *)bdev_io->u.bdev.md_buf + 3338 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3339 } 3340 3341 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3342 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3343 iovcnt < child_iovsize) { 3344 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3345 iov_len = parent_iov->iov_len - parent_iov_offset; 3346 3347 iov_len = spdk_min(iov_len, max_segment_size); 3348 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3349 to_next_boundary_bytes -= iov_len; 3350 3351 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3352 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3353 3354 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3355 parent_iov_offset += iov_len; 3356 } else { 3357 parent_iovpos++; 3358 parent_iov_offset = 0; 3359 } 3360 child_iovcnt++; 3361 iovcnt++; 3362 } 3363 3364 if (to_next_boundary_bytes > 0) { 3365 /* We had to stop this child I/O early because we ran out of 3366 * child_iov space or were limited by max_num_segments. 3367 * Ensure the iovs to be aligned with block size and 3368 * then adjust to_next_boundary before starting the 3369 * child I/O. 3370 */ 3371 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3372 iovcnt == child_iovsize); 3373 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3374 if (to_last_block_bytes != 0) { 3375 uint32_t child_iovpos = child_iovcnt - 1; 3376 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3377 * so the loop will naturally end 3378 */ 3379 3380 to_last_block_bytes = blocklen - to_last_block_bytes; 3381 to_next_boundary_bytes += to_last_block_bytes; 3382 while (to_last_block_bytes > 0 && iovcnt > 0) { 3383 iov_len = spdk_min(to_last_block_bytes, 3384 bdev_io->child_iov[child_iovpos].iov_len); 3385 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3386 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3387 child_iovpos--; 3388 if (--iovcnt == 0) { 3389 /* If the child IO is less than a block size just return. 3390 * If the first child IO of any split round is less than 3391 * a block size, an error exit. 3392 */ 3393 if (bdev_io->internal.split.outstanding == 0) { 3394 SPDK_ERRLOG("The first child io was less than a block size\n"); 3395 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3396 bdev_ch_remove_from_io_submitted(bdev_io); 3397 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3398 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3399 bdev_io->internal.ch->queue_depth); 3400 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3401 } 3402 3403 return; 3404 } 3405 } 3406 3407 to_last_block_bytes -= iov_len; 3408 3409 if (parent_iov_offset == 0) { 3410 parent_iovpos--; 3411 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3412 } 3413 parent_iov_offset -= iov_len; 3414 } 3415 3416 assert(to_last_block_bytes == 0); 3417 } 3418 to_next_boundary -= to_next_boundary_bytes / blocklen; 3419 } 3420 3421 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3422 ¤t_offset, &remaining); 3423 if (spdk_unlikely(rc)) { 3424 return; 3425 } 3426 } 3427 } 3428 3429 static void 3430 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3431 { 3432 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3433 uint32_t num_children_reqs = 0; 3434 int rc; 3435 3436 assert(bdev_io->internal.f.split); 3437 3438 offset = bdev_io->internal.split.current_offset_blocks; 3439 remaining = bdev_io->internal.split.remaining_num_blocks; 3440 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3441 3442 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3443 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3444 3445 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3446 &offset, &remaining); 3447 if (spdk_likely(rc == 0)) { 3448 num_children_reqs++; 3449 } else { 3450 return; 3451 } 3452 } 3453 } 3454 3455 static void 3456 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3457 { 3458 uint64_t offset, write_zeroes_blocks, remaining; 3459 uint32_t num_children_reqs = 0; 3460 int rc; 3461 3462 assert(bdev_io->internal.f.split); 3463 3464 offset = bdev_io->internal.split.current_offset_blocks; 3465 remaining = bdev_io->internal.split.remaining_num_blocks; 3466 3467 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3468 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3469 3470 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3471 &offset, &remaining); 3472 if (spdk_likely(rc == 0)) { 3473 num_children_reqs++; 3474 } else { 3475 return; 3476 } 3477 } 3478 } 3479 3480 static void 3481 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3482 { 3483 uint64_t offset, copy_blocks, remaining; 3484 uint32_t num_children_reqs = 0; 3485 int rc; 3486 3487 assert(bdev_io->internal.f.split); 3488 3489 offset = bdev_io->internal.split.current_offset_blocks; 3490 remaining = bdev_io->internal.split.remaining_num_blocks; 3491 3492 assert(bdev_io->bdev->max_copy != 0); 3493 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3494 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3495 3496 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3497 &offset, &remaining); 3498 if (spdk_likely(rc == 0)) { 3499 num_children_reqs++; 3500 } else { 3501 return; 3502 } 3503 } 3504 } 3505 3506 static void 3507 parent_bdev_io_complete(void *ctx, int rc) 3508 { 3509 struct spdk_bdev_io *parent_io = ctx; 3510 3511 if (rc) { 3512 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3513 } 3514 3515 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3516 parent_io->internal.caller_ctx); 3517 } 3518 3519 static void 3520 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3521 { 3522 struct spdk_bdev_io *bdev_io = ctx; 3523 3524 /* u.bdev.accel_sequence should have already been cleared at this point */ 3525 assert(bdev_io->u.bdev.accel_sequence == NULL); 3526 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3527 bdev_io->internal.f.has_accel_sequence = false; 3528 3529 if (spdk_unlikely(status != 0)) { 3530 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3531 } 3532 3533 parent_bdev_io_complete(bdev_io, status); 3534 } 3535 3536 static void 3537 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3538 { 3539 struct spdk_bdev_io *parent_io = cb_arg; 3540 3541 spdk_bdev_free_io(bdev_io); 3542 3543 assert(parent_io->internal.f.split); 3544 3545 if (!success) { 3546 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3547 /* If any child I/O failed, stop further splitting process. */ 3548 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3549 parent_io->internal.split.remaining_num_blocks = 0; 3550 } 3551 parent_io->internal.split.outstanding--; 3552 if (parent_io->internal.split.outstanding != 0) { 3553 return; 3554 } 3555 3556 /* 3557 * Parent I/O finishes when all blocks are consumed. 3558 */ 3559 if (parent_io->internal.split.remaining_num_blocks == 0) { 3560 assert(parent_io->internal.cb != bdev_io_split_done); 3561 bdev_ch_remove_from_io_submitted(parent_io); 3562 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3563 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3564 parent_io->internal.ch->queue_depth); 3565 3566 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3567 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3568 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3569 return; 3570 } else if (parent_io->internal.f.has_bounce_buf && 3571 !bdev_io_use_accel_sequence(bdev_io)) { 3572 /* bdev IO will be completed in the callback */ 3573 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3574 return; 3575 } 3576 } 3577 3578 parent_bdev_io_complete(parent_io, 0); 3579 return; 3580 } 3581 3582 /* 3583 * Continue with the splitting process. This function will complete the parent I/O if the 3584 * splitting is done. 3585 */ 3586 switch (parent_io->type) { 3587 case SPDK_BDEV_IO_TYPE_READ: 3588 case SPDK_BDEV_IO_TYPE_WRITE: 3589 _bdev_rw_split(parent_io); 3590 break; 3591 case SPDK_BDEV_IO_TYPE_UNMAP: 3592 bdev_unmap_split(parent_io); 3593 break; 3594 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3595 bdev_write_zeroes_split(parent_io); 3596 break; 3597 case SPDK_BDEV_IO_TYPE_COPY: 3598 bdev_copy_split(parent_io); 3599 break; 3600 default: 3601 assert(false); 3602 break; 3603 } 3604 } 3605 3606 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3607 bool success); 3608 3609 static void 3610 bdev_io_split(struct spdk_bdev_io *bdev_io) 3611 { 3612 assert(bdev_io_should_split(bdev_io)); 3613 assert(bdev_io->internal.f.split); 3614 3615 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3616 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3617 bdev_io->internal.split.outstanding = 0; 3618 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3619 3620 switch (bdev_io->type) { 3621 case SPDK_BDEV_IO_TYPE_READ: 3622 case SPDK_BDEV_IO_TYPE_WRITE: 3623 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3624 _bdev_rw_split(bdev_io); 3625 } else { 3626 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3627 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3628 bdev_io->u.bdev.num_blocks * bdev_io_get_block_size(bdev_io)); 3629 } 3630 break; 3631 case SPDK_BDEV_IO_TYPE_UNMAP: 3632 bdev_unmap_split(bdev_io); 3633 break; 3634 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3635 bdev_write_zeroes_split(bdev_io); 3636 break; 3637 case SPDK_BDEV_IO_TYPE_COPY: 3638 bdev_copy_split(bdev_io); 3639 break; 3640 default: 3641 assert(false); 3642 break; 3643 } 3644 } 3645 3646 static void 3647 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3648 { 3649 if (!success) { 3650 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3651 return; 3652 } 3653 3654 _bdev_rw_split(bdev_io); 3655 } 3656 3657 static inline void 3658 _bdev_io_submit(struct spdk_bdev_io *bdev_io) 3659 { 3660 struct spdk_bdev *bdev = bdev_io->bdev; 3661 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3662 3663 if (spdk_likely(bdev_ch->flags == 0)) { 3664 bdev_io_do_submit(bdev_ch, bdev_io); 3665 return; 3666 } 3667 3668 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3669 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3670 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3671 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3672 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3673 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3674 } else { 3675 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3676 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3677 } 3678 } else { 3679 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3680 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3681 } 3682 } 3683 3684 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3685 3686 bool 3687 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3688 { 3689 if (range1->length == 0 || range2->length == 0) { 3690 return false; 3691 } 3692 3693 if (range1->offset + range1->length <= range2->offset) { 3694 return false; 3695 } 3696 3697 if (range2->offset + range2->length <= range1->offset) { 3698 return false; 3699 } 3700 3701 return true; 3702 } 3703 3704 static bool 3705 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3706 { 3707 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3708 struct lba_range r; 3709 3710 switch (bdev_io->type) { 3711 case SPDK_BDEV_IO_TYPE_NVME_IO: 3712 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3713 /* Don't try to decode the NVMe command - just assume worst-case and that 3714 * it overlaps a locked range. 3715 */ 3716 return true; 3717 case SPDK_BDEV_IO_TYPE_READ: 3718 if (!range->quiesce) { 3719 return false; 3720 } 3721 /* fallthrough */ 3722 case SPDK_BDEV_IO_TYPE_WRITE: 3723 case SPDK_BDEV_IO_TYPE_UNMAP: 3724 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3725 case SPDK_BDEV_IO_TYPE_ZCOPY: 3726 case SPDK_BDEV_IO_TYPE_COPY: 3727 r.offset = bdev_io->u.bdev.offset_blocks; 3728 r.length = bdev_io->u.bdev.num_blocks; 3729 if (!bdev_lba_range_overlapped(range, &r)) { 3730 /* This I/O doesn't overlap the specified LBA range. */ 3731 return false; 3732 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3733 /* This I/O overlaps, but the I/O is on the same channel that locked this 3734 * range, and the caller_ctx is the same as the locked_ctx. This means 3735 * that this I/O is associated with the lock, and is allowed to execute. 3736 */ 3737 return false; 3738 } else { 3739 return true; 3740 } 3741 default: 3742 return false; 3743 } 3744 } 3745 3746 void 3747 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3748 { 3749 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3750 3751 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3752 3753 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3754 struct lba_range *range; 3755 3756 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3757 if (bdev_io_range_is_locked(bdev_io, range)) { 3758 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3759 return; 3760 } 3761 } 3762 } 3763 3764 bdev_ch_add_to_io_submitted(bdev_io); 3765 3766 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3767 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3768 ch->trace_id, bdev_io->u.bdev.num_blocks, 3769 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3770 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3771 3772 if (bdev_io->internal.f.split) { 3773 bdev_io_split(bdev_io); 3774 return; 3775 } 3776 3777 _bdev_io_submit(bdev_io); 3778 } 3779 3780 static inline int 3781 bdev_io_init_dif_ctx(struct spdk_bdev_io *bdev_io) 3782 { 3783 struct spdk_bdev *bdev = bdev_io->bdev; 3784 struct spdk_dif_ctx_init_ext_opts dif_opts; 3785 3786 memset(&bdev_io->u.bdev.dif_err, 0, sizeof(struct spdk_dif_error)); 3787 3788 dif_opts.size = SPDK_SIZEOF(&dif_opts, dif_pi_format); 3789 dif_opts.dif_pi_format = bdev->dif_pi_format; 3790 3791 return spdk_dif_ctx_init(&bdev_io->u.bdev.dif_ctx, 3792 bdev->blocklen, 3793 bdev->md_len, 3794 bdev->md_interleave, 3795 bdev->dif_is_head_of_md, 3796 bdev->dif_type, 3797 bdev_io->u.bdev.dif_check_flags, 3798 bdev_io->u.bdev.offset_blocks & 0xFFFFFFFF, 3799 0xFFFF, 0, 0, 0, &dif_opts); 3800 } 3801 3802 static void 3803 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3804 bool success) 3805 { 3806 if (!success) { 3807 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 3808 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3809 bdev_io_complete_unsubmitted(bdev_io); 3810 return; 3811 } 3812 3813 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 3814 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3815 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3816 return; 3817 } 3818 /* For reads we'll execute the sequence after the data is read, so, for now, only 3819 * clear out accel_sequence pointer and submit the IO */ 3820 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3821 bdev_io->u.bdev.accel_sequence = NULL; 3822 } 3823 3824 bdev_io_submit(bdev_io); 3825 } 3826 3827 static inline void 3828 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3829 { 3830 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3831 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3832 * For write operation we need to pull buffers from memory domain before submitting IO. 3833 * Once read operation completes, we need to use memory_domain push functionality to 3834 * update data in original memory domain IO buffer. 3835 * 3836 * If this I/O request is not aware of metadata, buffers in thsi IO request can't be 3837 * accessed directly too. It is needed to allocate buffers before issuing IO operation. 3838 * For write operation we need to insert metadata before submitting IO. Once read 3839 * operation completes, we need to strip metadata in original IO buffer. 3840 * 3841 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3842 assert(bdev_io_use_memory_domain(bdev_io) || 3843 bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3844 3845 bdev_io->u.bdev.memory_domain = NULL; 3846 bdev_io->u.bdev.memory_domain_ctx = NULL; 3847 _bdev_io_get_bounce_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3848 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3849 } 3850 3851 static inline void 3852 _bdev_io_ext_use_accel_buffer(struct spdk_bdev_io *bdev_io) 3853 { 3854 assert(bdev_io_use_memory_domain(bdev_io)); 3855 assert(bdev_io_needs_metadata(bdev_io->internal.desc, bdev_io)); 3856 3857 bdev_io->u.bdev.memory_domain = NULL; 3858 bdev_io->u.bdev.memory_domain_ctx = NULL; 3859 bdev_io_get_accel_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3860 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3861 } 3862 3863 /* We need to allocate bounce buffer 3864 * - if bdev doesn't support memory domains, 3865 * - if it does support them, but we need to execute an accel sequence and the data buffer is 3866 * from accel memory domain (to avoid doing a push/pull from that domain), or 3867 * - if IO is not aware of metadata. 3868 */ 3869 static inline bool 3870 bdev_io_needs_bounce_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3871 { 3872 if (bdev_io_use_memory_domain(bdev_io)) { 3873 if (!desc->memory_domains_supported || 3874 (bdev_io_needs_sequence_exec(desc, bdev_io) && 3875 (bdev_io->internal.memory_domain == spdk_accel_get_memory_domain() || 3876 bdev_io_needs_metadata(desc, bdev_io)))) { 3877 return true; 3878 } 3879 3880 return false; 3881 } 3882 3883 if (bdev_io_needs_metadata(desc, bdev_io)) { 3884 return true; 3885 } 3886 3887 return false; 3888 } 3889 3890 /* We need to allocate fake accel buffer if bdev supports memory domains but IO is not 3891 * aware of metadata. 3892 */ 3893 static inline bool 3894 bdev_io_needs_accel_buffer(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3895 { 3896 if (bdev_io_needs_metadata(desc, bdev_io)) { 3897 assert(bdev_io_use_memory_domain(bdev_io)); 3898 return true; 3899 } 3900 3901 return false; 3902 } 3903 3904 static inline void 3905 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3906 { 3907 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3908 int rc; 3909 3910 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3911 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3912 bdev_io_complete_unsubmitted(bdev_io); 3913 return; 3914 } 3915 3916 if (bdev_io_needs_metadata(desc, bdev_io)) { 3917 rc = bdev_io_init_dif_ctx(bdev_io); 3918 if (spdk_unlikely(rc != 0)) { 3919 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3920 bdev_io_complete_unsubmitted(bdev_io); 3921 return; 3922 } 3923 } 3924 3925 if (bdev_io_needs_bounce_buffer(desc, bdev_io)) { 3926 _bdev_io_ext_use_bounce_buffer(bdev_io); 3927 return; 3928 } 3929 3930 if (bdev_io_needs_accel_buffer(desc, bdev_io)) { 3931 _bdev_io_ext_use_accel_buffer(bdev_io); 3932 return; 3933 } 3934 3935 if (bdev_io_needs_sequence_exec(desc, bdev_io)) { 3936 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3937 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3938 return; 3939 } 3940 /* For reads we'll execute the sequence after the data is read, so, for now, only 3941 * clear out accel_sequence pointer and submit the IO */ 3942 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3943 bdev_io->u.bdev.accel_sequence = NULL; 3944 } 3945 3946 bdev_io_submit(bdev_io); 3947 } 3948 3949 static void 3950 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3951 { 3952 struct spdk_bdev *bdev = bdev_io->bdev; 3953 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3954 struct spdk_io_channel *ch = bdev_ch->channel; 3955 3956 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3957 3958 bdev_io->internal.f.in_submit_request = true; 3959 bdev_submit_request(bdev, ch, bdev_io); 3960 bdev_io->internal.f.in_submit_request = false; 3961 } 3962 3963 void 3964 bdev_io_init(struct spdk_bdev_io *bdev_io, 3965 struct spdk_bdev *bdev, void *cb_arg, 3966 spdk_bdev_io_completion_cb cb) 3967 { 3968 bdev_io->bdev = bdev; 3969 bdev_io->internal.f.raw = 0; 3970 bdev_io->internal.caller_ctx = cb_arg; 3971 bdev_io->internal.cb = cb; 3972 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3973 bdev_io->internal.f.in_submit_request = false; 3974 bdev_io->internal.error.nvme.cdw0 = 0; 3975 bdev_io->num_retries = 0; 3976 bdev_io->internal.get_buf_cb = NULL; 3977 bdev_io->internal.get_aux_buf_cb = NULL; 3978 bdev_io->internal.data_transfer_cpl = NULL; 3979 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3980 } 3981 3982 static bool 3983 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3984 { 3985 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3986 } 3987 3988 bool 3989 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3990 { 3991 bool supported; 3992 3993 supported = bdev_io_type_supported(bdev, io_type); 3994 3995 if (!supported) { 3996 switch (io_type) { 3997 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3998 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3999 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 4000 break; 4001 default: 4002 break; 4003 } 4004 } 4005 4006 return supported; 4007 } 4008 4009 static const char *g_io_type_strings[] = { 4010 [SPDK_BDEV_IO_TYPE_READ] = "read", 4011 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 4012 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 4013 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 4014 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 4015 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 4016 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 4017 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 4018 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 4019 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 4020 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 4021 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 4022 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 4023 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 4024 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 4025 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 4026 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 4027 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 4028 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 4029 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 4030 }; 4031 4032 const char * 4033 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 4034 { 4035 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 4036 return NULL; 4037 } 4038 4039 return g_io_type_strings[io_type]; 4040 } 4041 4042 int 4043 spdk_bdev_get_io_type(const char *io_type_string) 4044 { 4045 int i; 4046 4047 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 4048 if (!strcmp(io_type_string, g_io_type_strings[i])) { 4049 return i; 4050 } 4051 } 4052 4053 return -1; 4054 } 4055 4056 uint64_t 4057 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 4058 { 4059 return bdev_io->internal.submit_tsc; 4060 } 4061 4062 bool 4063 spdk_bdev_io_hide_metadata(struct spdk_bdev_io *bdev_io) 4064 { 4065 return bdev_io->internal.desc->opts.hide_metadata; 4066 } 4067 4068 int 4069 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 4070 { 4071 if (bdev->fn_table->dump_info_json) { 4072 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 4073 } 4074 4075 return 0; 4076 } 4077 4078 static void 4079 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 4080 { 4081 uint32_t max_per_timeslice = 0; 4082 int i; 4083 4084 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4085 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4086 qos->rate_limits[i].max_per_timeslice = 0; 4087 continue; 4088 } 4089 4090 max_per_timeslice = qos->rate_limits[i].limit * 4091 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 4092 4093 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 4094 qos->rate_limits[i].min_per_timeslice); 4095 4096 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4097 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 4098 } 4099 4100 bdev_qos_set_ops(qos); 4101 } 4102 4103 static void 4104 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4105 struct spdk_io_channel *io_ch, void *ctx) 4106 { 4107 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4108 int status; 4109 4110 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 4111 4112 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 4113 /* TODO: channels round robing */ 4114 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 4115 4116 spdk_bdev_for_each_channel_continue(i, status); 4117 } 4118 4119 4120 static void 4121 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 4122 { 4123 4124 } 4125 4126 static int 4127 bdev_channel_poll_qos(void *arg) 4128 { 4129 struct spdk_bdev *bdev = arg; 4130 struct spdk_bdev_qos *qos = bdev->internal.qos; 4131 uint64_t now = spdk_get_ticks(); 4132 int i; 4133 int64_t remaining_last_timeslice; 4134 4135 if (spdk_unlikely(qos->thread == NULL)) { 4136 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 4137 return SPDK_POLLER_IDLE; 4138 } 4139 4140 if (now < (qos->last_timeslice + qos->timeslice_size)) { 4141 /* We received our callback earlier than expected - return 4142 * immediately and wait to do accounting until at least one 4143 * timeslice has actually expired. This should never happen 4144 * with a well-behaved timer implementation. 4145 */ 4146 return SPDK_POLLER_IDLE; 4147 } 4148 4149 /* Reset for next round of rate limiting */ 4150 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4151 /* We may have allowed the IOs or bytes to slightly overrun in the last 4152 * timeslice. remaining_this_timeslice is signed, so if it's negative 4153 * here, we'll account for the overrun so that the next timeslice will 4154 * be appropriately reduced. 4155 */ 4156 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 4157 0, __ATOMIC_RELAXED); 4158 if (remaining_last_timeslice < 0) { 4159 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 4160 * potentially use 2 atomic ops each, so they can intertwine. 4161 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 4162 */ 4163 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 4164 remaining_last_timeslice, __ATOMIC_RELAXED); 4165 } 4166 } 4167 4168 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 4169 qos->last_timeslice += qos->timeslice_size; 4170 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4171 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 4172 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 4173 } 4174 } 4175 4176 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 4177 bdev_channel_submit_qos_io_done); 4178 4179 return SPDK_POLLER_BUSY; 4180 } 4181 4182 static void 4183 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 4184 { 4185 struct spdk_bdev_shared_resource *shared_resource; 4186 struct lba_range *range; 4187 4188 bdev_free_io_stat(ch->stat); 4189 #ifdef SPDK_CONFIG_VTUNE 4190 bdev_free_io_stat(ch->prev_stat); 4191 #endif 4192 4193 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 4194 range = TAILQ_FIRST(&ch->locked_ranges); 4195 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 4196 free(range); 4197 } 4198 4199 spdk_put_io_channel(ch->channel); 4200 spdk_put_io_channel(ch->accel_channel); 4201 4202 shared_resource = ch->shared_resource; 4203 4204 assert(TAILQ_EMPTY(&ch->io_locked)); 4205 assert(TAILQ_EMPTY(&ch->io_submitted)); 4206 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 4207 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 4208 assert(ch->io_outstanding == 0); 4209 assert(shared_resource->ref > 0); 4210 shared_resource->ref--; 4211 if (shared_resource->ref == 0) { 4212 assert(shared_resource->io_outstanding == 0); 4213 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 4214 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 4215 spdk_poller_unregister(&shared_resource->nomem_poller); 4216 free(shared_resource); 4217 } 4218 } 4219 4220 static void 4221 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 4222 { 4223 struct spdk_bdev_qos *qos = bdev->internal.qos; 4224 int i; 4225 4226 assert(spdk_spin_held(&bdev->internal.spinlock)); 4227 4228 /* Rate limiting on this bdev enabled */ 4229 if (qos) { 4230 if (qos->ch == NULL) { 4231 struct spdk_io_channel *io_ch; 4232 4233 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 4234 bdev->name, spdk_get_thread()); 4235 4236 /* No qos channel has been selected, so set one up */ 4237 4238 /* Take another reference to ch */ 4239 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4240 assert(io_ch != NULL); 4241 qos->ch = ch; 4242 4243 qos->thread = spdk_io_channel_get_thread(io_ch); 4244 4245 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4246 if (bdev_qos_is_iops_rate_limit(i) == true) { 4247 qos->rate_limits[i].min_per_timeslice = 4248 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4249 } else { 4250 qos->rate_limits[i].min_per_timeslice = 4251 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4252 } 4253 4254 if (qos->rate_limits[i].limit == 0) { 4255 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4256 } 4257 } 4258 bdev_qos_update_max_quota_per_timeslice(qos); 4259 qos->timeslice_size = 4260 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4261 qos->last_timeslice = spdk_get_ticks(); 4262 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4263 bdev, 4264 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4265 } 4266 4267 ch->flags |= BDEV_CH_QOS_ENABLED; 4268 } 4269 } 4270 4271 struct poll_timeout_ctx { 4272 struct spdk_bdev_desc *desc; 4273 uint64_t timeout_in_sec; 4274 spdk_bdev_io_timeout_cb cb_fn; 4275 void *cb_arg; 4276 }; 4277 4278 static void 4279 bdev_desc_free(struct spdk_bdev_desc *desc) 4280 { 4281 spdk_spin_destroy(&desc->spinlock); 4282 free(desc->media_events_buffer); 4283 free(desc); 4284 } 4285 4286 static void 4287 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4288 { 4289 struct poll_timeout_ctx *ctx = _ctx; 4290 struct spdk_bdev_desc *desc = ctx->desc; 4291 4292 free(ctx); 4293 4294 spdk_spin_lock(&desc->spinlock); 4295 desc->refs--; 4296 if (desc->closed == true && desc->refs == 0) { 4297 spdk_spin_unlock(&desc->spinlock); 4298 bdev_desc_free(desc); 4299 return; 4300 } 4301 spdk_spin_unlock(&desc->spinlock); 4302 } 4303 4304 static void 4305 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4306 struct spdk_io_channel *io_ch, void *_ctx) 4307 { 4308 struct poll_timeout_ctx *ctx = _ctx; 4309 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4310 struct spdk_bdev_desc *desc = ctx->desc; 4311 struct spdk_bdev_io *bdev_io; 4312 uint64_t now; 4313 4314 spdk_spin_lock(&desc->spinlock); 4315 if (desc->closed == true) { 4316 spdk_spin_unlock(&desc->spinlock); 4317 spdk_bdev_for_each_channel_continue(i, -1); 4318 return; 4319 } 4320 spdk_spin_unlock(&desc->spinlock); 4321 4322 now = spdk_get_ticks(); 4323 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4324 /* Exclude any I/O that are generated via splitting. */ 4325 if (bdev_io->internal.cb == bdev_io_split_done) { 4326 continue; 4327 } 4328 4329 /* Once we find an I/O that has not timed out, we can immediately 4330 * exit the loop. 4331 */ 4332 if (now < (bdev_io->internal.submit_tsc + 4333 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4334 goto end; 4335 } 4336 4337 if (bdev_io->internal.desc == desc) { 4338 ctx->cb_fn(ctx->cb_arg, bdev_io); 4339 } 4340 } 4341 4342 end: 4343 spdk_bdev_for_each_channel_continue(i, 0); 4344 } 4345 4346 static int 4347 bdev_poll_timeout_io(void *arg) 4348 { 4349 struct spdk_bdev_desc *desc = arg; 4350 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4351 struct poll_timeout_ctx *ctx; 4352 4353 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4354 if (!ctx) { 4355 SPDK_ERRLOG("failed to allocate memory\n"); 4356 return SPDK_POLLER_BUSY; 4357 } 4358 ctx->desc = desc; 4359 ctx->cb_arg = desc->cb_arg; 4360 ctx->cb_fn = desc->cb_fn; 4361 ctx->timeout_in_sec = desc->timeout_in_sec; 4362 4363 /* Take a ref on the descriptor in case it gets closed while we are checking 4364 * all of the channels. 4365 */ 4366 spdk_spin_lock(&desc->spinlock); 4367 desc->refs++; 4368 spdk_spin_unlock(&desc->spinlock); 4369 4370 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4371 bdev_channel_poll_timeout_io_done); 4372 4373 return SPDK_POLLER_BUSY; 4374 } 4375 4376 int 4377 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4378 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4379 { 4380 assert(desc->thread == spdk_get_thread()); 4381 4382 spdk_poller_unregister(&desc->io_timeout_poller); 4383 4384 if (timeout_in_sec) { 4385 assert(cb_fn != NULL); 4386 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4387 desc, 4388 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4389 1000); 4390 if (desc->io_timeout_poller == NULL) { 4391 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4392 return -1; 4393 } 4394 } 4395 4396 desc->cb_fn = cb_fn; 4397 desc->cb_arg = cb_arg; 4398 desc->timeout_in_sec = timeout_in_sec; 4399 4400 return 0; 4401 } 4402 4403 static int 4404 bdev_channel_create(void *io_device, void *ctx_buf) 4405 { 4406 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4407 struct spdk_bdev_channel *ch = ctx_buf; 4408 struct spdk_io_channel *mgmt_io_ch; 4409 struct spdk_bdev_mgmt_channel *mgmt_ch; 4410 struct spdk_bdev_shared_resource *shared_resource; 4411 struct lba_range *range; 4412 4413 ch->bdev = bdev; 4414 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4415 if (!ch->channel) { 4416 return -1; 4417 } 4418 4419 ch->accel_channel = spdk_accel_get_io_channel(); 4420 if (!ch->accel_channel) { 4421 spdk_put_io_channel(ch->channel); 4422 return -1; 4423 } 4424 4425 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4426 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4427 4428 assert(ch->histogram == NULL); 4429 if (bdev->internal.histogram_enabled) { 4430 ch->histogram = spdk_histogram_data_alloc(); 4431 if (ch->histogram == NULL) { 4432 SPDK_ERRLOG("Could not allocate histogram\n"); 4433 } 4434 } 4435 4436 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4437 if (!mgmt_io_ch) { 4438 spdk_put_io_channel(ch->channel); 4439 spdk_put_io_channel(ch->accel_channel); 4440 return -1; 4441 } 4442 4443 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4444 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4445 if (shared_resource->shared_ch == ch->channel) { 4446 spdk_put_io_channel(mgmt_io_ch); 4447 shared_resource->ref++; 4448 break; 4449 } 4450 } 4451 4452 if (shared_resource == NULL) { 4453 shared_resource = calloc(1, sizeof(*shared_resource)); 4454 if (shared_resource == NULL) { 4455 spdk_put_io_channel(ch->channel); 4456 spdk_put_io_channel(ch->accel_channel); 4457 spdk_put_io_channel(mgmt_io_ch); 4458 return -1; 4459 } 4460 4461 shared_resource->mgmt_ch = mgmt_ch; 4462 shared_resource->io_outstanding = 0; 4463 TAILQ_INIT(&shared_resource->nomem_io); 4464 shared_resource->nomem_threshold = 0; 4465 shared_resource->shared_ch = ch->channel; 4466 shared_resource->ref = 1; 4467 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4468 } 4469 4470 ch->io_outstanding = 0; 4471 TAILQ_INIT(&ch->locked_ranges); 4472 TAILQ_INIT(&ch->qos_queued_io); 4473 ch->flags = 0; 4474 ch->trace_id = bdev->internal.trace_id; 4475 ch->shared_resource = shared_resource; 4476 4477 TAILQ_INIT(&ch->io_submitted); 4478 TAILQ_INIT(&ch->io_locked); 4479 TAILQ_INIT(&ch->io_accel_exec); 4480 TAILQ_INIT(&ch->io_memory_domain); 4481 4482 ch->stat = bdev_alloc_io_stat(false); 4483 if (ch->stat == NULL) { 4484 bdev_channel_destroy_resource(ch); 4485 return -1; 4486 } 4487 4488 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4489 4490 #ifdef SPDK_CONFIG_VTUNE 4491 { 4492 char *name; 4493 __itt_init_ittlib(NULL, 0); 4494 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4495 if (!name) { 4496 bdev_channel_destroy_resource(ch); 4497 return -1; 4498 } 4499 ch->handle = __itt_string_handle_create(name); 4500 free(name); 4501 ch->start_tsc = spdk_get_ticks(); 4502 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4503 ch->prev_stat = bdev_alloc_io_stat(false); 4504 if (ch->prev_stat == NULL) { 4505 bdev_channel_destroy_resource(ch); 4506 return -1; 4507 } 4508 } 4509 #endif 4510 4511 spdk_spin_lock(&bdev->internal.spinlock); 4512 bdev_enable_qos(bdev, ch); 4513 4514 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4515 struct lba_range *new_range; 4516 4517 new_range = calloc(1, sizeof(*new_range)); 4518 if (new_range == NULL) { 4519 spdk_spin_unlock(&bdev->internal.spinlock); 4520 bdev_channel_destroy_resource(ch); 4521 return -1; 4522 } 4523 new_range->length = range->length; 4524 new_range->offset = range->offset; 4525 new_range->locked_ctx = range->locked_ctx; 4526 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4527 } 4528 4529 spdk_spin_unlock(&bdev->internal.spinlock); 4530 4531 return 0; 4532 } 4533 4534 static int 4535 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4536 void *cb_ctx) 4537 { 4538 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4539 struct spdk_bdev_io *bdev_io; 4540 uint64_t buf_len; 4541 4542 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4543 if (bdev_io->internal.ch == bdev_ch) { 4544 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4545 spdk_iobuf_entry_abort(ch, entry, buf_len); 4546 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4547 } 4548 4549 return 0; 4550 } 4551 4552 /* 4553 * Abort I/O that are waiting on a data buffer. 4554 */ 4555 static void 4556 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4557 { 4558 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_all_buf_io_cb, ch); 4559 } 4560 4561 /* 4562 * Abort I/O that are queued waiting for submission. These types of I/O are 4563 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4564 */ 4565 static void 4566 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4567 { 4568 struct spdk_bdev_io *bdev_io, *tmp; 4569 4570 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4571 if (bdev_io->internal.ch == ch) { 4572 TAILQ_REMOVE(queue, bdev_io, internal.link); 4573 /* 4574 * spdk_bdev_io_complete() assumes that the completed I/O had 4575 * been submitted to the bdev module. Since in this case it 4576 * hadn't, bump io_outstanding to account for the decrement 4577 * that spdk_bdev_io_complete() will do. 4578 */ 4579 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4580 bdev_io_increment_outstanding(ch, ch->shared_resource); 4581 } 4582 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4583 } 4584 } 4585 } 4586 4587 static bool 4588 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4589 { 4590 struct spdk_bdev_io *bdev_io; 4591 4592 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4593 if (bdev_io == bio_to_abort) { 4594 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4595 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4596 return true; 4597 } 4598 } 4599 4600 return false; 4601 } 4602 4603 static int 4604 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4605 { 4606 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4607 uint64_t buf_len; 4608 4609 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4610 if (bdev_io == bio_to_abort) { 4611 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4612 spdk_iobuf_entry_abort(ch, entry, buf_len); 4613 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4614 return 1; 4615 } 4616 4617 return 0; 4618 } 4619 4620 static bool 4621 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4622 { 4623 int rc; 4624 4625 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, bdev_abort_buf_io_cb, bio_to_abort); 4626 return rc == 1; 4627 } 4628 4629 static void 4630 bdev_qos_channel_destroy(void *cb_arg) 4631 { 4632 struct spdk_bdev_qos *qos = cb_arg; 4633 4634 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4635 spdk_poller_unregister(&qos->poller); 4636 4637 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4638 4639 free(qos); 4640 } 4641 4642 static int 4643 bdev_qos_destroy(struct spdk_bdev *bdev) 4644 { 4645 int i; 4646 4647 /* 4648 * Cleanly shutting down the QoS poller is tricky, because 4649 * during the asynchronous operation the user could open 4650 * a new descriptor and create a new channel, spawning 4651 * a new QoS poller. 4652 * 4653 * The strategy is to create a new QoS structure here and swap it 4654 * in. The shutdown path then continues to refer to the old one 4655 * until it completes and then releases it. 4656 */ 4657 struct spdk_bdev_qos *new_qos, *old_qos; 4658 4659 old_qos = bdev->internal.qos; 4660 4661 new_qos = calloc(1, sizeof(*new_qos)); 4662 if (!new_qos) { 4663 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4664 return -ENOMEM; 4665 } 4666 4667 /* Copy the old QoS data into the newly allocated structure */ 4668 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4669 4670 /* Zero out the key parts of the QoS structure */ 4671 new_qos->ch = NULL; 4672 new_qos->thread = NULL; 4673 new_qos->poller = NULL; 4674 /* 4675 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4676 * It will be used later for the new QoS structure. 4677 */ 4678 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4679 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4680 new_qos->rate_limits[i].min_per_timeslice = 0; 4681 new_qos->rate_limits[i].max_per_timeslice = 0; 4682 } 4683 4684 bdev->internal.qos = new_qos; 4685 4686 if (old_qos->thread == NULL) { 4687 free(old_qos); 4688 } else { 4689 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4690 } 4691 4692 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4693 * been destroyed yet. The destruction path will end up waiting for the final 4694 * channel to be put before it releases resources. */ 4695 4696 return 0; 4697 } 4698 4699 void 4700 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4701 { 4702 total->bytes_read += add->bytes_read; 4703 total->num_read_ops += add->num_read_ops; 4704 total->bytes_written += add->bytes_written; 4705 total->num_write_ops += add->num_write_ops; 4706 total->bytes_unmapped += add->bytes_unmapped; 4707 total->num_unmap_ops += add->num_unmap_ops; 4708 total->bytes_copied += add->bytes_copied; 4709 total->num_copy_ops += add->num_copy_ops; 4710 total->read_latency_ticks += add->read_latency_ticks; 4711 total->write_latency_ticks += add->write_latency_ticks; 4712 total->unmap_latency_ticks += add->unmap_latency_ticks; 4713 total->copy_latency_ticks += add->copy_latency_ticks; 4714 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4715 total->max_read_latency_ticks = add->max_read_latency_ticks; 4716 } 4717 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4718 total->min_read_latency_ticks = add->min_read_latency_ticks; 4719 } 4720 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4721 total->max_write_latency_ticks = add->max_write_latency_ticks; 4722 } 4723 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4724 total->min_write_latency_ticks = add->min_write_latency_ticks; 4725 } 4726 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4727 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4728 } 4729 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4730 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4731 } 4732 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4733 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4734 } 4735 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4736 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4737 } 4738 } 4739 4740 static void 4741 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4742 { 4743 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4744 4745 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4746 memcpy(to_stat->io_error, from_stat->io_error, 4747 sizeof(struct spdk_bdev_io_error_stat)); 4748 } 4749 } 4750 4751 void 4752 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4753 { 4754 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4755 return; 4756 } 4757 4758 stat->max_read_latency_ticks = 0; 4759 stat->min_read_latency_ticks = UINT64_MAX; 4760 stat->max_write_latency_ticks = 0; 4761 stat->min_write_latency_ticks = UINT64_MAX; 4762 stat->max_unmap_latency_ticks = 0; 4763 stat->min_unmap_latency_ticks = UINT64_MAX; 4764 stat->max_copy_latency_ticks = 0; 4765 stat->min_copy_latency_ticks = UINT64_MAX; 4766 4767 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4768 return; 4769 } 4770 4771 stat->bytes_read = 0; 4772 stat->num_read_ops = 0; 4773 stat->bytes_written = 0; 4774 stat->num_write_ops = 0; 4775 stat->bytes_unmapped = 0; 4776 stat->num_unmap_ops = 0; 4777 stat->bytes_copied = 0; 4778 stat->num_copy_ops = 0; 4779 stat->read_latency_ticks = 0; 4780 stat->write_latency_ticks = 0; 4781 stat->unmap_latency_ticks = 0; 4782 stat->copy_latency_ticks = 0; 4783 4784 if (stat->io_error != NULL) { 4785 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4786 } 4787 } 4788 4789 struct spdk_bdev_io_stat * 4790 bdev_alloc_io_stat(bool io_error_stat) 4791 { 4792 struct spdk_bdev_io_stat *stat; 4793 4794 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4795 if (stat == NULL) { 4796 return NULL; 4797 } 4798 4799 if (io_error_stat) { 4800 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4801 if (stat->io_error == NULL) { 4802 free(stat); 4803 return NULL; 4804 } 4805 } else { 4806 stat->io_error = NULL; 4807 } 4808 4809 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4810 4811 return stat; 4812 } 4813 4814 void 4815 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4816 { 4817 if (stat != NULL) { 4818 free(stat->io_error); 4819 free(stat); 4820 } 4821 } 4822 4823 void 4824 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4825 { 4826 int i; 4827 4828 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4829 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4830 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4831 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4832 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4833 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4834 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4835 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4836 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4837 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4838 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4839 stat->min_read_latency_ticks != UINT64_MAX ? 4840 stat->min_read_latency_ticks : 0); 4841 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4842 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4843 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4844 stat->min_write_latency_ticks != UINT64_MAX ? 4845 stat->min_write_latency_ticks : 0); 4846 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4847 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4848 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4849 stat->min_unmap_latency_ticks != UINT64_MAX ? 4850 stat->min_unmap_latency_ticks : 0); 4851 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4852 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4853 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4854 stat->min_copy_latency_ticks != UINT64_MAX ? 4855 stat->min_copy_latency_ticks : 0); 4856 4857 if (stat->io_error != NULL) { 4858 spdk_json_write_named_object_begin(w, "io_error"); 4859 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4860 if (stat->io_error->error_status[i] != 0) { 4861 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4862 stat->io_error->error_status[i]); 4863 } 4864 } 4865 spdk_json_write_object_end(w); 4866 } 4867 } 4868 4869 static void 4870 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4871 { 4872 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4873 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4874 4875 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4876 bdev_abort_all_buf_io(mgmt_ch, ch); 4877 } 4878 4879 static void 4880 bdev_channel_destroy(void *io_device, void *ctx_buf) 4881 { 4882 struct spdk_bdev_channel *ch = ctx_buf; 4883 4884 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4885 spdk_get_thread()); 4886 4887 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4888 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4889 4890 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4891 spdk_spin_lock(&ch->bdev->internal.spinlock); 4892 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4893 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4894 4895 bdev_channel_abort_queued_ios(ch); 4896 4897 if (ch->histogram) { 4898 spdk_histogram_data_free(ch->histogram); 4899 } 4900 4901 bdev_channel_destroy_resource(ch); 4902 } 4903 4904 /* 4905 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4906 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4907 */ 4908 static int 4909 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4910 { 4911 struct spdk_bdev_name *tmp; 4912 4913 bdev_name->name = strdup(name); 4914 if (bdev_name->name == NULL) { 4915 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4916 return -ENOMEM; 4917 } 4918 4919 bdev_name->bdev = bdev; 4920 4921 spdk_spin_lock(&g_bdev_mgr.spinlock); 4922 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4923 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4924 4925 if (tmp != NULL) { 4926 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4927 free(bdev_name->name); 4928 return -EEXIST; 4929 } 4930 4931 return 0; 4932 } 4933 4934 static void 4935 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4936 { 4937 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4938 free(bdev_name->name); 4939 } 4940 4941 static void 4942 bdev_name_del(struct spdk_bdev_name *bdev_name) 4943 { 4944 spdk_spin_lock(&g_bdev_mgr.spinlock); 4945 bdev_name_del_unsafe(bdev_name); 4946 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4947 } 4948 4949 int 4950 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4951 { 4952 struct spdk_bdev_alias *tmp; 4953 int ret; 4954 4955 if (alias == NULL) { 4956 SPDK_ERRLOG("Empty alias passed\n"); 4957 return -EINVAL; 4958 } 4959 4960 tmp = calloc(1, sizeof(*tmp)); 4961 if (tmp == NULL) { 4962 SPDK_ERRLOG("Unable to allocate alias\n"); 4963 return -ENOMEM; 4964 } 4965 4966 ret = bdev_name_add(&tmp->alias, bdev, alias); 4967 if (ret != 0) { 4968 free(tmp); 4969 return ret; 4970 } 4971 4972 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4973 4974 return 0; 4975 } 4976 4977 static int 4978 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4979 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4980 { 4981 struct spdk_bdev_alias *tmp; 4982 4983 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4984 if (strcmp(alias, tmp->alias.name) == 0) { 4985 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4986 alias_del_fn(&tmp->alias); 4987 free(tmp); 4988 return 0; 4989 } 4990 } 4991 4992 return -ENOENT; 4993 } 4994 4995 int 4996 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4997 { 4998 int rc; 4999 5000 rc = bdev_alias_del(bdev, alias, bdev_name_del); 5001 if (rc == -ENOENT) { 5002 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 5003 } 5004 5005 return rc; 5006 } 5007 5008 void 5009 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 5010 { 5011 struct spdk_bdev_alias *p, *tmp; 5012 5013 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 5014 TAILQ_REMOVE(&bdev->aliases, p, tailq); 5015 bdev_name_del(&p->alias); 5016 free(p); 5017 } 5018 } 5019 5020 struct spdk_io_channel * 5021 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 5022 { 5023 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 5024 } 5025 5026 void * 5027 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 5028 { 5029 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5030 void *ctx = NULL; 5031 5032 if (bdev->fn_table->get_module_ctx) { 5033 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 5034 } 5035 5036 return ctx; 5037 } 5038 5039 const char * 5040 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 5041 { 5042 return bdev->module->name; 5043 } 5044 5045 const char * 5046 spdk_bdev_get_name(const struct spdk_bdev *bdev) 5047 { 5048 return bdev->name; 5049 } 5050 5051 const char * 5052 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 5053 { 5054 return bdev->product_name; 5055 } 5056 5057 const struct spdk_bdev_aliases_list * 5058 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 5059 { 5060 return &bdev->aliases; 5061 } 5062 5063 uint32_t 5064 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 5065 { 5066 return bdev->blocklen; 5067 } 5068 5069 uint32_t 5070 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 5071 { 5072 return bdev->write_unit_size; 5073 } 5074 5075 uint64_t 5076 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 5077 { 5078 return bdev->blockcnt; 5079 } 5080 5081 const char * 5082 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 5083 { 5084 return qos_rpc_type[type]; 5085 } 5086 5087 void 5088 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 5089 { 5090 int i; 5091 5092 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 5093 5094 spdk_spin_lock(&bdev->internal.spinlock); 5095 if (bdev->internal.qos) { 5096 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 5097 if (bdev->internal.qos->rate_limits[i].limit != 5098 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 5099 limits[i] = bdev->internal.qos->rate_limits[i].limit; 5100 if (bdev_qos_is_iops_rate_limit(i) == false) { 5101 /* Change from Byte to Megabyte which is user visible. */ 5102 limits[i] = limits[i] / 1024 / 1024; 5103 } 5104 } 5105 } 5106 } 5107 spdk_spin_unlock(&bdev->internal.spinlock); 5108 } 5109 5110 size_t 5111 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 5112 { 5113 return 1 << bdev->required_alignment; 5114 } 5115 5116 uint32_t 5117 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 5118 { 5119 return bdev->optimal_io_boundary; 5120 } 5121 5122 bool 5123 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 5124 { 5125 return bdev->write_cache; 5126 } 5127 5128 const struct spdk_uuid * 5129 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 5130 { 5131 return &bdev->uuid; 5132 } 5133 5134 uint16_t 5135 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 5136 { 5137 return bdev->acwu; 5138 } 5139 5140 uint32_t 5141 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 5142 { 5143 return bdev->md_len; 5144 } 5145 5146 bool 5147 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 5148 { 5149 return (bdev->md_len != 0) && bdev->md_interleave; 5150 } 5151 5152 bool 5153 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 5154 { 5155 return (bdev->md_len != 0) && !bdev->md_interleave; 5156 } 5157 5158 bool 5159 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 5160 { 5161 return bdev->zoned; 5162 } 5163 5164 uint32_t 5165 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 5166 { 5167 if (spdk_bdev_is_md_interleaved(bdev)) { 5168 return bdev->blocklen - bdev->md_len; 5169 } else { 5170 return bdev->blocklen; 5171 } 5172 } 5173 5174 uint32_t 5175 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 5176 { 5177 return bdev->phys_blocklen; 5178 } 5179 5180 static uint32_t 5181 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 5182 { 5183 if (!spdk_bdev_is_md_interleaved(bdev)) { 5184 return bdev->blocklen + bdev->md_len; 5185 } else { 5186 return bdev->blocklen; 5187 } 5188 } 5189 5190 /* We have to use the typedef in the function declaration to appease astyle. */ 5191 typedef enum spdk_dif_type spdk_dif_type_t; 5192 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 5193 5194 spdk_dif_type_t 5195 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 5196 { 5197 if (bdev->md_len != 0) { 5198 return bdev->dif_type; 5199 } else { 5200 return SPDK_DIF_DISABLE; 5201 } 5202 } 5203 5204 spdk_dif_pi_format_t 5205 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 5206 { 5207 return bdev->dif_pi_format; 5208 } 5209 5210 bool 5211 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 5212 { 5213 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 5214 return bdev->dif_is_head_of_md; 5215 } else { 5216 return false; 5217 } 5218 } 5219 5220 bool 5221 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 5222 enum spdk_dif_check_type check_type) 5223 { 5224 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5225 return false; 5226 } 5227 5228 switch (check_type) { 5229 case SPDK_DIF_CHECK_TYPE_REFTAG: 5230 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5231 case SPDK_DIF_CHECK_TYPE_APPTAG: 5232 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5233 case SPDK_DIF_CHECK_TYPE_GUARD: 5234 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5235 default: 5236 return false; 5237 } 5238 } 5239 5240 static uint32_t 5241 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5242 { 5243 uint64_t aligned_length, max_write_blocks; 5244 5245 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5246 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5247 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5248 5249 return max_write_blocks; 5250 } 5251 5252 uint32_t 5253 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5254 { 5255 return bdev->max_copy; 5256 } 5257 5258 uint64_t 5259 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5260 { 5261 return bdev->internal.measured_queue_depth; 5262 } 5263 5264 uint64_t 5265 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5266 { 5267 return bdev->internal.period; 5268 } 5269 5270 uint64_t 5271 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5272 { 5273 return bdev->internal.weighted_io_time; 5274 } 5275 5276 uint64_t 5277 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5278 { 5279 return bdev->internal.io_time; 5280 } 5281 5282 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5283 { 5284 return bdev->ctratt; 5285 } 5286 5287 uint32_t 5288 spdk_bdev_get_nvme_nsid(struct spdk_bdev *bdev) 5289 { 5290 return bdev->nsid; 5291 } 5292 5293 uint32_t 5294 spdk_bdev_desc_get_block_size(struct spdk_bdev_desc *desc) 5295 { 5296 struct spdk_bdev *bdev = desc->bdev; 5297 5298 return desc->opts.hide_metadata ? bdev->blocklen - bdev->md_len : bdev->blocklen; 5299 } 5300 5301 uint32_t 5302 spdk_bdev_desc_get_md_size(struct spdk_bdev_desc *desc) 5303 { 5304 struct spdk_bdev *bdev = desc->bdev; 5305 5306 return desc->opts.hide_metadata ? 0 : bdev->md_len; 5307 } 5308 5309 bool 5310 spdk_bdev_desc_is_md_interleaved(struct spdk_bdev_desc *desc) 5311 { 5312 struct spdk_bdev *bdev = desc->bdev; 5313 5314 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_interleaved(bdev); 5315 } 5316 5317 bool 5318 spdk_bdev_desc_is_md_separate(struct spdk_bdev_desc *desc) 5319 { 5320 struct spdk_bdev *bdev = desc->bdev; 5321 5322 return desc->opts.hide_metadata ? false : spdk_bdev_is_md_separate(bdev); 5323 } 5324 5325 spdk_dif_type_t 5326 spdk_bdev_desc_get_dif_type(struct spdk_bdev_desc *desc) 5327 { 5328 struct spdk_bdev *bdev = desc->bdev; 5329 5330 return desc->opts.hide_metadata ? SPDK_DIF_DISABLE : spdk_bdev_get_dif_type(bdev); 5331 } 5332 5333 spdk_dif_pi_format_t 5334 spdk_bdev_desc_get_dif_pi_format(struct spdk_bdev_desc *desc) 5335 { 5336 struct spdk_bdev *bdev = desc->bdev; 5337 5338 return desc->opts.hide_metadata ? SPDK_DIF_PI_FORMAT_16 : spdk_bdev_get_dif_pi_format(bdev); 5339 } 5340 5341 bool 5342 spdk_bdev_desc_is_dif_head_of_md(struct spdk_bdev_desc *desc) 5343 { 5344 struct spdk_bdev *bdev = desc->bdev; 5345 5346 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_head_of_md(bdev); 5347 } 5348 5349 bool 5350 spdk_bdev_desc_is_dif_check_enabled(struct spdk_bdev_desc *desc, 5351 enum spdk_dif_check_type check_type) 5352 { 5353 struct spdk_bdev *bdev = desc->bdev; 5354 5355 return desc->opts.hide_metadata ? false : spdk_bdev_is_dif_check_enabled(bdev, check_type); 5356 } 5357 5358 static void bdev_update_qd_sampling_period(void *ctx); 5359 5360 static void 5361 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5362 { 5363 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5364 5365 if (bdev->internal.measured_queue_depth) { 5366 bdev->internal.io_time += bdev->internal.period; 5367 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5368 } 5369 5370 bdev->internal.qd_poll_in_progress = false; 5371 5372 bdev_update_qd_sampling_period(bdev); 5373 } 5374 5375 static void 5376 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5377 struct spdk_io_channel *io_ch, void *_ctx) 5378 { 5379 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5380 5381 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5382 spdk_bdev_for_each_channel_continue(i, 0); 5383 } 5384 5385 static int 5386 bdev_calculate_measured_queue_depth(void *ctx) 5387 { 5388 struct spdk_bdev *bdev = ctx; 5389 5390 bdev->internal.qd_poll_in_progress = true; 5391 bdev->internal.temporary_queue_depth = 0; 5392 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5393 return SPDK_POLLER_BUSY; 5394 } 5395 5396 static void 5397 bdev_update_qd_sampling_period(void *ctx) 5398 { 5399 struct spdk_bdev *bdev = ctx; 5400 5401 if (bdev->internal.period == bdev->internal.new_period) { 5402 return; 5403 } 5404 5405 if (bdev->internal.qd_poll_in_progress) { 5406 return; 5407 } 5408 5409 bdev->internal.period = bdev->internal.new_period; 5410 5411 spdk_poller_unregister(&bdev->internal.qd_poller); 5412 if (bdev->internal.period != 0) { 5413 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5414 bdev, bdev->internal.period); 5415 } else { 5416 spdk_bdev_close(bdev->internal.qd_desc); 5417 bdev->internal.qd_desc = NULL; 5418 } 5419 } 5420 5421 static void 5422 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5423 { 5424 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5425 } 5426 5427 void 5428 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5429 { 5430 int rc; 5431 5432 if (bdev->internal.new_period == period) { 5433 return; 5434 } 5435 5436 bdev->internal.new_period = period; 5437 5438 if (bdev->internal.qd_desc != NULL) { 5439 assert(bdev->internal.period != 0); 5440 5441 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5442 bdev_update_qd_sampling_period, bdev); 5443 return; 5444 } 5445 5446 assert(bdev->internal.period == 0); 5447 5448 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5449 NULL, &bdev->internal.qd_desc); 5450 if (rc != 0) { 5451 return; 5452 } 5453 5454 bdev->internal.period = period; 5455 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5456 bdev, period); 5457 } 5458 5459 struct bdev_get_current_qd_ctx { 5460 uint64_t current_qd; 5461 spdk_bdev_get_current_qd_cb cb_fn; 5462 void *cb_arg; 5463 }; 5464 5465 static void 5466 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5467 { 5468 struct bdev_get_current_qd_ctx *ctx = _ctx; 5469 5470 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5471 5472 free(ctx); 5473 } 5474 5475 static void 5476 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5477 struct spdk_io_channel *io_ch, void *_ctx) 5478 { 5479 struct bdev_get_current_qd_ctx *ctx = _ctx; 5480 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5481 5482 ctx->current_qd += bdev_ch->io_outstanding; 5483 5484 spdk_bdev_for_each_channel_continue(i, 0); 5485 } 5486 5487 void 5488 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5489 void *cb_arg) 5490 { 5491 struct bdev_get_current_qd_ctx *ctx; 5492 5493 assert(cb_fn != NULL); 5494 5495 ctx = calloc(1, sizeof(*ctx)); 5496 if (ctx == NULL) { 5497 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5498 return; 5499 } 5500 5501 ctx->cb_fn = cb_fn; 5502 ctx->cb_arg = cb_arg; 5503 5504 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5505 } 5506 5507 static void 5508 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5509 { 5510 assert(desc->thread == spdk_get_thread()); 5511 5512 spdk_spin_lock(&desc->spinlock); 5513 desc->refs--; 5514 if (!desc->closed) { 5515 spdk_spin_unlock(&desc->spinlock); 5516 desc->callback.event_fn(type, 5517 desc->bdev, 5518 desc->callback.ctx); 5519 return; 5520 } else if (desc->refs == 0) { 5521 /* This descriptor was closed after this event_notify message was sent. 5522 * spdk_bdev_close() could not free the descriptor since this message was 5523 * in flight, so we free it now using bdev_desc_free(). 5524 */ 5525 spdk_spin_unlock(&desc->spinlock); 5526 bdev_desc_free(desc); 5527 return; 5528 } 5529 spdk_spin_unlock(&desc->spinlock); 5530 } 5531 5532 static void 5533 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5534 { 5535 spdk_spin_lock(&desc->spinlock); 5536 desc->refs++; 5537 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5538 spdk_spin_unlock(&desc->spinlock); 5539 } 5540 5541 static void 5542 _resize_notify(void *ctx) 5543 { 5544 struct spdk_bdev_desc *desc = ctx; 5545 5546 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5547 } 5548 5549 int 5550 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5551 { 5552 struct spdk_bdev_desc *desc; 5553 int ret; 5554 5555 if (size == bdev->blockcnt) { 5556 return 0; 5557 } 5558 5559 spdk_spin_lock(&bdev->internal.spinlock); 5560 5561 /* bdev has open descriptors */ 5562 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5563 bdev->blockcnt > size) { 5564 ret = -EBUSY; 5565 } else { 5566 bdev->blockcnt = size; 5567 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5568 event_notify(desc, _resize_notify); 5569 } 5570 ret = 0; 5571 } 5572 5573 spdk_spin_unlock(&bdev->internal.spinlock); 5574 5575 return ret; 5576 } 5577 5578 /* 5579 * Convert I/O offset and length from bytes to blocks. 5580 * 5581 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5582 */ 5583 static uint64_t 5584 bdev_bytes_to_blocks(struct spdk_bdev_desc *desc, uint64_t offset_bytes, 5585 uint64_t *offset_blocks, uint64_t num_bytes, uint64_t *num_blocks) 5586 { 5587 uint32_t block_size = bdev_desc_get_block_size(desc); 5588 uint8_t shift_cnt; 5589 5590 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5591 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5592 shift_cnt = spdk_u32log2(block_size); 5593 *offset_blocks = offset_bytes >> shift_cnt; 5594 *num_blocks = num_bytes >> shift_cnt; 5595 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5596 (num_bytes - (*num_blocks << shift_cnt)); 5597 } else { 5598 *offset_blocks = offset_bytes / block_size; 5599 *num_blocks = num_bytes / block_size; 5600 return (offset_bytes % block_size) | (num_bytes % block_size); 5601 } 5602 } 5603 5604 static bool 5605 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5606 { 5607 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5608 * has been an overflow and hence the offset has been wrapped around */ 5609 if (offset_blocks + num_blocks < offset_blocks) { 5610 return false; 5611 } 5612 5613 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5614 if (offset_blocks + num_blocks > bdev->blockcnt) { 5615 return false; 5616 } 5617 5618 return true; 5619 } 5620 5621 static void 5622 bdev_seek_complete_cb(void *ctx) 5623 { 5624 struct spdk_bdev_io *bdev_io = ctx; 5625 5626 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5627 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5628 } 5629 5630 static int 5631 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5632 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5633 spdk_bdev_io_completion_cb cb, void *cb_arg) 5634 { 5635 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5636 struct spdk_bdev_io *bdev_io; 5637 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5638 5639 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5640 5641 /* Check if offset_blocks is valid looking at the validity of one block */ 5642 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5643 return -EINVAL; 5644 } 5645 5646 bdev_io = bdev_channel_get_io(channel); 5647 if (!bdev_io) { 5648 return -ENOMEM; 5649 } 5650 5651 bdev_io->internal.ch = channel; 5652 bdev_io->internal.desc = desc; 5653 bdev_io->type = io_type; 5654 bdev_io->u.bdev.offset_blocks = offset_blocks; 5655 bdev_io->u.bdev.memory_domain = NULL; 5656 bdev_io->u.bdev.memory_domain_ctx = NULL; 5657 bdev_io->u.bdev.accel_sequence = NULL; 5658 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5659 5660 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5661 /* In case bdev doesn't support seek to next data/hole offset, 5662 * it is assumed that only data and no holes are present */ 5663 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5664 bdev_io->u.bdev.seek.offset = offset_blocks; 5665 } else { 5666 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5667 } 5668 5669 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5670 return 0; 5671 } 5672 5673 bdev_io_submit(bdev_io); 5674 return 0; 5675 } 5676 5677 int 5678 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5679 uint64_t offset_blocks, 5680 spdk_bdev_io_completion_cb cb, void *cb_arg) 5681 { 5682 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5683 } 5684 5685 int 5686 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5687 uint64_t offset_blocks, 5688 spdk_bdev_io_completion_cb cb, void *cb_arg) 5689 { 5690 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5691 } 5692 5693 uint64_t 5694 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5695 { 5696 return bdev_io->u.bdev.seek.offset; 5697 } 5698 5699 static int 5700 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5701 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5702 spdk_bdev_io_completion_cb cb, void *cb_arg) 5703 { 5704 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5705 struct spdk_bdev_io *bdev_io; 5706 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5707 5708 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5709 return -EINVAL; 5710 } 5711 5712 bdev_io = bdev_channel_get_io(channel); 5713 if (!bdev_io) { 5714 return -ENOMEM; 5715 } 5716 5717 bdev_io->internal.ch = channel; 5718 bdev_io->internal.desc = desc; 5719 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5720 bdev_io->u.bdev.iovs = &bdev_io->iov; 5721 bdev_io->u.bdev.iovs[0].iov_base = buf; 5722 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5723 bdev_io->u.bdev.iovcnt = 1; 5724 bdev_io->u.bdev.md_buf = md_buf; 5725 bdev_io->u.bdev.num_blocks = num_blocks; 5726 bdev_io->u.bdev.offset_blocks = offset_blocks; 5727 bdev_io->u.bdev.memory_domain = NULL; 5728 bdev_io->u.bdev.memory_domain_ctx = NULL; 5729 bdev_io->u.bdev.accel_sequence = NULL; 5730 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5731 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5732 5733 bdev_io_submit(bdev_io); 5734 return 0; 5735 } 5736 5737 int 5738 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5739 void *buf, uint64_t offset, uint64_t nbytes, 5740 spdk_bdev_io_completion_cb cb, void *cb_arg) 5741 { 5742 uint64_t offset_blocks, num_blocks; 5743 5744 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5745 return -EINVAL; 5746 } 5747 5748 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5749 } 5750 5751 int 5752 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5753 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5754 spdk_bdev_io_completion_cb cb, void *cb_arg) 5755 { 5756 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5757 } 5758 5759 int 5760 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5761 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5762 spdk_bdev_io_completion_cb cb, void *cb_arg) 5763 { 5764 struct iovec iov = { 5765 .iov_base = buf, 5766 }; 5767 5768 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5769 return -EINVAL; 5770 } 5771 5772 if ((md_buf || desc->opts.hide_metadata) && !_is_buf_allocated(&iov)) { 5773 return -EINVAL; 5774 } 5775 5776 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5777 cb, cb_arg); 5778 } 5779 5780 int 5781 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5782 struct iovec *iov, int iovcnt, 5783 uint64_t offset, uint64_t nbytes, 5784 spdk_bdev_io_completion_cb cb, void *cb_arg) 5785 { 5786 uint64_t offset_blocks, num_blocks; 5787 5788 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5789 return -EINVAL; 5790 } 5791 5792 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5793 } 5794 5795 static int 5796 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5797 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5798 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5799 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5800 spdk_bdev_io_completion_cb cb, void *cb_arg) 5801 { 5802 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5803 struct spdk_bdev_io *bdev_io; 5804 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5805 5806 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5807 return -EINVAL; 5808 } 5809 5810 bdev_io = bdev_channel_get_io(channel); 5811 if (spdk_unlikely(!bdev_io)) { 5812 return -ENOMEM; 5813 } 5814 5815 bdev_io->internal.ch = channel; 5816 bdev_io->internal.desc = desc; 5817 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5818 bdev_io->u.bdev.iovs = iov; 5819 bdev_io->u.bdev.iovcnt = iovcnt; 5820 bdev_io->u.bdev.md_buf = md_buf; 5821 bdev_io->u.bdev.num_blocks = num_blocks; 5822 bdev_io->u.bdev.offset_blocks = offset_blocks; 5823 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5824 5825 if (seq != NULL) { 5826 bdev_io->internal.f.has_accel_sequence = true; 5827 bdev_io->internal.accel_sequence = seq; 5828 } 5829 5830 if (domain != NULL) { 5831 bdev_io->internal.f.has_memory_domain = true; 5832 bdev_io->internal.memory_domain = domain; 5833 bdev_io->internal.memory_domain_ctx = domain_ctx; 5834 } 5835 5836 bdev_io->u.bdev.memory_domain = domain; 5837 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5838 bdev_io->u.bdev.accel_sequence = seq; 5839 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5840 5841 _bdev_io_submit_ext(desc, bdev_io); 5842 5843 return 0; 5844 } 5845 5846 int 5847 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5848 struct iovec *iov, int iovcnt, 5849 uint64_t offset_blocks, uint64_t num_blocks, 5850 spdk_bdev_io_completion_cb cb, void *cb_arg) 5851 { 5852 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5853 5854 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5855 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5856 } 5857 5858 int 5859 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5860 struct iovec *iov, int iovcnt, void *md_buf, 5861 uint64_t offset_blocks, uint64_t num_blocks, 5862 spdk_bdev_io_completion_cb cb, void *cb_arg) 5863 { 5864 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5865 5866 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5867 return -EINVAL; 5868 } 5869 5870 if (md_buf && !_is_buf_allocated(iov)) { 5871 return -EINVAL; 5872 } 5873 5874 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5875 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5876 } 5877 5878 static inline bool 5879 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5880 { 5881 /* 5882 * We check if opts size is at least of size when we first introduced 5883 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5884 * are not checked internal. 5885 */ 5886 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5887 sizeof(opts->metadata) && 5888 opts->size <= sizeof(*opts) && 5889 /* When memory domain is used, the user must provide data buffers */ 5890 (!opts->memory_domain || (iov && iov[0].iov_base)); 5891 } 5892 5893 int 5894 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5895 struct iovec *iov, int iovcnt, 5896 uint64_t offset_blocks, uint64_t num_blocks, 5897 spdk_bdev_io_completion_cb cb, void *cb_arg, 5898 struct spdk_bdev_ext_io_opts *opts) 5899 { 5900 struct spdk_memory_domain *domain = NULL; 5901 struct spdk_accel_sequence *seq = NULL; 5902 void *domain_ctx = NULL, *md = NULL; 5903 uint32_t dif_check_flags = 0; 5904 uint32_t nvme_cdw12_raw; 5905 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5906 5907 if (opts) { 5908 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5909 return -EINVAL; 5910 } 5911 5912 md = opts->metadata; 5913 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5914 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5915 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5916 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5917 if (md) { 5918 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5919 return -EINVAL; 5920 } 5921 5922 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5923 return -EINVAL; 5924 } 5925 5926 if (spdk_unlikely(seq != NULL)) { 5927 return -EINVAL; 5928 } 5929 5930 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5931 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 5932 return -ENOTSUP; 5933 } 5934 } 5935 5936 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 5937 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 5938 } 5939 } 5940 5941 dif_check_flags |= bdev->dif_check_flags & 5942 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5943 5944 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5945 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5946 } 5947 5948 static int 5949 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5950 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5951 spdk_bdev_io_completion_cb cb, void *cb_arg) 5952 { 5953 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5954 struct spdk_bdev_io *bdev_io; 5955 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5956 5957 if (!desc->write) { 5958 return -EBADF; 5959 } 5960 5961 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5962 return -EINVAL; 5963 } 5964 5965 bdev_io = bdev_channel_get_io(channel); 5966 if (!bdev_io) { 5967 return -ENOMEM; 5968 } 5969 5970 bdev_io->internal.ch = channel; 5971 bdev_io->internal.desc = desc; 5972 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5973 bdev_io->u.bdev.iovs = &bdev_io->iov; 5974 bdev_io->u.bdev.iovs[0].iov_base = buf; 5975 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 5976 bdev_io->u.bdev.iovcnt = 1; 5977 bdev_io->u.bdev.md_buf = md_buf; 5978 bdev_io->u.bdev.num_blocks = num_blocks; 5979 bdev_io->u.bdev.offset_blocks = offset_blocks; 5980 bdev_io->u.bdev.memory_domain = NULL; 5981 bdev_io->u.bdev.memory_domain_ctx = NULL; 5982 bdev_io->u.bdev.accel_sequence = NULL; 5983 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5984 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5985 5986 bdev_io_submit(bdev_io); 5987 return 0; 5988 } 5989 5990 int 5991 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5992 void *buf, uint64_t offset, uint64_t nbytes, 5993 spdk_bdev_io_completion_cb cb, void *cb_arg) 5994 { 5995 uint64_t offset_blocks, num_blocks; 5996 5997 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 5998 return -EINVAL; 5999 } 6000 6001 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 6002 } 6003 6004 int 6005 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6006 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6007 spdk_bdev_io_completion_cb cb, void *cb_arg) 6008 { 6009 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6010 cb, cb_arg); 6011 } 6012 6013 int 6014 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6015 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6016 spdk_bdev_io_completion_cb cb, void *cb_arg) 6017 { 6018 struct iovec iov = { 6019 .iov_base = buf, 6020 }; 6021 6022 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6023 return -EINVAL; 6024 } 6025 6026 if (md_buf && !_is_buf_allocated(&iov)) { 6027 return -EINVAL; 6028 } 6029 6030 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6031 cb, cb_arg); 6032 } 6033 6034 static int 6035 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6036 struct iovec *iov, int iovcnt, void *md_buf, 6037 uint64_t offset_blocks, uint64_t num_blocks, 6038 struct spdk_memory_domain *domain, void *domain_ctx, 6039 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 6040 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 6041 spdk_bdev_io_completion_cb cb, void *cb_arg) 6042 { 6043 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6044 struct spdk_bdev_io *bdev_io; 6045 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6046 6047 if (spdk_unlikely(!desc->write)) { 6048 return -EBADF; 6049 } 6050 6051 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 6052 return -EINVAL; 6053 } 6054 6055 bdev_io = bdev_channel_get_io(channel); 6056 if (spdk_unlikely(!bdev_io)) { 6057 return -ENOMEM; 6058 } 6059 6060 bdev_io->internal.ch = channel; 6061 bdev_io->internal.desc = desc; 6062 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 6063 bdev_io->u.bdev.iovs = iov; 6064 bdev_io->u.bdev.iovcnt = iovcnt; 6065 bdev_io->u.bdev.md_buf = md_buf; 6066 bdev_io->u.bdev.num_blocks = num_blocks; 6067 bdev_io->u.bdev.offset_blocks = offset_blocks; 6068 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6069 if (seq != NULL) { 6070 bdev_io->internal.f.has_accel_sequence = true; 6071 bdev_io->internal.accel_sequence = seq; 6072 } 6073 6074 if (domain != NULL) { 6075 bdev_io->internal.f.has_memory_domain = true; 6076 bdev_io->internal.memory_domain = domain; 6077 bdev_io->internal.memory_domain_ctx = domain_ctx; 6078 } 6079 6080 bdev_io->u.bdev.memory_domain = domain; 6081 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 6082 bdev_io->u.bdev.accel_sequence = seq; 6083 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 6084 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 6085 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 6086 6087 _bdev_io_submit_ext(desc, bdev_io); 6088 6089 return 0; 6090 } 6091 6092 int 6093 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6094 struct iovec *iov, int iovcnt, 6095 uint64_t offset, uint64_t len, 6096 spdk_bdev_io_completion_cb cb, void *cb_arg) 6097 { 6098 uint64_t offset_blocks, num_blocks; 6099 6100 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6101 return -EINVAL; 6102 } 6103 6104 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 6105 } 6106 6107 int 6108 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6109 struct iovec *iov, int iovcnt, 6110 uint64_t offset_blocks, uint64_t num_blocks, 6111 spdk_bdev_io_completion_cb cb, void *cb_arg) 6112 { 6113 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6114 6115 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6116 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6117 cb, cb_arg); 6118 } 6119 6120 int 6121 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6122 struct iovec *iov, int iovcnt, void *md_buf, 6123 uint64_t offset_blocks, uint64_t num_blocks, 6124 spdk_bdev_io_completion_cb cb, void *cb_arg) 6125 { 6126 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6127 6128 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 6129 return -EINVAL; 6130 } 6131 6132 if (md_buf && !_is_buf_allocated(iov)) { 6133 return -EINVAL; 6134 } 6135 6136 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6137 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 6138 cb, cb_arg); 6139 } 6140 6141 int 6142 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6143 struct iovec *iov, int iovcnt, 6144 uint64_t offset_blocks, uint64_t num_blocks, 6145 spdk_bdev_io_completion_cb cb, void *cb_arg, 6146 struct spdk_bdev_ext_io_opts *opts) 6147 { 6148 struct spdk_memory_domain *domain = NULL; 6149 struct spdk_accel_sequence *seq = NULL; 6150 void *domain_ctx = NULL, *md = NULL; 6151 uint32_t dif_check_flags = 0; 6152 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6153 uint32_t nvme_cdw12_raw = 0; 6154 uint32_t nvme_cdw13_raw = 0; 6155 6156 if (opts) { 6157 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 6158 return -EINVAL; 6159 } 6160 md = opts->metadata; 6161 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 6162 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 6163 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 6164 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 6165 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 6166 if (md) { 6167 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 6168 return -EINVAL; 6169 } 6170 6171 if (spdk_unlikely(!_is_buf_allocated(iov))) { 6172 return -EINVAL; 6173 } 6174 6175 if (spdk_unlikely(seq != NULL)) { 6176 return -EINVAL; 6177 } 6178 6179 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6180 SPDK_ERRLOG("Separate metadata with NVMe PRACT is not supported.\n"); 6181 return -ENOTSUP; 6182 } 6183 } 6184 6185 if (nvme_cdw12_raw & SPDK_DIF_FLAGS_NVME_PRACT) { 6186 dif_check_flags |= SPDK_DIF_FLAGS_NVME_PRACT; 6187 } 6188 } 6189 6190 dif_check_flags |= bdev->dif_check_flags & 6191 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 6192 6193 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 6194 domain, domain_ctx, seq, dif_check_flags, 6195 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 6196 } 6197 6198 static void 6199 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6200 { 6201 struct spdk_bdev_io *parent_io = cb_arg; 6202 struct spdk_bdev *bdev = parent_io->bdev; 6203 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 6204 int i, rc = 0; 6205 6206 if (!success) { 6207 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6208 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6209 spdk_bdev_free_io(bdev_io); 6210 return; 6211 } 6212 6213 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 6214 rc = memcmp(read_buf, 6215 parent_io->u.bdev.iovs[i].iov_base, 6216 parent_io->u.bdev.iovs[i].iov_len); 6217 if (rc) { 6218 break; 6219 } 6220 read_buf += parent_io->u.bdev.iovs[i].iov_len; 6221 } 6222 6223 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 6224 rc = memcmp(bdev_io->u.bdev.md_buf, 6225 parent_io->u.bdev.md_buf, 6226 spdk_bdev_get_md_size(bdev)); 6227 } 6228 6229 spdk_bdev_free_io(bdev_io); 6230 6231 if (rc == 0) { 6232 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6233 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6234 } else { 6235 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 6236 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6237 } 6238 } 6239 6240 static void 6241 bdev_compare_do_read(void *_bdev_io) 6242 { 6243 struct spdk_bdev_io *bdev_io = _bdev_io; 6244 int rc; 6245 6246 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 6247 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 6248 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6249 bdev_compare_do_read_done, bdev_io); 6250 6251 if (rc == -ENOMEM) { 6252 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 6253 } else if (rc != 0) { 6254 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6255 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6256 } 6257 } 6258 6259 static int 6260 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6261 struct iovec *iov, int iovcnt, void *md_buf, 6262 uint64_t offset_blocks, uint64_t num_blocks, 6263 spdk_bdev_io_completion_cb cb, void *cb_arg) 6264 { 6265 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6266 struct spdk_bdev_io *bdev_io; 6267 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6268 6269 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6270 return -EINVAL; 6271 } 6272 6273 bdev_io = bdev_channel_get_io(channel); 6274 if (!bdev_io) { 6275 return -ENOMEM; 6276 } 6277 6278 bdev_io->internal.ch = channel; 6279 bdev_io->internal.desc = desc; 6280 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6281 bdev_io->u.bdev.iovs = iov; 6282 bdev_io->u.bdev.iovcnt = iovcnt; 6283 bdev_io->u.bdev.md_buf = md_buf; 6284 bdev_io->u.bdev.num_blocks = num_blocks; 6285 bdev_io->u.bdev.offset_blocks = offset_blocks; 6286 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6287 bdev_io->u.bdev.memory_domain = NULL; 6288 bdev_io->u.bdev.memory_domain_ctx = NULL; 6289 bdev_io->u.bdev.accel_sequence = NULL; 6290 6291 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6292 bdev_io_submit(bdev_io); 6293 return 0; 6294 } 6295 6296 bdev_compare_do_read(bdev_io); 6297 6298 return 0; 6299 } 6300 6301 int 6302 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6303 struct iovec *iov, int iovcnt, 6304 uint64_t offset_blocks, uint64_t num_blocks, 6305 spdk_bdev_io_completion_cb cb, void *cb_arg) 6306 { 6307 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 6308 num_blocks, cb, cb_arg); 6309 } 6310 6311 int 6312 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6313 struct iovec *iov, int iovcnt, void *md_buf, 6314 uint64_t offset_blocks, uint64_t num_blocks, 6315 spdk_bdev_io_completion_cb cb, void *cb_arg) 6316 { 6317 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6318 return -EINVAL; 6319 } 6320 6321 if (md_buf && !_is_buf_allocated(iov)) { 6322 return -EINVAL; 6323 } 6324 6325 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6326 num_blocks, cb, cb_arg); 6327 } 6328 6329 static int 6330 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6331 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6332 spdk_bdev_io_completion_cb cb, void *cb_arg) 6333 { 6334 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6335 struct spdk_bdev_io *bdev_io; 6336 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6337 6338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6339 return -EINVAL; 6340 } 6341 6342 bdev_io = bdev_channel_get_io(channel); 6343 if (!bdev_io) { 6344 return -ENOMEM; 6345 } 6346 6347 bdev_io->internal.ch = channel; 6348 bdev_io->internal.desc = desc; 6349 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6350 bdev_io->u.bdev.iovs = &bdev_io->iov; 6351 bdev_io->u.bdev.iovs[0].iov_base = buf; 6352 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev_desc_get_block_size(desc); 6353 bdev_io->u.bdev.iovcnt = 1; 6354 bdev_io->u.bdev.md_buf = md_buf; 6355 bdev_io->u.bdev.num_blocks = num_blocks; 6356 bdev_io->u.bdev.offset_blocks = offset_blocks; 6357 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6358 bdev_io->u.bdev.memory_domain = NULL; 6359 bdev_io->u.bdev.memory_domain_ctx = NULL; 6360 bdev_io->u.bdev.accel_sequence = NULL; 6361 6362 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6363 bdev_io_submit(bdev_io); 6364 return 0; 6365 } 6366 6367 bdev_compare_do_read(bdev_io); 6368 6369 return 0; 6370 } 6371 6372 int 6373 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6374 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6375 spdk_bdev_io_completion_cb cb, void *cb_arg) 6376 { 6377 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6378 cb, cb_arg); 6379 } 6380 6381 int 6382 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6383 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6384 spdk_bdev_io_completion_cb cb, void *cb_arg) 6385 { 6386 struct iovec iov = { 6387 .iov_base = buf, 6388 }; 6389 6390 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6391 return -EINVAL; 6392 } 6393 6394 if (md_buf && !_is_buf_allocated(&iov)) { 6395 return -EINVAL; 6396 } 6397 6398 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6399 cb, cb_arg); 6400 } 6401 6402 static void 6403 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6404 { 6405 struct spdk_bdev_io *bdev_io = ctx; 6406 6407 if (unlock_status) { 6408 SPDK_ERRLOG("LBA range unlock failed\n"); 6409 } 6410 6411 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6412 false, bdev_io->internal.caller_ctx); 6413 } 6414 6415 static void 6416 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6417 { 6418 bdev_io->internal.status = status; 6419 6420 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6421 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6422 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6423 } 6424 6425 static void 6426 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6427 { 6428 struct spdk_bdev_io *parent_io = cb_arg; 6429 6430 if (!success) { 6431 SPDK_ERRLOG("Compare and write operation failed\n"); 6432 } 6433 6434 spdk_bdev_free_io(bdev_io); 6435 6436 bdev_comparev_and_writev_blocks_unlock(parent_io, 6437 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6438 } 6439 6440 static void 6441 bdev_compare_and_write_do_write(void *_bdev_io) 6442 { 6443 struct spdk_bdev_io *bdev_io = _bdev_io; 6444 int rc; 6445 6446 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6447 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6448 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6449 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6450 bdev_compare_and_write_do_write_done, bdev_io); 6451 6452 6453 if (rc == -ENOMEM) { 6454 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6455 } else if (rc != 0) { 6456 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6457 } 6458 } 6459 6460 static void 6461 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6462 { 6463 struct spdk_bdev_io *parent_io = cb_arg; 6464 6465 spdk_bdev_free_io(bdev_io); 6466 6467 if (!success) { 6468 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6469 return; 6470 } 6471 6472 bdev_compare_and_write_do_write(parent_io); 6473 } 6474 6475 static void 6476 bdev_compare_and_write_do_compare(void *_bdev_io) 6477 { 6478 struct spdk_bdev_io *bdev_io = _bdev_io; 6479 int rc; 6480 6481 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6482 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6483 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6484 bdev_compare_and_write_do_compare_done, bdev_io); 6485 6486 if (rc == -ENOMEM) { 6487 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6488 } else if (rc != 0) { 6489 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6490 } 6491 } 6492 6493 static void 6494 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6495 { 6496 struct spdk_bdev_io *bdev_io = ctx; 6497 6498 if (status) { 6499 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6500 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6501 return; 6502 } 6503 6504 bdev_compare_and_write_do_compare(bdev_io); 6505 } 6506 6507 int 6508 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6509 struct iovec *compare_iov, int compare_iovcnt, 6510 struct iovec *write_iov, int write_iovcnt, 6511 uint64_t offset_blocks, uint64_t num_blocks, 6512 spdk_bdev_io_completion_cb cb, void *cb_arg) 6513 { 6514 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6515 struct spdk_bdev_io *bdev_io; 6516 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6517 6518 if (!desc->write) { 6519 return -EBADF; 6520 } 6521 6522 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6523 return -EINVAL; 6524 } 6525 6526 if (num_blocks > bdev->acwu) { 6527 return -EINVAL; 6528 } 6529 6530 bdev_io = bdev_channel_get_io(channel); 6531 if (!bdev_io) { 6532 return -ENOMEM; 6533 } 6534 6535 bdev_io->internal.ch = channel; 6536 bdev_io->internal.desc = desc; 6537 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6538 bdev_io->u.bdev.iovs = compare_iov; 6539 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6540 bdev_io->u.bdev.fused_iovs = write_iov; 6541 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6542 bdev_io->u.bdev.md_buf = NULL; 6543 bdev_io->u.bdev.num_blocks = num_blocks; 6544 bdev_io->u.bdev.offset_blocks = offset_blocks; 6545 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6546 bdev_io->u.bdev.memory_domain = NULL; 6547 bdev_io->u.bdev.memory_domain_ctx = NULL; 6548 bdev_io->u.bdev.accel_sequence = NULL; 6549 6550 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6551 bdev_io_submit(bdev_io); 6552 return 0; 6553 } 6554 6555 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6556 bdev_comparev_and_writev_blocks_locked, bdev_io); 6557 } 6558 6559 int 6560 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6561 struct iovec *iov, int iovcnt, 6562 uint64_t offset_blocks, uint64_t num_blocks, 6563 bool populate, 6564 spdk_bdev_io_completion_cb cb, void *cb_arg) 6565 { 6566 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6567 struct spdk_bdev_io *bdev_io; 6568 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6569 6570 if (!desc->write) { 6571 return -EBADF; 6572 } 6573 6574 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6575 return -EINVAL; 6576 } 6577 6578 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6579 return -ENOTSUP; 6580 } 6581 6582 bdev_io = bdev_channel_get_io(channel); 6583 if (!bdev_io) { 6584 return -ENOMEM; 6585 } 6586 6587 bdev_io->internal.ch = channel; 6588 bdev_io->internal.desc = desc; 6589 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6590 bdev_io->u.bdev.num_blocks = num_blocks; 6591 bdev_io->u.bdev.offset_blocks = offset_blocks; 6592 bdev_io->u.bdev.iovs = iov; 6593 bdev_io->u.bdev.iovcnt = iovcnt; 6594 bdev_io->u.bdev.md_buf = NULL; 6595 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6596 bdev_io->u.bdev.zcopy.commit = 0; 6597 bdev_io->u.bdev.zcopy.start = 1; 6598 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6599 bdev_io->u.bdev.memory_domain = NULL; 6600 bdev_io->u.bdev.memory_domain_ctx = NULL; 6601 bdev_io->u.bdev.accel_sequence = NULL; 6602 6603 bdev_io_submit(bdev_io); 6604 6605 return 0; 6606 } 6607 6608 int 6609 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6610 spdk_bdev_io_completion_cb cb, void *cb_arg) 6611 { 6612 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6613 return -EINVAL; 6614 } 6615 6616 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6617 bdev_io->u.bdev.zcopy.start = 0; 6618 bdev_io->internal.caller_ctx = cb_arg; 6619 bdev_io->internal.cb = cb; 6620 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6621 6622 bdev_io_submit(bdev_io); 6623 6624 return 0; 6625 } 6626 6627 int 6628 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6629 uint64_t offset, uint64_t len, 6630 spdk_bdev_io_completion_cb cb, void *cb_arg) 6631 { 6632 uint64_t offset_blocks, num_blocks; 6633 6634 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, len, &num_blocks) != 0) { 6635 return -EINVAL; 6636 } 6637 6638 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6639 } 6640 6641 int 6642 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6643 uint64_t offset_blocks, uint64_t num_blocks, 6644 spdk_bdev_io_completion_cb cb, void *cb_arg) 6645 { 6646 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6647 struct spdk_bdev_io *bdev_io; 6648 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6649 6650 if (!desc->write) { 6651 return -EBADF; 6652 } 6653 6654 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6655 return -EINVAL; 6656 } 6657 6658 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6659 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6660 return -ENOTSUP; 6661 } 6662 6663 bdev_io = bdev_channel_get_io(channel); 6664 6665 if (!bdev_io) { 6666 return -ENOMEM; 6667 } 6668 6669 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6670 bdev_io->internal.ch = channel; 6671 bdev_io->internal.desc = desc; 6672 bdev_io->u.bdev.offset_blocks = offset_blocks; 6673 bdev_io->u.bdev.num_blocks = num_blocks; 6674 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6675 bdev_io->u.bdev.memory_domain = NULL; 6676 bdev_io->u.bdev.memory_domain_ctx = NULL; 6677 bdev_io->u.bdev.accel_sequence = NULL; 6678 6679 /* If the write_zeroes size is large and should be split, use the generic split 6680 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6681 * 6682 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6683 * or emulate it using regular write request otherwise. 6684 */ 6685 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6686 bdev_io->internal.f.split) { 6687 bdev_io_submit(bdev_io); 6688 return 0; 6689 } 6690 6691 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6692 6693 return bdev_write_zero_buffer(bdev_io); 6694 } 6695 6696 int 6697 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6698 uint64_t offset, uint64_t nbytes, 6699 spdk_bdev_io_completion_cb cb, void *cb_arg) 6700 { 6701 uint64_t offset_blocks, num_blocks; 6702 6703 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, nbytes, &num_blocks) != 0) { 6704 return -EINVAL; 6705 } 6706 6707 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6708 } 6709 6710 static void 6711 bdev_io_complete_cb(void *ctx) 6712 { 6713 struct spdk_bdev_io *bdev_io = ctx; 6714 6715 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6716 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6717 } 6718 6719 int 6720 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6721 uint64_t offset_blocks, uint64_t num_blocks, 6722 spdk_bdev_io_completion_cb cb, void *cb_arg) 6723 { 6724 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6725 struct spdk_bdev_io *bdev_io; 6726 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6727 6728 if (!desc->write) { 6729 return -EBADF; 6730 } 6731 6732 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6733 return -EINVAL; 6734 } 6735 6736 bdev_io = bdev_channel_get_io(channel); 6737 if (!bdev_io) { 6738 return -ENOMEM; 6739 } 6740 6741 bdev_io->internal.ch = channel; 6742 bdev_io->internal.desc = desc; 6743 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6744 6745 bdev_io->u.bdev.iovs = &bdev_io->iov; 6746 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6747 bdev_io->u.bdev.iovs[0].iov_len = 0; 6748 bdev_io->u.bdev.iovcnt = 1; 6749 6750 bdev_io->u.bdev.offset_blocks = offset_blocks; 6751 bdev_io->u.bdev.num_blocks = num_blocks; 6752 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6753 bdev_io->u.bdev.memory_domain = NULL; 6754 bdev_io->u.bdev.memory_domain_ctx = NULL; 6755 bdev_io->u.bdev.accel_sequence = NULL; 6756 6757 if (num_blocks == 0) { 6758 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6759 return 0; 6760 } 6761 6762 bdev_io_submit(bdev_io); 6763 return 0; 6764 } 6765 6766 int 6767 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6768 uint64_t offset, uint64_t length, 6769 spdk_bdev_io_completion_cb cb, void *cb_arg) 6770 { 6771 uint64_t offset_blocks, num_blocks; 6772 6773 if (bdev_bytes_to_blocks(desc, offset, &offset_blocks, length, &num_blocks) != 0) { 6774 return -EINVAL; 6775 } 6776 6777 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6778 } 6779 6780 int 6781 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6782 uint64_t offset_blocks, uint64_t num_blocks, 6783 spdk_bdev_io_completion_cb cb, void *cb_arg) 6784 { 6785 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6786 struct spdk_bdev_io *bdev_io; 6787 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6788 6789 if (!desc->write) { 6790 return -EBADF; 6791 } 6792 6793 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_FLUSH))) { 6794 return -ENOTSUP; 6795 } 6796 6797 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6798 return -EINVAL; 6799 } 6800 6801 bdev_io = bdev_channel_get_io(channel); 6802 if (!bdev_io) { 6803 return -ENOMEM; 6804 } 6805 6806 bdev_io->internal.ch = channel; 6807 bdev_io->internal.desc = desc; 6808 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6809 bdev_io->u.bdev.iovs = NULL; 6810 bdev_io->u.bdev.iovcnt = 0; 6811 bdev_io->u.bdev.offset_blocks = offset_blocks; 6812 bdev_io->u.bdev.num_blocks = num_blocks; 6813 bdev_io->u.bdev.memory_domain = NULL; 6814 bdev_io->u.bdev.memory_domain_ctx = NULL; 6815 bdev_io->u.bdev.accel_sequence = NULL; 6816 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6817 6818 bdev_io_submit(bdev_io); 6819 return 0; 6820 } 6821 6822 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6823 6824 static void 6825 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6826 { 6827 struct spdk_bdev_io *bdev_io = _ctx; 6828 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 6829 6830 if (status == -EBUSY) { 6831 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6832 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6833 bdev_io, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6834 } else { 6835 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6836 /* If outstanding IOs are still present and reset_io_drain_timeout 6837 * seconds passed, start the reset. */ 6838 bdev_io_submit_reset(bdev_io); 6839 } else { 6840 /* We still have in progress memory domain pull/push or we're 6841 * executing accel sequence. Since we cannot abort either of those 6842 * operations, fail the reset request. */ 6843 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6844 } 6845 } 6846 } else { 6847 SPDK_DEBUGLOG(bdev, 6848 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6849 ch->bdev->name); 6850 /* Mark the completion status as a SUCCESS and complete the reset. */ 6851 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6852 } 6853 } 6854 6855 static void 6856 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6857 struct spdk_io_channel *io_ch, void *_ctx) 6858 { 6859 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6860 int status = 0; 6861 6862 if (cur_ch->io_outstanding > 0 || 6863 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6864 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6865 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6866 * further iteration over the rest of the channels and pass non-zero status 6867 * to the callback function. */ 6868 status = -EBUSY; 6869 } 6870 spdk_bdev_for_each_channel_continue(i, status); 6871 } 6872 6873 static int 6874 bdev_reset_poll_for_outstanding_io(void *ctx) 6875 { 6876 struct spdk_bdev_io *bdev_io = ctx; 6877 6878 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6879 spdk_bdev_for_each_channel(bdev_io->bdev, bdev_reset_check_outstanding_io, bdev_io, 6880 bdev_reset_check_outstanding_io_done); 6881 6882 return SPDK_POLLER_BUSY; 6883 } 6884 6885 static void 6886 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6887 { 6888 struct spdk_bdev_io *bdev_io = _ctx; 6889 6890 if (bdev->reset_io_drain_timeout == 0) { 6891 bdev_io_submit_reset(bdev_io); 6892 return; 6893 } 6894 6895 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6896 (bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6897 6898 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6899 * submit the reset to the underlying module only if outstanding I/O 6900 * remain after reset_io_drain_timeout seconds have passed. */ 6901 spdk_bdev_for_each_channel(bdev, bdev_reset_check_outstanding_io, bdev_io, 6902 bdev_reset_check_outstanding_io_done); 6903 } 6904 6905 static void 6906 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6907 struct spdk_io_channel *ch, void *_ctx) 6908 { 6909 struct spdk_bdev_channel *channel; 6910 struct spdk_bdev_mgmt_channel *mgmt_channel; 6911 struct spdk_bdev_shared_resource *shared_resource; 6912 bdev_io_tailq_t tmp_queued; 6913 6914 TAILQ_INIT(&tmp_queued); 6915 6916 channel = __io_ch_to_bdev_ch(ch); 6917 shared_resource = channel->shared_resource; 6918 mgmt_channel = shared_resource->mgmt_ch; 6919 6920 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6921 6922 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6923 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6924 } 6925 6926 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6927 bdev_abort_all_buf_io(mgmt_channel, channel); 6928 bdev_abort_all_queued_io(&tmp_queued, channel); 6929 6930 spdk_bdev_for_each_channel_continue(i, 0); 6931 } 6932 6933 static void 6934 bdev_start_reset(struct spdk_bdev_io *bdev_io) 6935 { 6936 struct spdk_bdev *bdev = bdev_io->bdev; 6937 bool freeze_channel = false; 6938 6939 bdev_ch_add_to_io_submitted(bdev_io); 6940 6941 /** 6942 * Take a channel reference for the target bdev for the life of this 6943 * reset. This guards against the channel getting destroyed before 6944 * the reset is completed. We will release the reference when this 6945 * reset is completed. 6946 */ 6947 bdev_io->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6948 6949 spdk_spin_lock(&bdev->internal.spinlock); 6950 if (bdev->internal.reset_in_progress == NULL) { 6951 bdev->internal.reset_in_progress = bdev_io; 6952 freeze_channel = true; 6953 } else { 6954 TAILQ_INSERT_TAIL(&bdev->internal.queued_resets, bdev_io, internal.link); 6955 } 6956 spdk_spin_unlock(&bdev->internal.spinlock); 6957 6958 if (freeze_channel) { 6959 spdk_bdev_for_each_channel(bdev, bdev_reset_freeze_channel, bdev_io, 6960 bdev_reset_freeze_channel_done); 6961 } 6962 } 6963 6964 int 6965 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6966 spdk_bdev_io_completion_cb cb, void *cb_arg) 6967 { 6968 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6969 struct spdk_bdev_io *bdev_io; 6970 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6971 6972 bdev_io = bdev_channel_get_io(channel); 6973 if (!bdev_io) { 6974 return -ENOMEM; 6975 } 6976 6977 bdev_io->internal.ch = channel; 6978 bdev_io->internal.desc = desc; 6979 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6980 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6981 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6982 6983 bdev_start_reset(bdev_io); 6984 return 0; 6985 } 6986 6987 void 6988 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6989 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6990 { 6991 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6992 6993 bdev_get_io_stat(stat, channel->stat); 6994 spdk_bdev_reset_io_stat(channel->stat, reset_mode); 6995 } 6996 6997 static void 6998 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6999 { 7000 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7001 7002 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 7003 bdev_iostat_ctx->cb_arg, 0); 7004 free(bdev_iostat_ctx); 7005 } 7006 7007 static void 7008 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7009 struct spdk_io_channel *ch, void *_ctx) 7010 { 7011 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 7012 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7013 7014 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 7015 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 7016 spdk_bdev_for_each_channel_continue(i, 0); 7017 } 7018 7019 void 7020 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 7021 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 7022 { 7023 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 7024 7025 assert(bdev != NULL); 7026 assert(stat != NULL); 7027 assert(cb != NULL); 7028 7029 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 7030 if (bdev_iostat_ctx == NULL) { 7031 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 7032 cb(bdev, stat, cb_arg, -ENOMEM); 7033 return; 7034 } 7035 7036 bdev_iostat_ctx->stat = stat; 7037 bdev_iostat_ctx->cb = cb; 7038 bdev_iostat_ctx->cb_arg = cb_arg; 7039 bdev_iostat_ctx->reset_mode = reset_mode; 7040 7041 /* Start with the statistics from previously deleted channels. */ 7042 spdk_spin_lock(&bdev->internal.spinlock); 7043 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 7044 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 7045 spdk_spin_unlock(&bdev->internal.spinlock); 7046 7047 /* Then iterate and add the statistics from each existing channel. */ 7048 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 7049 bdev_get_device_stat_done); 7050 } 7051 7052 struct bdev_iostat_reset_ctx { 7053 enum spdk_bdev_reset_stat_mode mode; 7054 bdev_reset_device_stat_cb cb; 7055 void *cb_arg; 7056 }; 7057 7058 static void 7059 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 7060 { 7061 struct bdev_iostat_reset_ctx *ctx = _ctx; 7062 7063 ctx->cb(bdev, ctx->cb_arg, 0); 7064 7065 free(ctx); 7066 } 7067 7068 static void 7069 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7070 struct spdk_io_channel *ch, void *_ctx) 7071 { 7072 struct bdev_iostat_reset_ctx *ctx = _ctx; 7073 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7074 7075 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 7076 7077 spdk_bdev_for_each_channel_continue(i, 0); 7078 } 7079 7080 void 7081 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 7082 bdev_reset_device_stat_cb cb, void *cb_arg) 7083 { 7084 struct bdev_iostat_reset_ctx *ctx; 7085 7086 assert(bdev != NULL); 7087 assert(cb != NULL); 7088 7089 ctx = calloc(1, sizeof(*ctx)); 7090 if (ctx == NULL) { 7091 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 7092 cb(bdev, cb_arg, -ENOMEM); 7093 return; 7094 } 7095 7096 ctx->mode = mode; 7097 ctx->cb = cb; 7098 ctx->cb_arg = cb_arg; 7099 7100 spdk_spin_lock(&bdev->internal.spinlock); 7101 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 7102 spdk_spin_unlock(&bdev->internal.spinlock); 7103 7104 spdk_bdev_for_each_channel(bdev, 7105 bdev_reset_each_channel_stat, 7106 ctx, 7107 bdev_reset_device_stat_done); 7108 } 7109 7110 int 7111 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7112 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7113 spdk_bdev_io_completion_cb cb, void *cb_arg) 7114 { 7115 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7116 struct spdk_bdev_io *bdev_io; 7117 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7118 7119 if (!desc->write) { 7120 return -EBADF; 7121 } 7122 7123 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 7124 return -ENOTSUP; 7125 } 7126 7127 bdev_io = bdev_channel_get_io(channel); 7128 if (!bdev_io) { 7129 return -ENOMEM; 7130 } 7131 7132 bdev_io->internal.ch = channel; 7133 bdev_io->internal.desc = desc; 7134 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 7135 bdev_io->u.nvme_passthru.cmd = *cmd; 7136 bdev_io->u.nvme_passthru.buf = buf; 7137 bdev_io->u.nvme_passthru.nbytes = nbytes; 7138 bdev_io->u.nvme_passthru.md_buf = NULL; 7139 bdev_io->u.nvme_passthru.md_len = 0; 7140 7141 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7142 7143 bdev_io_submit(bdev_io); 7144 return 0; 7145 } 7146 7147 int 7148 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7149 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 7150 spdk_bdev_io_completion_cb cb, void *cb_arg) 7151 { 7152 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7153 struct spdk_bdev_io *bdev_io; 7154 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7155 7156 if (!desc->write) { 7157 /* 7158 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7159 * to easily determine if the command is a read or write, but for now just 7160 * do not allow io_passthru with a read-only descriptor. 7161 */ 7162 return -EBADF; 7163 } 7164 7165 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7166 return -ENOTSUP; 7167 } 7168 7169 bdev_io = bdev_channel_get_io(channel); 7170 if (!bdev_io) { 7171 return -ENOMEM; 7172 } 7173 7174 bdev_io->internal.ch = channel; 7175 bdev_io->internal.desc = desc; 7176 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 7177 bdev_io->u.nvme_passthru.cmd = *cmd; 7178 bdev_io->u.nvme_passthru.buf = buf; 7179 bdev_io->u.nvme_passthru.nbytes = nbytes; 7180 bdev_io->u.nvme_passthru.md_buf = NULL; 7181 bdev_io->u.nvme_passthru.md_len = 0; 7182 7183 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7184 7185 bdev_io_submit(bdev_io); 7186 return 0; 7187 } 7188 7189 int 7190 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7191 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 7192 spdk_bdev_io_completion_cb cb, void *cb_arg) 7193 { 7194 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7195 struct spdk_bdev_io *bdev_io; 7196 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7197 7198 if (!desc->write) { 7199 /* 7200 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7201 * to easily determine if the command is a read or write, but for now just 7202 * do not allow io_passthru with a read-only descriptor. 7203 */ 7204 return -EBADF; 7205 } 7206 7207 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7208 return -ENOTSUP; 7209 } 7210 7211 bdev_io = bdev_channel_get_io(channel); 7212 if (!bdev_io) { 7213 return -ENOMEM; 7214 } 7215 7216 bdev_io->internal.ch = channel; 7217 bdev_io->internal.desc = desc; 7218 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 7219 bdev_io->u.nvme_passthru.cmd = *cmd; 7220 bdev_io->u.nvme_passthru.buf = buf; 7221 bdev_io->u.nvme_passthru.nbytes = nbytes; 7222 bdev_io->u.nvme_passthru.md_buf = md_buf; 7223 bdev_io->u.nvme_passthru.md_len = md_len; 7224 7225 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7226 7227 bdev_io_submit(bdev_io); 7228 return 0; 7229 } 7230 7231 int 7232 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 7233 struct spdk_io_channel *ch, 7234 const struct spdk_nvme_cmd *cmd, 7235 struct iovec *iov, int iovcnt, size_t nbytes, 7236 void *md_buf, size_t md_len, 7237 spdk_bdev_io_completion_cb cb, void *cb_arg) 7238 { 7239 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7240 struct spdk_bdev_io *bdev_io; 7241 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7242 7243 if (!desc->write) { 7244 /* 7245 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 7246 * to easily determine if the command is a read or write, but for now just 7247 * do not allow io_passthru with a read-only descriptor. 7248 */ 7249 return -EBADF; 7250 } 7251 7252 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 7253 return -ENOTSUP; 7254 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 7255 return -ENOTSUP; 7256 } 7257 7258 bdev_io = bdev_channel_get_io(channel); 7259 if (!bdev_io) { 7260 return -ENOMEM; 7261 } 7262 7263 bdev_io->internal.ch = channel; 7264 bdev_io->internal.desc = desc; 7265 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 7266 bdev_io->u.nvme_passthru.cmd = *cmd; 7267 bdev_io->u.nvme_passthru.iovs = iov; 7268 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 7269 bdev_io->u.nvme_passthru.nbytes = nbytes; 7270 bdev_io->u.nvme_passthru.md_buf = md_buf; 7271 bdev_io->u.nvme_passthru.md_len = md_len; 7272 7273 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7274 7275 bdev_io_submit(bdev_io); 7276 return 0; 7277 } 7278 7279 static void bdev_abort_retry(void *ctx); 7280 static void bdev_abort(struct spdk_bdev_io *parent_io); 7281 7282 static void 7283 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7284 { 7285 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 7286 struct spdk_bdev_io *parent_io = cb_arg; 7287 struct spdk_bdev_io *bio_to_abort, *tmp_io; 7288 7289 bio_to_abort = bdev_io->u.abort.bio_to_abort; 7290 7291 spdk_bdev_free_io(bdev_io); 7292 7293 if (!success) { 7294 /* Check if the target I/O completed in the meantime. */ 7295 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7296 if (tmp_io == bio_to_abort) { 7297 break; 7298 } 7299 } 7300 7301 /* If the target I/O still exists, set the parent to failed. */ 7302 if (tmp_io != NULL) { 7303 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7304 } 7305 } 7306 7307 assert(parent_io->internal.f.split); 7308 7309 parent_io->internal.split.outstanding--; 7310 if (parent_io->internal.split.outstanding == 0) { 7311 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7312 bdev_abort_retry(parent_io); 7313 } else { 7314 bdev_io_complete(parent_io); 7315 } 7316 } 7317 } 7318 7319 static int 7320 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7321 struct spdk_bdev_io *bio_to_abort, 7322 spdk_bdev_io_completion_cb cb, void *cb_arg) 7323 { 7324 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7325 struct spdk_bdev_io *bdev_io; 7326 7327 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7328 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7329 /* TODO: Abort reset or abort request. */ 7330 return -ENOTSUP; 7331 } 7332 7333 bdev_io = bdev_channel_get_io(channel); 7334 if (bdev_io == NULL) { 7335 return -ENOMEM; 7336 } 7337 7338 bdev_io->internal.ch = channel; 7339 bdev_io->internal.desc = desc; 7340 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7341 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7342 7343 if (bio_to_abort->internal.f.split) { 7344 assert(bdev_io_should_split(bio_to_abort)); 7345 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7346 7347 /* Parent abort request is not submitted directly, but to manage its 7348 * execution add it to the submitted list here. 7349 */ 7350 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7351 bdev_ch_add_to_io_submitted(bdev_io); 7352 7353 bdev_abort(bdev_io); 7354 7355 return 0; 7356 } 7357 7358 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7359 7360 /* Submit the abort request to the underlying bdev module. */ 7361 bdev_io_submit(bdev_io); 7362 7363 return 0; 7364 } 7365 7366 static bool 7367 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7368 { 7369 struct spdk_bdev_io *iter; 7370 7371 TAILQ_FOREACH(iter, tailq, internal.link) { 7372 if (iter == bdev_io) { 7373 return true; 7374 } 7375 } 7376 7377 return false; 7378 } 7379 7380 static uint32_t 7381 _bdev_abort(struct spdk_bdev_io *parent_io) 7382 { 7383 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7384 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7385 void *bio_cb_arg; 7386 struct spdk_bdev_io *bio_to_abort; 7387 uint32_t matched_ios; 7388 int rc; 7389 7390 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7391 7392 /* matched_ios is returned and will be kept by the caller. 7393 * 7394 * This function will be used for two cases, 1) the same cb_arg is used for 7395 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7396 * Incrementing split_outstanding directly here may confuse readers especially 7397 * for the 1st case. 7398 * 7399 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7400 * works as expected. 7401 */ 7402 matched_ios = 0; 7403 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7404 7405 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7406 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7407 continue; 7408 } 7409 7410 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7411 /* Any I/O which was submitted after this abort command should be excluded. */ 7412 continue; 7413 } 7414 7415 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7416 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7417 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7418 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7419 break; 7420 } 7421 7422 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7423 if (rc != 0) { 7424 if (rc == -ENOMEM) { 7425 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7426 } else { 7427 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7428 } 7429 break; 7430 } 7431 matched_ios++; 7432 } 7433 7434 return matched_ios; 7435 } 7436 7437 static void 7438 bdev_abort_retry(void *ctx) 7439 { 7440 struct spdk_bdev_io *parent_io = ctx; 7441 uint32_t matched_ios; 7442 7443 matched_ios = _bdev_abort(parent_io); 7444 7445 if (matched_ios == 0) { 7446 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7447 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7448 } else { 7449 /* For retry, the case that no target I/O was found is success 7450 * because it means target I/Os completed in the meantime. 7451 */ 7452 bdev_io_complete(parent_io); 7453 } 7454 return; 7455 } 7456 7457 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7458 parent_io->internal.f.split = true; 7459 parent_io->internal.split.outstanding = matched_ios; 7460 } 7461 7462 static void 7463 bdev_abort(struct spdk_bdev_io *parent_io) 7464 { 7465 uint32_t matched_ios; 7466 7467 matched_ios = _bdev_abort(parent_io); 7468 7469 if (matched_ios == 0) { 7470 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7471 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7472 } else { 7473 /* The case the no target I/O was found is failure. */ 7474 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7475 bdev_io_complete(parent_io); 7476 } 7477 return; 7478 } 7479 7480 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7481 parent_io->internal.f.split = true; 7482 parent_io->internal.split.outstanding = matched_ios; 7483 } 7484 7485 int 7486 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7487 void *bio_cb_arg, 7488 spdk_bdev_io_completion_cb cb, void *cb_arg) 7489 { 7490 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7491 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7492 struct spdk_bdev_io *bdev_io; 7493 7494 if (bio_cb_arg == NULL) { 7495 return -EINVAL; 7496 } 7497 7498 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7499 return -ENOTSUP; 7500 } 7501 7502 bdev_io = bdev_channel_get_io(channel); 7503 if (bdev_io == NULL) { 7504 return -ENOMEM; 7505 } 7506 7507 bdev_io->internal.ch = channel; 7508 bdev_io->internal.desc = desc; 7509 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7510 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7511 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7512 7513 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7514 7515 /* Parent abort request is not submitted directly, but to manage its execution, 7516 * add it to the submitted list here. 7517 */ 7518 bdev_ch_add_to_io_submitted(bdev_io); 7519 7520 bdev_abort(bdev_io); 7521 7522 return 0; 7523 } 7524 7525 int 7526 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7527 struct spdk_bdev_io_wait_entry *entry) 7528 { 7529 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7530 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7531 7532 if (bdev != entry->bdev) { 7533 SPDK_ERRLOG("bdevs do not match\n"); 7534 return -EINVAL; 7535 } 7536 7537 if (mgmt_ch->per_thread_cache_count > 0) { 7538 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7539 return -EINVAL; 7540 } 7541 7542 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7543 return 0; 7544 } 7545 7546 static inline void 7547 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7548 { 7549 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7550 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7551 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7552 uint32_t blocklen = bdev_io->bdev->blocklen; 7553 7554 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7555 switch (bdev_io->type) { 7556 case SPDK_BDEV_IO_TYPE_READ: 7557 io_stat->bytes_read += num_blocks * blocklen; 7558 io_stat->num_read_ops++; 7559 io_stat->read_latency_ticks += tsc_diff; 7560 if (io_stat->max_read_latency_ticks < tsc_diff) { 7561 io_stat->max_read_latency_ticks = tsc_diff; 7562 } 7563 if (io_stat->min_read_latency_ticks > tsc_diff) { 7564 io_stat->min_read_latency_ticks = tsc_diff; 7565 } 7566 break; 7567 case SPDK_BDEV_IO_TYPE_WRITE: 7568 io_stat->bytes_written += num_blocks * blocklen; 7569 io_stat->num_write_ops++; 7570 io_stat->write_latency_ticks += tsc_diff; 7571 if (io_stat->max_write_latency_ticks < tsc_diff) { 7572 io_stat->max_write_latency_ticks = tsc_diff; 7573 } 7574 if (io_stat->min_write_latency_ticks > tsc_diff) { 7575 io_stat->min_write_latency_ticks = tsc_diff; 7576 } 7577 break; 7578 case SPDK_BDEV_IO_TYPE_UNMAP: 7579 io_stat->bytes_unmapped += num_blocks * blocklen; 7580 io_stat->num_unmap_ops++; 7581 io_stat->unmap_latency_ticks += tsc_diff; 7582 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7583 io_stat->max_unmap_latency_ticks = tsc_diff; 7584 } 7585 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7586 io_stat->min_unmap_latency_ticks = tsc_diff; 7587 } 7588 break; 7589 case SPDK_BDEV_IO_TYPE_ZCOPY: 7590 /* Track the data in the start phase only */ 7591 if (bdev_io->u.bdev.zcopy.start) { 7592 if (bdev_io->u.bdev.zcopy.populate) { 7593 io_stat->bytes_read += num_blocks * blocklen; 7594 io_stat->num_read_ops++; 7595 io_stat->read_latency_ticks += tsc_diff; 7596 if (io_stat->max_read_latency_ticks < tsc_diff) { 7597 io_stat->max_read_latency_ticks = tsc_diff; 7598 } 7599 if (io_stat->min_read_latency_ticks > tsc_diff) { 7600 io_stat->min_read_latency_ticks = tsc_diff; 7601 } 7602 } else { 7603 io_stat->bytes_written += num_blocks * blocklen; 7604 io_stat->num_write_ops++; 7605 io_stat->write_latency_ticks += tsc_diff; 7606 if (io_stat->max_write_latency_ticks < tsc_diff) { 7607 io_stat->max_write_latency_ticks = tsc_diff; 7608 } 7609 if (io_stat->min_write_latency_ticks > tsc_diff) { 7610 io_stat->min_write_latency_ticks = tsc_diff; 7611 } 7612 } 7613 } 7614 break; 7615 case SPDK_BDEV_IO_TYPE_COPY: 7616 io_stat->bytes_copied += num_blocks * blocklen; 7617 io_stat->num_copy_ops++; 7618 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7619 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7620 io_stat->max_copy_latency_ticks = tsc_diff; 7621 } 7622 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7623 io_stat->min_copy_latency_ticks = tsc_diff; 7624 } 7625 break; 7626 default: 7627 break; 7628 } 7629 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7630 io_stat = bdev_io->bdev->internal.stat; 7631 assert(io_stat->io_error != NULL); 7632 7633 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7634 io_stat->io_error->error_status[-io_status - 1]++; 7635 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7636 } 7637 7638 #ifdef SPDK_CONFIG_VTUNE 7639 uint64_t now_tsc = spdk_get_ticks(); 7640 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7641 uint64_t data[5]; 7642 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7643 7644 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7645 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7646 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7647 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7648 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7649 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7650 7651 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7652 __itt_metadata_u64, 5, data); 7653 7654 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7655 bdev_io->internal.ch->start_tsc = now_tsc; 7656 } 7657 #endif 7658 } 7659 7660 static inline void 7661 _bdev_io_complete(void *ctx) 7662 { 7663 struct spdk_bdev_io *bdev_io = ctx; 7664 7665 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7666 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7667 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7668 } 7669 7670 assert(bdev_io->internal.cb != NULL); 7671 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7672 7673 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7674 bdev_io->internal.caller_ctx); 7675 } 7676 7677 static inline void 7678 bdev_io_complete(void *ctx) 7679 { 7680 struct spdk_bdev_io *bdev_io = ctx; 7681 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7682 uint64_t tsc, tsc_diff; 7683 7684 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7685 /* 7686 * Defer completion to avoid potential infinite recursion if the 7687 * user's completion callback issues a new I/O. 7688 */ 7689 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7690 bdev_io_complete, bdev_io); 7691 return; 7692 } 7693 7694 tsc = spdk_get_ticks(); 7695 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7696 7697 bdev_ch_remove_from_io_submitted(bdev_io); 7698 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7699 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7700 7701 if (bdev_ch->histogram) { 7702 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7703 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7704 /* 7705 * Tally all I/O types if the histogram_io_type is set to 0. 7706 */ 7707 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7708 } 7709 } 7710 7711 bdev_io_update_io_stat(bdev_io, tsc_diff); 7712 _bdev_io_complete(bdev_io); 7713 } 7714 7715 /* The difference between this function and bdev_io_complete() is that this should be called to 7716 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7717 * io_submitted list and don't have submit_tsc updated. 7718 */ 7719 static inline void 7720 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7721 { 7722 /* Since the IO hasn't been submitted it's bound to be failed */ 7723 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7724 7725 /* At this point we don't know if the IO is completed from submission context or not, but, 7726 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7727 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7728 _bdev_io_complete, bdev_io); 7729 } 7730 7731 static void bdev_destroy_cb(void *io_device); 7732 7733 static inline void 7734 _bdev_reset_complete(void *ctx) 7735 { 7736 struct spdk_bdev_io *bdev_io = ctx; 7737 7738 /* Put the channel reference we got in submission. */ 7739 assert(bdev_io->u.reset.ch_ref != NULL); 7740 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7741 bdev_io->u.reset.ch_ref = NULL; 7742 7743 bdev_io_complete(bdev_io); 7744 } 7745 7746 static void 7747 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7748 { 7749 struct spdk_bdev_io *bdev_io = _ctx; 7750 bdev_io_tailq_t queued_resets; 7751 struct spdk_bdev_io *queued_reset; 7752 7753 assert(bdev_io == bdev->internal.reset_in_progress); 7754 7755 TAILQ_INIT(&queued_resets); 7756 7757 spdk_spin_lock(&bdev->internal.spinlock); 7758 TAILQ_SWAP(&bdev->internal.queued_resets, &queued_resets, 7759 spdk_bdev_io, internal.link); 7760 bdev->internal.reset_in_progress = NULL; 7761 spdk_spin_unlock(&bdev->internal.spinlock); 7762 7763 while (!TAILQ_EMPTY(&queued_resets)) { 7764 queued_reset = TAILQ_FIRST(&queued_resets); 7765 TAILQ_REMOVE(&queued_resets, queued_reset, internal.link); 7766 queued_reset->internal.status = bdev_io->internal.status; 7767 spdk_thread_send_msg(spdk_bdev_io_get_thread(queued_reset), 7768 _bdev_reset_complete, queued_reset); 7769 } 7770 7771 _bdev_reset_complete(bdev_io); 7772 7773 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7774 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7775 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7776 } 7777 } 7778 7779 static void 7780 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7781 struct spdk_io_channel *_ch, void *_ctx) 7782 { 7783 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7784 7785 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7786 7787 spdk_bdev_for_each_channel_continue(i, 0); 7788 } 7789 7790 static void 7791 bdev_io_complete_sequence_cb(void *ctx, int status) 7792 { 7793 struct spdk_bdev_io *bdev_io = ctx; 7794 7795 /* u.bdev.accel_sequence should have already been cleared at this point */ 7796 assert(bdev_io->u.bdev.accel_sequence == NULL); 7797 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7798 bdev_io->internal.f.has_accel_sequence = false; 7799 7800 if (spdk_unlikely(status != 0)) { 7801 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7802 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7803 } 7804 7805 bdev_io_complete(bdev_io); 7806 } 7807 7808 void 7809 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7810 { 7811 struct spdk_bdev *bdev = bdev_io->bdev; 7812 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7813 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7814 7815 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7816 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7817 spdk_bdev_get_module_name(bdev), 7818 bdev_io_status_get_string(bdev_io->internal.status)); 7819 assert(false); 7820 } 7821 bdev_io->internal.status = status; 7822 7823 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7824 assert(bdev_io == bdev->internal.reset_in_progress); 7825 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7826 bdev_reset_complete); 7827 return; 7828 } else { 7829 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7830 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7831 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7832 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7833 return; 7834 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7835 !bdev_io_use_accel_sequence(bdev_io))) { 7836 _bdev_io_push_bounce_data_buffer(bdev_io, 7837 _bdev_io_complete_push_bounce_done); 7838 /* bdev IO will be completed in the callback */ 7839 return; 7840 } 7841 } 7842 7843 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7844 return; 7845 } 7846 } 7847 7848 bdev_io_complete(bdev_io); 7849 } 7850 7851 void 7852 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7853 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7854 { 7855 enum spdk_bdev_io_status status; 7856 7857 if (sc == SPDK_SCSI_STATUS_GOOD) { 7858 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7859 } else { 7860 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7861 bdev_io->internal.error.scsi.sc = sc; 7862 bdev_io->internal.error.scsi.sk = sk; 7863 bdev_io->internal.error.scsi.asc = asc; 7864 bdev_io->internal.error.scsi.ascq = ascq; 7865 } 7866 7867 spdk_bdev_io_complete(bdev_io, status); 7868 } 7869 7870 void 7871 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7872 int *sc, int *sk, int *asc, int *ascq) 7873 { 7874 assert(sc != NULL); 7875 assert(sk != NULL); 7876 assert(asc != NULL); 7877 assert(ascq != NULL); 7878 7879 switch (bdev_io->internal.status) { 7880 case SPDK_BDEV_IO_STATUS_SUCCESS: 7881 *sc = SPDK_SCSI_STATUS_GOOD; 7882 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7883 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7884 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7885 break; 7886 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7887 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7888 break; 7889 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7890 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7891 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7892 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7893 *ascq = bdev_io->internal.error.scsi.ascq; 7894 break; 7895 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7896 *sc = bdev_io->internal.error.scsi.sc; 7897 *sk = bdev_io->internal.error.scsi.sk; 7898 *asc = bdev_io->internal.error.scsi.asc; 7899 *ascq = bdev_io->internal.error.scsi.ascq; 7900 break; 7901 default: 7902 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7903 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7904 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7905 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7906 break; 7907 } 7908 } 7909 7910 void 7911 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7912 { 7913 enum spdk_bdev_io_status status; 7914 7915 if (aio_result == 0) { 7916 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7917 } else { 7918 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7919 } 7920 7921 bdev_io->internal.error.aio_result = aio_result; 7922 7923 spdk_bdev_io_complete(bdev_io, status); 7924 } 7925 7926 void 7927 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7928 { 7929 assert(aio_result != NULL); 7930 7931 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7932 *aio_result = bdev_io->internal.error.aio_result; 7933 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7934 *aio_result = 0; 7935 } else { 7936 *aio_result = -EIO; 7937 } 7938 } 7939 7940 void 7941 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7942 { 7943 enum spdk_bdev_io_status status; 7944 7945 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7946 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7947 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7948 status = SPDK_BDEV_IO_STATUS_ABORTED; 7949 } else { 7950 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7951 } 7952 7953 bdev_io->internal.error.nvme.cdw0 = cdw0; 7954 bdev_io->internal.error.nvme.sct = sct; 7955 bdev_io->internal.error.nvme.sc = sc; 7956 7957 spdk_bdev_io_complete(bdev_io, status); 7958 } 7959 7960 void 7961 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7962 { 7963 assert(sct != NULL); 7964 assert(sc != NULL); 7965 assert(cdw0 != NULL); 7966 7967 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7968 *sct = SPDK_NVME_SCT_GENERIC; 7969 *sc = SPDK_NVME_SC_SUCCESS; 7970 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7971 *cdw0 = 0; 7972 } else { 7973 *cdw0 = 1U; 7974 } 7975 return; 7976 } 7977 7978 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7979 *sct = SPDK_NVME_SCT_GENERIC; 7980 *sc = SPDK_NVME_SC_SUCCESS; 7981 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7982 *sct = bdev_io->internal.error.nvme.sct; 7983 *sc = bdev_io->internal.error.nvme.sc; 7984 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7985 *sct = SPDK_NVME_SCT_GENERIC; 7986 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7987 } else { 7988 *sct = SPDK_NVME_SCT_GENERIC; 7989 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7990 } 7991 7992 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7993 } 7994 7995 void 7996 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7997 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7998 { 7999 assert(first_sct != NULL); 8000 assert(first_sc != NULL); 8001 assert(second_sct != NULL); 8002 assert(second_sc != NULL); 8003 assert(cdw0 != NULL); 8004 8005 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 8006 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 8007 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 8008 *first_sct = bdev_io->internal.error.nvme.sct; 8009 *first_sc = bdev_io->internal.error.nvme.sc; 8010 *second_sct = SPDK_NVME_SCT_GENERIC; 8011 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8012 } else { 8013 *first_sct = SPDK_NVME_SCT_GENERIC; 8014 *first_sc = SPDK_NVME_SC_SUCCESS; 8015 *second_sct = bdev_io->internal.error.nvme.sct; 8016 *second_sc = bdev_io->internal.error.nvme.sc; 8017 } 8018 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 8019 *first_sct = SPDK_NVME_SCT_GENERIC; 8020 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8021 *second_sct = SPDK_NVME_SCT_GENERIC; 8022 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 8023 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 8024 *first_sct = SPDK_NVME_SCT_GENERIC; 8025 *first_sc = SPDK_NVME_SC_SUCCESS; 8026 *second_sct = SPDK_NVME_SCT_GENERIC; 8027 *second_sc = SPDK_NVME_SC_SUCCESS; 8028 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 8029 *first_sct = SPDK_NVME_SCT_GENERIC; 8030 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8031 *second_sct = SPDK_NVME_SCT_GENERIC; 8032 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8033 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 8034 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 8035 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 8036 *second_sct = SPDK_NVME_SCT_GENERIC; 8037 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 8038 } else { 8039 *first_sct = SPDK_NVME_SCT_GENERIC; 8040 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8041 *second_sct = SPDK_NVME_SCT_GENERIC; 8042 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 8043 } 8044 8045 *cdw0 = bdev_io->internal.error.nvme.cdw0; 8046 } 8047 8048 void 8049 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 8050 const struct spdk_bdev_io *base_io) 8051 { 8052 switch (base_io->internal.status) { 8053 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 8054 spdk_bdev_io_complete_nvme_status(bdev_io, 8055 base_io->internal.error.nvme.cdw0, 8056 base_io->internal.error.nvme.sct, 8057 base_io->internal.error.nvme.sc); 8058 break; 8059 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 8060 spdk_bdev_io_complete_scsi_status(bdev_io, 8061 base_io->internal.error.scsi.sc, 8062 base_io->internal.error.scsi.sk, 8063 base_io->internal.error.scsi.asc, 8064 base_io->internal.error.scsi.ascq); 8065 break; 8066 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 8067 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 8068 break; 8069 default: 8070 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 8071 break; 8072 } 8073 } 8074 8075 struct spdk_thread * 8076 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 8077 { 8078 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 8079 } 8080 8081 struct spdk_io_channel * 8082 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 8083 { 8084 return bdev_io->internal.ch->channel; 8085 } 8086 8087 static int 8088 bdev_register(struct spdk_bdev *bdev) 8089 { 8090 char *bdev_name; 8091 char uuid[SPDK_UUID_STRING_LEN]; 8092 struct spdk_iobuf_opts iobuf_opts; 8093 int ret; 8094 8095 assert(bdev->module != NULL); 8096 8097 if (!bdev->name) { 8098 SPDK_ERRLOG("Bdev name is NULL\n"); 8099 return -EINVAL; 8100 } 8101 8102 if (!strlen(bdev->name)) { 8103 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 8104 return -EINVAL; 8105 } 8106 8107 /* Users often register their own I/O devices using the bdev name. In 8108 * order to avoid conflicts, prepend bdev_. */ 8109 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 8110 if (!bdev_name) { 8111 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 8112 return -ENOMEM; 8113 } 8114 8115 bdev->internal.stat = bdev_alloc_io_stat(true); 8116 if (!bdev->internal.stat) { 8117 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 8118 free(bdev_name); 8119 return -ENOMEM; 8120 } 8121 8122 bdev->internal.status = SPDK_BDEV_STATUS_READY; 8123 bdev->internal.measured_queue_depth = UINT64_MAX; 8124 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8125 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8126 bdev->internal.qd_poller = NULL; 8127 bdev->internal.qos = NULL; 8128 8129 TAILQ_INIT(&bdev->internal.open_descs); 8130 TAILQ_INIT(&bdev->internal.locked_ranges); 8131 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 8132 TAILQ_INIT(&bdev->internal.queued_resets); 8133 TAILQ_INIT(&bdev->aliases); 8134 8135 /* UUID may be specified by the user or defined by bdev itself. 8136 * Otherwise it will be generated here, so this field will never be empty. */ 8137 if (spdk_uuid_is_null(&bdev->uuid)) { 8138 spdk_uuid_generate(&bdev->uuid); 8139 } 8140 8141 /* Add the UUID alias only if it's different than the name */ 8142 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8143 if (strcmp(bdev->name, uuid) != 0) { 8144 ret = spdk_bdev_alias_add(bdev, uuid); 8145 if (ret != 0) { 8146 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 8147 bdev_free_io_stat(bdev->internal.stat); 8148 free(bdev_name); 8149 return ret; 8150 } 8151 } 8152 8153 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 8154 if (spdk_bdev_get_buf_align(bdev) > 1) { 8155 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 8156 iobuf_opts.large_bufsize / bdev->blocklen); 8157 } 8158 8159 /* If the user didn't specify a write unit size, set it to one. */ 8160 if (bdev->write_unit_size == 0) { 8161 bdev->write_unit_size = 1; 8162 } 8163 8164 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 8165 if (bdev->acwu == 0) { 8166 bdev->acwu = bdev->write_unit_size; 8167 } 8168 8169 if (bdev->phys_blocklen == 0) { 8170 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 8171 } 8172 8173 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 8174 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 8175 } 8176 8177 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 8178 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 8179 } 8180 8181 bdev->internal.reset_in_progress = NULL; 8182 bdev->internal.qd_poll_in_progress = false; 8183 bdev->internal.period = 0; 8184 bdev->internal.new_period = 0; 8185 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 8186 8187 /* 8188 * Initialize spinlock before registering IO device because spinlock is used in 8189 * bdev_channel_create 8190 */ 8191 spdk_spin_init(&bdev->internal.spinlock); 8192 8193 spdk_io_device_register(__bdev_to_io_dev(bdev), 8194 bdev_channel_create, bdev_channel_destroy, 8195 sizeof(struct spdk_bdev_channel), 8196 bdev_name); 8197 8198 /* 8199 * Register bdev name only after the bdev object is ready. 8200 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 8201 * create IO channels... 8202 */ 8203 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 8204 if (ret != 0) { 8205 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 8206 bdev_free_io_stat(bdev->internal.stat); 8207 spdk_spin_destroy(&bdev->internal.spinlock); 8208 free(bdev_name); 8209 return ret; 8210 } 8211 8212 free(bdev_name); 8213 8214 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 8215 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 8216 8217 return 0; 8218 } 8219 8220 static void 8221 bdev_destroy_cb(void *io_device) 8222 { 8223 int rc; 8224 struct spdk_bdev *bdev; 8225 spdk_bdev_unregister_cb cb_fn; 8226 void *cb_arg; 8227 8228 bdev = __bdev_from_io_dev(io_device); 8229 8230 if (bdev->internal.unregister_td != spdk_get_thread()) { 8231 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 8232 return; 8233 } 8234 8235 cb_fn = bdev->internal.unregister_cb; 8236 cb_arg = bdev->internal.unregister_ctx; 8237 8238 spdk_spin_destroy(&bdev->internal.spinlock); 8239 free(bdev->internal.qos); 8240 bdev_free_io_stat(bdev->internal.stat); 8241 spdk_trace_unregister_owner(bdev->internal.trace_id); 8242 8243 rc = bdev->fn_table->destruct(bdev->ctxt); 8244 if (rc < 0) { 8245 SPDK_ERRLOG("destruct failed\n"); 8246 } 8247 if (rc <= 0 && cb_fn != NULL) { 8248 cb_fn(cb_arg, rc); 8249 } 8250 } 8251 8252 void 8253 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 8254 { 8255 if (bdev->internal.unregister_cb != NULL) { 8256 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 8257 } 8258 } 8259 8260 static void 8261 _remove_notify(void *arg) 8262 { 8263 struct spdk_bdev_desc *desc = arg; 8264 8265 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 8266 } 8267 8268 /* returns: 0 - bdev removed and ready to be destructed. 8269 * -EBUSY - bdev can't be destructed yet. */ 8270 static int 8271 bdev_unregister_unsafe(struct spdk_bdev *bdev) 8272 { 8273 struct spdk_bdev_desc *desc, *tmp; 8274 struct spdk_bdev_alias *alias; 8275 int rc = 0; 8276 char uuid[SPDK_UUID_STRING_LEN]; 8277 8278 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 8279 assert(spdk_spin_held(&bdev->internal.spinlock)); 8280 8281 /* Notify each descriptor about hotremoval */ 8282 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 8283 rc = -EBUSY; 8284 /* 8285 * Defer invocation of the event_cb to a separate message that will 8286 * run later on its thread. This ensures this context unwinds and 8287 * we don't recursively unregister this bdev again if the event_cb 8288 * immediately closes its descriptor. 8289 */ 8290 event_notify(desc, _remove_notify); 8291 } 8292 8293 /* If there are no descriptors, proceed removing the bdev */ 8294 if (rc == 0) { 8295 bdev_examine_allowlist_remove(bdev->name); 8296 TAILQ_FOREACH(alias, &bdev->aliases, tailq) { 8297 bdev_examine_allowlist_remove(alias->alias.name); 8298 } 8299 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 8300 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 8301 8302 /* Delete the name and the UUID alias */ 8303 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 8304 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8305 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8306 8307 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8308 8309 if (bdev->internal.reset_in_progress != NULL) { 8310 /* If reset is in progress, let the completion callback for reset 8311 * unregister the bdev. 8312 */ 8313 rc = -EBUSY; 8314 } 8315 } 8316 8317 return rc; 8318 } 8319 8320 static void 8321 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8322 struct spdk_io_channel *io_ch, void *_ctx) 8323 { 8324 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8325 8326 bdev_channel_abort_queued_ios(bdev_ch); 8327 spdk_bdev_for_each_channel_continue(i, 0); 8328 } 8329 8330 static void 8331 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8332 { 8333 int rc; 8334 8335 spdk_spin_lock(&g_bdev_mgr.spinlock); 8336 spdk_spin_lock(&bdev->internal.spinlock); 8337 /* 8338 * Set the status to REMOVING after completing to abort channels. Otherwise, 8339 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8340 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8341 * may fail. 8342 */ 8343 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8344 rc = bdev_unregister_unsafe(bdev); 8345 spdk_spin_unlock(&bdev->internal.spinlock); 8346 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8347 8348 if (rc == 0) { 8349 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8350 } 8351 } 8352 8353 void 8354 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8355 { 8356 struct spdk_thread *thread; 8357 8358 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8359 8360 thread = spdk_get_thread(); 8361 if (!thread) { 8362 /* The user called this from a non-SPDK thread. */ 8363 if (cb_fn != NULL) { 8364 cb_fn(cb_arg, -ENOTSUP); 8365 } 8366 return; 8367 } 8368 8369 spdk_spin_lock(&g_bdev_mgr.spinlock); 8370 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8371 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8372 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8373 if (cb_fn) { 8374 cb_fn(cb_arg, -EBUSY); 8375 } 8376 return; 8377 } 8378 8379 spdk_spin_lock(&bdev->internal.spinlock); 8380 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8381 bdev->internal.unregister_cb = cb_fn; 8382 bdev->internal.unregister_ctx = cb_arg; 8383 bdev->internal.unregister_td = thread; 8384 spdk_spin_unlock(&bdev->internal.spinlock); 8385 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8386 8387 spdk_bdev_set_qd_sampling_period(bdev, 0); 8388 8389 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8390 bdev_unregister); 8391 } 8392 8393 int 8394 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8395 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8396 { 8397 struct spdk_bdev_desc *desc; 8398 struct spdk_bdev *bdev; 8399 int rc; 8400 8401 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8402 if (rc != 0) { 8403 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8404 return rc; 8405 } 8406 8407 bdev = spdk_bdev_desc_get_bdev(desc); 8408 8409 if (bdev->module != module) { 8410 spdk_bdev_close(desc); 8411 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8412 bdev_name); 8413 return -ENODEV; 8414 } 8415 8416 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8417 8418 spdk_bdev_close(desc); 8419 8420 return 0; 8421 } 8422 8423 static int 8424 bdev_start_qos(struct spdk_bdev *bdev) 8425 { 8426 struct set_qos_limit_ctx *ctx; 8427 8428 /* Enable QoS */ 8429 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8430 ctx = calloc(1, sizeof(*ctx)); 8431 if (ctx == NULL) { 8432 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8433 return -ENOMEM; 8434 } 8435 ctx->bdev = bdev; 8436 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8437 } 8438 8439 return 0; 8440 } 8441 8442 static void 8443 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8444 struct spdk_bdev *bdev) 8445 { 8446 enum spdk_bdev_claim_type type; 8447 const char *typename, *modname; 8448 extern struct spdk_log_flag SPDK_LOG_bdev; 8449 8450 assert(spdk_spin_held(&bdev->internal.spinlock)); 8451 8452 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8453 return; 8454 } 8455 8456 type = bdev->internal.claim_type; 8457 typename = spdk_bdev_claim_get_name(type); 8458 8459 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8460 modname = bdev->internal.claim.v1.module->name; 8461 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8462 bdev->name, detail, typename, modname); 8463 return; 8464 } 8465 8466 if (claim_type_is_v2(type)) { 8467 struct spdk_bdev_module_claim *claim; 8468 8469 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8470 modname = claim->module->name; 8471 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8472 bdev->name, detail, typename, modname); 8473 } 8474 return; 8475 } 8476 8477 assert(false); 8478 } 8479 8480 static int 8481 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8482 { 8483 struct spdk_thread *thread; 8484 int rc = 0; 8485 8486 thread = spdk_get_thread(); 8487 if (!thread) { 8488 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8489 return -ENOTSUP; 8490 } 8491 8492 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8493 spdk_get_thread()); 8494 8495 desc->bdev = bdev; 8496 desc->thread = thread; 8497 desc->write = write; 8498 8499 spdk_spin_lock(&bdev->internal.spinlock); 8500 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8501 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8502 spdk_spin_unlock(&bdev->internal.spinlock); 8503 return -ENODEV; 8504 } 8505 8506 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8507 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8508 spdk_spin_unlock(&bdev->internal.spinlock); 8509 return -EPERM; 8510 } 8511 8512 rc = bdev_start_qos(bdev); 8513 if (rc != 0) { 8514 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8515 spdk_spin_unlock(&bdev->internal.spinlock); 8516 return rc; 8517 } 8518 8519 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8520 8521 spdk_spin_unlock(&bdev->internal.spinlock); 8522 8523 return 0; 8524 } 8525 8526 static void 8527 bdev_open_opts_get_defaults(struct spdk_bdev_open_opts *opts, size_t opts_size) 8528 { 8529 if (!opts) { 8530 SPDK_ERRLOG("opts should not be NULL.\n"); 8531 return; 8532 } 8533 8534 if (!opts_size) { 8535 SPDK_ERRLOG("opts_size should not be zero.\n"); 8536 return; 8537 } 8538 8539 memset(opts, 0, opts_size); 8540 opts->size = opts_size; 8541 8542 #define FIELD_OK(field) \ 8543 offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size 8544 8545 #define SET_FIELD(field, value) \ 8546 if (FIELD_OK(field)) { \ 8547 opts->field = value; \ 8548 } \ 8549 8550 SET_FIELD(hide_metadata, false); 8551 8552 #undef FIELD_OK 8553 #undef SET_FIELD 8554 } 8555 8556 static void 8557 bdev_open_opts_copy(struct spdk_bdev_open_opts *opts, 8558 const struct spdk_bdev_open_opts *opts_src, size_t opts_size) 8559 { 8560 assert(opts); 8561 assert(opts_src); 8562 8563 #define SET_FIELD(field) \ 8564 if (offsetof(struct spdk_bdev_open_opts, field) + sizeof(opts->field) <= opts_size) { \ 8565 opts->field = opts_src->field; \ 8566 } \ 8567 8568 SET_FIELD(hide_metadata); 8569 8570 opts->size = opts_src->size; 8571 8572 /* We should not remove this statement, but need to update the assert statement 8573 * if we add a new field, and also add a corresponding SET_FIELD statement. 8574 */ 8575 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_opts) == 16, "Incorrect size"); 8576 8577 #undef SET_FIELD 8578 } 8579 8580 void 8581 spdk_bdev_open_opts_init(struct spdk_bdev_open_opts *opts, size_t opts_size) 8582 { 8583 struct spdk_bdev_open_opts opts_local; 8584 8585 bdev_open_opts_get_defaults(&opts_local, sizeof(opts_local)); 8586 bdev_open_opts_copy(opts, &opts_local, opts_size); 8587 } 8588 8589 static int 8590 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8591 struct spdk_bdev_open_opts *user_opts, struct spdk_bdev_desc **_desc) 8592 { 8593 struct spdk_bdev_desc *desc; 8594 struct spdk_bdev_open_opts opts; 8595 unsigned int i; 8596 8597 bdev_open_opts_get_defaults(&opts, sizeof(opts)); 8598 if (user_opts != NULL) { 8599 bdev_open_opts_copy(&opts, user_opts, user_opts->size); 8600 } 8601 8602 desc = calloc(1, sizeof(*desc)); 8603 if (desc == NULL) { 8604 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8605 return -ENOMEM; 8606 } 8607 8608 desc->opts = opts; 8609 8610 TAILQ_INIT(&desc->pending_media_events); 8611 TAILQ_INIT(&desc->free_media_events); 8612 8613 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8614 desc->callback.event_fn = event_cb; 8615 desc->callback.ctx = event_ctx; 8616 spdk_spin_init(&desc->spinlock); 8617 8618 if (desc->opts.hide_metadata) { 8619 if (spdk_bdev_is_md_separate(bdev)) { 8620 SPDK_ERRLOG("hide_metadata option is not supported with separate metadata.\n"); 8621 bdev_desc_free(desc); 8622 return -EINVAL; 8623 } 8624 } 8625 8626 if (bdev->media_events) { 8627 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8628 sizeof(*desc->media_events_buffer)); 8629 if (desc->media_events_buffer == NULL) { 8630 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8631 bdev_desc_free(desc); 8632 return -ENOMEM; 8633 } 8634 8635 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8636 TAILQ_INSERT_TAIL(&desc->free_media_events, 8637 &desc->media_events_buffer[i], tailq); 8638 } 8639 } 8640 8641 if (bdev->fn_table->accel_sequence_supported != NULL) { 8642 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8643 desc->accel_sequence_supported[i] = 8644 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8645 (enum spdk_bdev_io_type)i); 8646 } 8647 } 8648 8649 *_desc = desc; 8650 8651 return 0; 8652 } 8653 8654 static int 8655 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8656 void *event_ctx, struct spdk_bdev_open_opts *opts, 8657 struct spdk_bdev_desc **_desc) 8658 { 8659 struct spdk_bdev_desc *desc; 8660 struct spdk_bdev *bdev; 8661 int rc; 8662 8663 bdev = bdev_get_by_name(bdev_name); 8664 8665 if (bdev == NULL) { 8666 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8667 return -ENODEV; 8668 } 8669 8670 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, opts, &desc); 8671 if (rc != 0) { 8672 return rc; 8673 } 8674 8675 rc = bdev_open(bdev, write, desc); 8676 if (rc != 0) { 8677 bdev_desc_free(desc); 8678 desc = NULL; 8679 } 8680 8681 *_desc = desc; 8682 8683 return rc; 8684 } 8685 8686 int 8687 spdk_bdev_open_ext_v2(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8688 void *event_ctx, struct spdk_bdev_open_opts *opts, 8689 struct spdk_bdev_desc **_desc) 8690 { 8691 int rc; 8692 8693 if (event_cb == NULL) { 8694 SPDK_ERRLOG("Missing event callback function\n"); 8695 return -EINVAL; 8696 } 8697 8698 spdk_spin_lock(&g_bdev_mgr.spinlock); 8699 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, opts, _desc); 8700 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8701 8702 return rc; 8703 } 8704 8705 int 8706 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8707 void *event_ctx, struct spdk_bdev_desc **_desc) 8708 { 8709 return spdk_bdev_open_ext_v2(bdev_name, write, event_cb, event_ctx, NULL, _desc); 8710 } 8711 8712 struct spdk_bdev_open_async_ctx { 8713 char *bdev_name; 8714 spdk_bdev_event_cb_t event_cb; 8715 void *event_ctx; 8716 bool write; 8717 int rc; 8718 spdk_bdev_open_async_cb_t cb_fn; 8719 void *cb_arg; 8720 struct spdk_bdev_desc *desc; 8721 struct spdk_bdev_open_async_opts opts; 8722 uint64_t start_ticks; 8723 struct spdk_thread *orig_thread; 8724 struct spdk_poller *poller; 8725 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8726 }; 8727 8728 static void 8729 bdev_open_async_done(void *arg) 8730 { 8731 struct spdk_bdev_open_async_ctx *ctx = arg; 8732 8733 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8734 8735 free(ctx->bdev_name); 8736 free(ctx); 8737 } 8738 8739 static void 8740 bdev_open_async_cancel(void *arg) 8741 { 8742 struct spdk_bdev_open_async_ctx *ctx = arg; 8743 8744 assert(ctx->rc == -ESHUTDOWN); 8745 8746 spdk_poller_unregister(&ctx->poller); 8747 8748 bdev_open_async_done(ctx); 8749 } 8750 8751 /* This is called when the bdev library finishes at shutdown. */ 8752 static void 8753 bdev_open_async_fini(void) 8754 { 8755 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8756 8757 spdk_spin_lock(&g_bdev_mgr.spinlock); 8758 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8759 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8760 /* 8761 * We have to move to ctx->orig_thread to unregister ctx->poller. 8762 * However, there is a chance that ctx->poller is executed before 8763 * message is executed, which could result in bdev_open_async_done() 8764 * being called twice. To avoid such race condition, set ctx->rc to 8765 * -ESHUTDOWN. 8766 */ 8767 ctx->rc = -ESHUTDOWN; 8768 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8769 } 8770 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8771 } 8772 8773 static int bdev_open_async(void *arg); 8774 8775 static void 8776 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8777 { 8778 uint64_t timeout_ticks; 8779 8780 if (ctx->rc == -ESHUTDOWN) { 8781 /* This context is being canceled. Do nothing. */ 8782 return; 8783 } 8784 8785 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8786 NULL, &ctx->desc); 8787 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8788 goto exit; 8789 } 8790 8791 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8792 if (spdk_get_ticks() >= timeout_ticks) { 8793 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8794 ctx->rc = -ETIMEDOUT; 8795 goto exit; 8796 } 8797 8798 return; 8799 8800 exit: 8801 spdk_poller_unregister(&ctx->poller); 8802 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8803 8804 /* Completion callback is processed after stack unwinding. */ 8805 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8806 } 8807 8808 static int 8809 bdev_open_async(void *arg) 8810 { 8811 struct spdk_bdev_open_async_ctx *ctx = arg; 8812 8813 spdk_spin_lock(&g_bdev_mgr.spinlock); 8814 8815 _bdev_open_async(ctx); 8816 8817 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8818 8819 return SPDK_POLLER_BUSY; 8820 } 8821 8822 static void 8823 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8824 struct spdk_bdev_open_async_opts *opts_src, 8825 size_t size) 8826 { 8827 assert(opts); 8828 assert(opts_src); 8829 8830 opts->size = size; 8831 8832 #define SET_FIELD(field) \ 8833 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8834 opts->field = opts_src->field; \ 8835 } \ 8836 8837 SET_FIELD(timeout_ms); 8838 8839 /* Do not remove this statement, you should always update this statement when you adding a new field, 8840 * and do not forget to add the SET_FIELD statement for your added field. */ 8841 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8842 8843 #undef SET_FIELD 8844 } 8845 8846 static void 8847 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8848 { 8849 assert(opts); 8850 8851 opts->size = size; 8852 8853 #define SET_FIELD(field, value) \ 8854 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8855 opts->field = value; \ 8856 } \ 8857 8858 SET_FIELD(timeout_ms, 0); 8859 8860 #undef SET_FIELD 8861 } 8862 8863 int 8864 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8865 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8866 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8867 { 8868 struct spdk_bdev_open_async_ctx *ctx; 8869 8870 if (event_cb == NULL) { 8871 SPDK_ERRLOG("Missing event callback function\n"); 8872 return -EINVAL; 8873 } 8874 8875 if (open_cb == NULL) { 8876 SPDK_ERRLOG("Missing open callback function\n"); 8877 return -EINVAL; 8878 } 8879 8880 if (opts != NULL && opts->size == 0) { 8881 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8882 return -EINVAL; 8883 } 8884 8885 ctx = calloc(1, sizeof(*ctx)); 8886 if (ctx == NULL) { 8887 SPDK_ERRLOG("Failed to allocate open context\n"); 8888 return -ENOMEM; 8889 } 8890 8891 ctx->bdev_name = strdup(bdev_name); 8892 if (ctx->bdev_name == NULL) { 8893 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8894 free(ctx); 8895 return -ENOMEM; 8896 } 8897 8898 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8899 if (ctx->poller == NULL) { 8900 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8901 free(ctx->bdev_name); 8902 free(ctx); 8903 return -ENOMEM; 8904 } 8905 8906 ctx->cb_fn = open_cb; 8907 ctx->cb_arg = open_cb_arg; 8908 ctx->write = write; 8909 ctx->event_cb = event_cb; 8910 ctx->event_ctx = event_ctx; 8911 ctx->orig_thread = spdk_get_thread(); 8912 ctx->start_ticks = spdk_get_ticks(); 8913 8914 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8915 if (opts != NULL) { 8916 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8917 } 8918 8919 spdk_spin_lock(&g_bdev_mgr.spinlock); 8920 8921 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8922 _bdev_open_async(ctx); 8923 8924 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8925 8926 return 0; 8927 } 8928 8929 static void 8930 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8931 { 8932 int rc; 8933 8934 spdk_spin_lock(&bdev->internal.spinlock); 8935 spdk_spin_lock(&desc->spinlock); 8936 8937 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8938 8939 desc->closed = true; 8940 8941 if (desc->claim != NULL) { 8942 bdev_desc_release_claims(desc); 8943 } 8944 8945 if (0 == desc->refs) { 8946 spdk_spin_unlock(&desc->spinlock); 8947 bdev_desc_free(desc); 8948 } else { 8949 spdk_spin_unlock(&desc->spinlock); 8950 } 8951 8952 /* If no more descriptors, kill QoS channel */ 8953 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8954 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8955 bdev->name, spdk_get_thread()); 8956 8957 if (bdev_qos_destroy(bdev)) { 8958 /* There isn't anything we can do to recover here. Just let the 8959 * old QoS poller keep running. The QoS handling won't change 8960 * cores when the user allocates a new channel, but it won't break. */ 8961 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8962 } 8963 } 8964 8965 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8966 rc = bdev_unregister_unsafe(bdev); 8967 spdk_spin_unlock(&bdev->internal.spinlock); 8968 8969 if (rc == 0) { 8970 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8971 } 8972 } else { 8973 spdk_spin_unlock(&bdev->internal.spinlock); 8974 } 8975 } 8976 8977 void 8978 spdk_bdev_close(struct spdk_bdev_desc *desc) 8979 { 8980 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8981 8982 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8983 spdk_get_thread()); 8984 8985 assert(desc->thread == spdk_get_thread()); 8986 8987 spdk_poller_unregister(&desc->io_timeout_poller); 8988 8989 spdk_spin_lock(&g_bdev_mgr.spinlock); 8990 8991 bdev_close(bdev, desc); 8992 8993 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8994 } 8995 8996 int32_t 8997 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8998 { 8999 if (bdev->numa.id_valid) { 9000 return bdev->numa.id; 9001 } else { 9002 return SPDK_ENV_NUMA_ID_ANY; 9003 } 9004 } 9005 9006 static void 9007 bdev_register_finished(void *arg) 9008 { 9009 struct spdk_bdev_desc *desc = arg; 9010 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9011 9012 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 9013 9014 spdk_spin_lock(&g_bdev_mgr.spinlock); 9015 9016 bdev_close(bdev, desc); 9017 9018 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9019 } 9020 9021 int 9022 spdk_bdev_register(struct spdk_bdev *bdev) 9023 { 9024 struct spdk_bdev_desc *desc; 9025 struct spdk_thread *thread = spdk_get_thread(); 9026 int rc; 9027 9028 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 9029 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 9030 thread ? spdk_thread_get_name(thread) : "null"); 9031 return -EINVAL; 9032 } 9033 9034 rc = bdev_register(bdev); 9035 if (rc != 0) { 9036 return rc; 9037 } 9038 9039 /* A descriptor is opened to prevent bdev deletion during examination */ 9040 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9041 if (rc != 0) { 9042 spdk_bdev_unregister(bdev, NULL, NULL); 9043 return rc; 9044 } 9045 9046 rc = bdev_open(bdev, false, desc); 9047 if (rc != 0) { 9048 bdev_desc_free(desc); 9049 spdk_bdev_unregister(bdev, NULL, NULL); 9050 return rc; 9051 } 9052 9053 /* Examine configuration before initializing I/O */ 9054 bdev_examine(bdev); 9055 9056 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 9057 if (rc != 0) { 9058 bdev_close(bdev, desc); 9059 spdk_bdev_unregister(bdev, NULL, NULL); 9060 } 9061 9062 return rc; 9063 } 9064 9065 int 9066 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 9067 struct spdk_bdev_module *module) 9068 { 9069 spdk_spin_lock(&bdev->internal.spinlock); 9070 9071 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9072 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9073 spdk_spin_unlock(&bdev->internal.spinlock); 9074 return -EPERM; 9075 } 9076 9077 if (desc && !desc->write) { 9078 desc->write = true; 9079 } 9080 9081 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 9082 bdev->internal.claim.v1.module = module; 9083 9084 spdk_spin_unlock(&bdev->internal.spinlock); 9085 return 0; 9086 } 9087 9088 void 9089 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 9090 { 9091 spdk_spin_lock(&bdev->internal.spinlock); 9092 9093 assert(bdev->internal.claim.v1.module != NULL); 9094 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 9095 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9096 bdev->internal.claim.v1.module = NULL; 9097 9098 spdk_spin_unlock(&bdev->internal.spinlock); 9099 } 9100 9101 /* 9102 * Start claims v2 9103 */ 9104 9105 const char * 9106 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 9107 { 9108 switch (type) { 9109 case SPDK_BDEV_CLAIM_NONE: 9110 return "not_claimed"; 9111 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9112 return "exclusive_write"; 9113 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9114 return "read_many_write_one"; 9115 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9116 return "read_many_write_none"; 9117 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9118 return "read_many_write_many"; 9119 default: 9120 break; 9121 } 9122 return "invalid_claim"; 9123 } 9124 9125 static bool 9126 claim_type_is_v2(enum spdk_bdev_claim_type type) 9127 { 9128 switch (type) { 9129 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9130 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9131 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9132 return true; 9133 default: 9134 break; 9135 } 9136 return false; 9137 } 9138 9139 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 9140 static bool 9141 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 9142 { 9143 switch (type) { 9144 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9145 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9146 return true; 9147 default: 9148 break; 9149 } 9150 return false; 9151 } 9152 9153 void 9154 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 9155 { 9156 if (opts == NULL) { 9157 SPDK_ERRLOG("opts should not be NULL\n"); 9158 assert(opts != NULL); 9159 return; 9160 } 9161 if (size == 0) { 9162 SPDK_ERRLOG("size should not be zero\n"); 9163 assert(size != 0); 9164 return; 9165 } 9166 9167 memset(opts, 0, size); 9168 opts->opts_size = size; 9169 9170 #define FIELD_OK(field) \ 9171 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 9172 9173 #define SET_FIELD(field, value) \ 9174 if (FIELD_OK(field)) { \ 9175 opts->field = value; \ 9176 } \ 9177 9178 SET_FIELD(shared_claim_key, 0); 9179 9180 #undef FIELD_OK 9181 #undef SET_FIELD 9182 } 9183 9184 static int 9185 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 9186 { 9187 if (src->opts_size == 0) { 9188 SPDK_ERRLOG("size should not be zero\n"); 9189 return -1; 9190 } 9191 9192 memset(dst, 0, sizeof(*dst)); 9193 dst->opts_size = src->opts_size; 9194 9195 #define FIELD_OK(field) \ 9196 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 9197 9198 #define SET_FIELD(field) \ 9199 if (FIELD_OK(field)) { \ 9200 dst->field = src->field; \ 9201 } \ 9202 9203 if (FIELD_OK(name)) { 9204 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 9205 } 9206 9207 SET_FIELD(shared_claim_key); 9208 9209 /* You should not remove this statement, but need to update the assert statement 9210 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9211 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 9212 9213 #undef FIELD_OK 9214 #undef SET_FIELD 9215 return 0; 9216 } 9217 9218 /* Returns 0 if a read-write-once claim can be taken. */ 9219 static int 9220 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9221 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9222 { 9223 struct spdk_bdev *bdev = desc->bdev; 9224 struct spdk_bdev_desc *open_desc; 9225 9226 assert(spdk_spin_held(&bdev->internal.spinlock)); 9227 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 9228 9229 if (opts->shared_claim_key != 0) { 9230 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 9231 bdev->name); 9232 return -EINVAL; 9233 } 9234 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 9235 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9236 return -EPERM; 9237 } 9238 if (desc->claim != NULL) { 9239 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 9240 bdev->name, desc->claim->module->name); 9241 return -EPERM; 9242 } 9243 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9244 if (desc != open_desc && open_desc->write) { 9245 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 9246 "another descriptor is open for writing\n", 9247 bdev->name); 9248 return -EPERM; 9249 } 9250 } 9251 9252 return 0; 9253 } 9254 9255 /* Returns 0 if a read-only-many claim can be taken. */ 9256 static int 9257 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9258 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9259 { 9260 struct spdk_bdev *bdev = desc->bdev; 9261 struct spdk_bdev_desc *open_desc; 9262 9263 assert(spdk_spin_held(&bdev->internal.spinlock)); 9264 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 9265 assert(desc->claim == NULL); 9266 9267 if (desc->write) { 9268 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 9269 bdev->name); 9270 return -EINVAL; 9271 } 9272 if (opts->shared_claim_key != 0) { 9273 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 9274 return -EINVAL; 9275 } 9276 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9277 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9278 if (open_desc->write) { 9279 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 9280 "another descriptor is open for writing\n", 9281 bdev->name); 9282 return -EPERM; 9283 } 9284 } 9285 } 9286 9287 return 0; 9288 } 9289 9290 /* Returns 0 if a read-write-many claim can be taken. */ 9291 static int 9292 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9293 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9294 { 9295 struct spdk_bdev *bdev = desc->bdev; 9296 struct spdk_bdev_desc *open_desc; 9297 9298 assert(spdk_spin_held(&bdev->internal.spinlock)); 9299 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 9300 assert(desc->claim == NULL); 9301 9302 if (opts->shared_claim_key == 0) { 9303 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 9304 bdev->name); 9305 return -EINVAL; 9306 } 9307 switch (bdev->internal.claim_type) { 9308 case SPDK_BDEV_CLAIM_NONE: 9309 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 9310 if (open_desc == desc) { 9311 continue; 9312 } 9313 if (open_desc->write) { 9314 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 9315 "another descriptor is open for writing without a " 9316 "claim\n", bdev->name); 9317 return -EPERM; 9318 } 9319 } 9320 break; 9321 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9322 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 9323 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 9324 return -EPERM; 9325 } 9326 break; 9327 default: 9328 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9329 return -EBUSY; 9330 } 9331 9332 return 0; 9333 } 9334 9335 /* Updates desc and its bdev with a v2 claim. */ 9336 static int 9337 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9338 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 9339 { 9340 struct spdk_bdev *bdev = desc->bdev; 9341 struct spdk_bdev_module_claim *claim; 9342 9343 assert(spdk_spin_held(&bdev->internal.spinlock)); 9344 assert(claim_type_is_v2(type)); 9345 assert(desc->claim == NULL); 9346 9347 claim = calloc(1, sizeof(*desc->claim)); 9348 if (claim == NULL) { 9349 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 9350 return -ENOMEM; 9351 } 9352 claim->module = module; 9353 claim->desc = desc; 9354 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 9355 memcpy(claim->name, opts->name, sizeof(claim->name)); 9356 desc->claim = claim; 9357 9358 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 9359 bdev->internal.claim_type = type; 9360 TAILQ_INIT(&bdev->internal.claim.v2.claims); 9361 bdev->internal.claim.v2.key = opts->shared_claim_key; 9362 } 9363 assert(type == bdev->internal.claim_type); 9364 9365 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 9366 9367 if (!desc->write && claim_type_promotes_to_write(type)) { 9368 desc->write = true; 9369 } 9370 9371 return 0; 9372 } 9373 9374 int 9375 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 9376 struct spdk_bdev_claim_opts *_opts, 9377 struct spdk_bdev_module *module) 9378 { 9379 struct spdk_bdev *bdev; 9380 struct spdk_bdev_claim_opts opts; 9381 int rc = 0; 9382 9383 if (desc == NULL) { 9384 SPDK_ERRLOG("descriptor must not be NULL\n"); 9385 return -EINVAL; 9386 } 9387 9388 bdev = desc->bdev; 9389 9390 if (_opts == NULL) { 9391 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 9392 } else if (claim_opts_copy(_opts, &opts) != 0) { 9393 return -EINVAL; 9394 } 9395 9396 spdk_spin_lock(&bdev->internal.spinlock); 9397 9398 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9399 bdev->internal.claim_type != type) { 9400 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9401 spdk_spin_unlock(&bdev->internal.spinlock); 9402 return -EPERM; 9403 } 9404 9405 if (claim_type_is_v2(type) && desc->claim != NULL) { 9406 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9407 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9408 spdk_spin_unlock(&bdev->internal.spinlock); 9409 return -EPERM; 9410 } 9411 9412 switch (type) { 9413 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9414 spdk_spin_unlock(&bdev->internal.spinlock); 9415 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9416 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9417 rc = claim_verify_rwo(desc, type, &opts, module); 9418 break; 9419 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9420 rc = claim_verify_rom(desc, type, &opts, module); 9421 break; 9422 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9423 rc = claim_verify_rwm(desc, type, &opts, module); 9424 break; 9425 default: 9426 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9427 rc = -ENOTSUP; 9428 } 9429 9430 if (rc == 0) { 9431 rc = claim_bdev(desc, type, &opts, module); 9432 } 9433 9434 spdk_spin_unlock(&bdev->internal.spinlock); 9435 return rc; 9436 } 9437 9438 static void 9439 claim_reset(struct spdk_bdev *bdev) 9440 { 9441 assert(spdk_spin_held(&bdev->internal.spinlock)); 9442 assert(claim_type_is_v2(bdev->internal.claim_type)); 9443 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9444 9445 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9446 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9447 } 9448 9449 static void 9450 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9451 { 9452 struct spdk_bdev *bdev = desc->bdev; 9453 9454 assert(spdk_spin_held(&bdev->internal.spinlock)); 9455 assert(claim_type_is_v2(bdev->internal.claim_type)); 9456 9457 if (bdev->internal.examine_in_progress == 0) { 9458 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9459 free(desc->claim); 9460 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9461 claim_reset(bdev); 9462 } 9463 } else { 9464 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9465 desc->claim->module = NULL; 9466 desc->claim->desc = NULL; 9467 } 9468 desc->claim = NULL; 9469 } 9470 9471 /* 9472 * End claims v2 9473 */ 9474 9475 struct spdk_bdev * 9476 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9477 { 9478 assert(desc != NULL); 9479 return desc->bdev; 9480 } 9481 9482 int 9483 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9484 { 9485 struct spdk_bdev *bdev, *tmp; 9486 struct spdk_bdev_desc *desc; 9487 int rc = 0; 9488 9489 assert(fn != NULL); 9490 9491 spdk_spin_lock(&g_bdev_mgr.spinlock); 9492 bdev = spdk_bdev_first(); 9493 while (bdev != NULL) { 9494 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9495 if (rc != 0) { 9496 break; 9497 } 9498 rc = bdev_open(bdev, false, desc); 9499 if (rc != 0) { 9500 bdev_desc_free(desc); 9501 if (rc == -ENODEV) { 9502 /* Ignore the error and move to the next bdev. */ 9503 rc = 0; 9504 bdev = spdk_bdev_next(bdev); 9505 continue; 9506 } 9507 break; 9508 } 9509 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9510 9511 rc = fn(ctx, bdev); 9512 9513 spdk_spin_lock(&g_bdev_mgr.spinlock); 9514 tmp = spdk_bdev_next(bdev); 9515 bdev_close(bdev, desc); 9516 if (rc != 0) { 9517 break; 9518 } 9519 bdev = tmp; 9520 } 9521 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9522 9523 return rc; 9524 } 9525 9526 int 9527 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9528 { 9529 struct spdk_bdev *bdev, *tmp; 9530 struct spdk_bdev_desc *desc; 9531 int rc = 0; 9532 9533 assert(fn != NULL); 9534 9535 spdk_spin_lock(&g_bdev_mgr.spinlock); 9536 bdev = spdk_bdev_first_leaf(); 9537 while (bdev != NULL) { 9538 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, NULL, &desc); 9539 if (rc != 0) { 9540 break; 9541 } 9542 rc = bdev_open(bdev, false, desc); 9543 if (rc != 0) { 9544 bdev_desc_free(desc); 9545 if (rc == -ENODEV) { 9546 /* Ignore the error and move to the next bdev. */ 9547 rc = 0; 9548 bdev = spdk_bdev_next_leaf(bdev); 9549 continue; 9550 } 9551 break; 9552 } 9553 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9554 9555 rc = fn(ctx, bdev); 9556 9557 spdk_spin_lock(&g_bdev_mgr.spinlock); 9558 tmp = spdk_bdev_next_leaf(bdev); 9559 bdev_close(bdev, desc); 9560 if (rc != 0) { 9561 break; 9562 } 9563 bdev = tmp; 9564 } 9565 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9566 9567 return rc; 9568 } 9569 9570 void 9571 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9572 { 9573 struct iovec *iovs; 9574 int iovcnt; 9575 9576 if (bdev_io == NULL) { 9577 return; 9578 } 9579 9580 switch (bdev_io->type) { 9581 case SPDK_BDEV_IO_TYPE_READ: 9582 case SPDK_BDEV_IO_TYPE_WRITE: 9583 case SPDK_BDEV_IO_TYPE_ZCOPY: 9584 iovs = bdev_io->u.bdev.iovs; 9585 iovcnt = bdev_io->u.bdev.iovcnt; 9586 break; 9587 default: 9588 iovs = NULL; 9589 iovcnt = 0; 9590 break; 9591 } 9592 9593 if (iovp) { 9594 *iovp = iovs; 9595 } 9596 if (iovcntp) { 9597 *iovcntp = iovcnt; 9598 } 9599 } 9600 9601 void * 9602 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9603 { 9604 if (bdev_io == NULL) { 9605 return NULL; 9606 } 9607 9608 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9609 return NULL; 9610 } 9611 9612 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9613 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9614 return bdev_io->u.bdev.md_buf; 9615 } 9616 9617 return NULL; 9618 } 9619 9620 void * 9621 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9622 { 9623 if (bdev_io == NULL) { 9624 assert(false); 9625 return NULL; 9626 } 9627 9628 return bdev_io->internal.caller_ctx; 9629 } 9630 9631 void 9632 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9633 { 9634 9635 if (spdk_bdev_module_list_find(bdev_module->name)) { 9636 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9637 assert(false); 9638 } 9639 9640 spdk_spin_init(&bdev_module->internal.spinlock); 9641 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9642 9643 /* 9644 * Modules with examine callbacks must be initialized first, so they are 9645 * ready to handle examine callbacks from later modules that will 9646 * register physical bdevs. 9647 */ 9648 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9649 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9650 } else { 9651 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9652 } 9653 } 9654 9655 struct spdk_bdev_module * 9656 spdk_bdev_module_list_find(const char *name) 9657 { 9658 struct spdk_bdev_module *bdev_module; 9659 9660 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9661 if (strcmp(name, bdev_module->name) == 0) { 9662 break; 9663 } 9664 } 9665 9666 return bdev_module; 9667 } 9668 9669 static int 9670 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9671 { 9672 uint64_t num_blocks; 9673 void *md_buf = NULL; 9674 9675 num_blocks = bdev_io->u.bdev.num_blocks; 9676 9677 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9678 md_buf = (char *)g_bdev_mgr.zero_buffer + 9679 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9680 } 9681 9682 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9683 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9684 g_bdev_mgr.zero_buffer, md_buf, 9685 bdev_io->u.bdev.offset_blocks, num_blocks, 9686 bdev_write_zero_buffer_done, bdev_io); 9687 } 9688 9689 static void 9690 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9691 { 9692 struct spdk_bdev_io *parent_io = cb_arg; 9693 9694 spdk_bdev_free_io(bdev_io); 9695 9696 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9697 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9698 } 9699 9700 static void 9701 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9702 { 9703 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9704 ctx->bdev->internal.qos_mod_in_progress = false; 9705 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9706 9707 if (ctx->cb_fn) { 9708 ctx->cb_fn(ctx->cb_arg, status); 9709 } 9710 free(ctx); 9711 } 9712 9713 static void 9714 bdev_disable_qos_done(void *cb_arg) 9715 { 9716 struct set_qos_limit_ctx *ctx = cb_arg; 9717 struct spdk_bdev *bdev = ctx->bdev; 9718 struct spdk_bdev_qos *qos; 9719 9720 spdk_spin_lock(&bdev->internal.spinlock); 9721 qos = bdev->internal.qos; 9722 bdev->internal.qos = NULL; 9723 spdk_spin_unlock(&bdev->internal.spinlock); 9724 9725 if (qos->thread != NULL) { 9726 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9727 spdk_poller_unregister(&qos->poller); 9728 } 9729 9730 free(qos); 9731 9732 bdev_set_qos_limit_done(ctx, 0); 9733 } 9734 9735 static void 9736 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9737 { 9738 struct set_qos_limit_ctx *ctx = _ctx; 9739 struct spdk_thread *thread; 9740 9741 spdk_spin_lock(&bdev->internal.spinlock); 9742 thread = bdev->internal.qos->thread; 9743 spdk_spin_unlock(&bdev->internal.spinlock); 9744 9745 if (thread != NULL) { 9746 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9747 } else { 9748 bdev_disable_qos_done(ctx); 9749 } 9750 } 9751 9752 static void 9753 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9754 struct spdk_io_channel *ch, void *_ctx) 9755 { 9756 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9757 struct spdk_bdev_io *bdev_io; 9758 9759 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9760 9761 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9762 /* Re-submit the queued I/O. */ 9763 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9764 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9765 _bdev_io_submit(bdev_io); 9766 } 9767 9768 spdk_bdev_for_each_channel_continue(i, 0); 9769 } 9770 9771 static void 9772 bdev_update_qos_rate_limit_msg(void *cb_arg) 9773 { 9774 struct set_qos_limit_ctx *ctx = cb_arg; 9775 struct spdk_bdev *bdev = ctx->bdev; 9776 9777 spdk_spin_lock(&bdev->internal.spinlock); 9778 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9779 spdk_spin_unlock(&bdev->internal.spinlock); 9780 9781 bdev_set_qos_limit_done(ctx, 0); 9782 } 9783 9784 static void 9785 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9786 struct spdk_io_channel *ch, void *_ctx) 9787 { 9788 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9789 9790 spdk_spin_lock(&bdev->internal.spinlock); 9791 bdev_enable_qos(bdev, bdev_ch); 9792 spdk_spin_unlock(&bdev->internal.spinlock); 9793 spdk_bdev_for_each_channel_continue(i, 0); 9794 } 9795 9796 static void 9797 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9798 { 9799 struct set_qos_limit_ctx *ctx = _ctx; 9800 9801 bdev_set_qos_limit_done(ctx, status); 9802 } 9803 9804 static void 9805 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9806 { 9807 int i; 9808 9809 assert(bdev->internal.qos != NULL); 9810 9811 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9812 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9813 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9814 9815 if (limits[i] == 0) { 9816 bdev->internal.qos->rate_limits[i].limit = 9817 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9818 } 9819 } 9820 } 9821 } 9822 9823 void 9824 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9825 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9826 { 9827 struct set_qos_limit_ctx *ctx; 9828 uint32_t limit_set_complement; 9829 uint64_t min_limit_per_sec; 9830 int i; 9831 bool disable_rate_limit = true; 9832 9833 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9834 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9835 continue; 9836 } 9837 9838 if (limits[i] > 0) { 9839 disable_rate_limit = false; 9840 } 9841 9842 if (bdev_qos_is_iops_rate_limit(i) == true) { 9843 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9844 } else { 9845 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9846 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9847 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9848 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9849 } 9850 /* Change from megabyte to byte rate limit */ 9851 limits[i] = limits[i] * 1024 * 1024; 9852 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9853 } 9854 9855 limit_set_complement = limits[i] % min_limit_per_sec; 9856 if (limit_set_complement) { 9857 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9858 limits[i], min_limit_per_sec); 9859 limits[i] += min_limit_per_sec - limit_set_complement; 9860 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9861 } 9862 } 9863 9864 ctx = calloc(1, sizeof(*ctx)); 9865 if (ctx == NULL) { 9866 cb_fn(cb_arg, -ENOMEM); 9867 return; 9868 } 9869 9870 ctx->cb_fn = cb_fn; 9871 ctx->cb_arg = cb_arg; 9872 ctx->bdev = bdev; 9873 9874 spdk_spin_lock(&bdev->internal.spinlock); 9875 if (bdev->internal.qos_mod_in_progress) { 9876 spdk_spin_unlock(&bdev->internal.spinlock); 9877 free(ctx); 9878 cb_fn(cb_arg, -EAGAIN); 9879 return; 9880 } 9881 bdev->internal.qos_mod_in_progress = true; 9882 9883 if (disable_rate_limit == true && bdev->internal.qos) { 9884 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9885 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9886 (bdev->internal.qos->rate_limits[i].limit > 0 && 9887 bdev->internal.qos->rate_limits[i].limit != 9888 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9889 disable_rate_limit = false; 9890 break; 9891 } 9892 } 9893 } 9894 9895 if (disable_rate_limit == false) { 9896 if (bdev->internal.qos == NULL) { 9897 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9898 if (!bdev->internal.qos) { 9899 spdk_spin_unlock(&bdev->internal.spinlock); 9900 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9901 bdev_set_qos_limit_done(ctx, -ENOMEM); 9902 return; 9903 } 9904 } 9905 9906 if (bdev->internal.qos->thread == NULL) { 9907 /* Enabling */ 9908 bdev_set_qos_rate_limits(bdev, limits); 9909 9910 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9911 bdev_enable_qos_done); 9912 } else { 9913 /* Updating */ 9914 bdev_set_qos_rate_limits(bdev, limits); 9915 9916 spdk_thread_send_msg(bdev->internal.qos->thread, 9917 bdev_update_qos_rate_limit_msg, ctx); 9918 } 9919 } else { 9920 if (bdev->internal.qos != NULL) { 9921 bdev_set_qos_rate_limits(bdev, limits); 9922 9923 /* Disabling */ 9924 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9925 bdev_disable_qos_msg_done); 9926 } else { 9927 spdk_spin_unlock(&bdev->internal.spinlock); 9928 bdev_set_qos_limit_done(ctx, 0); 9929 return; 9930 } 9931 } 9932 9933 spdk_spin_unlock(&bdev->internal.spinlock); 9934 } 9935 9936 struct spdk_bdev_histogram_ctx { 9937 spdk_bdev_histogram_status_cb cb_fn; 9938 void *cb_arg; 9939 struct spdk_bdev *bdev; 9940 int status; 9941 }; 9942 9943 static void 9944 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9945 { 9946 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9947 9948 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9949 ctx->bdev->internal.histogram_in_progress = false; 9950 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9951 ctx->cb_fn(ctx->cb_arg, ctx->status); 9952 free(ctx); 9953 } 9954 9955 static void 9956 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9957 struct spdk_io_channel *_ch, void *_ctx) 9958 { 9959 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9960 9961 if (ch->histogram != NULL) { 9962 spdk_histogram_data_free(ch->histogram); 9963 ch->histogram = NULL; 9964 } 9965 spdk_bdev_for_each_channel_continue(i, 0); 9966 } 9967 9968 static void 9969 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9970 { 9971 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9972 9973 if (status != 0) { 9974 ctx->status = status; 9975 ctx->bdev->internal.histogram_enabled = false; 9976 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9977 bdev_histogram_disable_channel_cb); 9978 } else { 9979 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9980 ctx->bdev->internal.histogram_in_progress = false; 9981 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9982 ctx->cb_fn(ctx->cb_arg, ctx->status); 9983 free(ctx); 9984 } 9985 } 9986 9987 static void 9988 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9989 struct spdk_io_channel *_ch, void *_ctx) 9990 { 9991 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9992 int status = 0; 9993 9994 if (ch->histogram == NULL) { 9995 ch->histogram = spdk_histogram_data_alloc(); 9996 if (ch->histogram == NULL) { 9997 status = -ENOMEM; 9998 } 9999 } 10000 10001 spdk_bdev_for_each_channel_continue(i, status); 10002 } 10003 10004 void 10005 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10006 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 10007 { 10008 struct spdk_bdev_histogram_ctx *ctx; 10009 10010 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 10011 if (ctx == NULL) { 10012 cb_fn(cb_arg, -ENOMEM); 10013 return; 10014 } 10015 10016 ctx->bdev = bdev; 10017 ctx->status = 0; 10018 ctx->cb_fn = cb_fn; 10019 ctx->cb_arg = cb_arg; 10020 10021 spdk_spin_lock(&bdev->internal.spinlock); 10022 if (bdev->internal.histogram_in_progress) { 10023 spdk_spin_unlock(&bdev->internal.spinlock); 10024 free(ctx); 10025 cb_fn(cb_arg, -EAGAIN); 10026 return; 10027 } 10028 10029 bdev->internal.histogram_in_progress = true; 10030 spdk_spin_unlock(&bdev->internal.spinlock); 10031 10032 bdev->internal.histogram_enabled = enable; 10033 bdev->internal.histogram_io_type = opts->io_type; 10034 10035 if (enable) { 10036 /* Allocate histogram for each channel */ 10037 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 10038 bdev_histogram_enable_channel_cb); 10039 } else { 10040 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 10041 bdev_histogram_disable_channel_cb); 10042 } 10043 } 10044 10045 void 10046 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 10047 { 10048 if (opts == NULL) { 10049 SPDK_ERRLOG("opts should not be NULL\n"); 10050 assert(opts != NULL); 10051 return; 10052 } 10053 if (size == 0) { 10054 SPDK_ERRLOG("size should not be zero\n"); 10055 assert(size != 0); 10056 return; 10057 } 10058 10059 memset(opts, 0, size); 10060 opts->size = size; 10061 10062 #define FIELD_OK(field) \ 10063 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 10064 10065 #define SET_FIELD(field, value) \ 10066 if (FIELD_OK(field)) { \ 10067 opts->field = value; \ 10068 } \ 10069 10070 SET_FIELD(io_type, 0); 10071 10072 /* You should not remove this statement, but need to update the assert statement 10073 * if you add a new field, and also add a corresponding SET_FIELD statement */ 10074 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 10075 10076 #undef FIELD_OK 10077 #undef SET_FIELD 10078 } 10079 10080 void 10081 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 10082 void *cb_arg, bool enable) 10083 { 10084 struct spdk_bdev_enable_histogram_opts opts; 10085 10086 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 10087 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 10088 } 10089 10090 struct spdk_bdev_histogram_data_ctx { 10091 spdk_bdev_histogram_data_cb cb_fn; 10092 void *cb_arg; 10093 struct spdk_bdev *bdev; 10094 /** merged histogram data from all channels */ 10095 struct spdk_histogram_data *histogram; 10096 }; 10097 10098 static void 10099 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10100 { 10101 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10102 10103 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 10104 free(ctx); 10105 } 10106 10107 static void 10108 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10109 struct spdk_io_channel *_ch, void *_ctx) 10110 { 10111 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10112 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 10113 int status = 0; 10114 10115 if (ch->histogram == NULL) { 10116 status = -EFAULT; 10117 } else { 10118 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 10119 } 10120 10121 spdk_bdev_for_each_channel_continue(i, status); 10122 } 10123 10124 void 10125 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 10126 spdk_bdev_histogram_data_cb cb_fn, 10127 void *cb_arg) 10128 { 10129 struct spdk_bdev_histogram_data_ctx *ctx; 10130 10131 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 10132 if (ctx == NULL) { 10133 cb_fn(cb_arg, -ENOMEM, NULL); 10134 return; 10135 } 10136 10137 ctx->bdev = bdev; 10138 ctx->cb_fn = cb_fn; 10139 ctx->cb_arg = cb_arg; 10140 10141 ctx->histogram = histogram; 10142 10143 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 10144 bdev_histogram_get_channel_cb); 10145 } 10146 10147 void 10148 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 10149 void *cb_arg) 10150 { 10151 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 10152 int status = 0; 10153 10154 assert(cb_fn != NULL); 10155 10156 if (bdev_ch->histogram == NULL) { 10157 status = -EFAULT; 10158 } 10159 cb_fn(cb_arg, status, bdev_ch->histogram); 10160 } 10161 10162 size_t 10163 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 10164 size_t max_events) 10165 { 10166 struct media_event_entry *entry; 10167 size_t num_events = 0; 10168 10169 for (; num_events < max_events; ++num_events) { 10170 entry = TAILQ_FIRST(&desc->pending_media_events); 10171 if (entry == NULL) { 10172 break; 10173 } 10174 10175 events[num_events] = entry->event; 10176 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 10177 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 10178 } 10179 10180 return num_events; 10181 } 10182 10183 int 10184 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 10185 size_t num_events) 10186 { 10187 struct spdk_bdev_desc *desc; 10188 struct media_event_entry *entry; 10189 size_t event_id; 10190 int rc = 0; 10191 10192 assert(bdev->media_events); 10193 10194 spdk_spin_lock(&bdev->internal.spinlock); 10195 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10196 if (desc->write) { 10197 break; 10198 } 10199 } 10200 10201 if (desc == NULL || desc->media_events_buffer == NULL) { 10202 rc = -ENODEV; 10203 goto out; 10204 } 10205 10206 for (event_id = 0; event_id < num_events; ++event_id) { 10207 entry = TAILQ_FIRST(&desc->free_media_events); 10208 if (entry == NULL) { 10209 break; 10210 } 10211 10212 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 10213 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 10214 entry->event = events[event_id]; 10215 } 10216 10217 rc = event_id; 10218 out: 10219 spdk_spin_unlock(&bdev->internal.spinlock); 10220 return rc; 10221 } 10222 10223 static void 10224 _media_management_notify(void *arg) 10225 { 10226 struct spdk_bdev_desc *desc = arg; 10227 10228 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 10229 } 10230 10231 void 10232 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 10233 { 10234 struct spdk_bdev_desc *desc; 10235 10236 spdk_spin_lock(&bdev->internal.spinlock); 10237 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 10238 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 10239 event_notify(desc, _media_management_notify); 10240 } 10241 } 10242 spdk_spin_unlock(&bdev->internal.spinlock); 10243 } 10244 10245 struct locked_lba_range_ctx { 10246 struct lba_range range; 10247 struct lba_range *current_range; 10248 struct lba_range *owner_range; 10249 struct spdk_poller *poller; 10250 lock_range_cb cb_fn; 10251 void *cb_arg; 10252 }; 10253 10254 static void 10255 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10256 { 10257 struct locked_lba_range_ctx *ctx = _ctx; 10258 10259 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 10260 free(ctx); 10261 } 10262 10263 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 10264 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 10265 10266 static void 10267 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10268 { 10269 struct locked_lba_range_ctx *ctx = _ctx; 10270 10271 if (status == -ENOMEM) { 10272 /* One of the channels could not allocate a range object. 10273 * So we have to go back and clean up any ranges that were 10274 * allocated successfully before we return error status to 10275 * the caller. We can reuse the unlock function to do that 10276 * clean up. 10277 */ 10278 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10279 bdev_lock_error_cleanup_cb); 10280 return; 10281 } 10282 10283 /* All channels have locked this range and no I/O overlapping the range 10284 * are outstanding! Set the owner_ch for the range object for the 10285 * locking channel, so that this channel will know that it is allowed 10286 * to write to this range. 10287 */ 10288 if (ctx->owner_range != NULL) { 10289 ctx->owner_range->owner_ch = ctx->range.owner_ch; 10290 } 10291 10292 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10293 10294 /* Don't free the ctx here. Its range is in the bdev's global list of 10295 * locked ranges still, and will be removed and freed when this range 10296 * is later unlocked. 10297 */ 10298 } 10299 10300 static int 10301 bdev_lock_lba_range_check_io(void *_i) 10302 { 10303 struct spdk_bdev_channel_iter *i = _i; 10304 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 10305 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10306 struct locked_lba_range_ctx *ctx = i->ctx; 10307 struct lba_range *range = ctx->current_range; 10308 struct spdk_bdev_io *bdev_io; 10309 10310 spdk_poller_unregister(&ctx->poller); 10311 10312 /* The range is now in the locked_ranges, so no new IO can be submitted to this 10313 * range. But we need to wait until any outstanding IO overlapping with this range 10314 * are completed. 10315 */ 10316 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 10317 if (bdev_io_range_is_locked(bdev_io, range)) { 10318 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 10319 return SPDK_POLLER_BUSY; 10320 } 10321 } 10322 10323 spdk_bdev_for_each_channel_continue(i, 0); 10324 return SPDK_POLLER_BUSY; 10325 } 10326 10327 static void 10328 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10329 struct spdk_io_channel *_ch, void *_ctx) 10330 { 10331 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10332 struct locked_lba_range_ctx *ctx = _ctx; 10333 struct lba_range *range; 10334 10335 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10336 if (range->length == ctx->range.length && 10337 range->offset == ctx->range.offset && 10338 range->locked_ctx == ctx->range.locked_ctx) { 10339 /* This range already exists on this channel, so don't add 10340 * it again. This can happen when a new channel is created 10341 * while the for_each_channel operation is in progress. 10342 * Do not check for outstanding I/O in that case, since the 10343 * range was locked before any I/O could be submitted to the 10344 * new channel. 10345 */ 10346 spdk_bdev_for_each_channel_continue(i, 0); 10347 return; 10348 } 10349 } 10350 10351 range = calloc(1, sizeof(*range)); 10352 if (range == NULL) { 10353 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 10354 return; 10355 } 10356 10357 range->length = ctx->range.length; 10358 range->offset = ctx->range.offset; 10359 range->locked_ctx = ctx->range.locked_ctx; 10360 range->quiesce = ctx->range.quiesce; 10361 ctx->current_range = range; 10362 if (ctx->range.owner_ch == ch) { 10363 /* This is the range object for the channel that will hold 10364 * the lock. Store it in the ctx object so that we can easily 10365 * set its owner_ch after the lock is finally acquired. 10366 */ 10367 ctx->owner_range = range; 10368 } 10369 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 10370 bdev_lock_lba_range_check_io(i); 10371 } 10372 10373 static void 10374 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 10375 { 10376 assert(spdk_get_thread() == ctx->range.owner_thread); 10377 assert(ctx->range.owner_ch == NULL || 10378 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 10379 10380 /* We will add a copy of this range to each channel now. */ 10381 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 10382 bdev_lock_lba_range_cb); 10383 } 10384 10385 static bool 10386 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 10387 { 10388 struct lba_range *r; 10389 10390 TAILQ_FOREACH(r, tailq, tailq) { 10391 if (bdev_lba_range_overlapped(range, r)) { 10392 return true; 10393 } 10394 } 10395 return false; 10396 } 10397 10398 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10399 10400 static int 10401 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10402 uint64_t offset, uint64_t length, 10403 lock_range_cb cb_fn, void *cb_arg) 10404 { 10405 struct locked_lba_range_ctx *ctx; 10406 10407 ctx = calloc(1, sizeof(*ctx)); 10408 if (ctx == NULL) { 10409 return -ENOMEM; 10410 } 10411 10412 ctx->range.offset = offset; 10413 ctx->range.length = length; 10414 ctx->range.owner_thread = spdk_get_thread(); 10415 ctx->range.owner_ch = ch; 10416 ctx->range.locked_ctx = cb_arg; 10417 ctx->range.bdev = bdev; 10418 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10419 ctx->cb_fn = cb_fn; 10420 ctx->cb_arg = cb_arg; 10421 10422 spdk_spin_lock(&bdev->internal.spinlock); 10423 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10424 /* There is an active lock overlapping with this range. 10425 * Put it on the pending list until this range no 10426 * longer overlaps with another. 10427 */ 10428 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10429 } else { 10430 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10431 bdev_lock_lba_range_ctx(bdev, ctx); 10432 } 10433 spdk_spin_unlock(&bdev->internal.spinlock); 10434 return 0; 10435 } 10436 10437 static int 10438 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10439 uint64_t offset, uint64_t length, 10440 lock_range_cb cb_fn, void *cb_arg) 10441 { 10442 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10443 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10444 10445 if (cb_arg == NULL) { 10446 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10447 return -EINVAL; 10448 } 10449 10450 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10451 } 10452 10453 static void 10454 bdev_lock_lba_range_ctx_msg(void *_ctx) 10455 { 10456 struct locked_lba_range_ctx *ctx = _ctx; 10457 10458 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10459 } 10460 10461 static void 10462 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10463 { 10464 struct locked_lba_range_ctx *ctx = _ctx; 10465 struct locked_lba_range_ctx *pending_ctx; 10466 struct lba_range *range, *tmp; 10467 10468 spdk_spin_lock(&bdev->internal.spinlock); 10469 /* Check if there are any pending locked ranges that overlap with this range 10470 * that was just unlocked. If there are, check that it doesn't overlap with any 10471 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10472 * the lock process. 10473 */ 10474 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10475 if (bdev_lba_range_overlapped(range, &ctx->range) && 10476 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10477 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10478 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10479 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10480 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10481 bdev_lock_lba_range_ctx_msg, pending_ctx); 10482 } 10483 } 10484 spdk_spin_unlock(&bdev->internal.spinlock); 10485 10486 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10487 free(ctx); 10488 } 10489 10490 static void 10491 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10492 struct spdk_io_channel *_ch, void *_ctx) 10493 { 10494 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10495 struct locked_lba_range_ctx *ctx = _ctx; 10496 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10497 struct spdk_bdev_io *bdev_io; 10498 struct lba_range *range; 10499 10500 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10501 if (ctx->range.offset == range->offset && 10502 ctx->range.length == range->length && 10503 ctx->range.locked_ctx == range->locked_ctx) { 10504 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10505 free(range); 10506 break; 10507 } 10508 } 10509 10510 /* Note: we should almost always be able to assert that the range specified 10511 * was found. But there are some very rare corner cases where a new channel 10512 * gets created simultaneously with a range unlock, where this function 10513 * would execute on that new channel and wouldn't have the range. 10514 * We also use this to clean up range allocations when a later allocation 10515 * fails in the locking path. 10516 * So we can't actually assert() here. 10517 */ 10518 10519 /* Swap the locked IO into a temporary list, and then try to submit them again. 10520 * We could hyper-optimize this to only resubmit locked I/O that overlap 10521 * with the range that was just unlocked, but this isn't a performance path so 10522 * we go for simplicity here. 10523 */ 10524 TAILQ_INIT(&io_locked); 10525 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10526 while (!TAILQ_EMPTY(&io_locked)) { 10527 bdev_io = TAILQ_FIRST(&io_locked); 10528 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10529 bdev_io_submit(bdev_io); 10530 } 10531 10532 spdk_bdev_for_each_channel_continue(i, 0); 10533 } 10534 10535 static int 10536 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10537 lock_range_cb cb_fn, void *cb_arg) 10538 { 10539 struct locked_lba_range_ctx *ctx; 10540 struct lba_range *range; 10541 10542 spdk_spin_lock(&bdev->internal.spinlock); 10543 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10544 * and remove it. This ensures new channels don't inherit the locked range. 10545 * Then we will send a message to each channel to remove the range from its 10546 * per-channel list. 10547 */ 10548 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10549 if (range->offset == offset && range->length == length && 10550 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10551 break; 10552 } 10553 } 10554 if (range == NULL) { 10555 assert(false); 10556 spdk_spin_unlock(&bdev->internal.spinlock); 10557 return -EINVAL; 10558 } 10559 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10560 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10561 spdk_spin_unlock(&bdev->internal.spinlock); 10562 10563 ctx->cb_fn = cb_fn; 10564 ctx->cb_arg = cb_arg; 10565 10566 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10567 bdev_unlock_lba_range_cb); 10568 return 0; 10569 } 10570 10571 static int 10572 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10573 uint64_t offset, uint64_t length, 10574 lock_range_cb cb_fn, void *cb_arg) 10575 { 10576 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10577 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10578 struct lba_range *range; 10579 bool range_found = false; 10580 10581 /* Let's make sure the specified channel actually has a lock on 10582 * the specified range. Note that the range must match exactly. 10583 */ 10584 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10585 if (range->offset == offset && range->length == length && 10586 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10587 range_found = true; 10588 break; 10589 } 10590 } 10591 10592 if (!range_found) { 10593 return -EINVAL; 10594 } 10595 10596 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10597 } 10598 10599 struct bdev_quiesce_ctx { 10600 spdk_bdev_quiesce_cb cb_fn; 10601 void *cb_arg; 10602 }; 10603 10604 static void 10605 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10606 { 10607 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10608 10609 if (quiesce_ctx->cb_fn != NULL) { 10610 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10611 } 10612 10613 free(quiesce_ctx); 10614 } 10615 10616 static void 10617 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10618 { 10619 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10620 struct spdk_bdev_module *module = range->bdev->module; 10621 10622 if (status != 0) { 10623 if (quiesce_ctx->cb_fn != NULL) { 10624 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10625 } 10626 free(quiesce_ctx); 10627 return; 10628 } 10629 10630 spdk_spin_lock(&module->internal.spinlock); 10631 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10632 spdk_spin_unlock(&module->internal.spinlock); 10633 10634 if (quiesce_ctx->cb_fn != NULL) { 10635 /* copy the context in case the range is unlocked by the callback */ 10636 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10637 10638 quiesce_ctx->cb_fn = NULL; 10639 quiesce_ctx->cb_arg = NULL; 10640 10641 tmp.cb_fn(tmp.cb_arg, status); 10642 } 10643 /* quiesce_ctx will be freed on unquiesce */ 10644 } 10645 10646 static int 10647 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10648 uint64_t offset, uint64_t length, 10649 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10650 bool unquiesce) 10651 { 10652 struct bdev_quiesce_ctx *quiesce_ctx; 10653 int rc; 10654 10655 if (module != bdev->module) { 10656 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10657 return -EINVAL; 10658 } 10659 10660 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10661 return -EINVAL; 10662 } 10663 10664 if (unquiesce) { 10665 struct lba_range *range; 10666 10667 /* Make sure the specified range is actually quiesced in the specified module and 10668 * then remove it from the list. Note that the range must match exactly. 10669 */ 10670 spdk_spin_lock(&module->internal.spinlock); 10671 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10672 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10673 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10674 break; 10675 } 10676 } 10677 spdk_spin_unlock(&module->internal.spinlock); 10678 10679 if (range == NULL) { 10680 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10681 return -EINVAL; 10682 } 10683 10684 quiesce_ctx = range->locked_ctx; 10685 quiesce_ctx->cb_fn = cb_fn; 10686 quiesce_ctx->cb_arg = cb_arg; 10687 10688 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10689 } else { 10690 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10691 if (quiesce_ctx == NULL) { 10692 return -ENOMEM; 10693 } 10694 10695 quiesce_ctx->cb_fn = cb_fn; 10696 quiesce_ctx->cb_arg = cb_arg; 10697 10698 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10699 if (rc != 0) { 10700 free(quiesce_ctx); 10701 } 10702 } 10703 10704 return rc; 10705 } 10706 10707 int 10708 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10709 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10710 { 10711 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10712 } 10713 10714 int 10715 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10716 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10717 { 10718 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10719 } 10720 10721 int 10722 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10723 uint64_t offset, uint64_t length, 10724 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10725 { 10726 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10727 } 10728 10729 int 10730 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10731 uint64_t offset, uint64_t length, 10732 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10733 { 10734 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10735 } 10736 10737 int 10738 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10739 int array_size) 10740 { 10741 if (!bdev) { 10742 return -EINVAL; 10743 } 10744 10745 if (bdev->fn_table->get_memory_domains) { 10746 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10747 } 10748 10749 return 0; 10750 } 10751 10752 struct spdk_bdev_for_each_io_ctx { 10753 void *ctx; 10754 spdk_bdev_io_fn fn; 10755 spdk_bdev_for_each_io_cb cb; 10756 }; 10757 10758 static void 10759 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10760 struct spdk_io_channel *io_ch, void *_ctx) 10761 { 10762 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10763 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10764 struct spdk_bdev_io *bdev_io; 10765 int rc = 0; 10766 10767 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10768 rc = ctx->fn(ctx->ctx, bdev_io); 10769 if (rc != 0) { 10770 break; 10771 } 10772 } 10773 10774 spdk_bdev_for_each_channel_continue(i, rc); 10775 } 10776 10777 static void 10778 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10779 { 10780 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10781 10782 ctx->cb(ctx->ctx, status); 10783 10784 free(ctx); 10785 } 10786 10787 void 10788 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10789 spdk_bdev_for_each_io_cb cb) 10790 { 10791 struct spdk_bdev_for_each_io_ctx *ctx; 10792 10793 assert(fn != NULL && cb != NULL); 10794 10795 ctx = calloc(1, sizeof(*ctx)); 10796 if (ctx == NULL) { 10797 SPDK_ERRLOG("Failed to allocate context.\n"); 10798 cb(_ctx, -ENOMEM); 10799 return; 10800 } 10801 10802 ctx->ctx = _ctx; 10803 ctx->fn = fn; 10804 ctx->cb = cb; 10805 10806 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10807 bdev_for_each_io_done); 10808 } 10809 10810 void 10811 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10812 { 10813 spdk_for_each_channel_continue(iter->i, status); 10814 } 10815 10816 static struct spdk_bdev * 10817 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10818 { 10819 void *io_device = spdk_io_channel_iter_get_io_device(i); 10820 10821 return __bdev_from_io_dev(io_device); 10822 } 10823 10824 static void 10825 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10826 { 10827 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10828 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10829 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10830 10831 iter->i = i; 10832 iter->fn(iter, bdev, ch, iter->ctx); 10833 } 10834 10835 static void 10836 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10837 { 10838 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10839 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10840 10841 iter->i = i; 10842 iter->cpl(bdev, iter->ctx, status); 10843 10844 free(iter); 10845 } 10846 10847 void 10848 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10849 void *ctx, spdk_bdev_for_each_channel_done cpl) 10850 { 10851 struct spdk_bdev_channel_iter *iter; 10852 10853 assert(bdev != NULL && fn != NULL && ctx != NULL); 10854 10855 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10856 if (iter == NULL) { 10857 SPDK_ERRLOG("Unable to allocate iterator\n"); 10858 assert(false); 10859 return; 10860 } 10861 10862 iter->fn = fn; 10863 iter->cpl = cpl; 10864 iter->ctx = ctx; 10865 10866 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10867 iter, bdev_each_channel_cpl); 10868 } 10869 10870 static void 10871 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10872 { 10873 struct spdk_bdev_io *parent_io = cb_arg; 10874 10875 spdk_bdev_free_io(bdev_io); 10876 10877 /* Check return status of write */ 10878 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10879 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10880 } 10881 10882 static void 10883 bdev_copy_do_write(void *_bdev_io) 10884 { 10885 struct spdk_bdev_io *bdev_io = _bdev_io; 10886 int rc; 10887 10888 /* Write blocks */ 10889 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10890 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10891 bdev_io->u.bdev.iovs[0].iov_base, 10892 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10893 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10894 10895 if (rc == -ENOMEM) { 10896 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10897 } else if (rc != 0) { 10898 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10899 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10900 } 10901 } 10902 10903 static void 10904 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10905 { 10906 struct spdk_bdev_io *parent_io = cb_arg; 10907 10908 spdk_bdev_free_io(bdev_io); 10909 10910 /* Check return status of read */ 10911 if (!success) { 10912 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10913 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10914 return; 10915 } 10916 10917 /* Do write */ 10918 bdev_copy_do_write(parent_io); 10919 } 10920 10921 static void 10922 bdev_copy_do_read(void *_bdev_io) 10923 { 10924 struct spdk_bdev_io *bdev_io = _bdev_io; 10925 int rc; 10926 10927 /* Read blocks */ 10928 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10929 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10930 bdev_io->u.bdev.iovs[0].iov_base, 10931 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10932 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10933 10934 if (rc == -ENOMEM) { 10935 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10936 } else if (rc != 0) { 10937 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10938 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10939 } 10940 } 10941 10942 static void 10943 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10944 { 10945 if (!success) { 10946 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10947 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10948 return; 10949 } 10950 10951 bdev_copy_do_read(bdev_io); 10952 } 10953 10954 int 10955 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10956 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10957 spdk_bdev_io_completion_cb cb, void *cb_arg) 10958 { 10959 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10960 struct spdk_bdev_io *bdev_io; 10961 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10962 10963 if (!desc->write) { 10964 return -EBADF; 10965 } 10966 10967 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10968 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10969 SPDK_DEBUGLOG(bdev, 10970 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10971 dst_offset_blocks, src_offset_blocks, num_blocks); 10972 return -EINVAL; 10973 } 10974 10975 bdev_io = bdev_channel_get_io(channel); 10976 if (!bdev_io) { 10977 return -ENOMEM; 10978 } 10979 10980 bdev_io->internal.ch = channel; 10981 bdev_io->internal.desc = desc; 10982 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10983 10984 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10985 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10986 bdev_io->u.bdev.num_blocks = num_blocks; 10987 bdev_io->u.bdev.memory_domain = NULL; 10988 bdev_io->u.bdev.memory_domain_ctx = NULL; 10989 bdev_io->u.bdev.iovs = NULL; 10990 bdev_io->u.bdev.iovcnt = 0; 10991 bdev_io->u.bdev.md_buf = NULL; 10992 bdev_io->u.bdev.accel_sequence = NULL; 10993 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10994 10995 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10996 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10997 return 0; 10998 } 10999 11000 11001 /* If the copy size is large and should be split, use the generic split logic 11002 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 11003 * 11004 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 11005 * emulate it using regular read and write requests otherwise. 11006 */ 11007 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 11008 bdev_io->internal.f.split) { 11009 bdev_io_submit(bdev_io); 11010 return 0; 11011 } 11012 11013 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 11014 11015 return 0; 11016 } 11017 11018 SPDK_LOG_REGISTER_COMPONENT(bdev) 11019 11020 static void 11021 bdev_trace(void) 11022 { 11023 struct spdk_trace_tpoint_opts opts[] = { 11024 { 11025 "BDEV_IO_START", TRACE_BDEV_IO_START, 11026 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 11027 { 11028 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11029 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11030 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 11031 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11032 } 11033 }, 11034 { 11035 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 11036 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 11037 { 11038 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 11039 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 11040 } 11041 }, 11042 { 11043 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 11044 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11045 { 11046 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11047 } 11048 }, 11049 { 11050 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 11051 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 11052 { 11053 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 11054 } 11055 }, 11056 }; 11057 11058 11059 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 11060 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 11061 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 11062 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 11063 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 11064 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 11065 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 11066 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 11067 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 11068 } 11069 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 11070