1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 bdev_io_tailq_t queued_resets; 322 323 lba_range_tailq_t locked_ranges; 324 325 /** List of I/Os queued by QoS. */ 326 bdev_io_tailq_t qos_queued_io; 327 }; 328 329 struct media_event_entry { 330 struct spdk_bdev_media_event event; 331 TAILQ_ENTRY(media_event_entry) tailq; 332 }; 333 334 #define MEDIA_EVENT_POOL_SIZE 64 335 336 struct spdk_bdev_desc { 337 struct spdk_bdev *bdev; 338 struct spdk_thread *thread; 339 struct { 340 spdk_bdev_event_cb_t event_fn; 341 void *ctx; 342 } callback; 343 bool closed; 344 bool write; 345 bool memory_domains_supported; 346 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 347 struct spdk_spinlock spinlock; 348 uint32_t refs; 349 TAILQ_HEAD(, media_event_entry) pending_media_events; 350 TAILQ_HEAD(, media_event_entry) free_media_events; 351 struct media_event_entry *media_events_buffer; 352 TAILQ_ENTRY(spdk_bdev_desc) link; 353 354 uint64_t timeout_in_sec; 355 spdk_bdev_io_timeout_cb cb_fn; 356 void *cb_arg; 357 struct spdk_poller *io_timeout_poller; 358 struct spdk_bdev_module_claim *claim; 359 }; 360 361 struct spdk_bdev_iostat_ctx { 362 struct spdk_bdev_io_stat *stat; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 #define bdev_get_ext_io_opt(opts, field, defval) \ 442 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 443 444 static inline void 445 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 446 { 447 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 448 bdev_io->internal.ch->queue_depth++; 449 } 450 451 static inline void 452 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 453 { 454 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 455 bdev_io->internal.ch->queue_depth--; 456 } 457 458 void 459 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 460 { 461 if (!opts) { 462 SPDK_ERRLOG("opts should not be NULL\n"); 463 return; 464 } 465 466 if (!opts_size) { 467 SPDK_ERRLOG("opts_size should not be zero value\n"); 468 return; 469 } 470 471 opts->opts_size = opts_size; 472 473 #define SET_FIELD(field) \ 474 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 475 opts->field = g_bdev_opts.field; \ 476 } \ 477 478 SET_FIELD(bdev_io_pool_size); 479 SET_FIELD(bdev_io_cache_size); 480 SET_FIELD(bdev_auto_examine); 481 SET_FIELD(iobuf_small_cache_size); 482 SET_FIELD(iobuf_large_cache_size); 483 484 /* Do not remove this statement, you should always update this statement when you adding a new field, 485 * and do not forget to add the SET_FIELD statement for your added field. */ 486 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 487 488 #undef SET_FIELD 489 } 490 491 int 492 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 493 { 494 uint32_t min_pool_size; 495 496 if (!opts) { 497 SPDK_ERRLOG("opts cannot be NULL\n"); 498 return -1; 499 } 500 501 if (!opts->opts_size) { 502 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 503 return -1; 504 } 505 506 /* 507 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 508 * initialization. A second mgmt_ch will be created on the same thread when the application starts 509 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 510 */ 511 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 512 if (opts->bdev_io_pool_size < min_pool_size) { 513 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 514 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 515 spdk_thread_get_count()); 516 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 517 return -1; 518 } 519 520 #define SET_FIELD(field) \ 521 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 522 g_bdev_opts.field = opts->field; \ 523 } \ 524 525 SET_FIELD(bdev_io_pool_size); 526 SET_FIELD(bdev_io_cache_size); 527 SET_FIELD(bdev_auto_examine); 528 SET_FIELD(iobuf_small_cache_size); 529 SET_FIELD(iobuf_large_cache_size); 530 531 g_bdev_opts.opts_size = opts->opts_size; 532 533 #undef SET_FIELD 534 535 return 0; 536 } 537 538 static struct spdk_bdev * 539 bdev_get_by_name(const char *bdev_name) 540 { 541 struct spdk_bdev_name find; 542 struct spdk_bdev_name *res; 543 544 find.name = (char *)bdev_name; 545 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 546 if (res != NULL) { 547 return res->bdev; 548 } 549 550 return NULL; 551 } 552 553 struct spdk_bdev * 554 spdk_bdev_get_by_name(const char *bdev_name) 555 { 556 struct spdk_bdev *bdev; 557 558 spdk_spin_lock(&g_bdev_mgr.spinlock); 559 bdev = bdev_get_by_name(bdev_name); 560 spdk_spin_unlock(&g_bdev_mgr.spinlock); 561 562 return bdev; 563 } 564 565 struct bdev_io_status_string { 566 enum spdk_bdev_io_status status; 567 const char *str; 568 }; 569 570 static const struct bdev_io_status_string bdev_io_status_strings[] = { 571 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 572 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 573 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 574 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 575 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 576 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 577 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 578 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 579 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 580 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 581 }; 582 583 static const char * 584 bdev_io_status_get_string(enum spdk_bdev_io_status status) 585 { 586 uint32_t i; 587 588 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 589 if (bdev_io_status_strings[i].status == status) { 590 return bdev_io_status_strings[i].str; 591 } 592 } 593 594 return "reserved"; 595 } 596 597 struct spdk_bdev_wait_for_examine_ctx { 598 struct spdk_poller *poller; 599 spdk_bdev_wait_for_examine_cb cb_fn; 600 void *cb_arg; 601 }; 602 603 static bool bdev_module_all_actions_completed(void); 604 605 static int 606 bdev_wait_for_examine_cb(void *arg) 607 { 608 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 609 610 if (!bdev_module_all_actions_completed()) { 611 return SPDK_POLLER_IDLE; 612 } 613 614 spdk_poller_unregister(&ctx->poller); 615 ctx->cb_fn(ctx->cb_arg); 616 free(ctx); 617 618 return SPDK_POLLER_BUSY; 619 } 620 621 int 622 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 623 { 624 struct spdk_bdev_wait_for_examine_ctx *ctx; 625 626 ctx = calloc(1, sizeof(*ctx)); 627 if (ctx == NULL) { 628 return -ENOMEM; 629 } 630 ctx->cb_fn = cb_fn; 631 ctx->cb_arg = cb_arg; 632 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 633 634 return 0; 635 } 636 637 struct spdk_bdev_examine_item { 638 char *name; 639 TAILQ_ENTRY(spdk_bdev_examine_item) link; 640 }; 641 642 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 643 644 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 645 g_bdev_examine_allowlist); 646 647 static inline bool 648 bdev_examine_allowlist_check(const char *name) 649 { 650 struct spdk_bdev_examine_item *item; 651 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 652 if (strcmp(name, item->name) == 0) { 653 return true; 654 } 655 } 656 return false; 657 } 658 659 static inline void 660 bdev_examine_allowlist_free(void) 661 { 662 struct spdk_bdev_examine_item *item; 663 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 664 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 665 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 666 free(item->name); 667 free(item); 668 } 669 } 670 671 static inline bool 672 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 673 { 674 struct spdk_bdev_alias *tmp; 675 if (bdev_examine_allowlist_check(bdev->name)) { 676 return true; 677 } 678 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 679 if (bdev_examine_allowlist_check(tmp->alias.name)) { 680 return true; 681 } 682 } 683 return false; 684 } 685 686 static inline bool 687 bdev_ok_to_examine(struct spdk_bdev *bdev) 688 { 689 /* Some bdevs may not support the READ command. 690 * Do not try to examine them. 691 */ 692 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 693 return false; 694 } 695 696 if (g_bdev_opts.bdev_auto_examine) { 697 return true; 698 } else { 699 return bdev_in_examine_allowlist(bdev); 700 } 701 } 702 703 static void 704 bdev_examine(struct spdk_bdev *bdev) 705 { 706 struct spdk_bdev_module *module; 707 struct spdk_bdev_module_claim *claim, *tmpclaim; 708 uint32_t action; 709 710 if (!bdev_ok_to_examine(bdev)) { 711 return; 712 } 713 714 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 715 if (module->examine_config) { 716 spdk_spin_lock(&module->internal.spinlock); 717 action = module->internal.action_in_progress; 718 module->internal.action_in_progress++; 719 spdk_spin_unlock(&module->internal.spinlock); 720 module->examine_config(bdev); 721 if (action != module->internal.action_in_progress) { 722 SPDK_ERRLOG("examine_config for module %s did not call " 723 "spdk_bdev_module_examine_done()\n", module->name); 724 } 725 } 726 } 727 728 spdk_spin_lock(&bdev->internal.spinlock); 729 730 switch (bdev->internal.claim_type) { 731 case SPDK_BDEV_CLAIM_NONE: 732 /* Examine by all bdev modules */ 733 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 734 if (module->examine_disk) { 735 spdk_spin_lock(&module->internal.spinlock); 736 module->internal.action_in_progress++; 737 spdk_spin_unlock(&module->internal.spinlock); 738 spdk_spin_unlock(&bdev->internal.spinlock); 739 module->examine_disk(bdev); 740 spdk_spin_lock(&bdev->internal.spinlock); 741 } 742 } 743 break; 744 case SPDK_BDEV_CLAIM_EXCL_WRITE: 745 /* Examine by the one bdev module with a v1 claim */ 746 module = bdev->internal.claim.v1.module; 747 if (module->examine_disk) { 748 spdk_spin_lock(&module->internal.spinlock); 749 module->internal.action_in_progress++; 750 spdk_spin_unlock(&module->internal.spinlock); 751 spdk_spin_unlock(&bdev->internal.spinlock); 752 module->examine_disk(bdev); 753 return; 754 } 755 break; 756 default: 757 /* Examine by all bdev modules with a v2 claim */ 758 assert(claim_type_is_v2(bdev->internal.claim_type)); 759 /* 760 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 761 * list, perhaps accessing freed memory. Without protection, this could happen 762 * while the lock is dropped during the examine callback. 763 */ 764 bdev->internal.examine_in_progress++; 765 766 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 767 module = claim->module; 768 769 if (module == NULL) { 770 /* This is a vestigial claim, held by examine_count */ 771 continue; 772 } 773 774 if (module->examine_disk == NULL) { 775 continue; 776 } 777 778 spdk_spin_lock(&module->internal.spinlock); 779 module->internal.action_in_progress++; 780 spdk_spin_unlock(&module->internal.spinlock); 781 782 /* Call examine_disk without holding internal.spinlock. */ 783 spdk_spin_unlock(&bdev->internal.spinlock); 784 module->examine_disk(bdev); 785 spdk_spin_lock(&bdev->internal.spinlock); 786 } 787 788 assert(bdev->internal.examine_in_progress > 0); 789 bdev->internal.examine_in_progress--; 790 if (bdev->internal.examine_in_progress == 0) { 791 /* Remove any claims that were released during examine_disk */ 792 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 793 if (claim->desc != NULL) { 794 continue; 795 } 796 797 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 798 free(claim); 799 } 800 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 801 claim_reset(bdev); 802 } 803 } 804 } 805 806 spdk_spin_unlock(&bdev->internal.spinlock); 807 } 808 809 int 810 spdk_bdev_examine(const char *name) 811 { 812 struct spdk_bdev *bdev; 813 struct spdk_bdev_examine_item *item; 814 struct spdk_thread *thread = spdk_get_thread(); 815 816 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 817 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 818 thread ? spdk_thread_get_name(thread) : "null"); 819 return -EINVAL; 820 } 821 822 if (g_bdev_opts.bdev_auto_examine) { 823 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 824 return -EINVAL; 825 } 826 827 if (bdev_examine_allowlist_check(name)) { 828 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 829 return -EEXIST; 830 } 831 832 item = calloc(1, sizeof(*item)); 833 if (!item) { 834 return -ENOMEM; 835 } 836 item->name = strdup(name); 837 if (!item->name) { 838 free(item); 839 return -ENOMEM; 840 } 841 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 842 843 bdev = spdk_bdev_get_by_name(name); 844 if (bdev) { 845 bdev_examine(bdev); 846 } 847 return 0; 848 } 849 850 static inline void 851 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 852 { 853 struct spdk_bdev_examine_item *item; 854 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 855 spdk_json_write_object_begin(w); 856 spdk_json_write_named_string(w, "method", "bdev_examine"); 857 spdk_json_write_named_object_begin(w, "params"); 858 spdk_json_write_named_string(w, "name", item->name); 859 spdk_json_write_object_end(w); 860 spdk_json_write_object_end(w); 861 } 862 } 863 864 struct spdk_bdev * 865 spdk_bdev_first(void) 866 { 867 struct spdk_bdev *bdev; 868 869 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 870 if (bdev) { 871 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 872 } 873 874 return bdev; 875 } 876 877 struct spdk_bdev * 878 spdk_bdev_next(struct spdk_bdev *prev) 879 { 880 struct spdk_bdev *bdev; 881 882 bdev = TAILQ_NEXT(prev, internal.link); 883 if (bdev) { 884 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 885 } 886 887 return bdev; 888 } 889 890 static struct spdk_bdev * 891 _bdev_next_leaf(struct spdk_bdev *bdev) 892 { 893 while (bdev != NULL) { 894 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 895 return bdev; 896 } else { 897 bdev = TAILQ_NEXT(bdev, internal.link); 898 } 899 } 900 901 return bdev; 902 } 903 904 struct spdk_bdev * 905 spdk_bdev_first_leaf(void) 906 { 907 struct spdk_bdev *bdev; 908 909 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 910 911 if (bdev) { 912 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 913 } 914 915 return bdev; 916 } 917 918 struct spdk_bdev * 919 spdk_bdev_next_leaf(struct spdk_bdev *prev) 920 { 921 struct spdk_bdev *bdev; 922 923 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 924 925 if (bdev) { 926 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 927 } 928 929 return bdev; 930 } 931 932 static inline bool 933 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 934 { 935 return bdev_io->internal.f.has_memory_domain; 936 } 937 938 static inline bool 939 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 940 { 941 return bdev_io->internal.f.has_accel_sequence; 942 } 943 944 static inline void 945 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 946 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 947 { 948 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 949 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 950 * channels we will instead wait for half to complete. 951 */ 952 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 953 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 954 955 assert(state != BDEV_IO_RETRY_STATE_INVALID); 956 bdev_io->internal.retry_state = state; 957 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 958 } 959 960 static inline void 961 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 962 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 963 { 964 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 965 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 966 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 967 968 assert(state != BDEV_IO_RETRY_STATE_INVALID); 969 bdev_io->internal.retry_state = state; 970 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 971 } 972 973 void 974 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 975 { 976 struct iovec *iovs; 977 978 if (bdev_io->u.bdev.iovs == NULL) { 979 bdev_io->u.bdev.iovs = &bdev_io->iov; 980 bdev_io->u.bdev.iovcnt = 1; 981 } 982 983 iovs = bdev_io->u.bdev.iovs; 984 985 assert(iovs != NULL); 986 assert(bdev_io->u.bdev.iovcnt >= 1); 987 988 iovs[0].iov_base = buf; 989 iovs[0].iov_len = len; 990 } 991 992 void 993 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 994 { 995 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 996 bdev_io->u.bdev.md_buf = md_buf; 997 } 998 999 static bool 1000 _is_buf_allocated(const struct iovec *iovs) 1001 { 1002 if (iovs == NULL) { 1003 return false; 1004 } 1005 1006 return iovs[0].iov_base != NULL; 1007 } 1008 1009 static bool 1010 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1011 { 1012 int i; 1013 uintptr_t iov_base; 1014 1015 if (spdk_likely(alignment == 1)) { 1016 return true; 1017 } 1018 1019 for (i = 0; i < iovcnt; i++) { 1020 iov_base = (uintptr_t)iovs[i].iov_base; 1021 if ((iov_base & (alignment - 1)) != 0) { 1022 return false; 1023 } 1024 } 1025 1026 return true; 1027 } 1028 1029 static inline bool 1030 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1031 { 1032 if (!bdev_io_use_accel_sequence(bdev_io)) { 1033 return false; 1034 } 1035 1036 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1037 * bdev module didn't support accel sequences */ 1038 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1039 } 1040 1041 static inline void 1042 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1043 struct spdk_bdev_shared_resource *shared_resource) 1044 { 1045 bdev_ch->io_outstanding++; 1046 shared_resource->io_outstanding++; 1047 } 1048 1049 static inline void 1050 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1051 struct spdk_bdev_shared_resource *shared_resource) 1052 { 1053 assert(bdev_ch->io_outstanding > 0); 1054 assert(shared_resource->io_outstanding > 0); 1055 bdev_ch->io_outstanding--; 1056 shared_resource->io_outstanding--; 1057 } 1058 1059 static void 1060 bdev_io_submit_sequence_cb(void *ctx, int status) 1061 { 1062 struct spdk_bdev_io *bdev_io = ctx; 1063 1064 assert(bdev_io_use_accel_sequence(bdev_io)); 1065 1066 bdev_io->u.bdev.accel_sequence = NULL; 1067 bdev_io->internal.f.has_accel_sequence = false; 1068 1069 if (spdk_unlikely(status != 0)) { 1070 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1071 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1072 bdev_io_complete_unsubmitted(bdev_io); 1073 return; 1074 } 1075 1076 bdev_io_submit(bdev_io); 1077 } 1078 1079 static void 1080 bdev_io_exec_sequence_cb(void *ctx, int status) 1081 { 1082 struct spdk_bdev_io *bdev_io = ctx; 1083 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1084 1085 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1086 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1087 1088 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1089 bdev_ch_retry_io(ch); 1090 } 1091 1092 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1093 } 1094 1095 static void 1096 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1097 { 1098 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1099 1100 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1101 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1102 assert(bdev_io_use_accel_sequence(bdev_io)); 1103 1104 /* Since the operations are appended during submission, they're in the opposite order than 1105 * how we want to execute them for reads (i.e. we need to execute the most recently added 1106 * operation first), so reverse the sequence before executing it. 1107 */ 1108 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1109 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1110 } 1111 1112 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1113 bdev_io_increment_outstanding(ch, ch->shared_resource); 1114 bdev_io->internal.data_transfer_cpl = cb_fn; 1115 1116 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1117 bdev_io_exec_sequence_cb, bdev_io); 1118 } 1119 1120 static void 1121 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1122 { 1123 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1124 void *buf; 1125 1126 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1127 buf = bdev_io->internal.buf.ptr; 1128 bdev_io->internal.buf.ptr = NULL; 1129 bdev_io->internal.f.has_buf = false; 1130 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1131 bdev_io->internal.get_aux_buf_cb = NULL; 1132 } else { 1133 assert(bdev_io->internal.get_buf_cb != NULL); 1134 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1135 bdev_io->internal.get_buf_cb = NULL; 1136 } 1137 } 1138 1139 static void 1140 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1141 { 1142 struct spdk_bdev_io *bdev_io = ctx; 1143 1144 if (rc) { 1145 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1146 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1147 } 1148 bdev_io_get_buf_complete(bdev_io, !rc); 1149 } 1150 1151 static void 1152 bdev_io_pull_md_buf_done(void *ctx, int status) 1153 { 1154 struct spdk_bdev_io *bdev_io = ctx; 1155 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1156 1157 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1158 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1159 1160 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1161 bdev_ch_retry_io(ch); 1162 } 1163 1164 assert(bdev_io->internal.data_transfer_cpl); 1165 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1166 } 1167 1168 static void 1169 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1170 { 1171 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1172 int rc = 0; 1173 1174 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1175 if (bdev_io_use_memory_domain(bdev_io)) { 1176 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1177 bdev_io_increment_outstanding(ch, ch->shared_resource); 1178 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1179 bdev_io->internal.memory_domain_ctx, 1180 &bdev_io->internal.orig_md_iov, 1, 1181 &bdev_io->internal.bounce_md_iov, 1, 1182 bdev_io_pull_md_buf_done, bdev_io); 1183 if (rc == 0) { 1184 /* Continue to submit IO in completion callback */ 1185 return; 1186 } 1187 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1188 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1189 if (rc != -ENOMEM) { 1190 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1191 spdk_memory_domain_get_dma_device_id( 1192 bdev_io->internal.memory_domain), rc); 1193 } 1194 } else { 1195 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1196 bdev_io->internal.orig_md_iov.iov_base, 1197 bdev_io->internal.orig_md_iov.iov_len); 1198 } 1199 } 1200 1201 if (spdk_unlikely(rc == -ENOMEM)) { 1202 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1203 } else { 1204 assert(bdev_io->internal.data_transfer_cpl); 1205 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1206 } 1207 } 1208 1209 static void 1210 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1211 { 1212 /* save original md_buf */ 1213 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1214 bdev_io->internal.orig_md_iov.iov_len = len; 1215 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1216 bdev_io->internal.bounce_md_iov.iov_len = len; 1217 /* set bounce md_buf */ 1218 bdev_io->u.bdev.md_buf = md_buf; 1219 1220 bdev_io_pull_md_buf(bdev_io); 1221 } 1222 1223 static void 1224 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1225 { 1226 struct spdk_bdev *bdev = bdev_io->bdev; 1227 uint64_t md_len; 1228 void *buf; 1229 1230 if (spdk_bdev_is_md_separate(bdev)) { 1231 assert(!bdev_io_use_accel_sequence(bdev_io)); 1232 1233 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1234 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1235 1236 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1237 1238 if (bdev_io->u.bdev.md_buf != NULL) { 1239 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1240 return; 1241 } else { 1242 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1243 } 1244 } 1245 1246 bdev_io_get_buf_complete(bdev_io, true); 1247 } 1248 1249 static inline void 1250 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1251 { 1252 if (rc) { 1253 SPDK_ERRLOG("Failed to get data buffer\n"); 1254 assert(bdev_io->internal.data_transfer_cpl); 1255 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1256 return; 1257 } 1258 1259 _bdev_io_set_md_buf(bdev_io); 1260 } 1261 1262 static void 1263 bdev_io_pull_data_done_and_track(void *ctx, int status) 1264 { 1265 struct spdk_bdev_io *bdev_io = ctx; 1266 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1267 1268 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1269 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1270 1271 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1272 bdev_ch_retry_io(ch); 1273 } 1274 1275 bdev_io_pull_data_done(bdev_io, status); 1276 } 1277 1278 static void 1279 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1280 { 1281 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1282 int rc = 0; 1283 1284 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1285 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1286 * operation */ 1287 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1288 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1289 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1290 assert(bdev_io_use_accel_sequence(bdev_io)); 1291 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1292 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1293 NULL, NULL, 1294 bdev_io->internal.orig_iovs, 1295 bdev_io->internal.orig_iovcnt, 1296 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1297 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1298 NULL, NULL); 1299 } else { 1300 /* We need to reverse the src/dst for reads */ 1301 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1302 assert(bdev_io_use_accel_sequence(bdev_io)); 1303 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1304 bdev_io->internal.orig_iovs, 1305 bdev_io->internal.orig_iovcnt, 1306 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1307 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1308 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1309 NULL, NULL, NULL, NULL); 1310 } 1311 1312 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1313 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1314 bdev_io->internal.accel_sequence); 1315 } 1316 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1317 /* if this is write path, copy data from original buffer to bounce buffer */ 1318 if (bdev_io_use_memory_domain(bdev_io)) { 1319 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1320 bdev_io_increment_outstanding(ch, ch->shared_resource); 1321 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1322 bdev_io->internal.memory_domain_ctx, 1323 bdev_io->internal.orig_iovs, 1324 (uint32_t) bdev_io->internal.orig_iovcnt, 1325 bdev_io->u.bdev.iovs, 1, 1326 bdev_io_pull_data_done_and_track, 1327 bdev_io); 1328 if (rc == 0) { 1329 /* Continue to submit IO in completion callback */ 1330 return; 1331 } 1332 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1333 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1334 if (rc != -ENOMEM) { 1335 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1336 spdk_memory_domain_get_dma_device_id( 1337 bdev_io->internal.memory_domain)); 1338 } 1339 } else { 1340 assert(bdev_io->u.bdev.iovcnt == 1); 1341 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1342 bdev_io->u.bdev.iovs[0].iov_len, 1343 bdev_io->internal.orig_iovs, 1344 bdev_io->internal.orig_iovcnt); 1345 } 1346 } 1347 1348 if (spdk_unlikely(rc == -ENOMEM)) { 1349 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1350 } else { 1351 bdev_io_pull_data_done(bdev_io, rc); 1352 } 1353 } 1354 1355 static void 1356 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1357 bdev_copy_bounce_buffer_cpl cpl_cb) 1358 { 1359 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1360 1361 bdev_io->internal.data_transfer_cpl = cpl_cb; 1362 /* save original iovec */ 1363 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1364 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1365 /* set bounce iov */ 1366 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1367 bdev_io->u.bdev.iovcnt = 1; 1368 /* set bounce buffer for this operation */ 1369 bdev_io->u.bdev.iovs[0].iov_base = buf; 1370 bdev_io->u.bdev.iovs[0].iov_len = len; 1371 1372 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1373 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1374 } else { 1375 bdev_io_pull_data(bdev_io); 1376 } 1377 } 1378 1379 static void 1380 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1381 { 1382 struct spdk_bdev *bdev = bdev_io->bdev; 1383 bool buf_allocated; 1384 uint64_t alignment; 1385 void *aligned_buf; 1386 1387 bdev_io->internal.buf.ptr = buf; 1388 bdev_io->internal.f.has_buf = true; 1389 1390 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1391 bdev_io_get_buf_complete(bdev_io, true); 1392 return; 1393 } 1394 1395 alignment = spdk_bdev_get_buf_align(bdev); 1396 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1397 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1398 1399 if (buf_allocated) { 1400 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1401 /* Continue in completion callback */ 1402 return; 1403 } else { 1404 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1405 } 1406 1407 _bdev_io_set_md_buf(bdev_io); 1408 } 1409 1410 static inline uint64_t 1411 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1412 { 1413 struct spdk_bdev *bdev = bdev_io->bdev; 1414 uint64_t md_len, alignment; 1415 1416 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1417 1418 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1419 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1420 1421 return len + alignment + md_len; 1422 } 1423 1424 static void 1425 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1426 { 1427 struct spdk_bdev_mgmt_channel *ch; 1428 1429 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1430 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1431 } 1432 1433 static void 1434 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1435 { 1436 assert(bdev_io->internal.f.has_buf); 1437 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1438 bdev_io->internal.buf.ptr = NULL; 1439 bdev_io->internal.f.has_buf = false; 1440 } 1441 1442 void 1443 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1444 { 1445 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1446 1447 assert(buf != NULL); 1448 _bdev_io_put_buf(bdev_io, buf, len); 1449 } 1450 1451 static inline void 1452 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1453 struct spdk_bdev_io *bdev_io) 1454 { 1455 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1456 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1457 * sequence pointer to make sure we won't touch it anymore. */ 1458 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1459 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1460 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1461 bdev_io->internal.f.has_accel_sequence = false; 1462 } 1463 1464 bdev->fn_table->submit_request(ioch, bdev_io); 1465 } 1466 1467 static inline void 1468 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1469 { 1470 struct spdk_bdev *bdev = bdev_io->bdev; 1471 1472 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1473 bdev_io->internal.error.nvme.cdw0 = 0; 1474 bdev_io->num_retries++; 1475 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1476 } 1477 1478 static void 1479 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1480 { 1481 struct spdk_bdev_io *bdev_io; 1482 1483 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1484 /* 1485 * Allow some more I/O to complete before retrying the nomem_io queue. 1486 * Some drivers (such as nvme) cannot immediately take a new I/O in 1487 * the context of a completion, because the resources for the I/O are 1488 * not released until control returns to the bdev poller. Also, we 1489 * may require several small I/O to complete before a larger I/O 1490 * (that requires splitting) can be submitted. 1491 */ 1492 return; 1493 } 1494 1495 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1496 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1497 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1498 1499 switch (bdev_io->internal.retry_state) { 1500 case BDEV_IO_RETRY_STATE_SUBMIT: 1501 bdev_ch_resubmit_io(shared_resource, bdev_io); 1502 break; 1503 case BDEV_IO_RETRY_STATE_PULL: 1504 bdev_io_pull_data(bdev_io); 1505 break; 1506 case BDEV_IO_RETRY_STATE_PULL_MD: 1507 bdev_io_pull_md_buf(bdev_io); 1508 break; 1509 case BDEV_IO_RETRY_STATE_PUSH: 1510 bdev_io_push_bounce_data(bdev_io); 1511 break; 1512 case BDEV_IO_RETRY_STATE_PUSH_MD: 1513 bdev_io_push_bounce_md_buf(bdev_io); 1514 break; 1515 default: 1516 assert(0 && "invalid retry state"); 1517 break; 1518 } 1519 1520 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1521 /* This IO completed again with NOMEM status, so break the loop and 1522 * don't try anymore. Note that a bdev_io that fails with NOMEM 1523 * always gets requeued at the front of the list, to maintain 1524 * ordering. 1525 */ 1526 break; 1527 } 1528 } 1529 } 1530 1531 static void 1532 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1533 { 1534 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1535 } 1536 1537 static int 1538 bdev_no_mem_poller(void *ctx) 1539 { 1540 struct spdk_bdev_shared_resource *shared_resource = ctx; 1541 1542 spdk_poller_unregister(&shared_resource->nomem_poller); 1543 1544 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1545 bdev_shared_ch_retry_io(shared_resource); 1546 } 1547 /* the retry cb may re-register the poller so double check */ 1548 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1549 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1550 /* No IOs were submitted, try again */ 1551 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1552 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1553 } 1554 1555 return SPDK_POLLER_BUSY; 1556 } 1557 1558 static inline bool 1559 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1560 { 1561 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1562 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1563 1564 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1565 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1566 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1567 1568 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1569 /* Special case when we have nomem IOs and no outstanding IOs which completions 1570 * could trigger retry of queued IOs 1571 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1572 * new IOs submitted, e.g. qd==1 */ 1573 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1574 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1575 } 1576 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1577 * ownership of that sequence is transferred back to the bdev layer, so we need to 1578 * restore internal.accel_sequence to make sure that the sequence is handled 1579 * correctly in case the I/O is later aborted. */ 1580 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1581 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1582 assert(!bdev_io_use_accel_sequence(bdev_io)); 1583 bdev_io->internal.f.has_accel_sequence = true; 1584 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1585 } 1586 1587 return true; 1588 } 1589 1590 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1591 bdev_ch_retry_io(bdev_ch); 1592 } 1593 1594 return false; 1595 } 1596 1597 static void 1598 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1599 { 1600 struct spdk_bdev_io *bdev_io = ctx; 1601 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1602 1603 if (rc) { 1604 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1605 } 1606 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1607 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1608 */ 1609 bdev_io_put_buf(bdev_io); 1610 1611 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1612 bdev_ch_retry_io(ch); 1613 } 1614 1615 /* Continue with IO completion flow */ 1616 bdev_io_complete(bdev_io); 1617 } 1618 1619 static void 1620 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1621 { 1622 struct spdk_bdev_io *bdev_io = ctx; 1623 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1624 1625 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1626 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1627 1628 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1629 bdev_ch_retry_io(ch); 1630 } 1631 1632 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1633 } 1634 1635 static inline void 1636 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1637 { 1638 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1639 int rc = 0; 1640 1641 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1642 /* do the same for metadata buffer */ 1643 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1644 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1645 1646 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1647 if (bdev_io_use_memory_domain(bdev_io)) { 1648 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1649 bdev_io_increment_outstanding(ch, ch->shared_resource); 1650 /* If memory domain is used then we need to call async push function */ 1651 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1652 bdev_io->internal.memory_domain_ctx, 1653 &bdev_io->internal.orig_md_iov, 1654 (uint32_t)bdev_io->internal.orig_iovcnt, 1655 &bdev_io->internal.bounce_md_iov, 1, 1656 bdev_io_push_bounce_md_buf_done, 1657 bdev_io); 1658 if (rc == 0) { 1659 /* Continue IO completion in async callback */ 1660 return; 1661 } 1662 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1663 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1664 if (rc != -ENOMEM) { 1665 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1666 spdk_memory_domain_get_dma_device_id( 1667 bdev_io->internal.memory_domain)); 1668 } 1669 } else { 1670 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1671 bdev_io->internal.orig_md_iov.iov_len); 1672 } 1673 } 1674 } 1675 1676 if (spdk_unlikely(rc == -ENOMEM)) { 1677 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1678 } else { 1679 assert(bdev_io->internal.data_transfer_cpl); 1680 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1681 } 1682 } 1683 1684 static inline void 1685 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1686 { 1687 assert(bdev_io->internal.data_transfer_cpl); 1688 if (rc) { 1689 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1690 return; 1691 } 1692 1693 /* set original buffer for this io */ 1694 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1695 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1696 /* disable bouncing buffer for this io */ 1697 bdev_io->internal.orig_iovcnt = 0; 1698 bdev_io->internal.orig_iovs = NULL; 1699 1700 bdev_io_push_bounce_md_buf(bdev_io); 1701 } 1702 1703 static void 1704 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1705 { 1706 struct spdk_bdev_io *bdev_io = ctx; 1707 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1708 1709 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1710 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1711 1712 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1713 bdev_ch_retry_io(ch); 1714 } 1715 1716 bdev_io_push_bounce_data_done(bdev_io, status); 1717 } 1718 1719 static inline void 1720 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1721 { 1722 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1723 int rc = 0; 1724 1725 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1726 assert(!bdev_io_use_accel_sequence(bdev_io)); 1727 1728 /* if this is read path, copy data from bounce buffer to original buffer */ 1729 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1730 if (bdev_io_use_memory_domain(bdev_io)) { 1731 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1732 bdev_io_increment_outstanding(ch, ch->shared_resource); 1733 /* If memory domain is used then we need to call async push function */ 1734 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1735 bdev_io->internal.memory_domain_ctx, 1736 bdev_io->internal.orig_iovs, 1737 (uint32_t)bdev_io->internal.orig_iovcnt, 1738 &bdev_io->internal.bounce_iov, 1, 1739 bdev_io_push_bounce_data_done_and_track, 1740 bdev_io); 1741 if (rc == 0) { 1742 /* Continue IO completion in async callback */ 1743 return; 1744 } 1745 1746 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1747 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1748 if (rc != -ENOMEM) { 1749 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1750 spdk_memory_domain_get_dma_device_id( 1751 bdev_io->internal.memory_domain)); 1752 } 1753 } else { 1754 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1755 bdev_io->internal.orig_iovcnt, 1756 bdev_io->internal.bounce_iov.iov_base, 1757 bdev_io->internal.bounce_iov.iov_len); 1758 } 1759 } 1760 1761 if (spdk_unlikely(rc == -ENOMEM)) { 1762 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1763 } else { 1764 bdev_io_push_bounce_data_done(bdev_io, rc); 1765 } 1766 } 1767 1768 static inline void 1769 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1770 { 1771 bdev_io->internal.data_transfer_cpl = cpl_cb; 1772 bdev_io_push_bounce_data(bdev_io); 1773 } 1774 1775 static void 1776 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1777 { 1778 struct spdk_bdev_io *bdev_io; 1779 1780 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1781 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1782 } 1783 1784 static void 1785 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1786 { 1787 struct spdk_bdev_mgmt_channel *mgmt_ch; 1788 uint64_t max_len; 1789 void *buf; 1790 1791 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1792 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1793 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1794 1795 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1796 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1797 bdev_io_get_buf_complete(bdev_io, false); 1798 return; 1799 } 1800 1801 bdev_io->internal.buf.len = len; 1802 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1803 bdev_io_get_iobuf_cb); 1804 if (buf != NULL) { 1805 _bdev_io_set_buf(bdev_io, buf, len); 1806 } 1807 } 1808 1809 void 1810 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1811 { 1812 struct spdk_bdev *bdev = bdev_io->bdev; 1813 uint64_t alignment; 1814 1815 assert(cb != NULL); 1816 bdev_io->internal.get_buf_cb = cb; 1817 1818 alignment = spdk_bdev_get_buf_align(bdev); 1819 1820 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1821 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1822 /* Buffer already present and aligned */ 1823 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1824 return; 1825 } 1826 1827 bdev_io_get_buf(bdev_io, len); 1828 } 1829 1830 static void 1831 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1832 bool success) 1833 { 1834 if (!success) { 1835 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1836 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1837 bdev_io_complete_unsubmitted(bdev_io); 1838 return; 1839 } 1840 1841 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1842 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1843 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1844 return; 1845 } 1846 /* For reads we'll execute the sequence after the data is read, so, for now, only 1847 * clear out accel_sequence pointer and submit the IO */ 1848 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1849 bdev_io->u.bdev.accel_sequence = NULL; 1850 } 1851 1852 bdev_io_submit(bdev_io); 1853 } 1854 1855 static void 1856 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1857 uint64_t len) 1858 { 1859 assert(cb != NULL); 1860 bdev_io->internal.get_buf_cb = cb; 1861 1862 bdev_io_get_buf(bdev_io, len); 1863 } 1864 1865 void 1866 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1867 { 1868 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1869 1870 assert(cb != NULL); 1871 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1872 bdev_io->internal.get_aux_buf_cb = cb; 1873 bdev_io_get_buf(bdev_io, len); 1874 } 1875 1876 static int 1877 bdev_module_get_max_ctx_size(void) 1878 { 1879 struct spdk_bdev_module *bdev_module; 1880 int max_bdev_module_size = 0; 1881 1882 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1883 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1884 max_bdev_module_size = bdev_module->get_ctx_size(); 1885 } 1886 } 1887 1888 return max_bdev_module_size; 1889 } 1890 1891 static void 1892 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1893 { 1894 if (!bdev->internal.histogram_enabled) { 1895 return; 1896 } 1897 1898 spdk_json_write_object_begin(w); 1899 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1900 1901 spdk_json_write_named_object_begin(w, "params"); 1902 spdk_json_write_named_string(w, "name", bdev->name); 1903 1904 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1905 1906 if (bdev->internal.histogram_io_type) { 1907 spdk_json_write_named_string(w, "opc", 1908 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1909 } 1910 1911 spdk_json_write_object_end(w); 1912 1913 spdk_json_write_object_end(w); 1914 } 1915 1916 static void 1917 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1918 { 1919 int i; 1920 struct spdk_bdev_qos *qos = bdev->internal.qos; 1921 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1922 1923 if (!qos) { 1924 return; 1925 } 1926 1927 spdk_bdev_get_qos_rate_limits(bdev, limits); 1928 1929 spdk_json_write_object_begin(w); 1930 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1931 1932 spdk_json_write_named_object_begin(w, "params"); 1933 spdk_json_write_named_string(w, "name", bdev->name); 1934 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1935 if (limits[i] > 0) { 1936 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1937 } 1938 } 1939 spdk_json_write_object_end(w); 1940 1941 spdk_json_write_object_end(w); 1942 } 1943 1944 void 1945 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1946 { 1947 struct spdk_bdev_module *bdev_module; 1948 struct spdk_bdev *bdev; 1949 1950 assert(w != NULL); 1951 1952 spdk_json_write_array_begin(w); 1953 1954 spdk_json_write_object_begin(w); 1955 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1956 spdk_json_write_named_object_begin(w, "params"); 1957 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1958 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1959 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1960 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1961 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1962 spdk_json_write_object_end(w); 1963 spdk_json_write_object_end(w); 1964 1965 bdev_examine_allowlist_config_json(w); 1966 1967 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1968 if (bdev_module->config_json) { 1969 bdev_module->config_json(w); 1970 } 1971 } 1972 1973 spdk_spin_lock(&g_bdev_mgr.spinlock); 1974 1975 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1976 if (bdev->fn_table->write_config_json) { 1977 bdev->fn_table->write_config_json(bdev, w); 1978 } 1979 1980 bdev_qos_config_json(bdev, w); 1981 bdev_enable_histogram_config_json(bdev, w); 1982 } 1983 1984 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1985 1986 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1987 spdk_json_write_object_begin(w); 1988 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1989 spdk_json_write_object_end(w); 1990 1991 spdk_json_write_array_end(w); 1992 } 1993 1994 static void 1995 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1996 { 1997 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1998 struct spdk_bdev_io *bdev_io; 1999 2000 spdk_iobuf_channel_fini(&ch->iobuf); 2001 2002 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2003 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2004 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2005 ch->per_thread_cache_count--; 2006 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2007 } 2008 2009 assert(ch->per_thread_cache_count == 0); 2010 } 2011 2012 static int 2013 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2014 { 2015 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2016 struct spdk_bdev_io *bdev_io; 2017 uint32_t i; 2018 int rc; 2019 2020 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2021 g_bdev_opts.iobuf_small_cache_size, 2022 g_bdev_opts.iobuf_large_cache_size); 2023 if (rc != 0) { 2024 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2025 return -1; 2026 } 2027 2028 STAILQ_INIT(&ch->per_thread_cache); 2029 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2030 2031 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2032 ch->per_thread_cache_count = 0; 2033 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2034 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2035 if (bdev_io == NULL) { 2036 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2037 assert(false); 2038 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2039 return -1; 2040 } 2041 ch->per_thread_cache_count++; 2042 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2043 } 2044 2045 TAILQ_INIT(&ch->shared_resources); 2046 TAILQ_INIT(&ch->io_wait_queue); 2047 2048 return 0; 2049 } 2050 2051 static void 2052 bdev_init_complete(int rc) 2053 { 2054 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2055 void *cb_arg = g_init_cb_arg; 2056 struct spdk_bdev_module *m; 2057 2058 g_bdev_mgr.init_complete = true; 2059 g_init_cb_fn = NULL; 2060 g_init_cb_arg = NULL; 2061 2062 /* 2063 * For modules that need to know when subsystem init is complete, 2064 * inform them now. 2065 */ 2066 if (rc == 0) { 2067 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2068 if (m->init_complete) { 2069 m->init_complete(); 2070 } 2071 } 2072 } 2073 2074 cb_fn(cb_arg, rc); 2075 } 2076 2077 static bool 2078 bdev_module_all_actions_completed(void) 2079 { 2080 struct spdk_bdev_module *m; 2081 2082 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2083 if (m->internal.action_in_progress > 0) { 2084 return false; 2085 } 2086 } 2087 return true; 2088 } 2089 2090 static void 2091 bdev_module_action_complete(void) 2092 { 2093 /* 2094 * Don't finish bdev subsystem initialization if 2095 * module pre-initialization is still in progress, or 2096 * the subsystem been already initialized. 2097 */ 2098 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2099 return; 2100 } 2101 2102 /* 2103 * Check all bdev modules for inits/examinations in progress. If any 2104 * exist, return immediately since we cannot finish bdev subsystem 2105 * initialization until all are completed. 2106 */ 2107 if (!bdev_module_all_actions_completed()) { 2108 return; 2109 } 2110 2111 /* 2112 * Modules already finished initialization - now that all 2113 * the bdev modules have finished their asynchronous I/O 2114 * processing, the entire bdev layer can be marked as complete. 2115 */ 2116 bdev_init_complete(0); 2117 } 2118 2119 static void 2120 bdev_module_action_done(struct spdk_bdev_module *module) 2121 { 2122 spdk_spin_lock(&module->internal.spinlock); 2123 assert(module->internal.action_in_progress > 0); 2124 module->internal.action_in_progress--; 2125 spdk_spin_unlock(&module->internal.spinlock); 2126 bdev_module_action_complete(); 2127 } 2128 2129 void 2130 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2131 { 2132 assert(module->async_init); 2133 bdev_module_action_done(module); 2134 } 2135 2136 void 2137 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2138 { 2139 bdev_module_action_done(module); 2140 } 2141 2142 /** The last initialized bdev module */ 2143 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2144 2145 static void 2146 bdev_init_failed(void *cb_arg) 2147 { 2148 struct spdk_bdev_module *module = cb_arg; 2149 2150 spdk_spin_lock(&module->internal.spinlock); 2151 assert(module->internal.action_in_progress > 0); 2152 module->internal.action_in_progress--; 2153 spdk_spin_unlock(&module->internal.spinlock); 2154 bdev_init_complete(-1); 2155 } 2156 2157 static int 2158 bdev_modules_init(void) 2159 { 2160 struct spdk_bdev_module *module; 2161 int rc = 0; 2162 2163 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2164 g_resume_bdev_module = module; 2165 if (module->async_init) { 2166 spdk_spin_lock(&module->internal.spinlock); 2167 module->internal.action_in_progress = 1; 2168 spdk_spin_unlock(&module->internal.spinlock); 2169 } 2170 rc = module->module_init(); 2171 if (rc != 0) { 2172 /* Bump action_in_progress to prevent other modules from completion of modules_init 2173 * Send message to defer application shutdown until resources are cleaned up */ 2174 spdk_spin_lock(&module->internal.spinlock); 2175 module->internal.action_in_progress = 1; 2176 spdk_spin_unlock(&module->internal.spinlock); 2177 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2178 return rc; 2179 } 2180 } 2181 2182 g_resume_bdev_module = NULL; 2183 return 0; 2184 } 2185 2186 void 2187 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2188 { 2189 int rc = 0; 2190 char mempool_name[32]; 2191 2192 assert(cb_fn != NULL); 2193 2194 g_init_cb_fn = cb_fn; 2195 g_init_cb_arg = cb_arg; 2196 2197 spdk_notify_type_register("bdev_register"); 2198 spdk_notify_type_register("bdev_unregister"); 2199 2200 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2201 2202 rc = spdk_iobuf_register_module("bdev"); 2203 if (rc != 0) { 2204 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2205 bdev_init_complete(-1); 2206 return; 2207 } 2208 2209 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2210 g_bdev_opts.bdev_io_pool_size, 2211 sizeof(struct spdk_bdev_io) + 2212 bdev_module_get_max_ctx_size(), 2213 0, 2214 SPDK_ENV_SOCKET_ID_ANY); 2215 2216 if (g_bdev_mgr.bdev_io_pool == NULL) { 2217 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2218 bdev_init_complete(-1); 2219 return; 2220 } 2221 2222 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2223 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2224 if (!g_bdev_mgr.zero_buffer) { 2225 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2226 bdev_init_complete(-1); 2227 return; 2228 } 2229 2230 #ifdef SPDK_CONFIG_VTUNE 2231 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2232 #endif 2233 2234 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2235 bdev_mgmt_channel_destroy, 2236 sizeof(struct spdk_bdev_mgmt_channel), 2237 "bdev_mgr"); 2238 2239 rc = bdev_modules_init(); 2240 g_bdev_mgr.module_init_complete = true; 2241 if (rc != 0) { 2242 SPDK_ERRLOG("bdev modules init failed\n"); 2243 return; 2244 } 2245 2246 bdev_module_action_complete(); 2247 } 2248 2249 static void 2250 bdev_mgr_unregister_cb(void *io_device) 2251 { 2252 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2253 2254 if (g_bdev_mgr.bdev_io_pool) { 2255 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2256 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2257 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2258 g_bdev_opts.bdev_io_pool_size); 2259 } 2260 2261 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2262 } 2263 2264 spdk_free(g_bdev_mgr.zero_buffer); 2265 2266 bdev_examine_allowlist_free(); 2267 2268 cb_fn(g_fini_cb_arg); 2269 g_fini_cb_fn = NULL; 2270 g_fini_cb_arg = NULL; 2271 g_bdev_mgr.init_complete = false; 2272 g_bdev_mgr.module_init_complete = false; 2273 } 2274 2275 static void 2276 bdev_module_fini_iter(void *arg) 2277 { 2278 struct spdk_bdev_module *bdev_module; 2279 2280 /* FIXME: Handling initialization failures is broken now, 2281 * so we won't even try cleaning up after successfully 2282 * initialized modules. if module_init_complete is false, 2283 * just call spdk_bdev_mgr_unregister_cb 2284 */ 2285 if (!g_bdev_mgr.module_init_complete) { 2286 bdev_mgr_unregister_cb(NULL); 2287 return; 2288 } 2289 2290 /* Start iterating from the last touched module */ 2291 if (!g_resume_bdev_module) { 2292 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2293 } else { 2294 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2295 internal.tailq); 2296 } 2297 2298 while (bdev_module) { 2299 if (bdev_module->async_fini) { 2300 /* Save our place so we can resume later. We must 2301 * save the variable here, before calling module_fini() 2302 * below, because in some cases the module may immediately 2303 * call spdk_bdev_module_fini_done() and re-enter 2304 * this function to continue iterating. */ 2305 g_resume_bdev_module = bdev_module; 2306 } 2307 2308 if (bdev_module->module_fini) { 2309 bdev_module->module_fini(); 2310 } 2311 2312 if (bdev_module->async_fini) { 2313 return; 2314 } 2315 2316 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2317 internal.tailq); 2318 } 2319 2320 g_resume_bdev_module = NULL; 2321 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2322 } 2323 2324 void 2325 spdk_bdev_module_fini_done(void) 2326 { 2327 if (spdk_get_thread() != g_fini_thread) { 2328 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2329 } else { 2330 bdev_module_fini_iter(NULL); 2331 } 2332 } 2333 2334 static void 2335 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2336 { 2337 struct spdk_bdev *bdev = cb_arg; 2338 2339 if (bdeverrno && bdev) { 2340 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2341 bdev->name); 2342 2343 /* 2344 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2345 * bdev; try to continue by manually removing this bdev from the list and continue 2346 * with the next bdev in the list. 2347 */ 2348 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2349 } 2350 2351 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2352 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2353 /* 2354 * Bdev module finish need to be deferred as we might be in the middle of some context 2355 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2356 * after returning. 2357 */ 2358 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2359 return; 2360 } 2361 2362 /* 2363 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2364 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2365 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2366 * base bdevs. 2367 * 2368 * Also, walk the list in the reverse order. 2369 */ 2370 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2371 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2372 spdk_spin_lock(&bdev->internal.spinlock); 2373 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2374 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2375 spdk_spin_unlock(&bdev->internal.spinlock); 2376 continue; 2377 } 2378 spdk_spin_unlock(&bdev->internal.spinlock); 2379 2380 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2381 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2382 return; 2383 } 2384 2385 /* 2386 * If any bdev fails to unclaim underlying bdev properly, we may face the 2387 * case of bdev list consisting of claimed bdevs only (if claims are managed 2388 * correctly, this would mean there's a loop in the claims graph which is 2389 * clearly impossible). Warn and unregister last bdev on the list then. 2390 */ 2391 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2392 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2393 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2394 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2395 return; 2396 } 2397 } 2398 2399 static void 2400 bdev_module_fini_start_iter(void *arg) 2401 { 2402 struct spdk_bdev_module *bdev_module; 2403 2404 if (!g_resume_bdev_module) { 2405 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2406 } else { 2407 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2408 } 2409 2410 while (bdev_module) { 2411 if (bdev_module->async_fini_start) { 2412 /* Save our place so we can resume later. We must 2413 * save the variable here, before calling fini_start() 2414 * below, because in some cases the module may immediately 2415 * call spdk_bdev_module_fini_start_done() and re-enter 2416 * this function to continue iterating. */ 2417 g_resume_bdev_module = bdev_module; 2418 } 2419 2420 if (bdev_module->fini_start) { 2421 bdev_module->fini_start(); 2422 } 2423 2424 if (bdev_module->async_fini_start) { 2425 return; 2426 } 2427 2428 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2429 } 2430 2431 g_resume_bdev_module = NULL; 2432 2433 bdev_finish_unregister_bdevs_iter(NULL, 0); 2434 } 2435 2436 void 2437 spdk_bdev_module_fini_start_done(void) 2438 { 2439 if (spdk_get_thread() != g_fini_thread) { 2440 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2441 } else { 2442 bdev_module_fini_start_iter(NULL); 2443 } 2444 } 2445 2446 static void 2447 bdev_finish_wait_for_examine_done(void *cb_arg) 2448 { 2449 bdev_module_fini_start_iter(NULL); 2450 } 2451 2452 static void bdev_open_async_fini(void); 2453 2454 void 2455 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2456 { 2457 int rc; 2458 2459 assert(cb_fn != NULL); 2460 2461 g_fini_thread = spdk_get_thread(); 2462 2463 g_fini_cb_fn = cb_fn; 2464 g_fini_cb_arg = cb_arg; 2465 2466 bdev_open_async_fini(); 2467 2468 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2469 if (rc != 0) { 2470 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2471 bdev_finish_wait_for_examine_done(NULL); 2472 } 2473 } 2474 2475 struct spdk_bdev_io * 2476 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2477 { 2478 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2479 struct spdk_bdev_io *bdev_io; 2480 2481 if (ch->per_thread_cache_count > 0) { 2482 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2483 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2484 ch->per_thread_cache_count--; 2485 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2486 /* 2487 * Don't try to look for bdev_ios in the global pool if there are 2488 * waiters on bdev_ios - we don't want this caller to jump the line. 2489 */ 2490 bdev_io = NULL; 2491 } else { 2492 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2493 } 2494 2495 return bdev_io; 2496 } 2497 2498 void 2499 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2500 { 2501 struct spdk_bdev_mgmt_channel *ch; 2502 2503 assert(bdev_io != NULL); 2504 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2505 2506 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2507 2508 if (bdev_io->internal.f.has_buf) { 2509 bdev_io_put_buf(bdev_io); 2510 } 2511 2512 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2513 ch->per_thread_cache_count++; 2514 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2515 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2516 struct spdk_bdev_io_wait_entry *entry; 2517 2518 entry = TAILQ_FIRST(&ch->io_wait_queue); 2519 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2520 entry->cb_fn(entry->cb_arg); 2521 } 2522 } else { 2523 /* We should never have a full cache with entries on the io wait queue. */ 2524 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2525 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2526 } 2527 } 2528 2529 static bool 2530 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2531 { 2532 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2533 2534 switch (limit) { 2535 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2536 return true; 2537 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2538 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2539 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2540 return false; 2541 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2542 default: 2543 return false; 2544 } 2545 } 2546 2547 static bool 2548 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2549 { 2550 switch (bdev_io->type) { 2551 case SPDK_BDEV_IO_TYPE_NVME_IO: 2552 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2553 case SPDK_BDEV_IO_TYPE_READ: 2554 case SPDK_BDEV_IO_TYPE_WRITE: 2555 return true; 2556 case SPDK_BDEV_IO_TYPE_ZCOPY: 2557 if (bdev_io->u.bdev.zcopy.start) { 2558 return true; 2559 } else { 2560 return false; 2561 } 2562 default: 2563 return false; 2564 } 2565 } 2566 2567 static bool 2568 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2569 { 2570 switch (bdev_io->type) { 2571 case SPDK_BDEV_IO_TYPE_NVME_IO: 2572 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2573 /* Bit 1 (0x2) set for read operation */ 2574 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2575 return true; 2576 } else { 2577 return false; 2578 } 2579 case SPDK_BDEV_IO_TYPE_READ: 2580 return true; 2581 case SPDK_BDEV_IO_TYPE_ZCOPY: 2582 /* Populate to read from disk */ 2583 if (bdev_io->u.bdev.zcopy.populate) { 2584 return true; 2585 } else { 2586 return false; 2587 } 2588 default: 2589 return false; 2590 } 2591 } 2592 2593 static uint64_t 2594 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2595 { 2596 struct spdk_bdev *bdev = bdev_io->bdev; 2597 2598 switch (bdev_io->type) { 2599 case SPDK_BDEV_IO_TYPE_NVME_IO: 2600 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2601 return bdev_io->u.nvme_passthru.nbytes; 2602 case SPDK_BDEV_IO_TYPE_READ: 2603 case SPDK_BDEV_IO_TYPE_WRITE: 2604 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2605 case SPDK_BDEV_IO_TYPE_ZCOPY: 2606 /* Track the data in the start phase only */ 2607 if (bdev_io->u.bdev.zcopy.start) { 2608 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2609 } else { 2610 return 0; 2611 } 2612 default: 2613 return 0; 2614 } 2615 } 2616 2617 static inline bool 2618 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2619 { 2620 int64_t remaining_this_timeslice; 2621 2622 if (!limit->max_per_timeslice) { 2623 /* The QoS is disabled */ 2624 return false; 2625 } 2626 2627 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2628 __ATOMIC_RELAXED); 2629 if (remaining_this_timeslice + (int64_t)delta > 0) { 2630 /* There was still a quota for this delta -> the IO shouldn't be queued 2631 * 2632 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2633 * quota can be allowed once a while. Such overrun then taken into account in 2634 * the QoS poller, where the next timeslice quota is calculated. 2635 */ 2636 return false; 2637 } 2638 2639 /* There was no quota for this delta -> the IO should be queued 2640 * The remaining_this_timeslice must be rewinded so it reflects the real 2641 * amount of IOs or bytes allowed. 2642 */ 2643 __atomic_add_fetch( 2644 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2645 return true; 2646 } 2647 2648 static inline void 2649 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2650 { 2651 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2652 } 2653 2654 static bool 2655 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2656 { 2657 return bdev_qos_rw_queue_io(limit, io, 1); 2658 } 2659 2660 static void 2661 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2662 { 2663 bdev_qos_rw_rewind_io(limit, io, 1); 2664 } 2665 2666 static bool 2667 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2668 { 2669 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2670 } 2671 2672 static void 2673 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2674 { 2675 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2676 } 2677 2678 static bool 2679 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2680 { 2681 if (bdev_is_read_io(io) == false) { 2682 return false; 2683 } 2684 2685 return bdev_qos_rw_bps_queue(limit, io); 2686 } 2687 2688 static void 2689 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2690 { 2691 if (bdev_is_read_io(io) != false) { 2692 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2693 } 2694 } 2695 2696 static bool 2697 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2698 { 2699 if (bdev_is_read_io(io) == true) { 2700 return false; 2701 } 2702 2703 return bdev_qos_rw_bps_queue(limit, io); 2704 } 2705 2706 static void 2707 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2708 { 2709 if (bdev_is_read_io(io) != true) { 2710 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2711 } 2712 } 2713 2714 static void 2715 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2716 { 2717 int i; 2718 2719 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2720 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2721 qos->rate_limits[i].queue_io = NULL; 2722 continue; 2723 } 2724 2725 switch (i) { 2726 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2727 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2728 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2729 break; 2730 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2731 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2732 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2733 break; 2734 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2735 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2736 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2737 break; 2738 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2739 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2740 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2741 break; 2742 default: 2743 break; 2744 } 2745 } 2746 } 2747 2748 static void 2749 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2750 struct spdk_bdev_io *bdev_io, 2751 enum spdk_bdev_io_status status) 2752 { 2753 bdev_io->internal.in_submit_request = true; 2754 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2755 spdk_bdev_io_complete(bdev_io, status); 2756 bdev_io->internal.in_submit_request = false; 2757 } 2758 2759 static inline void 2760 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2761 { 2762 struct spdk_bdev *bdev = bdev_io->bdev; 2763 struct spdk_io_channel *ch = bdev_ch->channel; 2764 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2765 2766 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2767 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2768 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2769 2770 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2771 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2772 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2773 SPDK_BDEV_IO_STATUS_SUCCESS); 2774 return; 2775 } 2776 } 2777 2778 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2779 bdev_io->bdev->split_on_write_unit && 2780 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2781 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2782 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2783 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2784 return; 2785 } 2786 2787 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2788 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2789 bdev_io->internal.in_submit_request = true; 2790 bdev_submit_request(bdev, ch, bdev_io); 2791 bdev_io->internal.in_submit_request = false; 2792 } else { 2793 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2794 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2795 /* Special case when we have nomem IOs and no outstanding IOs which completions 2796 * could trigger retry of queued IOs */ 2797 bdev_shared_ch_retry_io(shared_resource); 2798 } 2799 } 2800 } 2801 2802 static bool 2803 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2804 { 2805 int i; 2806 2807 if (bdev_qos_io_to_limit(bdev_io) == true) { 2808 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2809 if (!qos->rate_limits[i].queue_io) { 2810 continue; 2811 } 2812 2813 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2814 bdev_io) == true) { 2815 for (i -= 1; i >= 0 ; i--) { 2816 if (!qos->rate_limits[i].queue_io) { 2817 continue; 2818 } 2819 2820 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2821 } 2822 return true; 2823 } 2824 } 2825 } 2826 2827 return false; 2828 } 2829 2830 static int 2831 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2832 { 2833 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2834 int submitted_ios = 0; 2835 2836 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2837 if (!bdev_qos_queue_io(qos, bdev_io)) { 2838 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2839 bdev_io_do_submit(ch, bdev_io); 2840 2841 submitted_ios++; 2842 } 2843 } 2844 2845 return submitted_ios; 2846 } 2847 2848 static void 2849 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2850 { 2851 int rc; 2852 2853 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2854 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2855 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2856 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2857 &bdev_io->internal.waitq_entry); 2858 if (rc != 0) { 2859 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2860 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2861 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2862 } 2863 } 2864 2865 static bool 2866 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2867 { 2868 uint32_t io_boundary; 2869 struct spdk_bdev *bdev = bdev_io->bdev; 2870 uint32_t max_segment_size = bdev->max_segment_size; 2871 uint32_t max_size = bdev->max_rw_size; 2872 int max_segs = bdev->max_num_segments; 2873 2874 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2875 io_boundary = bdev->write_unit_size; 2876 } else if (bdev->split_on_optimal_io_boundary) { 2877 io_boundary = bdev->optimal_io_boundary; 2878 } else { 2879 io_boundary = 0; 2880 } 2881 2882 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2883 return false; 2884 } 2885 2886 if (io_boundary) { 2887 uint64_t start_stripe, end_stripe; 2888 2889 start_stripe = bdev_io->u.bdev.offset_blocks; 2890 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2891 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2892 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2893 start_stripe >>= spdk_u32log2(io_boundary); 2894 end_stripe >>= spdk_u32log2(io_boundary); 2895 } else { 2896 start_stripe /= io_boundary; 2897 end_stripe /= io_boundary; 2898 } 2899 2900 if (start_stripe != end_stripe) { 2901 return true; 2902 } 2903 } 2904 2905 if (max_segs) { 2906 if (bdev_io->u.bdev.iovcnt > max_segs) { 2907 return true; 2908 } 2909 } 2910 2911 if (max_segment_size) { 2912 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2913 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2914 return true; 2915 } 2916 } 2917 } 2918 2919 if (max_size) { 2920 if (bdev_io->u.bdev.num_blocks > max_size) { 2921 return true; 2922 } 2923 } 2924 2925 return false; 2926 } 2927 2928 static bool 2929 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2930 { 2931 uint32_t num_unmap_segments; 2932 2933 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2934 return false; 2935 } 2936 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2937 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2938 return true; 2939 } 2940 2941 return false; 2942 } 2943 2944 static bool 2945 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2946 { 2947 if (!bdev_io->bdev->max_write_zeroes) { 2948 return false; 2949 } 2950 2951 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2952 return true; 2953 } 2954 2955 return false; 2956 } 2957 2958 static bool 2959 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2960 { 2961 if (bdev_io->bdev->max_copy != 0 && 2962 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2963 return true; 2964 } 2965 2966 return false; 2967 } 2968 2969 static bool 2970 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2971 { 2972 switch (bdev_io->type) { 2973 case SPDK_BDEV_IO_TYPE_READ: 2974 case SPDK_BDEV_IO_TYPE_WRITE: 2975 return bdev_rw_should_split(bdev_io); 2976 case SPDK_BDEV_IO_TYPE_UNMAP: 2977 return bdev_unmap_should_split(bdev_io); 2978 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2979 return bdev_write_zeroes_should_split(bdev_io); 2980 case SPDK_BDEV_IO_TYPE_COPY: 2981 return bdev_copy_should_split(bdev_io); 2982 default: 2983 return false; 2984 } 2985 } 2986 2987 static uint32_t 2988 _to_next_boundary(uint64_t offset, uint32_t boundary) 2989 { 2990 return (boundary - (offset % boundary)); 2991 } 2992 2993 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2994 2995 static void _bdev_rw_split(void *_bdev_io); 2996 2997 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2998 2999 static void 3000 _bdev_unmap_split(void *_bdev_io) 3001 { 3002 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3003 } 3004 3005 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3006 3007 static void 3008 _bdev_write_zeroes_split(void *_bdev_io) 3009 { 3010 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3011 } 3012 3013 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3014 3015 static void 3016 _bdev_copy_split(void *_bdev_io) 3017 { 3018 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3019 } 3020 3021 static int 3022 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3023 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3024 { 3025 int rc; 3026 uint64_t current_offset, current_remaining, current_src_offset; 3027 spdk_bdev_io_wait_cb io_wait_fn; 3028 3029 current_offset = *offset; 3030 current_remaining = *remaining; 3031 3032 assert(bdev_io->internal.f.split); 3033 3034 bdev_io->internal.split.outstanding++; 3035 3036 io_wait_fn = _bdev_rw_split; 3037 switch (bdev_io->type) { 3038 case SPDK_BDEV_IO_TYPE_READ: 3039 assert(bdev_io->u.bdev.accel_sequence == NULL); 3040 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3041 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3042 iov, iovcnt, md_buf, current_offset, 3043 num_blocks, 3044 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3045 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3046 NULL, 3047 bdev_io->u.bdev.dif_check_flags, 3048 bdev_io_split_done, bdev_io); 3049 break; 3050 case SPDK_BDEV_IO_TYPE_WRITE: 3051 assert(bdev_io->u.bdev.accel_sequence == NULL); 3052 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3053 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3054 iov, iovcnt, md_buf, current_offset, 3055 num_blocks, 3056 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3057 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3058 NULL, 3059 bdev_io->u.bdev.dif_check_flags, 3060 bdev_io->u.bdev.nvme_cdw12.raw, 3061 bdev_io->u.bdev.nvme_cdw13.raw, 3062 bdev_io_split_done, bdev_io); 3063 break; 3064 case SPDK_BDEV_IO_TYPE_UNMAP: 3065 io_wait_fn = _bdev_unmap_split; 3066 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3067 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3068 current_offset, num_blocks, 3069 bdev_io_split_done, bdev_io); 3070 break; 3071 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3072 io_wait_fn = _bdev_write_zeroes_split; 3073 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3074 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3075 current_offset, num_blocks, 3076 bdev_io_split_done, bdev_io); 3077 break; 3078 case SPDK_BDEV_IO_TYPE_COPY: 3079 io_wait_fn = _bdev_copy_split; 3080 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3081 (current_offset - bdev_io->u.bdev.offset_blocks); 3082 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3083 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3084 current_offset, current_src_offset, num_blocks, 3085 bdev_io_split_done, bdev_io); 3086 break; 3087 default: 3088 assert(false); 3089 rc = -EINVAL; 3090 break; 3091 } 3092 3093 if (rc == 0) { 3094 current_offset += num_blocks; 3095 current_remaining -= num_blocks; 3096 bdev_io->internal.split.current_offset_blocks = current_offset; 3097 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3098 *offset = current_offset; 3099 *remaining = current_remaining; 3100 } else { 3101 bdev_io->internal.split.outstanding--; 3102 if (rc == -ENOMEM) { 3103 if (bdev_io->internal.split.outstanding == 0) { 3104 /* No I/O is outstanding. Hence we should wait here. */ 3105 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3106 } 3107 } else { 3108 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3109 if (bdev_io->internal.split.outstanding == 0) { 3110 bdev_ch_remove_from_io_submitted(bdev_io); 3111 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3112 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3113 bdev_io->internal.ch->queue_depth); 3114 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3115 } 3116 } 3117 } 3118 3119 return rc; 3120 } 3121 3122 static void 3123 _bdev_rw_split(void *_bdev_io) 3124 { 3125 struct iovec *parent_iov, *iov; 3126 struct spdk_bdev_io *bdev_io = _bdev_io; 3127 struct spdk_bdev *bdev = bdev_io->bdev; 3128 uint64_t parent_offset, current_offset, remaining; 3129 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3130 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3131 uint32_t iovcnt, iov_len, child_iovsize; 3132 uint32_t blocklen = bdev->blocklen; 3133 uint32_t io_boundary; 3134 uint32_t max_segment_size = bdev->max_segment_size; 3135 uint32_t max_child_iovcnt = bdev->max_num_segments; 3136 uint32_t max_size = bdev->max_rw_size; 3137 void *md_buf = NULL; 3138 int rc; 3139 3140 max_size = max_size ? max_size : UINT32_MAX; 3141 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3142 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3143 SPDK_BDEV_IO_NUM_CHILD_IOV; 3144 3145 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3146 io_boundary = bdev->write_unit_size; 3147 } else if (bdev->split_on_optimal_io_boundary) { 3148 io_boundary = bdev->optimal_io_boundary; 3149 } else { 3150 io_boundary = UINT32_MAX; 3151 } 3152 3153 assert(bdev_io->internal.f.split); 3154 3155 remaining = bdev_io->internal.split.remaining_num_blocks; 3156 current_offset = bdev_io->internal.split.current_offset_blocks; 3157 parent_offset = bdev_io->u.bdev.offset_blocks; 3158 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3159 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3160 3161 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3162 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3163 if (parent_iov_offset < parent_iov->iov_len) { 3164 break; 3165 } 3166 parent_iov_offset -= parent_iov->iov_len; 3167 } 3168 3169 child_iovcnt = 0; 3170 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3171 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3172 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3173 to_next_boundary = spdk_min(remaining, to_next_boundary); 3174 to_next_boundary = spdk_min(max_size, to_next_boundary); 3175 to_next_boundary_bytes = to_next_boundary * blocklen; 3176 3177 iov = &bdev_io->child_iov[child_iovcnt]; 3178 iovcnt = 0; 3179 3180 if (bdev_io->u.bdev.md_buf) { 3181 md_buf = (char *)bdev_io->u.bdev.md_buf + 3182 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3183 } 3184 3185 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3186 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3187 iovcnt < child_iovsize) { 3188 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3189 iov_len = parent_iov->iov_len - parent_iov_offset; 3190 3191 iov_len = spdk_min(iov_len, max_segment_size); 3192 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3193 to_next_boundary_bytes -= iov_len; 3194 3195 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3196 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3197 3198 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3199 parent_iov_offset += iov_len; 3200 } else { 3201 parent_iovpos++; 3202 parent_iov_offset = 0; 3203 } 3204 child_iovcnt++; 3205 iovcnt++; 3206 } 3207 3208 if (to_next_boundary_bytes > 0) { 3209 /* We had to stop this child I/O early because we ran out of 3210 * child_iov space or were limited by max_num_segments. 3211 * Ensure the iovs to be aligned with block size and 3212 * then adjust to_next_boundary before starting the 3213 * child I/O. 3214 */ 3215 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3216 iovcnt == child_iovsize); 3217 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3218 if (to_last_block_bytes != 0) { 3219 uint32_t child_iovpos = child_iovcnt - 1; 3220 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3221 * so the loop will naturally end 3222 */ 3223 3224 to_last_block_bytes = blocklen - to_last_block_bytes; 3225 to_next_boundary_bytes += to_last_block_bytes; 3226 while (to_last_block_bytes > 0 && iovcnt > 0) { 3227 iov_len = spdk_min(to_last_block_bytes, 3228 bdev_io->child_iov[child_iovpos].iov_len); 3229 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3230 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3231 child_iovpos--; 3232 if (--iovcnt == 0) { 3233 /* If the child IO is less than a block size just return. 3234 * If the first child IO of any split round is less than 3235 * a block size, an error exit. 3236 */ 3237 if (bdev_io->internal.split.outstanding == 0) { 3238 SPDK_ERRLOG("The first child io was less than a block size\n"); 3239 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3240 bdev_ch_remove_from_io_submitted(bdev_io); 3241 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3242 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3243 bdev_io->internal.ch->queue_depth); 3244 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3245 } 3246 3247 return; 3248 } 3249 } 3250 3251 to_last_block_bytes -= iov_len; 3252 3253 if (parent_iov_offset == 0) { 3254 parent_iovpos--; 3255 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3256 } 3257 parent_iov_offset -= iov_len; 3258 } 3259 3260 assert(to_last_block_bytes == 0); 3261 } 3262 to_next_boundary -= to_next_boundary_bytes / blocklen; 3263 } 3264 3265 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3266 ¤t_offset, &remaining); 3267 if (spdk_unlikely(rc)) { 3268 return; 3269 } 3270 } 3271 } 3272 3273 static void 3274 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3275 { 3276 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3277 uint32_t num_children_reqs = 0; 3278 int rc; 3279 3280 assert(bdev_io->internal.f.split); 3281 3282 offset = bdev_io->internal.split.current_offset_blocks; 3283 remaining = bdev_io->internal.split.remaining_num_blocks; 3284 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3285 3286 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3287 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3288 3289 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3290 &offset, &remaining); 3291 if (spdk_likely(rc == 0)) { 3292 num_children_reqs++; 3293 } else { 3294 return; 3295 } 3296 } 3297 } 3298 3299 static void 3300 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3301 { 3302 uint64_t offset, write_zeroes_blocks, remaining; 3303 uint32_t num_children_reqs = 0; 3304 int rc; 3305 3306 assert(bdev_io->internal.f.split); 3307 3308 offset = bdev_io->internal.split.current_offset_blocks; 3309 remaining = bdev_io->internal.split.remaining_num_blocks; 3310 3311 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3312 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3313 3314 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3315 &offset, &remaining); 3316 if (spdk_likely(rc == 0)) { 3317 num_children_reqs++; 3318 } else { 3319 return; 3320 } 3321 } 3322 } 3323 3324 static void 3325 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3326 { 3327 uint64_t offset, copy_blocks, remaining; 3328 uint32_t num_children_reqs = 0; 3329 int rc; 3330 3331 assert(bdev_io->internal.f.split); 3332 3333 offset = bdev_io->internal.split.current_offset_blocks; 3334 remaining = bdev_io->internal.split.remaining_num_blocks; 3335 3336 assert(bdev_io->bdev->max_copy != 0); 3337 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3338 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3339 3340 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3341 &offset, &remaining); 3342 if (spdk_likely(rc == 0)) { 3343 num_children_reqs++; 3344 } else { 3345 return; 3346 } 3347 } 3348 } 3349 3350 static void 3351 parent_bdev_io_complete(void *ctx, int rc) 3352 { 3353 struct spdk_bdev_io *parent_io = ctx; 3354 3355 if (rc) { 3356 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3357 } 3358 3359 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3360 parent_io->internal.caller_ctx); 3361 } 3362 3363 static void 3364 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3365 { 3366 struct spdk_bdev_io *bdev_io = ctx; 3367 3368 /* u.bdev.accel_sequence should have already been cleared at this point */ 3369 assert(bdev_io->u.bdev.accel_sequence == NULL); 3370 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3371 bdev_io->internal.f.has_accel_sequence = false; 3372 3373 if (spdk_unlikely(status != 0)) { 3374 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3375 } 3376 3377 parent_bdev_io_complete(bdev_io, status); 3378 } 3379 3380 static void 3381 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3382 { 3383 struct spdk_bdev_io *parent_io = cb_arg; 3384 3385 spdk_bdev_free_io(bdev_io); 3386 3387 assert(parent_io->internal.f.split); 3388 3389 if (!success) { 3390 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3391 /* If any child I/O failed, stop further splitting process. */ 3392 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3393 parent_io->internal.split.remaining_num_blocks = 0; 3394 } 3395 parent_io->internal.split.outstanding--; 3396 if (parent_io->internal.split.outstanding != 0) { 3397 return; 3398 } 3399 3400 /* 3401 * Parent I/O finishes when all blocks are consumed. 3402 */ 3403 if (parent_io->internal.split.remaining_num_blocks == 0) { 3404 assert(parent_io->internal.cb != bdev_io_split_done); 3405 bdev_ch_remove_from_io_submitted(parent_io); 3406 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3407 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3408 parent_io->internal.ch->queue_depth); 3409 3410 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3411 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3412 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3413 return; 3414 } else if (parent_io->internal.orig_iovcnt != 0 && 3415 !bdev_io_use_accel_sequence(bdev_io)) { 3416 /* bdev IO will be completed in the callback */ 3417 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3418 return; 3419 } 3420 } 3421 3422 parent_bdev_io_complete(parent_io, 0); 3423 return; 3424 } 3425 3426 /* 3427 * Continue with the splitting process. This function will complete the parent I/O if the 3428 * splitting is done. 3429 */ 3430 switch (parent_io->type) { 3431 case SPDK_BDEV_IO_TYPE_READ: 3432 case SPDK_BDEV_IO_TYPE_WRITE: 3433 _bdev_rw_split(parent_io); 3434 break; 3435 case SPDK_BDEV_IO_TYPE_UNMAP: 3436 bdev_unmap_split(parent_io); 3437 break; 3438 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3439 bdev_write_zeroes_split(parent_io); 3440 break; 3441 case SPDK_BDEV_IO_TYPE_COPY: 3442 bdev_copy_split(parent_io); 3443 break; 3444 default: 3445 assert(false); 3446 break; 3447 } 3448 } 3449 3450 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3451 bool success); 3452 3453 static void 3454 bdev_io_split(struct spdk_bdev_io *bdev_io) 3455 { 3456 assert(bdev_io_should_split(bdev_io)); 3457 assert(bdev_io->internal.f.split); 3458 3459 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3460 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3461 bdev_io->internal.split.outstanding = 0; 3462 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3463 3464 switch (bdev_io->type) { 3465 case SPDK_BDEV_IO_TYPE_READ: 3466 case SPDK_BDEV_IO_TYPE_WRITE: 3467 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3468 _bdev_rw_split(bdev_io); 3469 } else { 3470 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3471 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3472 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3473 } 3474 break; 3475 case SPDK_BDEV_IO_TYPE_UNMAP: 3476 bdev_unmap_split(bdev_io); 3477 break; 3478 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3479 bdev_write_zeroes_split(bdev_io); 3480 break; 3481 case SPDK_BDEV_IO_TYPE_COPY: 3482 bdev_copy_split(bdev_io); 3483 break; 3484 default: 3485 assert(false); 3486 break; 3487 } 3488 } 3489 3490 static void 3491 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3492 { 3493 if (!success) { 3494 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3495 return; 3496 } 3497 3498 _bdev_rw_split(bdev_io); 3499 } 3500 3501 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3502 * be inlined, at least on some compilers. 3503 */ 3504 static inline void 3505 _bdev_io_submit(void *ctx) 3506 { 3507 struct spdk_bdev_io *bdev_io = ctx; 3508 struct spdk_bdev *bdev = bdev_io->bdev; 3509 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3510 3511 if (spdk_likely(bdev_ch->flags == 0)) { 3512 bdev_io_do_submit(bdev_ch, bdev_io); 3513 return; 3514 } 3515 3516 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3517 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3518 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3519 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3520 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3521 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3522 } else { 3523 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3524 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3525 } 3526 } else { 3527 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3528 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3529 } 3530 } 3531 3532 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3533 3534 bool 3535 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3536 { 3537 if (range1->length == 0 || range2->length == 0) { 3538 return false; 3539 } 3540 3541 if (range1->offset + range1->length <= range2->offset) { 3542 return false; 3543 } 3544 3545 if (range2->offset + range2->length <= range1->offset) { 3546 return false; 3547 } 3548 3549 return true; 3550 } 3551 3552 static bool 3553 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3554 { 3555 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3556 struct lba_range r; 3557 3558 switch (bdev_io->type) { 3559 case SPDK_BDEV_IO_TYPE_NVME_IO: 3560 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3561 /* Don't try to decode the NVMe command - just assume worst-case and that 3562 * it overlaps a locked range. 3563 */ 3564 return true; 3565 case SPDK_BDEV_IO_TYPE_READ: 3566 if (!range->quiesce) { 3567 return false; 3568 } 3569 /* fallthrough */ 3570 case SPDK_BDEV_IO_TYPE_WRITE: 3571 case SPDK_BDEV_IO_TYPE_UNMAP: 3572 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3573 case SPDK_BDEV_IO_TYPE_ZCOPY: 3574 case SPDK_BDEV_IO_TYPE_COPY: 3575 r.offset = bdev_io->u.bdev.offset_blocks; 3576 r.length = bdev_io->u.bdev.num_blocks; 3577 if (!bdev_lba_range_overlapped(range, &r)) { 3578 /* This I/O doesn't overlap the specified LBA range. */ 3579 return false; 3580 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3581 /* This I/O overlaps, but the I/O is on the same channel that locked this 3582 * range, and the caller_ctx is the same as the locked_ctx. This means 3583 * that this I/O is associated with the lock, and is allowed to execute. 3584 */ 3585 return false; 3586 } else { 3587 return true; 3588 } 3589 default: 3590 return false; 3591 } 3592 } 3593 3594 void 3595 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3596 { 3597 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3598 3599 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3600 3601 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3602 struct lba_range *range; 3603 3604 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3605 if (bdev_io_range_is_locked(bdev_io, range)) { 3606 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3607 return; 3608 } 3609 } 3610 } 3611 3612 bdev_ch_add_to_io_submitted(bdev_io); 3613 3614 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3615 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3616 ch->trace_id, bdev_io->u.bdev.num_blocks, 3617 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3618 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3619 3620 if (bdev_io->internal.f.split) { 3621 bdev_io_split(bdev_io); 3622 return; 3623 } 3624 3625 _bdev_io_submit(bdev_io); 3626 } 3627 3628 static inline void 3629 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3630 { 3631 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3632 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3633 * For write operation we need to pull buffers from memory domain before submitting IO. 3634 * Once read operation completes, we need to use memory_domain push functionality to 3635 * update data in original memory domain IO buffer 3636 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3637 assert(bdev_io->internal.f.has_memory_domain); 3638 bdev_io->u.bdev.memory_domain = NULL; 3639 bdev_io->u.bdev.memory_domain_ctx = NULL; 3640 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3641 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3642 } 3643 3644 static inline void 3645 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3646 { 3647 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3648 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3649 3650 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3651 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3652 bdev_io_complete_unsubmitted(bdev_io); 3653 return; 3654 } 3655 3656 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3657 * support them, but we need to execute an accel sequence and the data buffer is from accel 3658 * memory domain (to avoid doing a push/pull from that domain). 3659 */ 3660 if (bdev_io_use_memory_domain(bdev_io)) { 3661 if (!desc->memory_domains_supported || 3662 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3663 _bdev_io_ext_use_bounce_buffer(bdev_io); 3664 return; 3665 } 3666 } 3667 3668 if (needs_exec) { 3669 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3670 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3671 return; 3672 } 3673 /* For reads we'll execute the sequence after the data is read, so, for now, only 3674 * clear out accel_sequence pointer and submit the IO */ 3675 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3676 bdev_io->u.bdev.accel_sequence = NULL; 3677 } 3678 3679 bdev_io_submit(bdev_io); 3680 } 3681 3682 static void 3683 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3684 { 3685 struct spdk_bdev *bdev = bdev_io->bdev; 3686 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3687 struct spdk_io_channel *ch = bdev_ch->channel; 3688 3689 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3690 3691 bdev_io->internal.in_submit_request = true; 3692 bdev_submit_request(bdev, ch, bdev_io); 3693 bdev_io->internal.in_submit_request = false; 3694 } 3695 3696 void 3697 bdev_io_init(struct spdk_bdev_io *bdev_io, 3698 struct spdk_bdev *bdev, void *cb_arg, 3699 spdk_bdev_io_completion_cb cb) 3700 { 3701 bdev_io->bdev = bdev; 3702 bdev_io->internal.f.raw = 0; 3703 bdev_io->internal.caller_ctx = cb_arg; 3704 bdev_io->internal.cb = cb; 3705 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3706 bdev_io->internal.in_submit_request = false; 3707 bdev_io->internal.orig_iovs = NULL; 3708 bdev_io->internal.orig_iovcnt = 0; 3709 bdev_io->internal.orig_md_iov.iov_base = NULL; 3710 bdev_io->internal.error.nvme.cdw0 = 0; 3711 bdev_io->num_retries = 0; 3712 bdev_io->internal.get_buf_cb = NULL; 3713 bdev_io->internal.get_aux_buf_cb = NULL; 3714 bdev_io->internal.data_transfer_cpl = NULL; 3715 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3716 } 3717 3718 static bool 3719 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3720 { 3721 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3722 } 3723 3724 bool 3725 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3726 { 3727 bool supported; 3728 3729 supported = bdev_io_type_supported(bdev, io_type); 3730 3731 if (!supported) { 3732 switch (io_type) { 3733 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3734 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3735 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3736 break; 3737 default: 3738 break; 3739 } 3740 } 3741 3742 return supported; 3743 } 3744 3745 static const char *g_io_type_strings[] = { 3746 [SPDK_BDEV_IO_TYPE_READ] = "read", 3747 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3748 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3749 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3750 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3751 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3752 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3753 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3754 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3755 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3756 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3757 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3758 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3759 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3760 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3761 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3762 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3763 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3764 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3765 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3766 }; 3767 3768 const char * 3769 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3770 { 3771 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3772 return NULL; 3773 } 3774 3775 return g_io_type_strings[io_type]; 3776 } 3777 3778 int 3779 spdk_bdev_get_io_type(const char *io_type_string) 3780 { 3781 int i; 3782 3783 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3784 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3785 return i; 3786 } 3787 } 3788 3789 return -1; 3790 } 3791 3792 uint64_t 3793 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3794 { 3795 return bdev_io->internal.submit_tsc; 3796 } 3797 3798 int 3799 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3800 { 3801 if (bdev->fn_table->dump_info_json) { 3802 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3803 } 3804 3805 return 0; 3806 } 3807 3808 static void 3809 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3810 { 3811 uint32_t max_per_timeslice = 0; 3812 int i; 3813 3814 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3815 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3816 qos->rate_limits[i].max_per_timeslice = 0; 3817 continue; 3818 } 3819 3820 max_per_timeslice = qos->rate_limits[i].limit * 3821 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3822 3823 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3824 qos->rate_limits[i].min_per_timeslice); 3825 3826 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3827 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3828 } 3829 3830 bdev_qos_set_ops(qos); 3831 } 3832 3833 static void 3834 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3835 struct spdk_io_channel *io_ch, void *ctx) 3836 { 3837 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3838 int status; 3839 3840 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3841 3842 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3843 /* TODO: channels round robing */ 3844 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3845 3846 spdk_bdev_for_each_channel_continue(i, status); 3847 } 3848 3849 3850 static void 3851 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3852 { 3853 3854 } 3855 3856 static int 3857 bdev_channel_poll_qos(void *arg) 3858 { 3859 struct spdk_bdev *bdev = arg; 3860 struct spdk_bdev_qos *qos = bdev->internal.qos; 3861 uint64_t now = spdk_get_ticks(); 3862 int i; 3863 int64_t remaining_last_timeslice; 3864 3865 if (spdk_unlikely(qos->thread == NULL)) { 3866 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3867 return SPDK_POLLER_IDLE; 3868 } 3869 3870 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3871 /* We received our callback earlier than expected - return 3872 * immediately and wait to do accounting until at least one 3873 * timeslice has actually expired. This should never happen 3874 * with a well-behaved timer implementation. 3875 */ 3876 return SPDK_POLLER_IDLE; 3877 } 3878 3879 /* Reset for next round of rate limiting */ 3880 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3881 /* We may have allowed the IOs or bytes to slightly overrun in the last 3882 * timeslice. remaining_this_timeslice is signed, so if it's negative 3883 * here, we'll account for the overrun so that the next timeslice will 3884 * be appropriately reduced. 3885 */ 3886 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3887 0, __ATOMIC_RELAXED); 3888 if (remaining_last_timeslice < 0) { 3889 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3890 * potentially use 2 atomic ops each, so they can intertwine. 3891 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3892 */ 3893 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3894 remaining_last_timeslice, __ATOMIC_RELAXED); 3895 } 3896 } 3897 3898 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3899 qos->last_timeslice += qos->timeslice_size; 3900 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3901 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3902 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3903 } 3904 } 3905 3906 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3907 bdev_channel_submit_qos_io_done); 3908 3909 return SPDK_POLLER_BUSY; 3910 } 3911 3912 static void 3913 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3914 { 3915 struct spdk_bdev_shared_resource *shared_resource; 3916 struct lba_range *range; 3917 3918 bdev_free_io_stat(ch->stat); 3919 #ifdef SPDK_CONFIG_VTUNE 3920 bdev_free_io_stat(ch->prev_stat); 3921 #endif 3922 3923 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3924 range = TAILQ_FIRST(&ch->locked_ranges); 3925 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3926 free(range); 3927 } 3928 3929 spdk_put_io_channel(ch->channel); 3930 spdk_put_io_channel(ch->accel_channel); 3931 3932 shared_resource = ch->shared_resource; 3933 3934 assert(TAILQ_EMPTY(&ch->io_locked)); 3935 assert(TAILQ_EMPTY(&ch->io_submitted)); 3936 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3937 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3938 assert(ch->io_outstanding == 0); 3939 assert(shared_resource->ref > 0); 3940 shared_resource->ref--; 3941 if (shared_resource->ref == 0) { 3942 assert(shared_resource->io_outstanding == 0); 3943 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3944 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3945 spdk_poller_unregister(&shared_resource->nomem_poller); 3946 free(shared_resource); 3947 } 3948 } 3949 3950 static void 3951 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3952 { 3953 struct spdk_bdev_qos *qos = bdev->internal.qos; 3954 int i; 3955 3956 assert(spdk_spin_held(&bdev->internal.spinlock)); 3957 3958 /* Rate limiting on this bdev enabled */ 3959 if (qos) { 3960 if (qos->ch == NULL) { 3961 struct spdk_io_channel *io_ch; 3962 3963 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3964 bdev->name, spdk_get_thread()); 3965 3966 /* No qos channel has been selected, so set one up */ 3967 3968 /* Take another reference to ch */ 3969 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3970 assert(io_ch != NULL); 3971 qos->ch = ch; 3972 3973 qos->thread = spdk_io_channel_get_thread(io_ch); 3974 3975 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3976 if (bdev_qos_is_iops_rate_limit(i) == true) { 3977 qos->rate_limits[i].min_per_timeslice = 3978 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3979 } else { 3980 qos->rate_limits[i].min_per_timeslice = 3981 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3982 } 3983 3984 if (qos->rate_limits[i].limit == 0) { 3985 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3986 } 3987 } 3988 bdev_qos_update_max_quota_per_timeslice(qos); 3989 qos->timeslice_size = 3990 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3991 qos->last_timeslice = spdk_get_ticks(); 3992 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3993 bdev, 3994 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3995 } 3996 3997 ch->flags |= BDEV_CH_QOS_ENABLED; 3998 } 3999 } 4000 4001 struct poll_timeout_ctx { 4002 struct spdk_bdev_desc *desc; 4003 uint64_t timeout_in_sec; 4004 spdk_bdev_io_timeout_cb cb_fn; 4005 void *cb_arg; 4006 }; 4007 4008 static void 4009 bdev_desc_free(struct spdk_bdev_desc *desc) 4010 { 4011 spdk_spin_destroy(&desc->spinlock); 4012 free(desc->media_events_buffer); 4013 free(desc); 4014 } 4015 4016 static void 4017 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4018 { 4019 struct poll_timeout_ctx *ctx = _ctx; 4020 struct spdk_bdev_desc *desc = ctx->desc; 4021 4022 free(ctx); 4023 4024 spdk_spin_lock(&desc->spinlock); 4025 desc->refs--; 4026 if (desc->closed == true && desc->refs == 0) { 4027 spdk_spin_unlock(&desc->spinlock); 4028 bdev_desc_free(desc); 4029 return; 4030 } 4031 spdk_spin_unlock(&desc->spinlock); 4032 } 4033 4034 static void 4035 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4036 struct spdk_io_channel *io_ch, void *_ctx) 4037 { 4038 struct poll_timeout_ctx *ctx = _ctx; 4039 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4040 struct spdk_bdev_desc *desc = ctx->desc; 4041 struct spdk_bdev_io *bdev_io; 4042 uint64_t now; 4043 4044 spdk_spin_lock(&desc->spinlock); 4045 if (desc->closed == true) { 4046 spdk_spin_unlock(&desc->spinlock); 4047 spdk_bdev_for_each_channel_continue(i, -1); 4048 return; 4049 } 4050 spdk_spin_unlock(&desc->spinlock); 4051 4052 now = spdk_get_ticks(); 4053 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4054 /* Exclude any I/O that are generated via splitting. */ 4055 if (bdev_io->internal.cb == bdev_io_split_done) { 4056 continue; 4057 } 4058 4059 /* Once we find an I/O that has not timed out, we can immediately 4060 * exit the loop. 4061 */ 4062 if (now < (bdev_io->internal.submit_tsc + 4063 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4064 goto end; 4065 } 4066 4067 if (bdev_io->internal.desc == desc) { 4068 ctx->cb_fn(ctx->cb_arg, bdev_io); 4069 } 4070 } 4071 4072 end: 4073 spdk_bdev_for_each_channel_continue(i, 0); 4074 } 4075 4076 static int 4077 bdev_poll_timeout_io(void *arg) 4078 { 4079 struct spdk_bdev_desc *desc = arg; 4080 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4081 struct poll_timeout_ctx *ctx; 4082 4083 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4084 if (!ctx) { 4085 SPDK_ERRLOG("failed to allocate memory\n"); 4086 return SPDK_POLLER_BUSY; 4087 } 4088 ctx->desc = desc; 4089 ctx->cb_arg = desc->cb_arg; 4090 ctx->cb_fn = desc->cb_fn; 4091 ctx->timeout_in_sec = desc->timeout_in_sec; 4092 4093 /* Take a ref on the descriptor in case it gets closed while we are checking 4094 * all of the channels. 4095 */ 4096 spdk_spin_lock(&desc->spinlock); 4097 desc->refs++; 4098 spdk_spin_unlock(&desc->spinlock); 4099 4100 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4101 bdev_channel_poll_timeout_io_done); 4102 4103 return SPDK_POLLER_BUSY; 4104 } 4105 4106 int 4107 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4108 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4109 { 4110 assert(desc->thread == spdk_get_thread()); 4111 4112 spdk_poller_unregister(&desc->io_timeout_poller); 4113 4114 if (timeout_in_sec) { 4115 assert(cb_fn != NULL); 4116 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4117 desc, 4118 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4119 1000); 4120 if (desc->io_timeout_poller == NULL) { 4121 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4122 return -1; 4123 } 4124 } 4125 4126 desc->cb_fn = cb_fn; 4127 desc->cb_arg = cb_arg; 4128 desc->timeout_in_sec = timeout_in_sec; 4129 4130 return 0; 4131 } 4132 4133 static int 4134 bdev_channel_create(void *io_device, void *ctx_buf) 4135 { 4136 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4137 struct spdk_bdev_channel *ch = ctx_buf; 4138 struct spdk_io_channel *mgmt_io_ch; 4139 struct spdk_bdev_mgmt_channel *mgmt_ch; 4140 struct spdk_bdev_shared_resource *shared_resource; 4141 struct lba_range *range; 4142 4143 ch->bdev = bdev; 4144 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4145 if (!ch->channel) { 4146 return -1; 4147 } 4148 4149 ch->accel_channel = spdk_accel_get_io_channel(); 4150 if (!ch->accel_channel) { 4151 spdk_put_io_channel(ch->channel); 4152 return -1; 4153 } 4154 4155 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4156 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4157 4158 assert(ch->histogram == NULL); 4159 if (bdev->internal.histogram_enabled) { 4160 ch->histogram = spdk_histogram_data_alloc(); 4161 if (ch->histogram == NULL) { 4162 SPDK_ERRLOG("Could not allocate histogram\n"); 4163 } 4164 } 4165 4166 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4167 if (!mgmt_io_ch) { 4168 spdk_put_io_channel(ch->channel); 4169 spdk_put_io_channel(ch->accel_channel); 4170 return -1; 4171 } 4172 4173 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4174 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4175 if (shared_resource->shared_ch == ch->channel) { 4176 spdk_put_io_channel(mgmt_io_ch); 4177 shared_resource->ref++; 4178 break; 4179 } 4180 } 4181 4182 if (shared_resource == NULL) { 4183 shared_resource = calloc(1, sizeof(*shared_resource)); 4184 if (shared_resource == NULL) { 4185 spdk_put_io_channel(ch->channel); 4186 spdk_put_io_channel(ch->accel_channel); 4187 spdk_put_io_channel(mgmt_io_ch); 4188 return -1; 4189 } 4190 4191 shared_resource->mgmt_ch = mgmt_ch; 4192 shared_resource->io_outstanding = 0; 4193 TAILQ_INIT(&shared_resource->nomem_io); 4194 shared_resource->nomem_threshold = 0; 4195 shared_resource->shared_ch = ch->channel; 4196 shared_resource->ref = 1; 4197 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4198 } 4199 4200 ch->io_outstanding = 0; 4201 TAILQ_INIT(&ch->queued_resets); 4202 TAILQ_INIT(&ch->locked_ranges); 4203 TAILQ_INIT(&ch->qos_queued_io); 4204 ch->flags = 0; 4205 ch->trace_id = bdev->internal.trace_id; 4206 ch->shared_resource = shared_resource; 4207 4208 TAILQ_INIT(&ch->io_submitted); 4209 TAILQ_INIT(&ch->io_locked); 4210 TAILQ_INIT(&ch->io_accel_exec); 4211 TAILQ_INIT(&ch->io_memory_domain); 4212 4213 ch->stat = bdev_alloc_io_stat(false); 4214 if (ch->stat == NULL) { 4215 bdev_channel_destroy_resource(ch); 4216 return -1; 4217 } 4218 4219 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4220 4221 #ifdef SPDK_CONFIG_VTUNE 4222 { 4223 char *name; 4224 __itt_init_ittlib(NULL, 0); 4225 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4226 if (!name) { 4227 bdev_channel_destroy_resource(ch); 4228 return -1; 4229 } 4230 ch->handle = __itt_string_handle_create(name); 4231 free(name); 4232 ch->start_tsc = spdk_get_ticks(); 4233 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4234 ch->prev_stat = bdev_alloc_io_stat(false); 4235 if (ch->prev_stat == NULL) { 4236 bdev_channel_destroy_resource(ch); 4237 return -1; 4238 } 4239 } 4240 #endif 4241 4242 spdk_spin_lock(&bdev->internal.spinlock); 4243 bdev_enable_qos(bdev, ch); 4244 4245 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4246 struct lba_range *new_range; 4247 4248 new_range = calloc(1, sizeof(*new_range)); 4249 if (new_range == NULL) { 4250 spdk_spin_unlock(&bdev->internal.spinlock); 4251 bdev_channel_destroy_resource(ch); 4252 return -1; 4253 } 4254 new_range->length = range->length; 4255 new_range->offset = range->offset; 4256 new_range->locked_ctx = range->locked_ctx; 4257 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4258 } 4259 4260 spdk_spin_unlock(&bdev->internal.spinlock); 4261 4262 return 0; 4263 } 4264 4265 static int 4266 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4267 void *cb_ctx) 4268 { 4269 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4270 struct spdk_bdev_io *bdev_io; 4271 uint64_t buf_len; 4272 4273 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4274 if (bdev_io->internal.ch == bdev_ch) { 4275 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4276 spdk_iobuf_entry_abort(ch, entry, buf_len); 4277 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4278 } 4279 4280 return 0; 4281 } 4282 4283 /* 4284 * Abort I/O that are waiting on a data buffer. 4285 */ 4286 static void 4287 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4288 { 4289 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4290 bdev_abort_all_buf_io_cb, ch); 4291 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4292 bdev_abort_all_buf_io_cb, ch); 4293 } 4294 4295 /* 4296 * Abort I/O that are queued waiting for submission. These types of I/O are 4297 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4298 */ 4299 static void 4300 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4301 { 4302 struct spdk_bdev_io *bdev_io, *tmp; 4303 4304 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4305 if (bdev_io->internal.ch == ch) { 4306 TAILQ_REMOVE(queue, bdev_io, internal.link); 4307 /* 4308 * spdk_bdev_io_complete() assumes that the completed I/O had 4309 * been submitted to the bdev module. Since in this case it 4310 * hadn't, bump io_outstanding to account for the decrement 4311 * that spdk_bdev_io_complete() will do. 4312 */ 4313 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4314 bdev_io_increment_outstanding(ch, ch->shared_resource); 4315 } 4316 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4317 } 4318 } 4319 } 4320 4321 static bool 4322 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4323 { 4324 struct spdk_bdev_io *bdev_io; 4325 4326 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4327 if (bdev_io == bio_to_abort) { 4328 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4329 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4330 return true; 4331 } 4332 } 4333 4334 return false; 4335 } 4336 4337 static int 4338 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4339 { 4340 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4341 uint64_t buf_len; 4342 4343 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4344 if (bdev_io == bio_to_abort) { 4345 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4346 spdk_iobuf_entry_abort(ch, entry, buf_len); 4347 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4348 return 1; 4349 } 4350 4351 return 0; 4352 } 4353 4354 static bool 4355 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4356 { 4357 int rc; 4358 4359 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4360 bdev_abort_buf_io_cb, bio_to_abort); 4361 if (rc == 1) { 4362 return true; 4363 } 4364 4365 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4366 bdev_abort_buf_io_cb, bio_to_abort); 4367 return rc == 1; 4368 } 4369 4370 static void 4371 bdev_qos_channel_destroy(void *cb_arg) 4372 { 4373 struct spdk_bdev_qos *qos = cb_arg; 4374 4375 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4376 spdk_poller_unregister(&qos->poller); 4377 4378 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4379 4380 free(qos); 4381 } 4382 4383 static int 4384 bdev_qos_destroy(struct spdk_bdev *bdev) 4385 { 4386 int i; 4387 4388 /* 4389 * Cleanly shutting down the QoS poller is tricky, because 4390 * during the asynchronous operation the user could open 4391 * a new descriptor and create a new channel, spawning 4392 * a new QoS poller. 4393 * 4394 * The strategy is to create a new QoS structure here and swap it 4395 * in. The shutdown path then continues to refer to the old one 4396 * until it completes and then releases it. 4397 */ 4398 struct spdk_bdev_qos *new_qos, *old_qos; 4399 4400 old_qos = bdev->internal.qos; 4401 4402 new_qos = calloc(1, sizeof(*new_qos)); 4403 if (!new_qos) { 4404 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4405 return -ENOMEM; 4406 } 4407 4408 /* Copy the old QoS data into the newly allocated structure */ 4409 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4410 4411 /* Zero out the key parts of the QoS structure */ 4412 new_qos->ch = NULL; 4413 new_qos->thread = NULL; 4414 new_qos->poller = NULL; 4415 /* 4416 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4417 * It will be used later for the new QoS structure. 4418 */ 4419 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4420 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4421 new_qos->rate_limits[i].min_per_timeslice = 0; 4422 new_qos->rate_limits[i].max_per_timeslice = 0; 4423 } 4424 4425 bdev->internal.qos = new_qos; 4426 4427 if (old_qos->thread == NULL) { 4428 free(old_qos); 4429 } else { 4430 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4431 } 4432 4433 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4434 * been destroyed yet. The destruction path will end up waiting for the final 4435 * channel to be put before it releases resources. */ 4436 4437 return 0; 4438 } 4439 4440 void 4441 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4442 { 4443 total->bytes_read += add->bytes_read; 4444 total->num_read_ops += add->num_read_ops; 4445 total->bytes_written += add->bytes_written; 4446 total->num_write_ops += add->num_write_ops; 4447 total->bytes_unmapped += add->bytes_unmapped; 4448 total->num_unmap_ops += add->num_unmap_ops; 4449 total->bytes_copied += add->bytes_copied; 4450 total->num_copy_ops += add->num_copy_ops; 4451 total->read_latency_ticks += add->read_latency_ticks; 4452 total->write_latency_ticks += add->write_latency_ticks; 4453 total->unmap_latency_ticks += add->unmap_latency_ticks; 4454 total->copy_latency_ticks += add->copy_latency_ticks; 4455 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4456 total->max_read_latency_ticks = add->max_read_latency_ticks; 4457 } 4458 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4459 total->min_read_latency_ticks = add->min_read_latency_ticks; 4460 } 4461 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4462 total->max_write_latency_ticks = add->max_write_latency_ticks; 4463 } 4464 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4465 total->min_write_latency_ticks = add->min_write_latency_ticks; 4466 } 4467 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4468 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4469 } 4470 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4471 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4472 } 4473 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4474 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4475 } 4476 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4477 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4478 } 4479 } 4480 4481 static void 4482 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4483 { 4484 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4485 4486 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4487 memcpy(to_stat->io_error, from_stat->io_error, 4488 sizeof(struct spdk_bdev_io_error_stat)); 4489 } 4490 } 4491 4492 void 4493 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4494 { 4495 stat->max_read_latency_ticks = 0; 4496 stat->min_read_latency_ticks = UINT64_MAX; 4497 stat->max_write_latency_ticks = 0; 4498 stat->min_write_latency_ticks = UINT64_MAX; 4499 stat->max_unmap_latency_ticks = 0; 4500 stat->min_unmap_latency_ticks = UINT64_MAX; 4501 stat->max_copy_latency_ticks = 0; 4502 stat->min_copy_latency_ticks = UINT64_MAX; 4503 4504 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4505 return; 4506 } 4507 4508 stat->bytes_read = 0; 4509 stat->num_read_ops = 0; 4510 stat->bytes_written = 0; 4511 stat->num_write_ops = 0; 4512 stat->bytes_unmapped = 0; 4513 stat->num_unmap_ops = 0; 4514 stat->bytes_copied = 0; 4515 stat->num_copy_ops = 0; 4516 stat->read_latency_ticks = 0; 4517 stat->write_latency_ticks = 0; 4518 stat->unmap_latency_ticks = 0; 4519 stat->copy_latency_ticks = 0; 4520 4521 if (stat->io_error != NULL) { 4522 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4523 } 4524 } 4525 4526 struct spdk_bdev_io_stat * 4527 bdev_alloc_io_stat(bool io_error_stat) 4528 { 4529 struct spdk_bdev_io_stat *stat; 4530 4531 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4532 if (stat == NULL) { 4533 return NULL; 4534 } 4535 4536 if (io_error_stat) { 4537 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4538 if (stat->io_error == NULL) { 4539 free(stat); 4540 return NULL; 4541 } 4542 } else { 4543 stat->io_error = NULL; 4544 } 4545 4546 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4547 4548 return stat; 4549 } 4550 4551 void 4552 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4553 { 4554 if (stat != NULL) { 4555 free(stat->io_error); 4556 free(stat); 4557 } 4558 } 4559 4560 void 4561 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4562 { 4563 int i; 4564 4565 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4566 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4567 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4568 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4569 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4570 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4571 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4572 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4573 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4574 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4575 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4576 stat->min_read_latency_ticks != UINT64_MAX ? 4577 stat->min_read_latency_ticks : 0); 4578 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4579 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4580 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4581 stat->min_write_latency_ticks != UINT64_MAX ? 4582 stat->min_write_latency_ticks : 0); 4583 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4584 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4585 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4586 stat->min_unmap_latency_ticks != UINT64_MAX ? 4587 stat->min_unmap_latency_ticks : 0); 4588 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4589 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4590 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4591 stat->min_copy_latency_ticks != UINT64_MAX ? 4592 stat->min_copy_latency_ticks : 0); 4593 4594 if (stat->io_error != NULL) { 4595 spdk_json_write_named_object_begin(w, "io_error"); 4596 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4597 if (stat->io_error->error_status[i] != 0) { 4598 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4599 stat->io_error->error_status[i]); 4600 } 4601 } 4602 spdk_json_write_object_end(w); 4603 } 4604 } 4605 4606 static void 4607 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4608 { 4609 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4610 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4611 4612 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4613 bdev_abort_all_buf_io(mgmt_ch, ch); 4614 } 4615 4616 static void 4617 bdev_channel_destroy(void *io_device, void *ctx_buf) 4618 { 4619 struct spdk_bdev_channel *ch = ctx_buf; 4620 4621 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4622 spdk_get_thread()); 4623 4624 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4625 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4626 4627 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4628 spdk_spin_lock(&ch->bdev->internal.spinlock); 4629 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4630 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4631 4632 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4633 4634 bdev_channel_abort_queued_ios(ch); 4635 4636 if (ch->histogram) { 4637 spdk_histogram_data_free(ch->histogram); 4638 } 4639 4640 bdev_channel_destroy_resource(ch); 4641 } 4642 4643 /* 4644 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4645 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4646 */ 4647 static int 4648 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4649 { 4650 struct spdk_bdev_name *tmp; 4651 4652 bdev_name->name = strdup(name); 4653 if (bdev_name->name == NULL) { 4654 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4655 return -ENOMEM; 4656 } 4657 4658 bdev_name->bdev = bdev; 4659 4660 spdk_spin_lock(&g_bdev_mgr.spinlock); 4661 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4662 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4663 4664 if (tmp != NULL) { 4665 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4666 free(bdev_name->name); 4667 return -EEXIST; 4668 } 4669 4670 return 0; 4671 } 4672 4673 static void 4674 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4675 { 4676 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4677 free(bdev_name->name); 4678 } 4679 4680 static void 4681 bdev_name_del(struct spdk_bdev_name *bdev_name) 4682 { 4683 spdk_spin_lock(&g_bdev_mgr.spinlock); 4684 bdev_name_del_unsafe(bdev_name); 4685 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4686 } 4687 4688 int 4689 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4690 { 4691 struct spdk_bdev_alias *tmp; 4692 int ret; 4693 4694 if (alias == NULL) { 4695 SPDK_ERRLOG("Empty alias passed\n"); 4696 return -EINVAL; 4697 } 4698 4699 tmp = calloc(1, sizeof(*tmp)); 4700 if (tmp == NULL) { 4701 SPDK_ERRLOG("Unable to allocate alias\n"); 4702 return -ENOMEM; 4703 } 4704 4705 ret = bdev_name_add(&tmp->alias, bdev, alias); 4706 if (ret != 0) { 4707 free(tmp); 4708 return ret; 4709 } 4710 4711 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4712 4713 return 0; 4714 } 4715 4716 static int 4717 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4718 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4719 { 4720 struct spdk_bdev_alias *tmp; 4721 4722 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4723 if (strcmp(alias, tmp->alias.name) == 0) { 4724 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4725 alias_del_fn(&tmp->alias); 4726 free(tmp); 4727 return 0; 4728 } 4729 } 4730 4731 return -ENOENT; 4732 } 4733 4734 int 4735 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4736 { 4737 int rc; 4738 4739 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4740 if (rc == -ENOENT) { 4741 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4742 } 4743 4744 return rc; 4745 } 4746 4747 void 4748 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4749 { 4750 struct spdk_bdev_alias *p, *tmp; 4751 4752 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4753 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4754 bdev_name_del(&p->alias); 4755 free(p); 4756 } 4757 } 4758 4759 struct spdk_io_channel * 4760 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4761 { 4762 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4763 } 4764 4765 void * 4766 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4767 { 4768 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4769 void *ctx = NULL; 4770 4771 if (bdev->fn_table->get_module_ctx) { 4772 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4773 } 4774 4775 return ctx; 4776 } 4777 4778 const char * 4779 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4780 { 4781 return bdev->module->name; 4782 } 4783 4784 const char * 4785 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4786 { 4787 return bdev->name; 4788 } 4789 4790 const char * 4791 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4792 { 4793 return bdev->product_name; 4794 } 4795 4796 const struct spdk_bdev_aliases_list * 4797 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4798 { 4799 return &bdev->aliases; 4800 } 4801 4802 uint32_t 4803 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4804 { 4805 return bdev->blocklen; 4806 } 4807 4808 uint32_t 4809 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4810 { 4811 return bdev->write_unit_size; 4812 } 4813 4814 uint64_t 4815 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4816 { 4817 return bdev->blockcnt; 4818 } 4819 4820 const char * 4821 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4822 { 4823 return qos_rpc_type[type]; 4824 } 4825 4826 void 4827 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4828 { 4829 int i; 4830 4831 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4832 4833 spdk_spin_lock(&bdev->internal.spinlock); 4834 if (bdev->internal.qos) { 4835 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4836 if (bdev->internal.qos->rate_limits[i].limit != 4837 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4838 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4839 if (bdev_qos_is_iops_rate_limit(i) == false) { 4840 /* Change from Byte to Megabyte which is user visible. */ 4841 limits[i] = limits[i] / 1024 / 1024; 4842 } 4843 } 4844 } 4845 } 4846 spdk_spin_unlock(&bdev->internal.spinlock); 4847 } 4848 4849 size_t 4850 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4851 { 4852 return 1 << bdev->required_alignment; 4853 } 4854 4855 uint32_t 4856 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4857 { 4858 return bdev->optimal_io_boundary; 4859 } 4860 4861 bool 4862 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4863 { 4864 return bdev->write_cache; 4865 } 4866 4867 const struct spdk_uuid * 4868 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4869 { 4870 return &bdev->uuid; 4871 } 4872 4873 uint16_t 4874 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4875 { 4876 return bdev->acwu; 4877 } 4878 4879 uint32_t 4880 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4881 { 4882 return bdev->md_len; 4883 } 4884 4885 bool 4886 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4887 { 4888 return (bdev->md_len != 0) && bdev->md_interleave; 4889 } 4890 4891 bool 4892 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4893 { 4894 return (bdev->md_len != 0) && !bdev->md_interleave; 4895 } 4896 4897 bool 4898 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4899 { 4900 return bdev->zoned; 4901 } 4902 4903 uint32_t 4904 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4905 { 4906 if (spdk_bdev_is_md_interleaved(bdev)) { 4907 return bdev->blocklen - bdev->md_len; 4908 } else { 4909 return bdev->blocklen; 4910 } 4911 } 4912 4913 uint32_t 4914 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4915 { 4916 return bdev->phys_blocklen; 4917 } 4918 4919 static uint32_t 4920 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4921 { 4922 if (!spdk_bdev_is_md_interleaved(bdev)) { 4923 return bdev->blocklen + bdev->md_len; 4924 } else { 4925 return bdev->blocklen; 4926 } 4927 } 4928 4929 /* We have to use the typedef in the function declaration to appease astyle. */ 4930 typedef enum spdk_dif_type spdk_dif_type_t; 4931 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4932 4933 spdk_dif_type_t 4934 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4935 { 4936 if (bdev->md_len != 0) { 4937 return bdev->dif_type; 4938 } else { 4939 return SPDK_DIF_DISABLE; 4940 } 4941 } 4942 4943 spdk_dif_pi_format_t 4944 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4945 { 4946 return bdev->dif_pi_format; 4947 } 4948 4949 bool 4950 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4951 { 4952 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4953 return bdev->dif_is_head_of_md; 4954 } else { 4955 return false; 4956 } 4957 } 4958 4959 bool 4960 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4961 enum spdk_dif_check_type check_type) 4962 { 4963 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4964 return false; 4965 } 4966 4967 switch (check_type) { 4968 case SPDK_DIF_CHECK_TYPE_REFTAG: 4969 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4970 case SPDK_DIF_CHECK_TYPE_APPTAG: 4971 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4972 case SPDK_DIF_CHECK_TYPE_GUARD: 4973 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4974 default: 4975 return false; 4976 } 4977 } 4978 4979 static uint32_t 4980 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4981 { 4982 uint64_t aligned_length, max_write_blocks; 4983 4984 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4985 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4986 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4987 4988 return max_write_blocks; 4989 } 4990 4991 uint32_t 4992 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4993 { 4994 return bdev->max_copy; 4995 } 4996 4997 uint64_t 4998 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4999 { 5000 return bdev->internal.measured_queue_depth; 5001 } 5002 5003 uint64_t 5004 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5005 { 5006 return bdev->internal.period; 5007 } 5008 5009 uint64_t 5010 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5011 { 5012 return bdev->internal.weighted_io_time; 5013 } 5014 5015 uint64_t 5016 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5017 { 5018 return bdev->internal.io_time; 5019 } 5020 5021 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5022 { 5023 return bdev->ctratt; 5024 } 5025 5026 static void bdev_update_qd_sampling_period(void *ctx); 5027 5028 static void 5029 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5030 { 5031 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5032 5033 if (bdev->internal.measured_queue_depth) { 5034 bdev->internal.io_time += bdev->internal.period; 5035 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5036 } 5037 5038 bdev->internal.qd_poll_in_progress = false; 5039 5040 bdev_update_qd_sampling_period(bdev); 5041 } 5042 5043 static void 5044 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5045 struct spdk_io_channel *io_ch, void *_ctx) 5046 { 5047 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5048 5049 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5050 spdk_bdev_for_each_channel_continue(i, 0); 5051 } 5052 5053 static int 5054 bdev_calculate_measured_queue_depth(void *ctx) 5055 { 5056 struct spdk_bdev *bdev = ctx; 5057 5058 bdev->internal.qd_poll_in_progress = true; 5059 bdev->internal.temporary_queue_depth = 0; 5060 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5061 return SPDK_POLLER_BUSY; 5062 } 5063 5064 static void 5065 bdev_update_qd_sampling_period(void *ctx) 5066 { 5067 struct spdk_bdev *bdev = ctx; 5068 5069 if (bdev->internal.period == bdev->internal.new_period) { 5070 return; 5071 } 5072 5073 if (bdev->internal.qd_poll_in_progress) { 5074 return; 5075 } 5076 5077 bdev->internal.period = bdev->internal.new_period; 5078 5079 spdk_poller_unregister(&bdev->internal.qd_poller); 5080 if (bdev->internal.period != 0) { 5081 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5082 bdev, bdev->internal.period); 5083 } else { 5084 spdk_bdev_close(bdev->internal.qd_desc); 5085 bdev->internal.qd_desc = NULL; 5086 } 5087 } 5088 5089 static void 5090 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5091 { 5092 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5093 } 5094 5095 void 5096 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5097 { 5098 int rc; 5099 5100 if (bdev->internal.new_period == period) { 5101 return; 5102 } 5103 5104 bdev->internal.new_period = period; 5105 5106 if (bdev->internal.qd_desc != NULL) { 5107 assert(bdev->internal.period != 0); 5108 5109 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5110 bdev_update_qd_sampling_period, bdev); 5111 return; 5112 } 5113 5114 assert(bdev->internal.period == 0); 5115 5116 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5117 NULL, &bdev->internal.qd_desc); 5118 if (rc != 0) { 5119 return; 5120 } 5121 5122 bdev->internal.period = period; 5123 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5124 bdev, period); 5125 } 5126 5127 struct bdev_get_current_qd_ctx { 5128 uint64_t current_qd; 5129 spdk_bdev_get_current_qd_cb cb_fn; 5130 void *cb_arg; 5131 }; 5132 5133 static void 5134 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5135 { 5136 struct bdev_get_current_qd_ctx *ctx = _ctx; 5137 5138 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5139 5140 free(ctx); 5141 } 5142 5143 static void 5144 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5145 struct spdk_io_channel *io_ch, void *_ctx) 5146 { 5147 struct bdev_get_current_qd_ctx *ctx = _ctx; 5148 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5149 5150 ctx->current_qd += bdev_ch->io_outstanding; 5151 5152 spdk_bdev_for_each_channel_continue(i, 0); 5153 } 5154 5155 void 5156 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5157 void *cb_arg) 5158 { 5159 struct bdev_get_current_qd_ctx *ctx; 5160 5161 assert(cb_fn != NULL); 5162 5163 ctx = calloc(1, sizeof(*ctx)); 5164 if (ctx == NULL) { 5165 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5166 return; 5167 } 5168 5169 ctx->cb_fn = cb_fn; 5170 ctx->cb_arg = cb_arg; 5171 5172 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5173 } 5174 5175 static void 5176 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5177 { 5178 assert(desc->thread == spdk_get_thread()); 5179 5180 spdk_spin_lock(&desc->spinlock); 5181 desc->refs--; 5182 if (!desc->closed) { 5183 spdk_spin_unlock(&desc->spinlock); 5184 desc->callback.event_fn(type, 5185 desc->bdev, 5186 desc->callback.ctx); 5187 return; 5188 } else if (desc->refs == 0) { 5189 /* This descriptor was closed after this event_notify message was sent. 5190 * spdk_bdev_close() could not free the descriptor since this message was 5191 * in flight, so we free it now using bdev_desc_free(). 5192 */ 5193 spdk_spin_unlock(&desc->spinlock); 5194 bdev_desc_free(desc); 5195 return; 5196 } 5197 spdk_spin_unlock(&desc->spinlock); 5198 } 5199 5200 static void 5201 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5202 { 5203 spdk_spin_lock(&desc->spinlock); 5204 desc->refs++; 5205 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5206 spdk_spin_unlock(&desc->spinlock); 5207 } 5208 5209 static void 5210 _resize_notify(void *ctx) 5211 { 5212 struct spdk_bdev_desc *desc = ctx; 5213 5214 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5215 } 5216 5217 int 5218 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5219 { 5220 struct spdk_bdev_desc *desc; 5221 int ret; 5222 5223 if (size == bdev->blockcnt) { 5224 return 0; 5225 } 5226 5227 spdk_spin_lock(&bdev->internal.spinlock); 5228 5229 /* bdev has open descriptors */ 5230 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5231 bdev->blockcnt > size) { 5232 ret = -EBUSY; 5233 } else { 5234 bdev->blockcnt = size; 5235 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5236 event_notify(desc, _resize_notify); 5237 } 5238 ret = 0; 5239 } 5240 5241 spdk_spin_unlock(&bdev->internal.spinlock); 5242 5243 return ret; 5244 } 5245 5246 /* 5247 * Convert I/O offset and length from bytes to blocks. 5248 * 5249 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5250 */ 5251 static uint64_t 5252 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5253 uint64_t num_bytes, uint64_t *num_blocks) 5254 { 5255 uint32_t block_size = bdev->blocklen; 5256 uint8_t shift_cnt; 5257 5258 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5259 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5260 shift_cnt = spdk_u32log2(block_size); 5261 *offset_blocks = offset_bytes >> shift_cnt; 5262 *num_blocks = num_bytes >> shift_cnt; 5263 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5264 (num_bytes - (*num_blocks << shift_cnt)); 5265 } else { 5266 *offset_blocks = offset_bytes / block_size; 5267 *num_blocks = num_bytes / block_size; 5268 return (offset_bytes % block_size) | (num_bytes % block_size); 5269 } 5270 } 5271 5272 static bool 5273 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5274 { 5275 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5276 * has been an overflow and hence the offset has been wrapped around */ 5277 if (offset_blocks + num_blocks < offset_blocks) { 5278 return false; 5279 } 5280 5281 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5282 if (offset_blocks + num_blocks > bdev->blockcnt) { 5283 return false; 5284 } 5285 5286 return true; 5287 } 5288 5289 static void 5290 bdev_seek_complete_cb(void *ctx) 5291 { 5292 struct spdk_bdev_io *bdev_io = ctx; 5293 5294 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5295 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5296 } 5297 5298 static int 5299 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5300 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5301 spdk_bdev_io_completion_cb cb, void *cb_arg) 5302 { 5303 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5304 struct spdk_bdev_io *bdev_io; 5305 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5306 5307 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5308 5309 /* Check if offset_blocks is valid looking at the validity of one block */ 5310 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5311 return -EINVAL; 5312 } 5313 5314 bdev_io = bdev_channel_get_io(channel); 5315 if (!bdev_io) { 5316 return -ENOMEM; 5317 } 5318 5319 bdev_io->internal.ch = channel; 5320 bdev_io->internal.desc = desc; 5321 bdev_io->type = io_type; 5322 bdev_io->u.bdev.offset_blocks = offset_blocks; 5323 bdev_io->u.bdev.memory_domain = NULL; 5324 bdev_io->u.bdev.memory_domain_ctx = NULL; 5325 bdev_io->u.bdev.accel_sequence = NULL; 5326 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5327 5328 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5329 /* In case bdev doesn't support seek to next data/hole offset, 5330 * it is assumed that only data and no holes are present */ 5331 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5332 bdev_io->u.bdev.seek.offset = offset_blocks; 5333 } else { 5334 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5335 } 5336 5337 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5338 return 0; 5339 } 5340 5341 bdev_io_submit(bdev_io); 5342 return 0; 5343 } 5344 5345 int 5346 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5347 uint64_t offset_blocks, 5348 spdk_bdev_io_completion_cb cb, void *cb_arg) 5349 { 5350 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5351 } 5352 5353 int 5354 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5355 uint64_t offset_blocks, 5356 spdk_bdev_io_completion_cb cb, void *cb_arg) 5357 { 5358 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5359 } 5360 5361 uint64_t 5362 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5363 { 5364 return bdev_io->u.bdev.seek.offset; 5365 } 5366 5367 static int 5368 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5369 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5370 spdk_bdev_io_completion_cb cb, void *cb_arg) 5371 { 5372 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5373 struct spdk_bdev_io *bdev_io; 5374 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5375 5376 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5377 return -EINVAL; 5378 } 5379 5380 bdev_io = bdev_channel_get_io(channel); 5381 if (!bdev_io) { 5382 return -ENOMEM; 5383 } 5384 5385 bdev_io->internal.ch = channel; 5386 bdev_io->internal.desc = desc; 5387 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5388 bdev_io->u.bdev.iovs = &bdev_io->iov; 5389 bdev_io->u.bdev.iovs[0].iov_base = buf; 5390 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5391 bdev_io->u.bdev.iovcnt = 1; 5392 bdev_io->u.bdev.md_buf = md_buf; 5393 bdev_io->u.bdev.num_blocks = num_blocks; 5394 bdev_io->u.bdev.offset_blocks = offset_blocks; 5395 bdev_io->u.bdev.memory_domain = NULL; 5396 bdev_io->u.bdev.memory_domain_ctx = NULL; 5397 bdev_io->u.bdev.accel_sequence = NULL; 5398 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5399 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5400 5401 bdev_io_submit(bdev_io); 5402 return 0; 5403 } 5404 5405 int 5406 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5407 void *buf, uint64_t offset, uint64_t nbytes, 5408 spdk_bdev_io_completion_cb cb, void *cb_arg) 5409 { 5410 uint64_t offset_blocks, num_blocks; 5411 5412 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5413 nbytes, &num_blocks) != 0) { 5414 return -EINVAL; 5415 } 5416 5417 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5418 } 5419 5420 int 5421 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5422 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5423 spdk_bdev_io_completion_cb cb, void *cb_arg) 5424 { 5425 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5426 } 5427 5428 int 5429 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5430 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5431 spdk_bdev_io_completion_cb cb, void *cb_arg) 5432 { 5433 struct iovec iov = { 5434 .iov_base = buf, 5435 }; 5436 5437 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5438 return -EINVAL; 5439 } 5440 5441 if (md_buf && !_is_buf_allocated(&iov)) { 5442 return -EINVAL; 5443 } 5444 5445 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5446 cb, cb_arg); 5447 } 5448 5449 int 5450 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5451 struct iovec *iov, int iovcnt, 5452 uint64_t offset, uint64_t nbytes, 5453 spdk_bdev_io_completion_cb cb, void *cb_arg) 5454 { 5455 uint64_t offset_blocks, num_blocks; 5456 5457 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5458 nbytes, &num_blocks) != 0) { 5459 return -EINVAL; 5460 } 5461 5462 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5463 } 5464 5465 static int 5466 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5467 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5468 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5469 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5470 spdk_bdev_io_completion_cb cb, void *cb_arg) 5471 { 5472 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5473 struct spdk_bdev_io *bdev_io; 5474 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5475 5476 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5477 return -EINVAL; 5478 } 5479 5480 bdev_io = bdev_channel_get_io(channel); 5481 if (spdk_unlikely(!bdev_io)) { 5482 return -ENOMEM; 5483 } 5484 5485 bdev_io->internal.ch = channel; 5486 bdev_io->internal.desc = desc; 5487 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5488 bdev_io->u.bdev.iovs = iov; 5489 bdev_io->u.bdev.iovcnt = iovcnt; 5490 bdev_io->u.bdev.md_buf = md_buf; 5491 bdev_io->u.bdev.num_blocks = num_blocks; 5492 bdev_io->u.bdev.offset_blocks = offset_blocks; 5493 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5494 5495 if (seq != NULL) { 5496 bdev_io->internal.f.has_accel_sequence = true; 5497 bdev_io->internal.accel_sequence = seq; 5498 } 5499 5500 if (domain != NULL) { 5501 bdev_io->internal.f.has_memory_domain = true; 5502 bdev_io->internal.memory_domain = domain; 5503 bdev_io->internal.memory_domain_ctx = domain_ctx; 5504 } 5505 5506 bdev_io->u.bdev.memory_domain = domain; 5507 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5508 bdev_io->u.bdev.accel_sequence = seq; 5509 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5510 5511 _bdev_io_submit_ext(desc, bdev_io); 5512 5513 return 0; 5514 } 5515 5516 int 5517 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5518 struct iovec *iov, int iovcnt, 5519 uint64_t offset_blocks, uint64_t num_blocks, 5520 spdk_bdev_io_completion_cb cb, void *cb_arg) 5521 { 5522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5523 5524 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5525 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5526 } 5527 5528 int 5529 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5530 struct iovec *iov, int iovcnt, void *md_buf, 5531 uint64_t offset_blocks, uint64_t num_blocks, 5532 spdk_bdev_io_completion_cb cb, void *cb_arg) 5533 { 5534 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5535 5536 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5537 return -EINVAL; 5538 } 5539 5540 if (md_buf && !_is_buf_allocated(iov)) { 5541 return -EINVAL; 5542 } 5543 5544 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5545 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5546 } 5547 5548 static inline bool 5549 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5550 { 5551 /* 5552 * We check if opts size is at least of size when we first introduced 5553 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5554 * are not checked internal. 5555 */ 5556 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5557 sizeof(opts->metadata) && 5558 opts->size <= sizeof(*opts) && 5559 /* When memory domain is used, the user must provide data buffers */ 5560 (!opts->memory_domain || (iov && iov[0].iov_base)); 5561 } 5562 5563 int 5564 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5565 struct iovec *iov, int iovcnt, 5566 uint64_t offset_blocks, uint64_t num_blocks, 5567 spdk_bdev_io_completion_cb cb, void *cb_arg, 5568 struct spdk_bdev_ext_io_opts *opts) 5569 { 5570 struct spdk_memory_domain *domain = NULL; 5571 struct spdk_accel_sequence *seq = NULL; 5572 void *domain_ctx = NULL, *md = NULL; 5573 uint32_t dif_check_flags = 0; 5574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5575 5576 if (opts) { 5577 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5578 return -EINVAL; 5579 } 5580 5581 md = opts->metadata; 5582 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5583 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5584 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5585 if (md) { 5586 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5587 return -EINVAL; 5588 } 5589 5590 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5591 return -EINVAL; 5592 } 5593 5594 if (spdk_unlikely(seq != NULL)) { 5595 return -EINVAL; 5596 } 5597 } 5598 } 5599 5600 dif_check_flags = bdev->dif_check_flags & 5601 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5602 5603 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5604 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5605 } 5606 5607 static int 5608 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5609 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5610 spdk_bdev_io_completion_cb cb, void *cb_arg) 5611 { 5612 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5613 struct spdk_bdev_io *bdev_io; 5614 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5615 5616 if (!desc->write) { 5617 return -EBADF; 5618 } 5619 5620 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5621 return -EINVAL; 5622 } 5623 5624 bdev_io = bdev_channel_get_io(channel); 5625 if (!bdev_io) { 5626 return -ENOMEM; 5627 } 5628 5629 bdev_io->internal.ch = channel; 5630 bdev_io->internal.desc = desc; 5631 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5632 bdev_io->u.bdev.iovs = &bdev_io->iov; 5633 bdev_io->u.bdev.iovs[0].iov_base = buf; 5634 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5635 bdev_io->u.bdev.iovcnt = 1; 5636 bdev_io->u.bdev.md_buf = md_buf; 5637 bdev_io->u.bdev.num_blocks = num_blocks; 5638 bdev_io->u.bdev.offset_blocks = offset_blocks; 5639 bdev_io->u.bdev.memory_domain = NULL; 5640 bdev_io->u.bdev.memory_domain_ctx = NULL; 5641 bdev_io->u.bdev.accel_sequence = NULL; 5642 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5643 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5644 5645 bdev_io_submit(bdev_io); 5646 return 0; 5647 } 5648 5649 int 5650 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5651 void *buf, uint64_t offset, uint64_t nbytes, 5652 spdk_bdev_io_completion_cb cb, void *cb_arg) 5653 { 5654 uint64_t offset_blocks, num_blocks; 5655 5656 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5657 nbytes, &num_blocks) != 0) { 5658 return -EINVAL; 5659 } 5660 5661 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5662 } 5663 5664 int 5665 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5666 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5667 spdk_bdev_io_completion_cb cb, void *cb_arg) 5668 { 5669 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5670 cb, cb_arg); 5671 } 5672 5673 int 5674 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5675 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5676 spdk_bdev_io_completion_cb cb, void *cb_arg) 5677 { 5678 struct iovec iov = { 5679 .iov_base = buf, 5680 }; 5681 5682 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5683 return -EINVAL; 5684 } 5685 5686 if (md_buf && !_is_buf_allocated(&iov)) { 5687 return -EINVAL; 5688 } 5689 5690 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5691 cb, cb_arg); 5692 } 5693 5694 static int 5695 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5696 struct iovec *iov, int iovcnt, void *md_buf, 5697 uint64_t offset_blocks, uint64_t num_blocks, 5698 struct spdk_memory_domain *domain, void *domain_ctx, 5699 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5700 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5701 spdk_bdev_io_completion_cb cb, void *cb_arg) 5702 { 5703 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5704 struct spdk_bdev_io *bdev_io; 5705 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5706 5707 if (spdk_unlikely(!desc->write)) { 5708 return -EBADF; 5709 } 5710 5711 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5712 return -EINVAL; 5713 } 5714 5715 bdev_io = bdev_channel_get_io(channel); 5716 if (spdk_unlikely(!bdev_io)) { 5717 return -ENOMEM; 5718 } 5719 5720 bdev_io->internal.ch = channel; 5721 bdev_io->internal.desc = desc; 5722 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5723 bdev_io->u.bdev.iovs = iov; 5724 bdev_io->u.bdev.iovcnt = iovcnt; 5725 bdev_io->u.bdev.md_buf = md_buf; 5726 bdev_io->u.bdev.num_blocks = num_blocks; 5727 bdev_io->u.bdev.offset_blocks = offset_blocks; 5728 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5729 if (seq != NULL) { 5730 bdev_io->internal.f.has_accel_sequence = true; 5731 bdev_io->internal.accel_sequence = seq; 5732 } 5733 5734 if (domain != NULL) { 5735 bdev_io->internal.f.has_memory_domain = true; 5736 bdev_io->internal.memory_domain = domain; 5737 bdev_io->internal.memory_domain_ctx = domain_ctx; 5738 } 5739 5740 bdev_io->u.bdev.memory_domain = domain; 5741 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5742 bdev_io->u.bdev.accel_sequence = seq; 5743 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5744 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5745 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5746 5747 _bdev_io_submit_ext(desc, bdev_io); 5748 5749 return 0; 5750 } 5751 5752 int 5753 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5754 struct iovec *iov, int iovcnt, 5755 uint64_t offset, uint64_t len, 5756 spdk_bdev_io_completion_cb cb, void *cb_arg) 5757 { 5758 uint64_t offset_blocks, num_blocks; 5759 5760 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5761 len, &num_blocks) != 0) { 5762 return -EINVAL; 5763 } 5764 5765 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5766 } 5767 5768 int 5769 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5770 struct iovec *iov, int iovcnt, 5771 uint64_t offset_blocks, uint64_t num_blocks, 5772 spdk_bdev_io_completion_cb cb, void *cb_arg) 5773 { 5774 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5775 5776 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5777 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5778 cb, cb_arg); 5779 } 5780 5781 int 5782 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5783 struct iovec *iov, int iovcnt, void *md_buf, 5784 uint64_t offset_blocks, uint64_t num_blocks, 5785 spdk_bdev_io_completion_cb cb, void *cb_arg) 5786 { 5787 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5788 5789 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5790 return -EINVAL; 5791 } 5792 5793 if (md_buf && !_is_buf_allocated(iov)) { 5794 return -EINVAL; 5795 } 5796 5797 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5798 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5799 cb, cb_arg); 5800 } 5801 5802 int 5803 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5804 struct iovec *iov, int iovcnt, 5805 uint64_t offset_blocks, uint64_t num_blocks, 5806 spdk_bdev_io_completion_cb cb, void *cb_arg, 5807 struct spdk_bdev_ext_io_opts *opts) 5808 { 5809 struct spdk_memory_domain *domain = NULL; 5810 struct spdk_accel_sequence *seq = NULL; 5811 void *domain_ctx = NULL, *md = NULL; 5812 uint32_t dif_check_flags = 0; 5813 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5814 uint32_t nvme_cdw12_raw = 0; 5815 uint32_t nvme_cdw13_raw = 0; 5816 5817 if (opts) { 5818 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5819 return -EINVAL; 5820 } 5821 md = opts->metadata; 5822 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5823 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5824 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5825 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5826 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5827 if (md) { 5828 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5829 return -EINVAL; 5830 } 5831 5832 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5833 return -EINVAL; 5834 } 5835 5836 if (spdk_unlikely(seq != NULL)) { 5837 return -EINVAL; 5838 } 5839 } 5840 } 5841 5842 dif_check_flags = bdev->dif_check_flags & 5843 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5844 5845 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5846 domain, domain_ctx, seq, dif_check_flags, 5847 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5848 } 5849 5850 static void 5851 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5852 { 5853 struct spdk_bdev_io *parent_io = cb_arg; 5854 struct spdk_bdev *bdev = parent_io->bdev; 5855 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5856 int i, rc = 0; 5857 5858 if (!success) { 5859 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5860 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5861 spdk_bdev_free_io(bdev_io); 5862 return; 5863 } 5864 5865 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5866 rc = memcmp(read_buf, 5867 parent_io->u.bdev.iovs[i].iov_base, 5868 parent_io->u.bdev.iovs[i].iov_len); 5869 if (rc) { 5870 break; 5871 } 5872 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5873 } 5874 5875 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5876 rc = memcmp(bdev_io->u.bdev.md_buf, 5877 parent_io->u.bdev.md_buf, 5878 spdk_bdev_get_md_size(bdev)); 5879 } 5880 5881 spdk_bdev_free_io(bdev_io); 5882 5883 if (rc == 0) { 5884 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5885 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5886 } else { 5887 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5888 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5889 } 5890 } 5891 5892 static void 5893 bdev_compare_do_read(void *_bdev_io) 5894 { 5895 struct spdk_bdev_io *bdev_io = _bdev_io; 5896 int rc; 5897 5898 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5899 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5900 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5901 bdev_compare_do_read_done, bdev_io); 5902 5903 if (rc == -ENOMEM) { 5904 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5905 } else if (rc != 0) { 5906 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5907 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5908 } 5909 } 5910 5911 static int 5912 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5913 struct iovec *iov, int iovcnt, void *md_buf, 5914 uint64_t offset_blocks, uint64_t num_blocks, 5915 spdk_bdev_io_completion_cb cb, void *cb_arg) 5916 { 5917 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5918 struct spdk_bdev_io *bdev_io; 5919 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5920 5921 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5922 return -EINVAL; 5923 } 5924 5925 bdev_io = bdev_channel_get_io(channel); 5926 if (!bdev_io) { 5927 return -ENOMEM; 5928 } 5929 5930 bdev_io->internal.ch = channel; 5931 bdev_io->internal.desc = desc; 5932 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5933 bdev_io->u.bdev.iovs = iov; 5934 bdev_io->u.bdev.iovcnt = iovcnt; 5935 bdev_io->u.bdev.md_buf = md_buf; 5936 bdev_io->u.bdev.num_blocks = num_blocks; 5937 bdev_io->u.bdev.offset_blocks = offset_blocks; 5938 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5939 bdev_io->u.bdev.memory_domain = NULL; 5940 bdev_io->u.bdev.memory_domain_ctx = NULL; 5941 bdev_io->u.bdev.accel_sequence = NULL; 5942 5943 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5944 bdev_io_submit(bdev_io); 5945 return 0; 5946 } 5947 5948 bdev_compare_do_read(bdev_io); 5949 5950 return 0; 5951 } 5952 5953 int 5954 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5955 struct iovec *iov, int iovcnt, 5956 uint64_t offset_blocks, uint64_t num_blocks, 5957 spdk_bdev_io_completion_cb cb, void *cb_arg) 5958 { 5959 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5960 num_blocks, cb, cb_arg); 5961 } 5962 5963 int 5964 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5965 struct iovec *iov, int iovcnt, void *md_buf, 5966 uint64_t offset_blocks, uint64_t num_blocks, 5967 spdk_bdev_io_completion_cb cb, void *cb_arg) 5968 { 5969 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5970 return -EINVAL; 5971 } 5972 5973 if (md_buf && !_is_buf_allocated(iov)) { 5974 return -EINVAL; 5975 } 5976 5977 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5978 num_blocks, cb, cb_arg); 5979 } 5980 5981 static int 5982 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5983 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5984 spdk_bdev_io_completion_cb cb, void *cb_arg) 5985 { 5986 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5987 struct spdk_bdev_io *bdev_io; 5988 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5989 5990 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5991 return -EINVAL; 5992 } 5993 5994 bdev_io = bdev_channel_get_io(channel); 5995 if (!bdev_io) { 5996 return -ENOMEM; 5997 } 5998 5999 bdev_io->internal.ch = channel; 6000 bdev_io->internal.desc = desc; 6001 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6002 bdev_io->u.bdev.iovs = &bdev_io->iov; 6003 bdev_io->u.bdev.iovs[0].iov_base = buf; 6004 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6005 bdev_io->u.bdev.iovcnt = 1; 6006 bdev_io->u.bdev.md_buf = md_buf; 6007 bdev_io->u.bdev.num_blocks = num_blocks; 6008 bdev_io->u.bdev.offset_blocks = offset_blocks; 6009 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6010 bdev_io->u.bdev.memory_domain = NULL; 6011 bdev_io->u.bdev.memory_domain_ctx = NULL; 6012 bdev_io->u.bdev.accel_sequence = NULL; 6013 6014 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6015 bdev_io_submit(bdev_io); 6016 return 0; 6017 } 6018 6019 bdev_compare_do_read(bdev_io); 6020 6021 return 0; 6022 } 6023 6024 int 6025 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6026 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6027 spdk_bdev_io_completion_cb cb, void *cb_arg) 6028 { 6029 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6030 cb, cb_arg); 6031 } 6032 6033 int 6034 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6035 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6036 spdk_bdev_io_completion_cb cb, void *cb_arg) 6037 { 6038 struct iovec iov = { 6039 .iov_base = buf, 6040 }; 6041 6042 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6043 return -EINVAL; 6044 } 6045 6046 if (md_buf && !_is_buf_allocated(&iov)) { 6047 return -EINVAL; 6048 } 6049 6050 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6051 cb, cb_arg); 6052 } 6053 6054 static void 6055 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6056 { 6057 struct spdk_bdev_io *bdev_io = ctx; 6058 6059 if (unlock_status) { 6060 SPDK_ERRLOG("LBA range unlock failed\n"); 6061 } 6062 6063 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6064 false, bdev_io->internal.caller_ctx); 6065 } 6066 6067 static void 6068 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6069 { 6070 bdev_io->internal.status = status; 6071 6072 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6073 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6074 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6075 } 6076 6077 static void 6078 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6079 { 6080 struct spdk_bdev_io *parent_io = cb_arg; 6081 6082 if (!success) { 6083 SPDK_ERRLOG("Compare and write operation failed\n"); 6084 } 6085 6086 spdk_bdev_free_io(bdev_io); 6087 6088 bdev_comparev_and_writev_blocks_unlock(parent_io, 6089 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6090 } 6091 6092 static void 6093 bdev_compare_and_write_do_write(void *_bdev_io) 6094 { 6095 struct spdk_bdev_io *bdev_io = _bdev_io; 6096 int rc; 6097 6098 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6099 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6100 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6101 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6102 bdev_compare_and_write_do_write_done, bdev_io); 6103 6104 6105 if (rc == -ENOMEM) { 6106 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6107 } else if (rc != 0) { 6108 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6109 } 6110 } 6111 6112 static void 6113 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6114 { 6115 struct spdk_bdev_io *parent_io = cb_arg; 6116 6117 spdk_bdev_free_io(bdev_io); 6118 6119 if (!success) { 6120 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6121 return; 6122 } 6123 6124 bdev_compare_and_write_do_write(parent_io); 6125 } 6126 6127 static void 6128 bdev_compare_and_write_do_compare(void *_bdev_io) 6129 { 6130 struct spdk_bdev_io *bdev_io = _bdev_io; 6131 int rc; 6132 6133 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6134 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6135 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6136 bdev_compare_and_write_do_compare_done, bdev_io); 6137 6138 if (rc == -ENOMEM) { 6139 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6140 } else if (rc != 0) { 6141 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6142 } 6143 } 6144 6145 static void 6146 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6147 { 6148 struct spdk_bdev_io *bdev_io = ctx; 6149 6150 if (status) { 6151 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6152 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6153 return; 6154 } 6155 6156 bdev_compare_and_write_do_compare(bdev_io); 6157 } 6158 6159 int 6160 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6161 struct iovec *compare_iov, int compare_iovcnt, 6162 struct iovec *write_iov, int write_iovcnt, 6163 uint64_t offset_blocks, uint64_t num_blocks, 6164 spdk_bdev_io_completion_cb cb, void *cb_arg) 6165 { 6166 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6167 struct spdk_bdev_io *bdev_io; 6168 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6169 6170 if (!desc->write) { 6171 return -EBADF; 6172 } 6173 6174 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6175 return -EINVAL; 6176 } 6177 6178 if (num_blocks > bdev->acwu) { 6179 return -EINVAL; 6180 } 6181 6182 bdev_io = bdev_channel_get_io(channel); 6183 if (!bdev_io) { 6184 return -ENOMEM; 6185 } 6186 6187 bdev_io->internal.ch = channel; 6188 bdev_io->internal.desc = desc; 6189 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6190 bdev_io->u.bdev.iovs = compare_iov; 6191 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6192 bdev_io->u.bdev.fused_iovs = write_iov; 6193 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6194 bdev_io->u.bdev.md_buf = NULL; 6195 bdev_io->u.bdev.num_blocks = num_blocks; 6196 bdev_io->u.bdev.offset_blocks = offset_blocks; 6197 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6198 bdev_io->u.bdev.memory_domain = NULL; 6199 bdev_io->u.bdev.memory_domain_ctx = NULL; 6200 bdev_io->u.bdev.accel_sequence = NULL; 6201 6202 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6203 bdev_io_submit(bdev_io); 6204 return 0; 6205 } 6206 6207 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6208 bdev_comparev_and_writev_blocks_locked, bdev_io); 6209 } 6210 6211 int 6212 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6213 struct iovec *iov, int iovcnt, 6214 uint64_t offset_blocks, uint64_t num_blocks, 6215 bool populate, 6216 spdk_bdev_io_completion_cb cb, void *cb_arg) 6217 { 6218 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6219 struct spdk_bdev_io *bdev_io; 6220 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6221 6222 if (!desc->write) { 6223 return -EBADF; 6224 } 6225 6226 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6227 return -EINVAL; 6228 } 6229 6230 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6231 return -ENOTSUP; 6232 } 6233 6234 bdev_io = bdev_channel_get_io(channel); 6235 if (!bdev_io) { 6236 return -ENOMEM; 6237 } 6238 6239 bdev_io->internal.ch = channel; 6240 bdev_io->internal.desc = desc; 6241 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6242 bdev_io->u.bdev.num_blocks = num_blocks; 6243 bdev_io->u.bdev.offset_blocks = offset_blocks; 6244 bdev_io->u.bdev.iovs = iov; 6245 bdev_io->u.bdev.iovcnt = iovcnt; 6246 bdev_io->u.bdev.md_buf = NULL; 6247 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6248 bdev_io->u.bdev.zcopy.commit = 0; 6249 bdev_io->u.bdev.zcopy.start = 1; 6250 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6251 bdev_io->u.bdev.memory_domain = NULL; 6252 bdev_io->u.bdev.memory_domain_ctx = NULL; 6253 bdev_io->u.bdev.accel_sequence = NULL; 6254 6255 bdev_io_submit(bdev_io); 6256 6257 return 0; 6258 } 6259 6260 int 6261 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6262 spdk_bdev_io_completion_cb cb, void *cb_arg) 6263 { 6264 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6265 return -EINVAL; 6266 } 6267 6268 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6269 bdev_io->u.bdev.zcopy.start = 0; 6270 bdev_io->internal.caller_ctx = cb_arg; 6271 bdev_io->internal.cb = cb; 6272 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6273 6274 bdev_io_submit(bdev_io); 6275 6276 return 0; 6277 } 6278 6279 int 6280 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6281 uint64_t offset, uint64_t len, 6282 spdk_bdev_io_completion_cb cb, void *cb_arg) 6283 { 6284 uint64_t offset_blocks, num_blocks; 6285 6286 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6287 len, &num_blocks) != 0) { 6288 return -EINVAL; 6289 } 6290 6291 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6292 } 6293 6294 int 6295 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6296 uint64_t offset_blocks, uint64_t num_blocks, 6297 spdk_bdev_io_completion_cb cb, void *cb_arg) 6298 { 6299 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6300 struct spdk_bdev_io *bdev_io; 6301 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6302 6303 if (!desc->write) { 6304 return -EBADF; 6305 } 6306 6307 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6308 return -EINVAL; 6309 } 6310 6311 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6312 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6313 return -ENOTSUP; 6314 } 6315 6316 bdev_io = bdev_channel_get_io(channel); 6317 6318 if (!bdev_io) { 6319 return -ENOMEM; 6320 } 6321 6322 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6323 bdev_io->internal.ch = channel; 6324 bdev_io->internal.desc = desc; 6325 bdev_io->u.bdev.offset_blocks = offset_blocks; 6326 bdev_io->u.bdev.num_blocks = num_blocks; 6327 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6328 bdev_io->u.bdev.memory_domain = NULL; 6329 bdev_io->u.bdev.memory_domain_ctx = NULL; 6330 bdev_io->u.bdev.accel_sequence = NULL; 6331 6332 /* If the write_zeroes size is large and should be split, use the generic split 6333 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6334 * 6335 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6336 * or emulate it using regular write request otherwise. 6337 */ 6338 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6339 bdev_io->internal.f.split) { 6340 bdev_io_submit(bdev_io); 6341 return 0; 6342 } 6343 6344 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6345 6346 return bdev_write_zero_buffer(bdev_io); 6347 } 6348 6349 int 6350 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6351 uint64_t offset, uint64_t nbytes, 6352 spdk_bdev_io_completion_cb cb, void *cb_arg) 6353 { 6354 uint64_t offset_blocks, num_blocks; 6355 6356 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6357 nbytes, &num_blocks) != 0) { 6358 return -EINVAL; 6359 } 6360 6361 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6362 } 6363 6364 static void 6365 bdev_io_complete_cb(void *ctx) 6366 { 6367 struct spdk_bdev_io *bdev_io = ctx; 6368 6369 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6370 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6371 } 6372 6373 int 6374 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6375 uint64_t offset_blocks, uint64_t num_blocks, 6376 spdk_bdev_io_completion_cb cb, void *cb_arg) 6377 { 6378 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6379 struct spdk_bdev_io *bdev_io; 6380 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6381 6382 if (!desc->write) { 6383 return -EBADF; 6384 } 6385 6386 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6387 return -EINVAL; 6388 } 6389 6390 bdev_io = bdev_channel_get_io(channel); 6391 if (!bdev_io) { 6392 return -ENOMEM; 6393 } 6394 6395 bdev_io->internal.ch = channel; 6396 bdev_io->internal.desc = desc; 6397 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6398 6399 bdev_io->u.bdev.iovs = &bdev_io->iov; 6400 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6401 bdev_io->u.bdev.iovs[0].iov_len = 0; 6402 bdev_io->u.bdev.iovcnt = 1; 6403 6404 bdev_io->u.bdev.offset_blocks = offset_blocks; 6405 bdev_io->u.bdev.num_blocks = num_blocks; 6406 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6407 bdev_io->u.bdev.memory_domain = NULL; 6408 bdev_io->u.bdev.memory_domain_ctx = NULL; 6409 bdev_io->u.bdev.accel_sequence = NULL; 6410 6411 if (num_blocks == 0) { 6412 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6413 return 0; 6414 } 6415 6416 bdev_io_submit(bdev_io); 6417 return 0; 6418 } 6419 6420 int 6421 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6422 uint64_t offset, uint64_t length, 6423 spdk_bdev_io_completion_cb cb, void *cb_arg) 6424 { 6425 uint64_t offset_blocks, num_blocks; 6426 6427 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6428 length, &num_blocks) != 0) { 6429 return -EINVAL; 6430 } 6431 6432 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6433 } 6434 6435 int 6436 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6437 uint64_t offset_blocks, uint64_t num_blocks, 6438 spdk_bdev_io_completion_cb cb, void *cb_arg) 6439 { 6440 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6441 struct spdk_bdev_io *bdev_io; 6442 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6443 6444 if (!desc->write) { 6445 return -EBADF; 6446 } 6447 6448 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6449 return -EINVAL; 6450 } 6451 6452 bdev_io = bdev_channel_get_io(channel); 6453 if (!bdev_io) { 6454 return -ENOMEM; 6455 } 6456 6457 bdev_io->internal.ch = channel; 6458 bdev_io->internal.desc = desc; 6459 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6460 bdev_io->u.bdev.iovs = NULL; 6461 bdev_io->u.bdev.iovcnt = 0; 6462 bdev_io->u.bdev.offset_blocks = offset_blocks; 6463 bdev_io->u.bdev.num_blocks = num_blocks; 6464 bdev_io->u.bdev.memory_domain = NULL; 6465 bdev_io->u.bdev.memory_domain_ctx = NULL; 6466 bdev_io->u.bdev.accel_sequence = NULL; 6467 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6468 6469 bdev_io_submit(bdev_io); 6470 return 0; 6471 } 6472 6473 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6474 6475 static void 6476 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6477 { 6478 struct spdk_bdev_channel *ch = _ctx; 6479 struct spdk_bdev_io *bdev_io; 6480 6481 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6482 6483 if (status == -EBUSY) { 6484 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6485 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6486 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6487 } else { 6488 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6489 6490 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6491 /* If outstanding IOs are still present and reset_io_drain_timeout 6492 * seconds passed, start the reset. */ 6493 bdev_io_submit_reset(bdev_io); 6494 } else { 6495 /* We still have in progress memory domain pull/push or we're 6496 * executing accel sequence. Since we cannot abort either of those 6497 * operations, fail the reset request. */ 6498 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6499 } 6500 } 6501 } else { 6502 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6503 SPDK_DEBUGLOG(bdev, 6504 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6505 ch->bdev->name); 6506 /* Mark the completion status as a SUCCESS and complete the reset. */ 6507 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6508 } 6509 } 6510 6511 static void 6512 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6513 struct spdk_io_channel *io_ch, void *_ctx) 6514 { 6515 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6516 int status = 0; 6517 6518 if (cur_ch->io_outstanding > 0 || 6519 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6520 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6521 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6522 * further iteration over the rest of the channels and pass non-zero status 6523 * to the callback function. */ 6524 status = -EBUSY; 6525 } 6526 spdk_bdev_for_each_channel_continue(i, status); 6527 } 6528 6529 static int 6530 bdev_reset_poll_for_outstanding_io(void *ctx) 6531 { 6532 struct spdk_bdev_channel *ch = ctx; 6533 struct spdk_bdev_io *bdev_io; 6534 6535 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6536 6537 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6538 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6539 bdev_reset_check_outstanding_io_done); 6540 6541 return SPDK_POLLER_BUSY; 6542 } 6543 6544 static void 6545 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6546 { 6547 struct spdk_bdev_channel *ch = _ctx; 6548 struct spdk_bdev_io *bdev_io; 6549 6550 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6551 6552 if (bdev->reset_io_drain_timeout == 0) { 6553 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6554 6555 bdev_io_submit_reset(bdev_io); 6556 return; 6557 } 6558 6559 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6560 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6561 6562 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6563 * submit the reset to the underlying module only if outstanding I/O 6564 * remain after reset_io_drain_timeout seconds have passed. */ 6565 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6566 bdev_reset_check_outstanding_io_done); 6567 } 6568 6569 static void 6570 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6571 struct spdk_io_channel *ch, void *_ctx) 6572 { 6573 struct spdk_bdev_channel *channel; 6574 struct spdk_bdev_mgmt_channel *mgmt_channel; 6575 struct spdk_bdev_shared_resource *shared_resource; 6576 bdev_io_tailq_t tmp_queued; 6577 6578 TAILQ_INIT(&tmp_queued); 6579 6580 channel = __io_ch_to_bdev_ch(ch); 6581 shared_resource = channel->shared_resource; 6582 mgmt_channel = shared_resource->mgmt_ch; 6583 6584 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6585 6586 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6587 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6588 } 6589 6590 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6591 bdev_abort_all_buf_io(mgmt_channel, channel); 6592 bdev_abort_all_queued_io(&tmp_queued, channel); 6593 6594 spdk_bdev_for_each_channel_continue(i, 0); 6595 } 6596 6597 static void 6598 bdev_start_reset(void *ctx) 6599 { 6600 struct spdk_bdev_channel *ch = ctx; 6601 6602 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6603 bdev_reset_freeze_channel_done); 6604 } 6605 6606 static void 6607 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6608 { 6609 struct spdk_bdev *bdev = ch->bdev; 6610 6611 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6612 6613 spdk_spin_lock(&bdev->internal.spinlock); 6614 if (bdev->internal.reset_in_progress == NULL) { 6615 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6616 /* 6617 * Take a channel reference for the target bdev for the life of this 6618 * reset. This guards against the channel getting destroyed while 6619 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6620 * progress. We will release the reference when this reset is 6621 * completed. 6622 */ 6623 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6624 bdev_start_reset(ch); 6625 } 6626 spdk_spin_unlock(&bdev->internal.spinlock); 6627 } 6628 6629 int 6630 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6631 spdk_bdev_io_completion_cb cb, void *cb_arg) 6632 { 6633 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6634 struct spdk_bdev_io *bdev_io; 6635 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6636 6637 bdev_io = bdev_channel_get_io(channel); 6638 if (!bdev_io) { 6639 return -ENOMEM; 6640 } 6641 6642 bdev_io->internal.ch = channel; 6643 bdev_io->internal.desc = desc; 6644 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6645 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6646 bdev_io->u.reset.ch_ref = NULL; 6647 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6648 6649 spdk_spin_lock(&bdev->internal.spinlock); 6650 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6651 spdk_spin_unlock(&bdev->internal.spinlock); 6652 6653 bdev_ch_add_to_io_submitted(bdev_io); 6654 6655 bdev_channel_start_reset(channel); 6656 6657 return 0; 6658 } 6659 6660 void 6661 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6662 struct spdk_bdev_io_stat *stat) 6663 { 6664 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6665 6666 bdev_get_io_stat(stat, channel->stat); 6667 } 6668 6669 static void 6670 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6671 { 6672 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6673 6674 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6675 bdev_iostat_ctx->cb_arg, 0); 6676 free(bdev_iostat_ctx); 6677 } 6678 6679 static void 6680 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6681 struct spdk_io_channel *ch, void *_ctx) 6682 { 6683 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6684 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6685 6686 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6687 spdk_bdev_for_each_channel_continue(i, 0); 6688 } 6689 6690 void 6691 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6692 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6693 { 6694 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6695 6696 assert(bdev != NULL); 6697 assert(stat != NULL); 6698 assert(cb != NULL); 6699 6700 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6701 if (bdev_iostat_ctx == NULL) { 6702 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6703 cb(bdev, stat, cb_arg, -ENOMEM); 6704 return; 6705 } 6706 6707 bdev_iostat_ctx->stat = stat; 6708 bdev_iostat_ctx->cb = cb; 6709 bdev_iostat_ctx->cb_arg = cb_arg; 6710 6711 /* Start with the statistics from previously deleted channels. */ 6712 spdk_spin_lock(&bdev->internal.spinlock); 6713 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6714 spdk_spin_unlock(&bdev->internal.spinlock); 6715 6716 /* Then iterate and add the statistics from each existing channel. */ 6717 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6718 bdev_get_device_stat_done); 6719 } 6720 6721 struct bdev_iostat_reset_ctx { 6722 enum spdk_bdev_reset_stat_mode mode; 6723 bdev_reset_device_stat_cb cb; 6724 void *cb_arg; 6725 }; 6726 6727 static void 6728 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6729 { 6730 struct bdev_iostat_reset_ctx *ctx = _ctx; 6731 6732 ctx->cb(bdev, ctx->cb_arg, 0); 6733 6734 free(ctx); 6735 } 6736 6737 static void 6738 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6739 struct spdk_io_channel *ch, void *_ctx) 6740 { 6741 struct bdev_iostat_reset_ctx *ctx = _ctx; 6742 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6743 6744 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6745 6746 spdk_bdev_for_each_channel_continue(i, 0); 6747 } 6748 6749 void 6750 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6751 bdev_reset_device_stat_cb cb, void *cb_arg) 6752 { 6753 struct bdev_iostat_reset_ctx *ctx; 6754 6755 assert(bdev != NULL); 6756 assert(cb != NULL); 6757 6758 ctx = calloc(1, sizeof(*ctx)); 6759 if (ctx == NULL) { 6760 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6761 cb(bdev, cb_arg, -ENOMEM); 6762 return; 6763 } 6764 6765 ctx->mode = mode; 6766 ctx->cb = cb; 6767 ctx->cb_arg = cb_arg; 6768 6769 spdk_spin_lock(&bdev->internal.spinlock); 6770 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6771 spdk_spin_unlock(&bdev->internal.spinlock); 6772 6773 spdk_bdev_for_each_channel(bdev, 6774 bdev_reset_each_channel_stat, 6775 ctx, 6776 bdev_reset_device_stat_done); 6777 } 6778 6779 int 6780 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6781 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6782 spdk_bdev_io_completion_cb cb, void *cb_arg) 6783 { 6784 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6785 struct spdk_bdev_io *bdev_io; 6786 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6787 6788 if (!desc->write) { 6789 return -EBADF; 6790 } 6791 6792 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6793 return -ENOTSUP; 6794 } 6795 6796 bdev_io = bdev_channel_get_io(channel); 6797 if (!bdev_io) { 6798 return -ENOMEM; 6799 } 6800 6801 bdev_io->internal.ch = channel; 6802 bdev_io->internal.desc = desc; 6803 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6804 bdev_io->u.nvme_passthru.cmd = *cmd; 6805 bdev_io->u.nvme_passthru.buf = buf; 6806 bdev_io->u.nvme_passthru.nbytes = nbytes; 6807 bdev_io->u.nvme_passthru.md_buf = NULL; 6808 bdev_io->u.nvme_passthru.md_len = 0; 6809 6810 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6811 6812 bdev_io_submit(bdev_io); 6813 return 0; 6814 } 6815 6816 int 6817 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6818 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6819 spdk_bdev_io_completion_cb cb, void *cb_arg) 6820 { 6821 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6822 struct spdk_bdev_io *bdev_io; 6823 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6824 6825 if (!desc->write) { 6826 /* 6827 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6828 * to easily determine if the command is a read or write, but for now just 6829 * do not allow io_passthru with a read-only descriptor. 6830 */ 6831 return -EBADF; 6832 } 6833 6834 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6835 return -ENOTSUP; 6836 } 6837 6838 bdev_io = bdev_channel_get_io(channel); 6839 if (!bdev_io) { 6840 return -ENOMEM; 6841 } 6842 6843 bdev_io->internal.ch = channel; 6844 bdev_io->internal.desc = desc; 6845 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6846 bdev_io->u.nvme_passthru.cmd = *cmd; 6847 bdev_io->u.nvme_passthru.buf = buf; 6848 bdev_io->u.nvme_passthru.nbytes = nbytes; 6849 bdev_io->u.nvme_passthru.md_buf = NULL; 6850 bdev_io->u.nvme_passthru.md_len = 0; 6851 6852 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6853 6854 bdev_io_submit(bdev_io); 6855 return 0; 6856 } 6857 6858 int 6859 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6860 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6861 spdk_bdev_io_completion_cb cb, void *cb_arg) 6862 { 6863 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6864 struct spdk_bdev_io *bdev_io; 6865 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6866 6867 if (!desc->write) { 6868 /* 6869 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6870 * to easily determine if the command is a read or write, but for now just 6871 * do not allow io_passthru with a read-only descriptor. 6872 */ 6873 return -EBADF; 6874 } 6875 6876 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6877 return -ENOTSUP; 6878 } 6879 6880 bdev_io = bdev_channel_get_io(channel); 6881 if (!bdev_io) { 6882 return -ENOMEM; 6883 } 6884 6885 bdev_io->internal.ch = channel; 6886 bdev_io->internal.desc = desc; 6887 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6888 bdev_io->u.nvme_passthru.cmd = *cmd; 6889 bdev_io->u.nvme_passthru.buf = buf; 6890 bdev_io->u.nvme_passthru.nbytes = nbytes; 6891 bdev_io->u.nvme_passthru.md_buf = md_buf; 6892 bdev_io->u.nvme_passthru.md_len = md_len; 6893 6894 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6895 6896 bdev_io_submit(bdev_io); 6897 return 0; 6898 } 6899 6900 int 6901 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6902 struct spdk_io_channel *ch, 6903 const struct spdk_nvme_cmd *cmd, 6904 struct iovec *iov, int iovcnt, size_t nbytes, 6905 void *md_buf, size_t md_len, 6906 spdk_bdev_io_completion_cb cb, void *cb_arg) 6907 { 6908 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6909 struct spdk_bdev_io *bdev_io; 6910 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6911 6912 if (!desc->write) { 6913 /* 6914 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6915 * to easily determine if the command is a read or write, but for now just 6916 * do not allow io_passthru with a read-only descriptor. 6917 */ 6918 return -EBADF; 6919 } 6920 6921 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6922 return -ENOTSUP; 6923 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6924 return -ENOTSUP; 6925 } 6926 6927 bdev_io = bdev_channel_get_io(channel); 6928 if (!bdev_io) { 6929 return -ENOMEM; 6930 } 6931 6932 bdev_io->internal.ch = channel; 6933 bdev_io->internal.desc = desc; 6934 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6935 bdev_io->u.nvme_passthru.cmd = *cmd; 6936 bdev_io->u.nvme_passthru.iovs = iov; 6937 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6938 bdev_io->u.nvme_passthru.nbytes = nbytes; 6939 bdev_io->u.nvme_passthru.md_buf = md_buf; 6940 bdev_io->u.nvme_passthru.md_len = md_len; 6941 6942 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6943 6944 bdev_io_submit(bdev_io); 6945 return 0; 6946 } 6947 6948 static void bdev_abort_retry(void *ctx); 6949 static void bdev_abort(struct spdk_bdev_io *parent_io); 6950 6951 static void 6952 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6953 { 6954 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6955 struct spdk_bdev_io *parent_io = cb_arg; 6956 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6957 6958 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6959 6960 spdk_bdev_free_io(bdev_io); 6961 6962 if (!success) { 6963 /* Check if the target I/O completed in the meantime. */ 6964 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6965 if (tmp_io == bio_to_abort) { 6966 break; 6967 } 6968 } 6969 6970 /* If the target I/O still exists, set the parent to failed. */ 6971 if (tmp_io != NULL) { 6972 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6973 } 6974 } 6975 6976 assert(parent_io->internal.f.split); 6977 6978 parent_io->internal.split.outstanding--; 6979 if (parent_io->internal.split.outstanding == 0) { 6980 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6981 bdev_abort_retry(parent_io); 6982 } else { 6983 bdev_io_complete(parent_io); 6984 } 6985 } 6986 } 6987 6988 static int 6989 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6990 struct spdk_bdev_io *bio_to_abort, 6991 spdk_bdev_io_completion_cb cb, void *cb_arg) 6992 { 6993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6994 struct spdk_bdev_io *bdev_io; 6995 6996 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6997 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6998 /* TODO: Abort reset or abort request. */ 6999 return -ENOTSUP; 7000 } 7001 7002 bdev_io = bdev_channel_get_io(channel); 7003 if (bdev_io == NULL) { 7004 return -ENOMEM; 7005 } 7006 7007 bdev_io->internal.ch = channel; 7008 bdev_io->internal.desc = desc; 7009 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7010 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7011 7012 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.f.split) { 7013 assert(bdev_io_should_split(bio_to_abort)); 7014 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7015 7016 /* Parent abort request is not submitted directly, but to manage its 7017 * execution add it to the submitted list here. 7018 */ 7019 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7020 bdev_ch_add_to_io_submitted(bdev_io); 7021 7022 bdev_abort(bdev_io); 7023 7024 return 0; 7025 } 7026 7027 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7028 7029 /* Submit the abort request to the underlying bdev module. */ 7030 bdev_io_submit(bdev_io); 7031 7032 return 0; 7033 } 7034 7035 static bool 7036 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7037 { 7038 struct spdk_bdev_io *iter; 7039 7040 TAILQ_FOREACH(iter, tailq, internal.link) { 7041 if (iter == bdev_io) { 7042 return true; 7043 } 7044 } 7045 7046 return false; 7047 } 7048 7049 static uint32_t 7050 _bdev_abort(struct spdk_bdev_io *parent_io) 7051 { 7052 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7053 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7054 void *bio_cb_arg; 7055 struct spdk_bdev_io *bio_to_abort; 7056 uint32_t matched_ios; 7057 int rc; 7058 7059 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7060 7061 /* matched_ios is returned and will be kept by the caller. 7062 * 7063 * This function will be used for two cases, 1) the same cb_arg is used for 7064 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7065 * Incrementing split_outstanding directly here may confuse readers especially 7066 * for the 1st case. 7067 * 7068 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7069 * works as expected. 7070 */ 7071 matched_ios = 0; 7072 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7073 7074 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7075 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7076 continue; 7077 } 7078 7079 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7080 /* Any I/O which was submitted after this abort command should be excluded. */ 7081 continue; 7082 } 7083 7084 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7085 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7086 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7087 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7088 break; 7089 } 7090 7091 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7092 if (rc != 0) { 7093 if (rc == -ENOMEM) { 7094 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7095 } else { 7096 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7097 } 7098 break; 7099 } 7100 matched_ios++; 7101 } 7102 7103 return matched_ios; 7104 } 7105 7106 static void 7107 bdev_abort_retry(void *ctx) 7108 { 7109 struct spdk_bdev_io *parent_io = ctx; 7110 uint32_t matched_ios; 7111 7112 matched_ios = _bdev_abort(parent_io); 7113 7114 if (matched_ios == 0) { 7115 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7116 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7117 } else { 7118 /* For retry, the case that no target I/O was found is success 7119 * because it means target I/Os completed in the meantime. 7120 */ 7121 bdev_io_complete(parent_io); 7122 } 7123 return; 7124 } 7125 7126 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7127 parent_io->internal.f.split = true; 7128 parent_io->internal.split.outstanding = matched_ios; 7129 } 7130 7131 static void 7132 bdev_abort(struct spdk_bdev_io *parent_io) 7133 { 7134 uint32_t matched_ios; 7135 7136 matched_ios = _bdev_abort(parent_io); 7137 7138 if (matched_ios == 0) { 7139 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7140 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7141 } else { 7142 /* The case the no target I/O was found is failure. */ 7143 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7144 bdev_io_complete(parent_io); 7145 } 7146 return; 7147 } 7148 7149 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7150 parent_io->internal.f.split = true; 7151 parent_io->internal.split.outstanding = matched_ios; 7152 } 7153 7154 int 7155 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7156 void *bio_cb_arg, 7157 spdk_bdev_io_completion_cb cb, void *cb_arg) 7158 { 7159 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7160 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7161 struct spdk_bdev_io *bdev_io; 7162 7163 if (bio_cb_arg == NULL) { 7164 return -EINVAL; 7165 } 7166 7167 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7168 return -ENOTSUP; 7169 } 7170 7171 bdev_io = bdev_channel_get_io(channel); 7172 if (bdev_io == NULL) { 7173 return -ENOMEM; 7174 } 7175 7176 bdev_io->internal.ch = channel; 7177 bdev_io->internal.desc = desc; 7178 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7179 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7180 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7181 7182 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7183 7184 /* Parent abort request is not submitted directly, but to manage its execution, 7185 * add it to the submitted list here. 7186 */ 7187 bdev_ch_add_to_io_submitted(bdev_io); 7188 7189 bdev_abort(bdev_io); 7190 7191 return 0; 7192 } 7193 7194 int 7195 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7196 struct spdk_bdev_io_wait_entry *entry) 7197 { 7198 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7199 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7200 7201 if (bdev != entry->bdev) { 7202 SPDK_ERRLOG("bdevs do not match\n"); 7203 return -EINVAL; 7204 } 7205 7206 if (mgmt_ch->per_thread_cache_count > 0) { 7207 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7208 return -EINVAL; 7209 } 7210 7211 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7212 return 0; 7213 } 7214 7215 static inline void 7216 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7217 { 7218 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7219 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7220 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7221 uint32_t blocklen = bdev_io->bdev->blocklen; 7222 7223 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7224 switch (bdev_io->type) { 7225 case SPDK_BDEV_IO_TYPE_READ: 7226 io_stat->bytes_read += num_blocks * blocklen; 7227 io_stat->num_read_ops++; 7228 io_stat->read_latency_ticks += tsc_diff; 7229 if (io_stat->max_read_latency_ticks < tsc_diff) { 7230 io_stat->max_read_latency_ticks = tsc_diff; 7231 } 7232 if (io_stat->min_read_latency_ticks > tsc_diff) { 7233 io_stat->min_read_latency_ticks = tsc_diff; 7234 } 7235 break; 7236 case SPDK_BDEV_IO_TYPE_WRITE: 7237 io_stat->bytes_written += num_blocks * blocklen; 7238 io_stat->num_write_ops++; 7239 io_stat->write_latency_ticks += tsc_diff; 7240 if (io_stat->max_write_latency_ticks < tsc_diff) { 7241 io_stat->max_write_latency_ticks = tsc_diff; 7242 } 7243 if (io_stat->min_write_latency_ticks > tsc_diff) { 7244 io_stat->min_write_latency_ticks = tsc_diff; 7245 } 7246 break; 7247 case SPDK_BDEV_IO_TYPE_UNMAP: 7248 io_stat->bytes_unmapped += num_blocks * blocklen; 7249 io_stat->num_unmap_ops++; 7250 io_stat->unmap_latency_ticks += tsc_diff; 7251 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7252 io_stat->max_unmap_latency_ticks = tsc_diff; 7253 } 7254 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7255 io_stat->min_unmap_latency_ticks = tsc_diff; 7256 } 7257 break; 7258 case SPDK_BDEV_IO_TYPE_ZCOPY: 7259 /* Track the data in the start phase only */ 7260 if (bdev_io->u.bdev.zcopy.start) { 7261 if (bdev_io->u.bdev.zcopy.populate) { 7262 io_stat->bytes_read += num_blocks * blocklen; 7263 io_stat->num_read_ops++; 7264 io_stat->read_latency_ticks += tsc_diff; 7265 if (io_stat->max_read_latency_ticks < tsc_diff) { 7266 io_stat->max_read_latency_ticks = tsc_diff; 7267 } 7268 if (io_stat->min_read_latency_ticks > tsc_diff) { 7269 io_stat->min_read_latency_ticks = tsc_diff; 7270 } 7271 } else { 7272 io_stat->bytes_written += num_blocks * blocklen; 7273 io_stat->num_write_ops++; 7274 io_stat->write_latency_ticks += tsc_diff; 7275 if (io_stat->max_write_latency_ticks < tsc_diff) { 7276 io_stat->max_write_latency_ticks = tsc_diff; 7277 } 7278 if (io_stat->min_write_latency_ticks > tsc_diff) { 7279 io_stat->min_write_latency_ticks = tsc_diff; 7280 } 7281 } 7282 } 7283 break; 7284 case SPDK_BDEV_IO_TYPE_COPY: 7285 io_stat->bytes_copied += num_blocks * blocklen; 7286 io_stat->num_copy_ops++; 7287 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7288 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7289 io_stat->max_copy_latency_ticks = tsc_diff; 7290 } 7291 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7292 io_stat->min_copy_latency_ticks = tsc_diff; 7293 } 7294 break; 7295 default: 7296 break; 7297 } 7298 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7299 io_stat = bdev_io->bdev->internal.stat; 7300 assert(io_stat->io_error != NULL); 7301 7302 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7303 io_stat->io_error->error_status[-io_status - 1]++; 7304 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7305 } 7306 7307 #ifdef SPDK_CONFIG_VTUNE 7308 uint64_t now_tsc = spdk_get_ticks(); 7309 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7310 uint64_t data[5]; 7311 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7312 7313 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7314 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7315 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7316 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7317 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7318 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7319 7320 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7321 __itt_metadata_u64, 5, data); 7322 7323 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7324 bdev_io->internal.ch->start_tsc = now_tsc; 7325 } 7326 #endif 7327 } 7328 7329 static inline void 7330 _bdev_io_complete(void *ctx) 7331 { 7332 struct spdk_bdev_io *bdev_io = ctx; 7333 7334 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7335 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7336 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7337 } 7338 7339 assert(bdev_io->internal.cb != NULL); 7340 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7341 7342 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7343 bdev_io->internal.caller_ctx); 7344 } 7345 7346 static inline void 7347 bdev_io_complete(void *ctx) 7348 { 7349 struct spdk_bdev_io *bdev_io = ctx; 7350 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7351 uint64_t tsc, tsc_diff; 7352 7353 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7354 /* 7355 * Defer completion to avoid potential infinite recursion if the 7356 * user's completion callback issues a new I/O. 7357 */ 7358 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7359 bdev_io_complete, bdev_io); 7360 return; 7361 } 7362 7363 tsc = spdk_get_ticks(); 7364 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7365 7366 bdev_ch_remove_from_io_submitted(bdev_io); 7367 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7368 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7369 7370 if (bdev_ch->histogram) { 7371 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7372 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7373 /* 7374 * Tally all I/O types if the histogram_io_type is set to 0. 7375 */ 7376 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7377 } 7378 } 7379 7380 bdev_io_update_io_stat(bdev_io, tsc_diff); 7381 _bdev_io_complete(bdev_io); 7382 } 7383 7384 /* The difference between this function and bdev_io_complete() is that this should be called to 7385 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7386 * io_submitted list and don't have submit_tsc updated. 7387 */ 7388 static inline void 7389 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7390 { 7391 /* Since the IO hasn't been submitted it's bound to be failed */ 7392 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7393 7394 /* At this point we don't know if the IO is completed from submission context or not, but, 7395 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7396 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7397 _bdev_io_complete, bdev_io); 7398 } 7399 7400 static void bdev_destroy_cb(void *io_device); 7401 7402 static void 7403 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7404 { 7405 struct spdk_bdev_io *bdev_io = _ctx; 7406 7407 if (bdev_io->u.reset.ch_ref != NULL) { 7408 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7409 bdev_io->u.reset.ch_ref = NULL; 7410 } 7411 7412 bdev_io_complete(bdev_io); 7413 7414 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7415 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7416 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7417 } 7418 } 7419 7420 static void 7421 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7422 struct spdk_io_channel *_ch, void *_ctx) 7423 { 7424 struct spdk_bdev_io *bdev_io = _ctx; 7425 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7426 struct spdk_bdev_io *queued_reset; 7427 7428 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7429 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7430 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7431 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7432 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7433 } 7434 7435 spdk_bdev_for_each_channel_continue(i, 0); 7436 } 7437 7438 static void 7439 bdev_io_complete_sequence_cb(void *ctx, int status) 7440 { 7441 struct spdk_bdev_io *bdev_io = ctx; 7442 7443 /* u.bdev.accel_sequence should have already been cleared at this point */ 7444 assert(bdev_io->u.bdev.accel_sequence == NULL); 7445 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7446 bdev_io->internal.f.has_accel_sequence = false; 7447 7448 if (spdk_unlikely(status != 0)) { 7449 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7450 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7451 } 7452 7453 bdev_io_complete(bdev_io); 7454 } 7455 7456 void 7457 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7458 { 7459 struct spdk_bdev *bdev = bdev_io->bdev; 7460 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7461 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7462 7463 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7464 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7465 spdk_bdev_get_module_name(bdev), 7466 bdev_io_status_get_string(bdev_io->internal.status)); 7467 assert(false); 7468 } 7469 bdev_io->internal.status = status; 7470 7471 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7472 bool unlock_channels = false; 7473 7474 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7475 SPDK_ERRLOG("NOMEM returned for reset\n"); 7476 } 7477 spdk_spin_lock(&bdev->internal.spinlock); 7478 if (bdev_io == bdev->internal.reset_in_progress) { 7479 bdev->internal.reset_in_progress = NULL; 7480 unlock_channels = true; 7481 } 7482 spdk_spin_unlock(&bdev->internal.spinlock); 7483 7484 if (unlock_channels) { 7485 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7486 bdev_reset_complete); 7487 return; 7488 } 7489 } else { 7490 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7491 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7492 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7493 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7494 return; 7495 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7496 !bdev_io_use_accel_sequence(bdev_io))) { 7497 _bdev_io_push_bounce_data_buffer(bdev_io, 7498 _bdev_io_complete_push_bounce_done); 7499 /* bdev IO will be completed in the callback */ 7500 return; 7501 } 7502 } 7503 7504 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7505 return; 7506 } 7507 } 7508 7509 bdev_io_complete(bdev_io); 7510 } 7511 7512 void 7513 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7514 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7515 { 7516 enum spdk_bdev_io_status status; 7517 7518 if (sc == SPDK_SCSI_STATUS_GOOD) { 7519 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7520 } else { 7521 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7522 bdev_io->internal.error.scsi.sc = sc; 7523 bdev_io->internal.error.scsi.sk = sk; 7524 bdev_io->internal.error.scsi.asc = asc; 7525 bdev_io->internal.error.scsi.ascq = ascq; 7526 } 7527 7528 spdk_bdev_io_complete(bdev_io, status); 7529 } 7530 7531 void 7532 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7533 int *sc, int *sk, int *asc, int *ascq) 7534 { 7535 assert(sc != NULL); 7536 assert(sk != NULL); 7537 assert(asc != NULL); 7538 assert(ascq != NULL); 7539 7540 switch (bdev_io->internal.status) { 7541 case SPDK_BDEV_IO_STATUS_SUCCESS: 7542 *sc = SPDK_SCSI_STATUS_GOOD; 7543 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7544 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7545 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7546 break; 7547 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7548 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7549 break; 7550 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7551 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7552 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7553 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7554 *ascq = bdev_io->internal.error.scsi.ascq; 7555 break; 7556 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7557 *sc = bdev_io->internal.error.scsi.sc; 7558 *sk = bdev_io->internal.error.scsi.sk; 7559 *asc = bdev_io->internal.error.scsi.asc; 7560 *ascq = bdev_io->internal.error.scsi.ascq; 7561 break; 7562 default: 7563 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7564 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7565 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7566 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7567 break; 7568 } 7569 } 7570 7571 void 7572 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7573 { 7574 enum spdk_bdev_io_status status; 7575 7576 if (aio_result == 0) { 7577 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7578 } else { 7579 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7580 } 7581 7582 bdev_io->internal.error.aio_result = aio_result; 7583 7584 spdk_bdev_io_complete(bdev_io, status); 7585 } 7586 7587 void 7588 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7589 { 7590 assert(aio_result != NULL); 7591 7592 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7593 *aio_result = bdev_io->internal.error.aio_result; 7594 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7595 *aio_result = 0; 7596 } else { 7597 *aio_result = -EIO; 7598 } 7599 } 7600 7601 void 7602 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7603 { 7604 enum spdk_bdev_io_status status; 7605 7606 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7607 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7608 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7609 status = SPDK_BDEV_IO_STATUS_ABORTED; 7610 } else { 7611 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7612 } 7613 7614 bdev_io->internal.error.nvme.cdw0 = cdw0; 7615 bdev_io->internal.error.nvme.sct = sct; 7616 bdev_io->internal.error.nvme.sc = sc; 7617 7618 spdk_bdev_io_complete(bdev_io, status); 7619 } 7620 7621 void 7622 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7623 { 7624 assert(sct != NULL); 7625 assert(sc != NULL); 7626 assert(cdw0 != NULL); 7627 7628 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7629 *sct = SPDK_NVME_SCT_GENERIC; 7630 *sc = SPDK_NVME_SC_SUCCESS; 7631 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7632 *cdw0 = 0; 7633 } else { 7634 *cdw0 = 1U; 7635 } 7636 return; 7637 } 7638 7639 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7640 *sct = SPDK_NVME_SCT_GENERIC; 7641 *sc = SPDK_NVME_SC_SUCCESS; 7642 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7643 *sct = bdev_io->internal.error.nvme.sct; 7644 *sc = bdev_io->internal.error.nvme.sc; 7645 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7646 *sct = SPDK_NVME_SCT_GENERIC; 7647 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7648 } else { 7649 *sct = SPDK_NVME_SCT_GENERIC; 7650 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7651 } 7652 7653 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7654 } 7655 7656 void 7657 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7658 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7659 { 7660 assert(first_sct != NULL); 7661 assert(first_sc != NULL); 7662 assert(second_sct != NULL); 7663 assert(second_sc != NULL); 7664 assert(cdw0 != NULL); 7665 7666 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7667 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7668 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7669 *first_sct = bdev_io->internal.error.nvme.sct; 7670 *first_sc = bdev_io->internal.error.nvme.sc; 7671 *second_sct = SPDK_NVME_SCT_GENERIC; 7672 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7673 } else { 7674 *first_sct = SPDK_NVME_SCT_GENERIC; 7675 *first_sc = SPDK_NVME_SC_SUCCESS; 7676 *second_sct = bdev_io->internal.error.nvme.sct; 7677 *second_sc = bdev_io->internal.error.nvme.sc; 7678 } 7679 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7680 *first_sct = SPDK_NVME_SCT_GENERIC; 7681 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7682 *second_sct = SPDK_NVME_SCT_GENERIC; 7683 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7684 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7685 *first_sct = SPDK_NVME_SCT_GENERIC; 7686 *first_sc = SPDK_NVME_SC_SUCCESS; 7687 *second_sct = SPDK_NVME_SCT_GENERIC; 7688 *second_sc = SPDK_NVME_SC_SUCCESS; 7689 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7690 *first_sct = SPDK_NVME_SCT_GENERIC; 7691 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7692 *second_sct = SPDK_NVME_SCT_GENERIC; 7693 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7694 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7695 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7696 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7697 *second_sct = SPDK_NVME_SCT_GENERIC; 7698 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7699 } else { 7700 *first_sct = SPDK_NVME_SCT_GENERIC; 7701 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7702 *second_sct = SPDK_NVME_SCT_GENERIC; 7703 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7704 } 7705 7706 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7707 } 7708 7709 void 7710 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7711 const struct spdk_bdev_io *base_io) 7712 { 7713 switch (base_io->internal.status) { 7714 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7715 spdk_bdev_io_complete_nvme_status(bdev_io, 7716 base_io->internal.error.nvme.cdw0, 7717 base_io->internal.error.nvme.sct, 7718 base_io->internal.error.nvme.sc); 7719 break; 7720 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7721 spdk_bdev_io_complete_scsi_status(bdev_io, 7722 base_io->internal.error.scsi.sc, 7723 base_io->internal.error.scsi.sk, 7724 base_io->internal.error.scsi.asc, 7725 base_io->internal.error.scsi.ascq); 7726 break; 7727 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7728 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7729 break; 7730 default: 7731 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7732 break; 7733 } 7734 } 7735 7736 struct spdk_thread * 7737 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7738 { 7739 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7740 } 7741 7742 struct spdk_io_channel * 7743 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7744 { 7745 return bdev_io->internal.ch->channel; 7746 } 7747 7748 static int 7749 bdev_register(struct spdk_bdev *bdev) 7750 { 7751 char *bdev_name; 7752 char uuid[SPDK_UUID_STRING_LEN]; 7753 struct spdk_iobuf_opts iobuf_opts; 7754 int ret; 7755 7756 assert(bdev->module != NULL); 7757 7758 if (!bdev->name) { 7759 SPDK_ERRLOG("Bdev name is NULL\n"); 7760 return -EINVAL; 7761 } 7762 7763 if (!strlen(bdev->name)) { 7764 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7765 return -EINVAL; 7766 } 7767 7768 /* Users often register their own I/O devices using the bdev name. In 7769 * order to avoid conflicts, prepend bdev_. */ 7770 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7771 if (!bdev_name) { 7772 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7773 return -ENOMEM; 7774 } 7775 7776 bdev->internal.stat = bdev_alloc_io_stat(true); 7777 if (!bdev->internal.stat) { 7778 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7779 free(bdev_name); 7780 return -ENOMEM; 7781 } 7782 7783 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7784 bdev->internal.measured_queue_depth = UINT64_MAX; 7785 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7786 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7787 bdev->internal.qd_poller = NULL; 7788 bdev->internal.qos = NULL; 7789 7790 TAILQ_INIT(&bdev->internal.open_descs); 7791 TAILQ_INIT(&bdev->internal.locked_ranges); 7792 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7793 TAILQ_INIT(&bdev->aliases); 7794 7795 /* UUID may be specified by the user or defined by bdev itself. 7796 * Otherwise it will be generated here, so this field will never be empty. */ 7797 if (spdk_uuid_is_null(&bdev->uuid)) { 7798 spdk_uuid_generate(&bdev->uuid); 7799 } 7800 7801 /* Add the UUID alias only if it's different than the name */ 7802 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7803 if (strcmp(bdev->name, uuid) != 0) { 7804 ret = spdk_bdev_alias_add(bdev, uuid); 7805 if (ret != 0) { 7806 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7807 bdev_free_io_stat(bdev->internal.stat); 7808 free(bdev_name); 7809 return ret; 7810 } 7811 } 7812 7813 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7814 if (spdk_bdev_get_buf_align(bdev) > 1) { 7815 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7816 iobuf_opts.large_bufsize / bdev->blocklen); 7817 } 7818 7819 /* If the user didn't specify a write unit size, set it to one. */ 7820 if (bdev->write_unit_size == 0) { 7821 bdev->write_unit_size = 1; 7822 } 7823 7824 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7825 if (bdev->acwu == 0) { 7826 bdev->acwu = bdev->write_unit_size; 7827 } 7828 7829 if (bdev->phys_blocklen == 0) { 7830 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7831 } 7832 7833 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7834 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7835 } 7836 7837 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7838 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7839 } 7840 7841 bdev->internal.reset_in_progress = NULL; 7842 bdev->internal.qd_poll_in_progress = false; 7843 bdev->internal.period = 0; 7844 bdev->internal.new_period = 0; 7845 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7846 7847 /* 7848 * Initialize spinlock before registering IO device because spinlock is used in 7849 * bdev_channel_create 7850 */ 7851 spdk_spin_init(&bdev->internal.spinlock); 7852 7853 spdk_io_device_register(__bdev_to_io_dev(bdev), 7854 bdev_channel_create, bdev_channel_destroy, 7855 sizeof(struct spdk_bdev_channel), 7856 bdev_name); 7857 7858 /* 7859 * Register bdev name only after the bdev object is ready. 7860 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7861 * create IO channels... 7862 */ 7863 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7864 if (ret != 0) { 7865 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7866 bdev_free_io_stat(bdev->internal.stat); 7867 spdk_spin_destroy(&bdev->internal.spinlock); 7868 free(bdev_name); 7869 return ret; 7870 } 7871 7872 free(bdev_name); 7873 7874 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7875 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7876 7877 return 0; 7878 } 7879 7880 static void 7881 bdev_destroy_cb(void *io_device) 7882 { 7883 int rc; 7884 struct spdk_bdev *bdev; 7885 spdk_bdev_unregister_cb cb_fn; 7886 void *cb_arg; 7887 7888 bdev = __bdev_from_io_dev(io_device); 7889 7890 if (bdev->internal.unregister_td != spdk_get_thread()) { 7891 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7892 return; 7893 } 7894 7895 cb_fn = bdev->internal.unregister_cb; 7896 cb_arg = bdev->internal.unregister_ctx; 7897 7898 spdk_spin_destroy(&bdev->internal.spinlock); 7899 free(bdev->internal.qos); 7900 bdev_free_io_stat(bdev->internal.stat); 7901 spdk_trace_unregister_owner(bdev->internal.trace_id); 7902 7903 rc = bdev->fn_table->destruct(bdev->ctxt); 7904 if (rc < 0) { 7905 SPDK_ERRLOG("destruct failed\n"); 7906 } 7907 if (rc <= 0 && cb_fn != NULL) { 7908 cb_fn(cb_arg, rc); 7909 } 7910 } 7911 7912 void 7913 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7914 { 7915 if (bdev->internal.unregister_cb != NULL) { 7916 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7917 } 7918 } 7919 7920 static void 7921 _remove_notify(void *arg) 7922 { 7923 struct spdk_bdev_desc *desc = arg; 7924 7925 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7926 } 7927 7928 /* returns: 0 - bdev removed and ready to be destructed. 7929 * -EBUSY - bdev can't be destructed yet. */ 7930 static int 7931 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7932 { 7933 struct spdk_bdev_desc *desc, *tmp; 7934 int rc = 0; 7935 char uuid[SPDK_UUID_STRING_LEN]; 7936 7937 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7938 assert(spdk_spin_held(&bdev->internal.spinlock)); 7939 7940 /* Notify each descriptor about hotremoval */ 7941 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7942 rc = -EBUSY; 7943 /* 7944 * Defer invocation of the event_cb to a separate message that will 7945 * run later on its thread. This ensures this context unwinds and 7946 * we don't recursively unregister this bdev again if the event_cb 7947 * immediately closes its descriptor. 7948 */ 7949 event_notify(desc, _remove_notify); 7950 } 7951 7952 /* If there are no descriptors, proceed removing the bdev */ 7953 if (rc == 0) { 7954 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7955 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7956 7957 /* Delete the name and the UUID alias */ 7958 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7959 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7960 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7961 7962 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7963 7964 if (bdev->internal.reset_in_progress != NULL) { 7965 /* If reset is in progress, let the completion callback for reset 7966 * unregister the bdev. 7967 */ 7968 rc = -EBUSY; 7969 } 7970 } 7971 7972 return rc; 7973 } 7974 7975 static void 7976 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7977 struct spdk_io_channel *io_ch, void *_ctx) 7978 { 7979 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7980 7981 bdev_channel_abort_queued_ios(bdev_ch); 7982 spdk_bdev_for_each_channel_continue(i, 0); 7983 } 7984 7985 static void 7986 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7987 { 7988 int rc; 7989 7990 spdk_spin_lock(&g_bdev_mgr.spinlock); 7991 spdk_spin_lock(&bdev->internal.spinlock); 7992 /* 7993 * Set the status to REMOVING after completing to abort channels. Otherwise, 7994 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7995 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7996 * may fail. 7997 */ 7998 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7999 rc = bdev_unregister_unsafe(bdev); 8000 spdk_spin_unlock(&bdev->internal.spinlock); 8001 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8002 8003 if (rc == 0) { 8004 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8005 } 8006 } 8007 8008 void 8009 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8010 { 8011 struct spdk_thread *thread; 8012 8013 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8014 8015 thread = spdk_get_thread(); 8016 if (!thread) { 8017 /* The user called this from a non-SPDK thread. */ 8018 if (cb_fn != NULL) { 8019 cb_fn(cb_arg, -ENOTSUP); 8020 } 8021 return; 8022 } 8023 8024 spdk_spin_lock(&g_bdev_mgr.spinlock); 8025 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8026 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8027 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8028 if (cb_fn) { 8029 cb_fn(cb_arg, -EBUSY); 8030 } 8031 return; 8032 } 8033 8034 spdk_spin_lock(&bdev->internal.spinlock); 8035 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8036 bdev->internal.unregister_cb = cb_fn; 8037 bdev->internal.unregister_ctx = cb_arg; 8038 bdev->internal.unregister_td = thread; 8039 spdk_spin_unlock(&bdev->internal.spinlock); 8040 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8041 8042 spdk_bdev_set_qd_sampling_period(bdev, 0); 8043 8044 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8045 bdev_unregister); 8046 } 8047 8048 int 8049 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8050 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8051 { 8052 struct spdk_bdev_desc *desc; 8053 struct spdk_bdev *bdev; 8054 int rc; 8055 8056 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8057 if (rc != 0) { 8058 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8059 return rc; 8060 } 8061 8062 bdev = spdk_bdev_desc_get_bdev(desc); 8063 8064 if (bdev->module != module) { 8065 spdk_bdev_close(desc); 8066 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8067 bdev_name); 8068 return -ENODEV; 8069 } 8070 8071 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8072 8073 spdk_bdev_close(desc); 8074 8075 return 0; 8076 } 8077 8078 static int 8079 bdev_start_qos(struct spdk_bdev *bdev) 8080 { 8081 struct set_qos_limit_ctx *ctx; 8082 8083 /* Enable QoS */ 8084 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8085 ctx = calloc(1, sizeof(*ctx)); 8086 if (ctx == NULL) { 8087 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8088 return -ENOMEM; 8089 } 8090 ctx->bdev = bdev; 8091 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8092 } 8093 8094 return 0; 8095 } 8096 8097 static void 8098 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8099 struct spdk_bdev *bdev) 8100 { 8101 enum spdk_bdev_claim_type type; 8102 const char *typename, *modname; 8103 extern struct spdk_log_flag SPDK_LOG_bdev; 8104 8105 assert(spdk_spin_held(&bdev->internal.spinlock)); 8106 8107 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8108 return; 8109 } 8110 8111 type = bdev->internal.claim_type; 8112 typename = spdk_bdev_claim_get_name(type); 8113 8114 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8115 modname = bdev->internal.claim.v1.module->name; 8116 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8117 bdev->name, detail, typename, modname); 8118 return; 8119 } 8120 8121 if (claim_type_is_v2(type)) { 8122 struct spdk_bdev_module_claim *claim; 8123 8124 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8125 modname = claim->module->name; 8126 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8127 bdev->name, detail, typename, modname); 8128 } 8129 return; 8130 } 8131 8132 assert(false); 8133 } 8134 8135 static int 8136 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8137 { 8138 struct spdk_thread *thread; 8139 int rc = 0; 8140 8141 thread = spdk_get_thread(); 8142 if (!thread) { 8143 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8144 return -ENOTSUP; 8145 } 8146 8147 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8148 spdk_get_thread()); 8149 8150 desc->bdev = bdev; 8151 desc->thread = thread; 8152 desc->write = write; 8153 8154 spdk_spin_lock(&bdev->internal.spinlock); 8155 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8156 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8157 spdk_spin_unlock(&bdev->internal.spinlock); 8158 return -ENODEV; 8159 } 8160 8161 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8162 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8163 spdk_spin_unlock(&bdev->internal.spinlock); 8164 return -EPERM; 8165 } 8166 8167 rc = bdev_start_qos(bdev); 8168 if (rc != 0) { 8169 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8170 spdk_spin_unlock(&bdev->internal.spinlock); 8171 return rc; 8172 } 8173 8174 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8175 8176 spdk_spin_unlock(&bdev->internal.spinlock); 8177 8178 return 0; 8179 } 8180 8181 static int 8182 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8183 struct spdk_bdev_desc **_desc) 8184 { 8185 struct spdk_bdev_desc *desc; 8186 unsigned int i; 8187 8188 desc = calloc(1, sizeof(*desc)); 8189 if (desc == NULL) { 8190 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8191 return -ENOMEM; 8192 } 8193 8194 TAILQ_INIT(&desc->pending_media_events); 8195 TAILQ_INIT(&desc->free_media_events); 8196 8197 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8198 desc->callback.event_fn = event_cb; 8199 desc->callback.ctx = event_ctx; 8200 spdk_spin_init(&desc->spinlock); 8201 8202 if (bdev->media_events) { 8203 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8204 sizeof(*desc->media_events_buffer)); 8205 if (desc->media_events_buffer == NULL) { 8206 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8207 bdev_desc_free(desc); 8208 return -ENOMEM; 8209 } 8210 8211 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8212 TAILQ_INSERT_TAIL(&desc->free_media_events, 8213 &desc->media_events_buffer[i], tailq); 8214 } 8215 } 8216 8217 if (bdev->fn_table->accel_sequence_supported != NULL) { 8218 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8219 desc->accel_sequence_supported[i] = 8220 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8221 (enum spdk_bdev_io_type)i); 8222 } 8223 } 8224 8225 *_desc = desc; 8226 8227 return 0; 8228 } 8229 8230 static int 8231 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8232 void *event_ctx, struct spdk_bdev_desc **_desc) 8233 { 8234 struct spdk_bdev_desc *desc; 8235 struct spdk_bdev *bdev; 8236 int rc; 8237 8238 bdev = bdev_get_by_name(bdev_name); 8239 8240 if (bdev == NULL) { 8241 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8242 return -ENODEV; 8243 } 8244 8245 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8246 if (rc != 0) { 8247 return rc; 8248 } 8249 8250 rc = bdev_open(bdev, write, desc); 8251 if (rc != 0) { 8252 bdev_desc_free(desc); 8253 desc = NULL; 8254 } 8255 8256 *_desc = desc; 8257 8258 return rc; 8259 } 8260 8261 int 8262 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8263 void *event_ctx, struct spdk_bdev_desc **_desc) 8264 { 8265 int rc; 8266 8267 if (event_cb == NULL) { 8268 SPDK_ERRLOG("Missing event callback function\n"); 8269 return -EINVAL; 8270 } 8271 8272 spdk_spin_lock(&g_bdev_mgr.spinlock); 8273 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8274 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8275 8276 return rc; 8277 } 8278 8279 struct spdk_bdev_open_async_ctx { 8280 char *bdev_name; 8281 spdk_bdev_event_cb_t event_cb; 8282 void *event_ctx; 8283 bool write; 8284 int rc; 8285 spdk_bdev_open_async_cb_t cb_fn; 8286 void *cb_arg; 8287 struct spdk_bdev_desc *desc; 8288 struct spdk_bdev_open_async_opts opts; 8289 uint64_t start_ticks; 8290 struct spdk_thread *orig_thread; 8291 struct spdk_poller *poller; 8292 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8293 }; 8294 8295 static void 8296 bdev_open_async_done(void *arg) 8297 { 8298 struct spdk_bdev_open_async_ctx *ctx = arg; 8299 8300 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8301 8302 free(ctx->bdev_name); 8303 free(ctx); 8304 } 8305 8306 static void 8307 bdev_open_async_cancel(void *arg) 8308 { 8309 struct spdk_bdev_open_async_ctx *ctx = arg; 8310 8311 assert(ctx->rc == -ESHUTDOWN); 8312 8313 spdk_poller_unregister(&ctx->poller); 8314 8315 bdev_open_async_done(ctx); 8316 } 8317 8318 /* This is called when the bdev library finishes at shutdown. */ 8319 static void 8320 bdev_open_async_fini(void) 8321 { 8322 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8323 8324 spdk_spin_lock(&g_bdev_mgr.spinlock); 8325 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8326 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8327 /* 8328 * We have to move to ctx->orig_thread to unregister ctx->poller. 8329 * However, there is a chance that ctx->poller is executed before 8330 * message is executed, which could result in bdev_open_async_done() 8331 * being called twice. To avoid such race condition, set ctx->rc to 8332 * -ESHUTDOWN. 8333 */ 8334 ctx->rc = -ESHUTDOWN; 8335 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8336 } 8337 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8338 } 8339 8340 static int bdev_open_async(void *arg); 8341 8342 static void 8343 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8344 { 8345 uint64_t timeout_ticks; 8346 8347 if (ctx->rc == -ESHUTDOWN) { 8348 /* This context is being canceled. Do nothing. */ 8349 return; 8350 } 8351 8352 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8353 &ctx->desc); 8354 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8355 goto exit; 8356 } 8357 8358 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8359 if (spdk_get_ticks() >= timeout_ticks) { 8360 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8361 ctx->rc = -ETIMEDOUT; 8362 goto exit; 8363 } 8364 8365 return; 8366 8367 exit: 8368 spdk_poller_unregister(&ctx->poller); 8369 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8370 8371 /* Completion callback is processed after stack unwinding. */ 8372 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8373 } 8374 8375 static int 8376 bdev_open_async(void *arg) 8377 { 8378 struct spdk_bdev_open_async_ctx *ctx = arg; 8379 8380 spdk_spin_lock(&g_bdev_mgr.spinlock); 8381 8382 _bdev_open_async(ctx); 8383 8384 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8385 8386 return SPDK_POLLER_BUSY; 8387 } 8388 8389 static void 8390 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8391 struct spdk_bdev_open_async_opts *opts_src, 8392 size_t size) 8393 { 8394 assert(opts); 8395 assert(opts_src); 8396 8397 opts->size = size; 8398 8399 #define SET_FIELD(field) \ 8400 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8401 opts->field = opts_src->field; \ 8402 } \ 8403 8404 SET_FIELD(timeout_ms); 8405 8406 /* Do not remove this statement, you should always update this statement when you adding a new field, 8407 * and do not forget to add the SET_FIELD statement for your added field. */ 8408 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8409 8410 #undef SET_FIELD 8411 } 8412 8413 static void 8414 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8415 { 8416 assert(opts); 8417 8418 opts->size = size; 8419 8420 #define SET_FIELD(field, value) \ 8421 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8422 opts->field = value; \ 8423 } \ 8424 8425 SET_FIELD(timeout_ms, 0); 8426 8427 #undef SET_FIELD 8428 } 8429 8430 int 8431 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8432 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8433 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8434 { 8435 struct spdk_bdev_open_async_ctx *ctx; 8436 8437 if (event_cb == NULL) { 8438 SPDK_ERRLOG("Missing event callback function\n"); 8439 return -EINVAL; 8440 } 8441 8442 if (open_cb == NULL) { 8443 SPDK_ERRLOG("Missing open callback function\n"); 8444 return -EINVAL; 8445 } 8446 8447 if (opts != NULL && opts->size == 0) { 8448 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8449 return -EINVAL; 8450 } 8451 8452 ctx = calloc(1, sizeof(*ctx)); 8453 if (ctx == NULL) { 8454 SPDK_ERRLOG("Failed to allocate open context\n"); 8455 return -ENOMEM; 8456 } 8457 8458 ctx->bdev_name = strdup(bdev_name); 8459 if (ctx->bdev_name == NULL) { 8460 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8461 free(ctx); 8462 return -ENOMEM; 8463 } 8464 8465 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8466 if (ctx->poller == NULL) { 8467 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8468 free(ctx->bdev_name); 8469 free(ctx); 8470 return -ENOMEM; 8471 } 8472 8473 ctx->cb_fn = open_cb; 8474 ctx->cb_arg = open_cb_arg; 8475 ctx->write = write; 8476 ctx->event_cb = event_cb; 8477 ctx->event_ctx = event_ctx; 8478 ctx->orig_thread = spdk_get_thread(); 8479 ctx->start_ticks = spdk_get_ticks(); 8480 8481 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8482 if (opts != NULL) { 8483 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8484 } 8485 8486 spdk_spin_lock(&g_bdev_mgr.spinlock); 8487 8488 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8489 _bdev_open_async(ctx); 8490 8491 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8492 8493 return 0; 8494 } 8495 8496 static void 8497 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8498 { 8499 int rc; 8500 8501 spdk_spin_lock(&bdev->internal.spinlock); 8502 spdk_spin_lock(&desc->spinlock); 8503 8504 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8505 8506 desc->closed = true; 8507 8508 if (desc->claim != NULL) { 8509 bdev_desc_release_claims(desc); 8510 } 8511 8512 if (0 == desc->refs) { 8513 spdk_spin_unlock(&desc->spinlock); 8514 bdev_desc_free(desc); 8515 } else { 8516 spdk_spin_unlock(&desc->spinlock); 8517 } 8518 8519 /* If no more descriptors, kill QoS channel */ 8520 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8521 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8522 bdev->name, spdk_get_thread()); 8523 8524 if (bdev_qos_destroy(bdev)) { 8525 /* There isn't anything we can do to recover here. Just let the 8526 * old QoS poller keep running. The QoS handling won't change 8527 * cores when the user allocates a new channel, but it won't break. */ 8528 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8529 } 8530 } 8531 8532 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8533 rc = bdev_unregister_unsafe(bdev); 8534 spdk_spin_unlock(&bdev->internal.spinlock); 8535 8536 if (rc == 0) { 8537 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8538 } 8539 } else { 8540 spdk_spin_unlock(&bdev->internal.spinlock); 8541 } 8542 } 8543 8544 void 8545 spdk_bdev_close(struct spdk_bdev_desc *desc) 8546 { 8547 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8548 8549 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8550 spdk_get_thread()); 8551 8552 assert(desc->thread == spdk_get_thread()); 8553 8554 spdk_poller_unregister(&desc->io_timeout_poller); 8555 8556 spdk_spin_lock(&g_bdev_mgr.spinlock); 8557 8558 bdev_close(bdev, desc); 8559 8560 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8561 } 8562 8563 static void 8564 bdev_register_finished(void *arg) 8565 { 8566 struct spdk_bdev_desc *desc = arg; 8567 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8568 8569 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8570 8571 spdk_spin_lock(&g_bdev_mgr.spinlock); 8572 8573 bdev_close(bdev, desc); 8574 8575 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8576 } 8577 8578 int 8579 spdk_bdev_register(struct spdk_bdev *bdev) 8580 { 8581 struct spdk_bdev_desc *desc; 8582 struct spdk_thread *thread = spdk_get_thread(); 8583 int rc; 8584 8585 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8586 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8587 thread ? spdk_thread_get_name(thread) : "null"); 8588 return -EINVAL; 8589 } 8590 8591 rc = bdev_register(bdev); 8592 if (rc != 0) { 8593 return rc; 8594 } 8595 8596 /* A descriptor is opened to prevent bdev deletion during examination */ 8597 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8598 if (rc != 0) { 8599 spdk_bdev_unregister(bdev, NULL, NULL); 8600 return rc; 8601 } 8602 8603 rc = bdev_open(bdev, false, desc); 8604 if (rc != 0) { 8605 bdev_desc_free(desc); 8606 spdk_bdev_unregister(bdev, NULL, NULL); 8607 return rc; 8608 } 8609 8610 /* Examine configuration before initializing I/O */ 8611 bdev_examine(bdev); 8612 8613 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8614 if (rc != 0) { 8615 bdev_close(bdev, desc); 8616 spdk_bdev_unregister(bdev, NULL, NULL); 8617 } 8618 8619 return rc; 8620 } 8621 8622 int 8623 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8624 struct spdk_bdev_module *module) 8625 { 8626 spdk_spin_lock(&bdev->internal.spinlock); 8627 8628 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8629 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8630 spdk_spin_unlock(&bdev->internal.spinlock); 8631 return -EPERM; 8632 } 8633 8634 if (desc && !desc->write) { 8635 desc->write = true; 8636 } 8637 8638 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8639 bdev->internal.claim.v1.module = module; 8640 8641 spdk_spin_unlock(&bdev->internal.spinlock); 8642 return 0; 8643 } 8644 8645 void 8646 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8647 { 8648 spdk_spin_lock(&bdev->internal.spinlock); 8649 8650 assert(bdev->internal.claim.v1.module != NULL); 8651 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8652 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8653 bdev->internal.claim.v1.module = NULL; 8654 8655 spdk_spin_unlock(&bdev->internal.spinlock); 8656 } 8657 8658 /* 8659 * Start claims v2 8660 */ 8661 8662 const char * 8663 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8664 { 8665 switch (type) { 8666 case SPDK_BDEV_CLAIM_NONE: 8667 return "not_claimed"; 8668 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8669 return "exclusive_write"; 8670 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8671 return "read_many_write_one"; 8672 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8673 return "read_many_write_none"; 8674 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8675 return "read_many_write_many"; 8676 default: 8677 break; 8678 } 8679 return "invalid_claim"; 8680 } 8681 8682 static bool 8683 claim_type_is_v2(enum spdk_bdev_claim_type type) 8684 { 8685 switch (type) { 8686 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8687 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8688 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8689 return true; 8690 default: 8691 break; 8692 } 8693 return false; 8694 } 8695 8696 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8697 static bool 8698 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8699 { 8700 switch (type) { 8701 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8702 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8703 return true; 8704 default: 8705 break; 8706 } 8707 return false; 8708 } 8709 8710 void 8711 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8712 { 8713 if (opts == NULL) { 8714 SPDK_ERRLOG("opts should not be NULL\n"); 8715 assert(opts != NULL); 8716 return; 8717 } 8718 if (size == 0) { 8719 SPDK_ERRLOG("size should not be zero\n"); 8720 assert(size != 0); 8721 return; 8722 } 8723 8724 memset(opts, 0, size); 8725 opts->opts_size = size; 8726 8727 #define FIELD_OK(field) \ 8728 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8729 8730 #define SET_FIELD(field, value) \ 8731 if (FIELD_OK(field)) { \ 8732 opts->field = value; \ 8733 } \ 8734 8735 SET_FIELD(shared_claim_key, 0); 8736 8737 #undef FIELD_OK 8738 #undef SET_FIELD 8739 } 8740 8741 static int 8742 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8743 { 8744 if (src->opts_size == 0) { 8745 SPDK_ERRLOG("size should not be zero\n"); 8746 return -1; 8747 } 8748 8749 memset(dst, 0, sizeof(*dst)); 8750 dst->opts_size = src->opts_size; 8751 8752 #define FIELD_OK(field) \ 8753 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8754 8755 #define SET_FIELD(field) \ 8756 if (FIELD_OK(field)) { \ 8757 dst->field = src->field; \ 8758 } \ 8759 8760 if (FIELD_OK(name)) { 8761 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8762 } 8763 8764 SET_FIELD(shared_claim_key); 8765 8766 /* You should not remove this statement, but need to update the assert statement 8767 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8768 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8769 8770 #undef FIELD_OK 8771 #undef SET_FIELD 8772 return 0; 8773 } 8774 8775 /* Returns 0 if a read-write-once claim can be taken. */ 8776 static int 8777 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8778 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8779 { 8780 struct spdk_bdev *bdev = desc->bdev; 8781 struct spdk_bdev_desc *open_desc; 8782 8783 assert(spdk_spin_held(&bdev->internal.spinlock)); 8784 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8785 8786 if (opts->shared_claim_key != 0) { 8787 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8788 bdev->name); 8789 return -EINVAL; 8790 } 8791 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8792 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8793 return -EPERM; 8794 } 8795 if (desc->claim != NULL) { 8796 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8797 bdev->name, desc->claim->module->name); 8798 return -EPERM; 8799 } 8800 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8801 if (desc != open_desc && open_desc->write) { 8802 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8803 "another descriptor is open for writing\n", 8804 bdev->name); 8805 return -EPERM; 8806 } 8807 } 8808 8809 return 0; 8810 } 8811 8812 /* Returns 0 if a read-only-many claim can be taken. */ 8813 static int 8814 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8815 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8816 { 8817 struct spdk_bdev *bdev = desc->bdev; 8818 struct spdk_bdev_desc *open_desc; 8819 8820 assert(spdk_spin_held(&bdev->internal.spinlock)); 8821 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8822 assert(desc->claim == NULL); 8823 8824 if (desc->write) { 8825 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8826 bdev->name); 8827 return -EINVAL; 8828 } 8829 if (opts->shared_claim_key != 0) { 8830 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8831 return -EINVAL; 8832 } 8833 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8834 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8835 if (open_desc->write) { 8836 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8837 "another descriptor is open for writing\n", 8838 bdev->name); 8839 return -EPERM; 8840 } 8841 } 8842 } 8843 8844 return 0; 8845 } 8846 8847 /* Returns 0 if a read-write-many claim can be taken. */ 8848 static int 8849 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8850 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8851 { 8852 struct spdk_bdev *bdev = desc->bdev; 8853 struct spdk_bdev_desc *open_desc; 8854 8855 assert(spdk_spin_held(&bdev->internal.spinlock)); 8856 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8857 assert(desc->claim == NULL); 8858 8859 if (opts->shared_claim_key == 0) { 8860 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8861 bdev->name); 8862 return -EINVAL; 8863 } 8864 switch (bdev->internal.claim_type) { 8865 case SPDK_BDEV_CLAIM_NONE: 8866 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8867 if (open_desc == desc) { 8868 continue; 8869 } 8870 if (open_desc->write) { 8871 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8872 "another descriptor is open for writing without a " 8873 "claim\n", bdev->name); 8874 return -EPERM; 8875 } 8876 } 8877 break; 8878 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8879 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8880 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8881 return -EPERM; 8882 } 8883 break; 8884 default: 8885 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8886 return -EBUSY; 8887 } 8888 8889 return 0; 8890 } 8891 8892 /* Updates desc and its bdev with a v2 claim. */ 8893 static int 8894 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8895 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8896 { 8897 struct spdk_bdev *bdev = desc->bdev; 8898 struct spdk_bdev_module_claim *claim; 8899 8900 assert(spdk_spin_held(&bdev->internal.spinlock)); 8901 assert(claim_type_is_v2(type)); 8902 assert(desc->claim == NULL); 8903 8904 claim = calloc(1, sizeof(*desc->claim)); 8905 if (claim == NULL) { 8906 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8907 return -ENOMEM; 8908 } 8909 claim->module = module; 8910 claim->desc = desc; 8911 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8912 memcpy(claim->name, opts->name, sizeof(claim->name)); 8913 desc->claim = claim; 8914 8915 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8916 bdev->internal.claim_type = type; 8917 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8918 bdev->internal.claim.v2.key = opts->shared_claim_key; 8919 } 8920 assert(type == bdev->internal.claim_type); 8921 8922 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8923 8924 if (!desc->write && claim_type_promotes_to_write(type)) { 8925 desc->write = true; 8926 } 8927 8928 return 0; 8929 } 8930 8931 int 8932 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8933 struct spdk_bdev_claim_opts *_opts, 8934 struct spdk_bdev_module *module) 8935 { 8936 struct spdk_bdev *bdev; 8937 struct spdk_bdev_claim_opts opts; 8938 int rc = 0; 8939 8940 if (desc == NULL) { 8941 SPDK_ERRLOG("descriptor must not be NULL\n"); 8942 return -EINVAL; 8943 } 8944 8945 bdev = desc->bdev; 8946 8947 if (_opts == NULL) { 8948 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8949 } else if (claim_opts_copy(_opts, &opts) != 0) { 8950 return -EINVAL; 8951 } 8952 8953 spdk_spin_lock(&bdev->internal.spinlock); 8954 8955 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8956 bdev->internal.claim_type != type) { 8957 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8958 spdk_spin_unlock(&bdev->internal.spinlock); 8959 return -EPERM; 8960 } 8961 8962 if (claim_type_is_v2(type) && desc->claim != NULL) { 8963 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8964 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8965 spdk_spin_unlock(&bdev->internal.spinlock); 8966 return -EPERM; 8967 } 8968 8969 switch (type) { 8970 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8971 spdk_spin_unlock(&bdev->internal.spinlock); 8972 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8973 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8974 rc = claim_verify_rwo(desc, type, &opts, module); 8975 break; 8976 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8977 rc = claim_verify_rom(desc, type, &opts, module); 8978 break; 8979 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8980 rc = claim_verify_rwm(desc, type, &opts, module); 8981 break; 8982 default: 8983 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8984 rc = -ENOTSUP; 8985 } 8986 8987 if (rc == 0) { 8988 rc = claim_bdev(desc, type, &opts, module); 8989 } 8990 8991 spdk_spin_unlock(&bdev->internal.spinlock); 8992 return rc; 8993 } 8994 8995 static void 8996 claim_reset(struct spdk_bdev *bdev) 8997 { 8998 assert(spdk_spin_held(&bdev->internal.spinlock)); 8999 assert(claim_type_is_v2(bdev->internal.claim_type)); 9000 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9001 9002 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9003 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9004 } 9005 9006 static void 9007 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9008 { 9009 struct spdk_bdev *bdev = desc->bdev; 9010 9011 assert(spdk_spin_held(&bdev->internal.spinlock)); 9012 assert(claim_type_is_v2(bdev->internal.claim_type)); 9013 9014 if (bdev->internal.examine_in_progress == 0) { 9015 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9016 free(desc->claim); 9017 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9018 claim_reset(bdev); 9019 } 9020 } else { 9021 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9022 desc->claim->module = NULL; 9023 desc->claim->desc = NULL; 9024 } 9025 desc->claim = NULL; 9026 } 9027 9028 /* 9029 * End claims v2 9030 */ 9031 9032 struct spdk_bdev * 9033 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9034 { 9035 assert(desc != NULL); 9036 return desc->bdev; 9037 } 9038 9039 int 9040 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9041 { 9042 struct spdk_bdev *bdev, *tmp; 9043 struct spdk_bdev_desc *desc; 9044 int rc = 0; 9045 9046 assert(fn != NULL); 9047 9048 spdk_spin_lock(&g_bdev_mgr.spinlock); 9049 bdev = spdk_bdev_first(); 9050 while (bdev != NULL) { 9051 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9052 if (rc != 0) { 9053 break; 9054 } 9055 rc = bdev_open(bdev, false, desc); 9056 if (rc != 0) { 9057 bdev_desc_free(desc); 9058 if (rc == -ENODEV) { 9059 /* Ignore the error and move to the next bdev. */ 9060 rc = 0; 9061 bdev = spdk_bdev_next(bdev); 9062 continue; 9063 } 9064 break; 9065 } 9066 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9067 9068 rc = fn(ctx, bdev); 9069 9070 spdk_spin_lock(&g_bdev_mgr.spinlock); 9071 tmp = spdk_bdev_next(bdev); 9072 bdev_close(bdev, desc); 9073 if (rc != 0) { 9074 break; 9075 } 9076 bdev = tmp; 9077 } 9078 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9079 9080 return rc; 9081 } 9082 9083 int 9084 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9085 { 9086 struct spdk_bdev *bdev, *tmp; 9087 struct spdk_bdev_desc *desc; 9088 int rc = 0; 9089 9090 assert(fn != NULL); 9091 9092 spdk_spin_lock(&g_bdev_mgr.spinlock); 9093 bdev = spdk_bdev_first_leaf(); 9094 while (bdev != NULL) { 9095 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9096 if (rc != 0) { 9097 break; 9098 } 9099 rc = bdev_open(bdev, false, desc); 9100 if (rc != 0) { 9101 bdev_desc_free(desc); 9102 if (rc == -ENODEV) { 9103 /* Ignore the error and move to the next bdev. */ 9104 rc = 0; 9105 bdev = spdk_bdev_next_leaf(bdev); 9106 continue; 9107 } 9108 break; 9109 } 9110 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9111 9112 rc = fn(ctx, bdev); 9113 9114 spdk_spin_lock(&g_bdev_mgr.spinlock); 9115 tmp = spdk_bdev_next_leaf(bdev); 9116 bdev_close(bdev, desc); 9117 if (rc != 0) { 9118 break; 9119 } 9120 bdev = tmp; 9121 } 9122 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9123 9124 return rc; 9125 } 9126 9127 void 9128 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9129 { 9130 struct iovec *iovs; 9131 int iovcnt; 9132 9133 if (bdev_io == NULL) { 9134 return; 9135 } 9136 9137 switch (bdev_io->type) { 9138 case SPDK_BDEV_IO_TYPE_READ: 9139 case SPDK_BDEV_IO_TYPE_WRITE: 9140 case SPDK_BDEV_IO_TYPE_ZCOPY: 9141 iovs = bdev_io->u.bdev.iovs; 9142 iovcnt = bdev_io->u.bdev.iovcnt; 9143 break; 9144 default: 9145 iovs = NULL; 9146 iovcnt = 0; 9147 break; 9148 } 9149 9150 if (iovp) { 9151 *iovp = iovs; 9152 } 9153 if (iovcntp) { 9154 *iovcntp = iovcnt; 9155 } 9156 } 9157 9158 void * 9159 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9160 { 9161 if (bdev_io == NULL) { 9162 return NULL; 9163 } 9164 9165 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9166 return NULL; 9167 } 9168 9169 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9170 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9171 return bdev_io->u.bdev.md_buf; 9172 } 9173 9174 return NULL; 9175 } 9176 9177 void * 9178 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9179 { 9180 if (bdev_io == NULL) { 9181 assert(false); 9182 return NULL; 9183 } 9184 9185 return bdev_io->internal.caller_ctx; 9186 } 9187 9188 void 9189 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9190 { 9191 9192 if (spdk_bdev_module_list_find(bdev_module->name)) { 9193 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9194 assert(false); 9195 } 9196 9197 spdk_spin_init(&bdev_module->internal.spinlock); 9198 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9199 9200 /* 9201 * Modules with examine callbacks must be initialized first, so they are 9202 * ready to handle examine callbacks from later modules that will 9203 * register physical bdevs. 9204 */ 9205 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9206 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9207 } else { 9208 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9209 } 9210 } 9211 9212 struct spdk_bdev_module * 9213 spdk_bdev_module_list_find(const char *name) 9214 { 9215 struct spdk_bdev_module *bdev_module; 9216 9217 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9218 if (strcmp(name, bdev_module->name) == 0) { 9219 break; 9220 } 9221 } 9222 9223 return bdev_module; 9224 } 9225 9226 static int 9227 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9228 { 9229 uint64_t num_blocks; 9230 void *md_buf = NULL; 9231 9232 num_blocks = bdev_io->u.bdev.num_blocks; 9233 9234 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9235 md_buf = (char *)g_bdev_mgr.zero_buffer + 9236 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9237 } 9238 9239 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9240 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9241 g_bdev_mgr.zero_buffer, md_buf, 9242 bdev_io->u.bdev.offset_blocks, num_blocks, 9243 bdev_write_zero_buffer_done, bdev_io); 9244 } 9245 9246 static void 9247 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9248 { 9249 struct spdk_bdev_io *parent_io = cb_arg; 9250 9251 spdk_bdev_free_io(bdev_io); 9252 9253 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9254 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9255 } 9256 9257 static void 9258 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9259 { 9260 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9261 ctx->bdev->internal.qos_mod_in_progress = false; 9262 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9263 9264 if (ctx->cb_fn) { 9265 ctx->cb_fn(ctx->cb_arg, status); 9266 } 9267 free(ctx); 9268 } 9269 9270 static void 9271 bdev_disable_qos_done(void *cb_arg) 9272 { 9273 struct set_qos_limit_ctx *ctx = cb_arg; 9274 struct spdk_bdev *bdev = ctx->bdev; 9275 struct spdk_bdev_qos *qos; 9276 9277 spdk_spin_lock(&bdev->internal.spinlock); 9278 qos = bdev->internal.qos; 9279 bdev->internal.qos = NULL; 9280 spdk_spin_unlock(&bdev->internal.spinlock); 9281 9282 if (qos->thread != NULL) { 9283 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9284 spdk_poller_unregister(&qos->poller); 9285 } 9286 9287 free(qos); 9288 9289 bdev_set_qos_limit_done(ctx, 0); 9290 } 9291 9292 static void 9293 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9294 { 9295 struct set_qos_limit_ctx *ctx = _ctx; 9296 struct spdk_thread *thread; 9297 9298 spdk_spin_lock(&bdev->internal.spinlock); 9299 thread = bdev->internal.qos->thread; 9300 spdk_spin_unlock(&bdev->internal.spinlock); 9301 9302 if (thread != NULL) { 9303 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9304 } else { 9305 bdev_disable_qos_done(ctx); 9306 } 9307 } 9308 9309 static void 9310 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9311 struct spdk_io_channel *ch, void *_ctx) 9312 { 9313 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9314 struct spdk_bdev_io *bdev_io; 9315 9316 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9317 9318 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9319 /* Re-submit the queued I/O. */ 9320 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9321 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9322 _bdev_io_submit(bdev_io); 9323 } 9324 9325 spdk_bdev_for_each_channel_continue(i, 0); 9326 } 9327 9328 static void 9329 bdev_update_qos_rate_limit_msg(void *cb_arg) 9330 { 9331 struct set_qos_limit_ctx *ctx = cb_arg; 9332 struct spdk_bdev *bdev = ctx->bdev; 9333 9334 spdk_spin_lock(&bdev->internal.spinlock); 9335 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9336 spdk_spin_unlock(&bdev->internal.spinlock); 9337 9338 bdev_set_qos_limit_done(ctx, 0); 9339 } 9340 9341 static void 9342 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9343 struct spdk_io_channel *ch, void *_ctx) 9344 { 9345 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9346 9347 spdk_spin_lock(&bdev->internal.spinlock); 9348 bdev_enable_qos(bdev, bdev_ch); 9349 spdk_spin_unlock(&bdev->internal.spinlock); 9350 spdk_bdev_for_each_channel_continue(i, 0); 9351 } 9352 9353 static void 9354 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9355 { 9356 struct set_qos_limit_ctx *ctx = _ctx; 9357 9358 bdev_set_qos_limit_done(ctx, status); 9359 } 9360 9361 static void 9362 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9363 { 9364 int i; 9365 9366 assert(bdev->internal.qos != NULL); 9367 9368 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9369 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9370 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9371 9372 if (limits[i] == 0) { 9373 bdev->internal.qos->rate_limits[i].limit = 9374 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9375 } 9376 } 9377 } 9378 } 9379 9380 void 9381 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9382 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9383 { 9384 struct set_qos_limit_ctx *ctx; 9385 uint32_t limit_set_complement; 9386 uint64_t min_limit_per_sec; 9387 int i; 9388 bool disable_rate_limit = true; 9389 9390 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9391 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9392 continue; 9393 } 9394 9395 if (limits[i] > 0) { 9396 disable_rate_limit = false; 9397 } 9398 9399 if (bdev_qos_is_iops_rate_limit(i) == true) { 9400 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9401 } else { 9402 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9403 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9404 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9405 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9406 } 9407 /* Change from megabyte to byte rate limit */ 9408 limits[i] = limits[i] * 1024 * 1024; 9409 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9410 } 9411 9412 limit_set_complement = limits[i] % min_limit_per_sec; 9413 if (limit_set_complement) { 9414 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9415 limits[i], min_limit_per_sec); 9416 limits[i] += min_limit_per_sec - limit_set_complement; 9417 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9418 } 9419 } 9420 9421 ctx = calloc(1, sizeof(*ctx)); 9422 if (ctx == NULL) { 9423 cb_fn(cb_arg, -ENOMEM); 9424 return; 9425 } 9426 9427 ctx->cb_fn = cb_fn; 9428 ctx->cb_arg = cb_arg; 9429 ctx->bdev = bdev; 9430 9431 spdk_spin_lock(&bdev->internal.spinlock); 9432 if (bdev->internal.qos_mod_in_progress) { 9433 spdk_spin_unlock(&bdev->internal.spinlock); 9434 free(ctx); 9435 cb_fn(cb_arg, -EAGAIN); 9436 return; 9437 } 9438 bdev->internal.qos_mod_in_progress = true; 9439 9440 if (disable_rate_limit == true && bdev->internal.qos) { 9441 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9442 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9443 (bdev->internal.qos->rate_limits[i].limit > 0 && 9444 bdev->internal.qos->rate_limits[i].limit != 9445 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9446 disable_rate_limit = false; 9447 break; 9448 } 9449 } 9450 } 9451 9452 if (disable_rate_limit == false) { 9453 if (bdev->internal.qos == NULL) { 9454 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9455 if (!bdev->internal.qos) { 9456 spdk_spin_unlock(&bdev->internal.spinlock); 9457 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9458 bdev_set_qos_limit_done(ctx, -ENOMEM); 9459 return; 9460 } 9461 } 9462 9463 if (bdev->internal.qos->thread == NULL) { 9464 /* Enabling */ 9465 bdev_set_qos_rate_limits(bdev, limits); 9466 9467 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9468 bdev_enable_qos_done); 9469 } else { 9470 /* Updating */ 9471 bdev_set_qos_rate_limits(bdev, limits); 9472 9473 spdk_thread_send_msg(bdev->internal.qos->thread, 9474 bdev_update_qos_rate_limit_msg, ctx); 9475 } 9476 } else { 9477 if (bdev->internal.qos != NULL) { 9478 bdev_set_qos_rate_limits(bdev, limits); 9479 9480 /* Disabling */ 9481 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9482 bdev_disable_qos_msg_done); 9483 } else { 9484 spdk_spin_unlock(&bdev->internal.spinlock); 9485 bdev_set_qos_limit_done(ctx, 0); 9486 return; 9487 } 9488 } 9489 9490 spdk_spin_unlock(&bdev->internal.spinlock); 9491 } 9492 9493 struct spdk_bdev_histogram_ctx { 9494 spdk_bdev_histogram_status_cb cb_fn; 9495 void *cb_arg; 9496 struct spdk_bdev *bdev; 9497 int status; 9498 }; 9499 9500 static void 9501 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9502 { 9503 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9504 9505 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9506 ctx->bdev->internal.histogram_in_progress = false; 9507 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9508 ctx->cb_fn(ctx->cb_arg, ctx->status); 9509 free(ctx); 9510 } 9511 9512 static void 9513 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9514 struct spdk_io_channel *_ch, void *_ctx) 9515 { 9516 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9517 9518 if (ch->histogram != NULL) { 9519 spdk_histogram_data_free(ch->histogram); 9520 ch->histogram = NULL; 9521 } 9522 spdk_bdev_for_each_channel_continue(i, 0); 9523 } 9524 9525 static void 9526 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9527 { 9528 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9529 9530 if (status != 0) { 9531 ctx->status = status; 9532 ctx->bdev->internal.histogram_enabled = false; 9533 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9534 bdev_histogram_disable_channel_cb); 9535 } else { 9536 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9537 ctx->bdev->internal.histogram_in_progress = false; 9538 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9539 ctx->cb_fn(ctx->cb_arg, ctx->status); 9540 free(ctx); 9541 } 9542 } 9543 9544 static void 9545 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9546 struct spdk_io_channel *_ch, void *_ctx) 9547 { 9548 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9549 int status = 0; 9550 9551 if (ch->histogram == NULL) { 9552 ch->histogram = spdk_histogram_data_alloc(); 9553 if (ch->histogram == NULL) { 9554 status = -ENOMEM; 9555 } 9556 } 9557 9558 spdk_bdev_for_each_channel_continue(i, status); 9559 } 9560 9561 void 9562 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9563 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9564 { 9565 struct spdk_bdev_histogram_ctx *ctx; 9566 9567 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9568 if (ctx == NULL) { 9569 cb_fn(cb_arg, -ENOMEM); 9570 return; 9571 } 9572 9573 ctx->bdev = bdev; 9574 ctx->status = 0; 9575 ctx->cb_fn = cb_fn; 9576 ctx->cb_arg = cb_arg; 9577 9578 spdk_spin_lock(&bdev->internal.spinlock); 9579 if (bdev->internal.histogram_in_progress) { 9580 spdk_spin_unlock(&bdev->internal.spinlock); 9581 free(ctx); 9582 cb_fn(cb_arg, -EAGAIN); 9583 return; 9584 } 9585 9586 bdev->internal.histogram_in_progress = true; 9587 spdk_spin_unlock(&bdev->internal.spinlock); 9588 9589 bdev->internal.histogram_enabled = enable; 9590 bdev->internal.histogram_io_type = opts->io_type; 9591 9592 if (enable) { 9593 /* Allocate histogram for each channel */ 9594 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9595 bdev_histogram_enable_channel_cb); 9596 } else { 9597 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9598 bdev_histogram_disable_channel_cb); 9599 } 9600 } 9601 9602 void 9603 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9604 { 9605 if (opts == NULL) { 9606 SPDK_ERRLOG("opts should not be NULL\n"); 9607 assert(opts != NULL); 9608 return; 9609 } 9610 if (size == 0) { 9611 SPDK_ERRLOG("size should not be zero\n"); 9612 assert(size != 0); 9613 return; 9614 } 9615 9616 memset(opts, 0, size); 9617 opts->size = size; 9618 9619 #define FIELD_OK(field) \ 9620 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9621 9622 #define SET_FIELD(field, value) \ 9623 if (FIELD_OK(field)) { \ 9624 opts->field = value; \ 9625 } \ 9626 9627 SET_FIELD(io_type, 0); 9628 9629 /* You should not remove this statement, but need to update the assert statement 9630 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9631 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9632 9633 #undef FIELD_OK 9634 #undef SET_FIELD 9635 } 9636 9637 void 9638 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9639 void *cb_arg, bool enable) 9640 { 9641 struct spdk_bdev_enable_histogram_opts opts; 9642 9643 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9644 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9645 } 9646 9647 struct spdk_bdev_histogram_data_ctx { 9648 spdk_bdev_histogram_data_cb cb_fn; 9649 void *cb_arg; 9650 struct spdk_bdev *bdev; 9651 /** merged histogram data from all channels */ 9652 struct spdk_histogram_data *histogram; 9653 }; 9654 9655 static void 9656 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9657 { 9658 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9659 9660 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9661 free(ctx); 9662 } 9663 9664 static void 9665 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9666 struct spdk_io_channel *_ch, void *_ctx) 9667 { 9668 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9669 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9670 int status = 0; 9671 9672 if (ch->histogram == NULL) { 9673 status = -EFAULT; 9674 } else { 9675 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9676 } 9677 9678 spdk_bdev_for_each_channel_continue(i, status); 9679 } 9680 9681 void 9682 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9683 spdk_bdev_histogram_data_cb cb_fn, 9684 void *cb_arg) 9685 { 9686 struct spdk_bdev_histogram_data_ctx *ctx; 9687 9688 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9689 if (ctx == NULL) { 9690 cb_fn(cb_arg, -ENOMEM, NULL); 9691 return; 9692 } 9693 9694 ctx->bdev = bdev; 9695 ctx->cb_fn = cb_fn; 9696 ctx->cb_arg = cb_arg; 9697 9698 ctx->histogram = histogram; 9699 9700 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9701 bdev_histogram_get_channel_cb); 9702 } 9703 9704 void 9705 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9706 void *cb_arg) 9707 { 9708 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9709 int status = 0; 9710 9711 assert(cb_fn != NULL); 9712 9713 if (bdev_ch->histogram == NULL) { 9714 status = -EFAULT; 9715 } 9716 cb_fn(cb_arg, status, bdev_ch->histogram); 9717 } 9718 9719 size_t 9720 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9721 size_t max_events) 9722 { 9723 struct media_event_entry *entry; 9724 size_t num_events = 0; 9725 9726 for (; num_events < max_events; ++num_events) { 9727 entry = TAILQ_FIRST(&desc->pending_media_events); 9728 if (entry == NULL) { 9729 break; 9730 } 9731 9732 events[num_events] = entry->event; 9733 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9734 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9735 } 9736 9737 return num_events; 9738 } 9739 9740 int 9741 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9742 size_t num_events) 9743 { 9744 struct spdk_bdev_desc *desc; 9745 struct media_event_entry *entry; 9746 size_t event_id; 9747 int rc = 0; 9748 9749 assert(bdev->media_events); 9750 9751 spdk_spin_lock(&bdev->internal.spinlock); 9752 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9753 if (desc->write) { 9754 break; 9755 } 9756 } 9757 9758 if (desc == NULL || desc->media_events_buffer == NULL) { 9759 rc = -ENODEV; 9760 goto out; 9761 } 9762 9763 for (event_id = 0; event_id < num_events; ++event_id) { 9764 entry = TAILQ_FIRST(&desc->free_media_events); 9765 if (entry == NULL) { 9766 break; 9767 } 9768 9769 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9770 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9771 entry->event = events[event_id]; 9772 } 9773 9774 rc = event_id; 9775 out: 9776 spdk_spin_unlock(&bdev->internal.spinlock); 9777 return rc; 9778 } 9779 9780 static void 9781 _media_management_notify(void *arg) 9782 { 9783 struct spdk_bdev_desc *desc = arg; 9784 9785 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9786 } 9787 9788 void 9789 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9790 { 9791 struct spdk_bdev_desc *desc; 9792 9793 spdk_spin_lock(&bdev->internal.spinlock); 9794 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9795 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9796 event_notify(desc, _media_management_notify); 9797 } 9798 } 9799 spdk_spin_unlock(&bdev->internal.spinlock); 9800 } 9801 9802 struct locked_lba_range_ctx { 9803 struct lba_range range; 9804 struct lba_range *current_range; 9805 struct lba_range *owner_range; 9806 struct spdk_poller *poller; 9807 lock_range_cb cb_fn; 9808 void *cb_arg; 9809 }; 9810 9811 static void 9812 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9813 { 9814 struct locked_lba_range_ctx *ctx = _ctx; 9815 9816 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9817 free(ctx); 9818 } 9819 9820 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9821 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9822 9823 static void 9824 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9825 { 9826 struct locked_lba_range_ctx *ctx = _ctx; 9827 9828 if (status == -ENOMEM) { 9829 /* One of the channels could not allocate a range object. 9830 * So we have to go back and clean up any ranges that were 9831 * allocated successfully before we return error status to 9832 * the caller. We can reuse the unlock function to do that 9833 * clean up. 9834 */ 9835 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9836 bdev_lock_error_cleanup_cb); 9837 return; 9838 } 9839 9840 /* All channels have locked this range and no I/O overlapping the range 9841 * are outstanding! Set the owner_ch for the range object for the 9842 * locking channel, so that this channel will know that it is allowed 9843 * to write to this range. 9844 */ 9845 if (ctx->owner_range != NULL) { 9846 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9847 } 9848 9849 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9850 9851 /* Don't free the ctx here. Its range is in the bdev's global list of 9852 * locked ranges still, and will be removed and freed when this range 9853 * is later unlocked. 9854 */ 9855 } 9856 9857 static int 9858 bdev_lock_lba_range_check_io(void *_i) 9859 { 9860 struct spdk_bdev_channel_iter *i = _i; 9861 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9862 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9863 struct locked_lba_range_ctx *ctx = i->ctx; 9864 struct lba_range *range = ctx->current_range; 9865 struct spdk_bdev_io *bdev_io; 9866 9867 spdk_poller_unregister(&ctx->poller); 9868 9869 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9870 * range. But we need to wait until any outstanding IO overlapping with this range 9871 * are completed. 9872 */ 9873 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9874 if (bdev_io_range_is_locked(bdev_io, range)) { 9875 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9876 return SPDK_POLLER_BUSY; 9877 } 9878 } 9879 9880 spdk_bdev_for_each_channel_continue(i, 0); 9881 return SPDK_POLLER_BUSY; 9882 } 9883 9884 static void 9885 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9886 struct spdk_io_channel *_ch, void *_ctx) 9887 { 9888 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9889 struct locked_lba_range_ctx *ctx = _ctx; 9890 struct lba_range *range; 9891 9892 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9893 if (range->length == ctx->range.length && 9894 range->offset == ctx->range.offset && 9895 range->locked_ctx == ctx->range.locked_ctx) { 9896 /* This range already exists on this channel, so don't add 9897 * it again. This can happen when a new channel is created 9898 * while the for_each_channel operation is in progress. 9899 * Do not check for outstanding I/O in that case, since the 9900 * range was locked before any I/O could be submitted to the 9901 * new channel. 9902 */ 9903 spdk_bdev_for_each_channel_continue(i, 0); 9904 return; 9905 } 9906 } 9907 9908 range = calloc(1, sizeof(*range)); 9909 if (range == NULL) { 9910 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9911 return; 9912 } 9913 9914 range->length = ctx->range.length; 9915 range->offset = ctx->range.offset; 9916 range->locked_ctx = ctx->range.locked_ctx; 9917 range->quiesce = ctx->range.quiesce; 9918 ctx->current_range = range; 9919 if (ctx->range.owner_ch == ch) { 9920 /* This is the range object for the channel that will hold 9921 * the lock. Store it in the ctx object so that we can easily 9922 * set its owner_ch after the lock is finally acquired. 9923 */ 9924 ctx->owner_range = range; 9925 } 9926 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9927 bdev_lock_lba_range_check_io(i); 9928 } 9929 9930 static void 9931 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9932 { 9933 assert(spdk_get_thread() == ctx->range.owner_thread); 9934 assert(ctx->range.owner_ch == NULL || 9935 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9936 9937 /* We will add a copy of this range to each channel now. */ 9938 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9939 bdev_lock_lba_range_cb); 9940 } 9941 9942 static bool 9943 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9944 { 9945 struct lba_range *r; 9946 9947 TAILQ_FOREACH(r, tailq, tailq) { 9948 if (bdev_lba_range_overlapped(range, r)) { 9949 return true; 9950 } 9951 } 9952 return false; 9953 } 9954 9955 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9956 9957 static int 9958 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9959 uint64_t offset, uint64_t length, 9960 lock_range_cb cb_fn, void *cb_arg) 9961 { 9962 struct locked_lba_range_ctx *ctx; 9963 9964 ctx = calloc(1, sizeof(*ctx)); 9965 if (ctx == NULL) { 9966 return -ENOMEM; 9967 } 9968 9969 ctx->range.offset = offset; 9970 ctx->range.length = length; 9971 ctx->range.owner_thread = spdk_get_thread(); 9972 ctx->range.owner_ch = ch; 9973 ctx->range.locked_ctx = cb_arg; 9974 ctx->range.bdev = bdev; 9975 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9976 ctx->cb_fn = cb_fn; 9977 ctx->cb_arg = cb_arg; 9978 9979 spdk_spin_lock(&bdev->internal.spinlock); 9980 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9981 /* There is an active lock overlapping with this range. 9982 * Put it on the pending list until this range no 9983 * longer overlaps with another. 9984 */ 9985 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9986 } else { 9987 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9988 bdev_lock_lba_range_ctx(bdev, ctx); 9989 } 9990 spdk_spin_unlock(&bdev->internal.spinlock); 9991 return 0; 9992 } 9993 9994 static int 9995 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9996 uint64_t offset, uint64_t length, 9997 lock_range_cb cb_fn, void *cb_arg) 9998 { 9999 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10000 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10001 10002 if (cb_arg == NULL) { 10003 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10004 return -EINVAL; 10005 } 10006 10007 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10008 } 10009 10010 static void 10011 bdev_lock_lba_range_ctx_msg(void *_ctx) 10012 { 10013 struct locked_lba_range_ctx *ctx = _ctx; 10014 10015 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10016 } 10017 10018 static void 10019 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10020 { 10021 struct locked_lba_range_ctx *ctx = _ctx; 10022 struct locked_lba_range_ctx *pending_ctx; 10023 struct lba_range *range, *tmp; 10024 10025 spdk_spin_lock(&bdev->internal.spinlock); 10026 /* Check if there are any pending locked ranges that overlap with this range 10027 * that was just unlocked. If there are, check that it doesn't overlap with any 10028 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10029 * the lock process. 10030 */ 10031 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10032 if (bdev_lba_range_overlapped(range, &ctx->range) && 10033 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10034 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10035 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10036 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10037 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10038 bdev_lock_lba_range_ctx_msg, pending_ctx); 10039 } 10040 } 10041 spdk_spin_unlock(&bdev->internal.spinlock); 10042 10043 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10044 free(ctx); 10045 } 10046 10047 static void 10048 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10049 struct spdk_io_channel *_ch, void *_ctx) 10050 { 10051 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10052 struct locked_lba_range_ctx *ctx = _ctx; 10053 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10054 struct spdk_bdev_io *bdev_io; 10055 struct lba_range *range; 10056 10057 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10058 if (ctx->range.offset == range->offset && 10059 ctx->range.length == range->length && 10060 ctx->range.locked_ctx == range->locked_ctx) { 10061 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10062 free(range); 10063 break; 10064 } 10065 } 10066 10067 /* Note: we should almost always be able to assert that the range specified 10068 * was found. But there are some very rare corner cases where a new channel 10069 * gets created simultaneously with a range unlock, where this function 10070 * would execute on that new channel and wouldn't have the range. 10071 * We also use this to clean up range allocations when a later allocation 10072 * fails in the locking path. 10073 * So we can't actually assert() here. 10074 */ 10075 10076 /* Swap the locked IO into a temporary list, and then try to submit them again. 10077 * We could hyper-optimize this to only resubmit locked I/O that overlap 10078 * with the range that was just unlocked, but this isn't a performance path so 10079 * we go for simplicity here. 10080 */ 10081 TAILQ_INIT(&io_locked); 10082 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10083 while (!TAILQ_EMPTY(&io_locked)) { 10084 bdev_io = TAILQ_FIRST(&io_locked); 10085 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10086 bdev_io_submit(bdev_io); 10087 } 10088 10089 spdk_bdev_for_each_channel_continue(i, 0); 10090 } 10091 10092 static int 10093 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10094 lock_range_cb cb_fn, void *cb_arg) 10095 { 10096 struct locked_lba_range_ctx *ctx; 10097 struct lba_range *range; 10098 10099 spdk_spin_lock(&bdev->internal.spinlock); 10100 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10101 * and remove it. This ensures new channels don't inherit the locked range. 10102 * Then we will send a message to each channel to remove the range from its 10103 * per-channel list. 10104 */ 10105 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10106 if (range->offset == offset && range->length == length && 10107 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10108 break; 10109 } 10110 } 10111 if (range == NULL) { 10112 assert(false); 10113 spdk_spin_unlock(&bdev->internal.spinlock); 10114 return -EINVAL; 10115 } 10116 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10117 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10118 spdk_spin_unlock(&bdev->internal.spinlock); 10119 10120 ctx->cb_fn = cb_fn; 10121 ctx->cb_arg = cb_arg; 10122 10123 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10124 bdev_unlock_lba_range_cb); 10125 return 0; 10126 } 10127 10128 static int 10129 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10130 uint64_t offset, uint64_t length, 10131 lock_range_cb cb_fn, void *cb_arg) 10132 { 10133 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10134 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10135 struct lba_range *range; 10136 bool range_found = false; 10137 10138 /* Let's make sure the specified channel actually has a lock on 10139 * the specified range. Note that the range must match exactly. 10140 */ 10141 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10142 if (range->offset == offset && range->length == length && 10143 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10144 range_found = true; 10145 break; 10146 } 10147 } 10148 10149 if (!range_found) { 10150 return -EINVAL; 10151 } 10152 10153 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10154 } 10155 10156 struct bdev_quiesce_ctx { 10157 spdk_bdev_quiesce_cb cb_fn; 10158 void *cb_arg; 10159 }; 10160 10161 static void 10162 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10163 { 10164 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10165 10166 if (quiesce_ctx->cb_fn != NULL) { 10167 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10168 } 10169 10170 free(quiesce_ctx); 10171 } 10172 10173 static void 10174 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10175 { 10176 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10177 struct spdk_bdev_module *module = range->bdev->module; 10178 10179 if (status != 0) { 10180 if (quiesce_ctx->cb_fn != NULL) { 10181 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10182 } 10183 free(quiesce_ctx); 10184 return; 10185 } 10186 10187 spdk_spin_lock(&module->internal.spinlock); 10188 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10189 spdk_spin_unlock(&module->internal.spinlock); 10190 10191 if (quiesce_ctx->cb_fn != NULL) { 10192 /* copy the context in case the range is unlocked by the callback */ 10193 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10194 10195 quiesce_ctx->cb_fn = NULL; 10196 quiesce_ctx->cb_arg = NULL; 10197 10198 tmp.cb_fn(tmp.cb_arg, status); 10199 } 10200 /* quiesce_ctx will be freed on unquiesce */ 10201 } 10202 10203 static int 10204 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10205 uint64_t offset, uint64_t length, 10206 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10207 bool unquiesce) 10208 { 10209 struct bdev_quiesce_ctx *quiesce_ctx; 10210 int rc; 10211 10212 if (module != bdev->module) { 10213 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10214 return -EINVAL; 10215 } 10216 10217 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10218 return -EINVAL; 10219 } 10220 10221 if (unquiesce) { 10222 struct lba_range *range; 10223 10224 /* Make sure the specified range is actually quiesced in the specified module and 10225 * then remove it from the list. Note that the range must match exactly. 10226 */ 10227 spdk_spin_lock(&module->internal.spinlock); 10228 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10229 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10230 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10231 break; 10232 } 10233 } 10234 spdk_spin_unlock(&module->internal.spinlock); 10235 10236 if (range == NULL) { 10237 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10238 return -EINVAL; 10239 } 10240 10241 quiesce_ctx = range->locked_ctx; 10242 quiesce_ctx->cb_fn = cb_fn; 10243 quiesce_ctx->cb_arg = cb_arg; 10244 10245 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10246 } else { 10247 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10248 if (quiesce_ctx == NULL) { 10249 return -ENOMEM; 10250 } 10251 10252 quiesce_ctx->cb_fn = cb_fn; 10253 quiesce_ctx->cb_arg = cb_arg; 10254 10255 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10256 if (rc != 0) { 10257 free(quiesce_ctx); 10258 } 10259 } 10260 10261 return rc; 10262 } 10263 10264 int 10265 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10266 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10267 { 10268 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10269 } 10270 10271 int 10272 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10273 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10274 { 10275 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10276 } 10277 10278 int 10279 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10280 uint64_t offset, uint64_t length, 10281 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10282 { 10283 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10284 } 10285 10286 int 10287 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10288 uint64_t offset, uint64_t length, 10289 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10290 { 10291 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10292 } 10293 10294 int 10295 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10296 int array_size) 10297 { 10298 if (!bdev) { 10299 return -EINVAL; 10300 } 10301 10302 if (bdev->fn_table->get_memory_domains) { 10303 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10304 } 10305 10306 return 0; 10307 } 10308 10309 struct spdk_bdev_for_each_io_ctx { 10310 void *ctx; 10311 spdk_bdev_io_fn fn; 10312 spdk_bdev_for_each_io_cb cb; 10313 }; 10314 10315 static void 10316 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10317 struct spdk_io_channel *io_ch, void *_ctx) 10318 { 10319 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10320 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10321 struct spdk_bdev_io *bdev_io; 10322 int rc = 0; 10323 10324 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10325 rc = ctx->fn(ctx->ctx, bdev_io); 10326 if (rc != 0) { 10327 break; 10328 } 10329 } 10330 10331 spdk_bdev_for_each_channel_continue(i, rc); 10332 } 10333 10334 static void 10335 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10336 { 10337 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10338 10339 ctx->cb(ctx->ctx, status); 10340 10341 free(ctx); 10342 } 10343 10344 void 10345 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10346 spdk_bdev_for_each_io_cb cb) 10347 { 10348 struct spdk_bdev_for_each_io_ctx *ctx; 10349 10350 assert(fn != NULL && cb != NULL); 10351 10352 ctx = calloc(1, sizeof(*ctx)); 10353 if (ctx == NULL) { 10354 SPDK_ERRLOG("Failed to allocate context.\n"); 10355 cb(_ctx, -ENOMEM); 10356 return; 10357 } 10358 10359 ctx->ctx = _ctx; 10360 ctx->fn = fn; 10361 ctx->cb = cb; 10362 10363 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10364 bdev_for_each_io_done); 10365 } 10366 10367 void 10368 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10369 { 10370 spdk_for_each_channel_continue(iter->i, status); 10371 } 10372 10373 static struct spdk_bdev * 10374 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10375 { 10376 void *io_device = spdk_io_channel_iter_get_io_device(i); 10377 10378 return __bdev_from_io_dev(io_device); 10379 } 10380 10381 static void 10382 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10383 { 10384 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10385 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10386 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10387 10388 iter->i = i; 10389 iter->fn(iter, bdev, ch, iter->ctx); 10390 } 10391 10392 static void 10393 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10394 { 10395 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10396 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10397 10398 iter->i = i; 10399 iter->cpl(bdev, iter->ctx, status); 10400 10401 free(iter); 10402 } 10403 10404 void 10405 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10406 void *ctx, spdk_bdev_for_each_channel_done cpl) 10407 { 10408 struct spdk_bdev_channel_iter *iter; 10409 10410 assert(bdev != NULL && fn != NULL && ctx != NULL); 10411 10412 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10413 if (iter == NULL) { 10414 SPDK_ERRLOG("Unable to allocate iterator\n"); 10415 assert(false); 10416 return; 10417 } 10418 10419 iter->fn = fn; 10420 iter->cpl = cpl; 10421 iter->ctx = ctx; 10422 10423 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10424 iter, bdev_each_channel_cpl); 10425 } 10426 10427 static void 10428 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10429 { 10430 struct spdk_bdev_io *parent_io = cb_arg; 10431 10432 spdk_bdev_free_io(bdev_io); 10433 10434 /* Check return status of write */ 10435 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10436 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10437 } 10438 10439 static void 10440 bdev_copy_do_write(void *_bdev_io) 10441 { 10442 struct spdk_bdev_io *bdev_io = _bdev_io; 10443 int rc; 10444 10445 /* Write blocks */ 10446 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10447 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10448 bdev_io->u.bdev.iovs[0].iov_base, 10449 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10450 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10451 10452 if (rc == -ENOMEM) { 10453 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10454 } else if (rc != 0) { 10455 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10456 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10457 } 10458 } 10459 10460 static void 10461 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10462 { 10463 struct spdk_bdev_io *parent_io = cb_arg; 10464 10465 spdk_bdev_free_io(bdev_io); 10466 10467 /* Check return status of read */ 10468 if (!success) { 10469 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10470 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10471 return; 10472 } 10473 10474 /* Do write */ 10475 bdev_copy_do_write(parent_io); 10476 } 10477 10478 static void 10479 bdev_copy_do_read(void *_bdev_io) 10480 { 10481 struct spdk_bdev_io *bdev_io = _bdev_io; 10482 int rc; 10483 10484 /* Read blocks */ 10485 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10486 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10487 bdev_io->u.bdev.iovs[0].iov_base, 10488 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10489 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10490 10491 if (rc == -ENOMEM) { 10492 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10493 } else if (rc != 0) { 10494 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10495 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10496 } 10497 } 10498 10499 static void 10500 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10501 { 10502 if (!success) { 10503 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10504 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10505 return; 10506 } 10507 10508 bdev_copy_do_read(bdev_io); 10509 } 10510 10511 int 10512 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10513 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10514 spdk_bdev_io_completion_cb cb, void *cb_arg) 10515 { 10516 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10517 struct spdk_bdev_io *bdev_io; 10518 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10519 10520 if (!desc->write) { 10521 return -EBADF; 10522 } 10523 10524 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10525 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10526 SPDK_DEBUGLOG(bdev, 10527 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10528 dst_offset_blocks, src_offset_blocks, num_blocks); 10529 return -EINVAL; 10530 } 10531 10532 bdev_io = bdev_channel_get_io(channel); 10533 if (!bdev_io) { 10534 return -ENOMEM; 10535 } 10536 10537 bdev_io->internal.ch = channel; 10538 bdev_io->internal.desc = desc; 10539 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10540 10541 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10542 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10543 bdev_io->u.bdev.num_blocks = num_blocks; 10544 bdev_io->u.bdev.memory_domain = NULL; 10545 bdev_io->u.bdev.memory_domain_ctx = NULL; 10546 bdev_io->u.bdev.iovs = NULL; 10547 bdev_io->u.bdev.iovcnt = 0; 10548 bdev_io->u.bdev.md_buf = NULL; 10549 bdev_io->u.bdev.accel_sequence = NULL; 10550 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10551 10552 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10553 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10554 return 0; 10555 } 10556 10557 10558 /* If the copy size is large and should be split, use the generic split logic 10559 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10560 * 10561 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10562 * emulate it using regular read and write requests otherwise. 10563 */ 10564 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10565 bdev_io->internal.f.split) { 10566 bdev_io_submit(bdev_io); 10567 return 0; 10568 } 10569 10570 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10571 10572 return 0; 10573 } 10574 10575 SPDK_LOG_REGISTER_COMPONENT(bdev) 10576 10577 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10578 { 10579 struct spdk_trace_tpoint_opts opts[] = { 10580 { 10581 "BDEV_IO_START", TRACE_BDEV_IO_START, 10582 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10583 { 10584 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10585 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10586 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10587 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10588 } 10589 }, 10590 { 10591 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10592 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10593 { 10594 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10595 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10596 } 10597 }, 10598 { 10599 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10600 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10601 { 10602 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10603 } 10604 }, 10605 { 10606 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10607 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10608 { 10609 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10610 } 10611 }, 10612 }; 10613 10614 10615 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10616 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10617 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10618 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10619 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10620 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10621 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10622 } 10623