1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 bdev_io_tailq_t queued_resets; 322 323 lba_range_tailq_t locked_ranges; 324 325 /** List of I/Os queued by QoS. */ 326 bdev_io_tailq_t qos_queued_io; 327 }; 328 329 struct media_event_entry { 330 struct spdk_bdev_media_event event; 331 TAILQ_ENTRY(media_event_entry) tailq; 332 }; 333 334 #define MEDIA_EVENT_POOL_SIZE 64 335 336 struct spdk_bdev_desc { 337 struct spdk_bdev *bdev; 338 struct spdk_thread *thread; 339 struct { 340 spdk_bdev_event_cb_t event_fn; 341 void *ctx; 342 } callback; 343 bool closed; 344 bool write; 345 bool memory_domains_supported; 346 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 347 struct spdk_spinlock spinlock; 348 uint32_t refs; 349 TAILQ_HEAD(, media_event_entry) pending_media_events; 350 TAILQ_HEAD(, media_event_entry) free_media_events; 351 struct media_event_entry *media_events_buffer; 352 TAILQ_ENTRY(spdk_bdev_desc) link; 353 354 uint64_t timeout_in_sec; 355 spdk_bdev_io_timeout_cb cb_fn; 356 void *cb_arg; 357 struct spdk_poller *io_timeout_poller; 358 struct spdk_bdev_module_claim *claim; 359 }; 360 361 struct spdk_bdev_iostat_ctx { 362 struct spdk_bdev_io_stat *stat; 363 enum spdk_bdev_reset_stat_mode reset_mode; 364 spdk_bdev_get_device_stat_cb cb; 365 void *cb_arg; 366 }; 367 368 struct set_qos_limit_ctx { 369 void (*cb_fn)(void *cb_arg, int status); 370 void *cb_arg; 371 struct spdk_bdev *bdev; 372 }; 373 374 struct spdk_bdev_channel_iter { 375 spdk_bdev_for_each_channel_msg fn; 376 spdk_bdev_for_each_channel_done cpl; 377 struct spdk_io_channel_iter *i; 378 void *ctx; 379 }; 380 381 struct spdk_bdev_io_error_stat { 382 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 383 }; 384 385 enum bdev_io_retry_state { 386 BDEV_IO_RETRY_STATE_INVALID, 387 BDEV_IO_RETRY_STATE_PULL, 388 BDEV_IO_RETRY_STATE_PULL_MD, 389 BDEV_IO_RETRY_STATE_SUBMIT, 390 BDEV_IO_RETRY_STATE_PUSH, 391 BDEV_IO_RETRY_STATE_PUSH_MD, 392 }; 393 394 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 395 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 396 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 397 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 398 399 static inline void bdev_io_complete(void *ctx); 400 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 402 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 403 404 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 405 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 406 407 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 408 struct spdk_io_channel *ch, void *_ctx); 409 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 410 411 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 412 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 413 uint64_t num_blocks, 414 struct spdk_memory_domain *domain, void *domain_ctx, 415 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 416 spdk_bdev_io_completion_cb cb, void *cb_arg); 417 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 418 struct iovec *iov, int iovcnt, void *md_buf, 419 uint64_t offset_blocks, uint64_t num_blocks, 420 struct spdk_memory_domain *domain, void *domain_ctx, 421 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 422 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 423 spdk_bdev_io_completion_cb cb, void *cb_arg); 424 425 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 426 uint64_t offset, uint64_t length, 427 lock_range_cb cb_fn, void *cb_arg); 428 429 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 430 uint64_t offset, uint64_t length, 431 lock_range_cb cb_fn, void *cb_arg); 432 433 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 434 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 435 436 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 437 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 438 static void claim_reset(struct spdk_bdev *bdev); 439 440 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 441 442 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 443 444 #define bdev_get_ext_io_opt(opts, field, defval) \ 445 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 446 447 static inline void 448 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 449 { 450 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 451 bdev_io->internal.ch->queue_depth++; 452 } 453 454 static inline void 455 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 456 { 457 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 458 bdev_io->internal.ch->queue_depth--; 459 } 460 461 void 462 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 463 { 464 if (!opts) { 465 SPDK_ERRLOG("opts should not be NULL\n"); 466 return; 467 } 468 469 if (!opts_size) { 470 SPDK_ERRLOG("opts_size should not be zero value\n"); 471 return; 472 } 473 474 opts->opts_size = opts_size; 475 476 #define SET_FIELD(field) \ 477 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 478 opts->field = g_bdev_opts.field; \ 479 } \ 480 481 SET_FIELD(bdev_io_pool_size); 482 SET_FIELD(bdev_io_cache_size); 483 SET_FIELD(bdev_auto_examine); 484 SET_FIELD(iobuf_small_cache_size); 485 SET_FIELD(iobuf_large_cache_size); 486 487 /* Do not remove this statement, you should always update this statement when you adding a new field, 488 * and do not forget to add the SET_FIELD statement for your added field. */ 489 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 490 491 #undef SET_FIELD 492 } 493 494 int 495 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 496 { 497 uint32_t min_pool_size; 498 499 if (!opts) { 500 SPDK_ERRLOG("opts cannot be NULL\n"); 501 return -1; 502 } 503 504 if (!opts->opts_size) { 505 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 506 return -1; 507 } 508 509 /* 510 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 511 * initialization. A second mgmt_ch will be created on the same thread when the application starts 512 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 513 */ 514 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 515 if (opts->bdev_io_pool_size < min_pool_size) { 516 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 517 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 518 spdk_thread_get_count()); 519 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 520 return -1; 521 } 522 523 #define SET_FIELD(field) \ 524 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 525 g_bdev_opts.field = opts->field; \ 526 } \ 527 528 SET_FIELD(bdev_io_pool_size); 529 SET_FIELD(bdev_io_cache_size); 530 SET_FIELD(bdev_auto_examine); 531 SET_FIELD(iobuf_small_cache_size); 532 SET_FIELD(iobuf_large_cache_size); 533 534 g_bdev_opts.opts_size = opts->opts_size; 535 536 #undef SET_FIELD 537 538 return 0; 539 } 540 541 static struct spdk_bdev * 542 bdev_get_by_name(const char *bdev_name) 543 { 544 struct spdk_bdev_name find; 545 struct spdk_bdev_name *res; 546 547 find.name = (char *)bdev_name; 548 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 549 if (res != NULL) { 550 return res->bdev; 551 } 552 553 return NULL; 554 } 555 556 struct spdk_bdev * 557 spdk_bdev_get_by_name(const char *bdev_name) 558 { 559 struct spdk_bdev *bdev; 560 561 spdk_spin_lock(&g_bdev_mgr.spinlock); 562 bdev = bdev_get_by_name(bdev_name); 563 spdk_spin_unlock(&g_bdev_mgr.spinlock); 564 565 return bdev; 566 } 567 568 struct bdev_io_status_string { 569 enum spdk_bdev_io_status status; 570 const char *str; 571 }; 572 573 static const struct bdev_io_status_string bdev_io_status_strings[] = { 574 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 575 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 576 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 577 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 578 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 579 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 580 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 581 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 582 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 583 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 584 }; 585 586 static const char * 587 bdev_io_status_get_string(enum spdk_bdev_io_status status) 588 { 589 uint32_t i; 590 591 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 592 if (bdev_io_status_strings[i].status == status) { 593 return bdev_io_status_strings[i].str; 594 } 595 } 596 597 return "reserved"; 598 } 599 600 struct spdk_bdev_wait_for_examine_ctx { 601 struct spdk_poller *poller; 602 spdk_bdev_wait_for_examine_cb cb_fn; 603 void *cb_arg; 604 }; 605 606 static bool bdev_module_all_actions_completed(void); 607 608 static int 609 bdev_wait_for_examine_cb(void *arg) 610 { 611 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 612 613 if (!bdev_module_all_actions_completed()) { 614 return SPDK_POLLER_IDLE; 615 } 616 617 spdk_poller_unregister(&ctx->poller); 618 ctx->cb_fn(ctx->cb_arg); 619 free(ctx); 620 621 return SPDK_POLLER_BUSY; 622 } 623 624 int 625 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 626 { 627 struct spdk_bdev_wait_for_examine_ctx *ctx; 628 629 ctx = calloc(1, sizeof(*ctx)); 630 if (ctx == NULL) { 631 return -ENOMEM; 632 } 633 ctx->cb_fn = cb_fn; 634 ctx->cb_arg = cb_arg; 635 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 636 637 return 0; 638 } 639 640 struct spdk_bdev_examine_item { 641 char *name; 642 TAILQ_ENTRY(spdk_bdev_examine_item) link; 643 }; 644 645 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 646 647 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 648 g_bdev_examine_allowlist); 649 650 static inline bool 651 bdev_examine_allowlist_check(const char *name) 652 { 653 struct spdk_bdev_examine_item *item; 654 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 655 if (strcmp(name, item->name) == 0) { 656 return true; 657 } 658 } 659 return false; 660 } 661 662 static inline void 663 bdev_examine_allowlist_free(void) 664 { 665 struct spdk_bdev_examine_item *item; 666 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 667 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 668 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 669 free(item->name); 670 free(item); 671 } 672 } 673 674 static inline bool 675 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 676 { 677 struct spdk_bdev_alias *tmp; 678 if (bdev_examine_allowlist_check(bdev->name)) { 679 return true; 680 } 681 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 682 if (bdev_examine_allowlist_check(tmp->alias.name)) { 683 return true; 684 } 685 } 686 return false; 687 } 688 689 static inline bool 690 bdev_ok_to_examine(struct spdk_bdev *bdev) 691 { 692 /* Some bdevs may not support the READ command. 693 * Do not try to examine them. 694 */ 695 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 696 return false; 697 } 698 699 if (g_bdev_opts.bdev_auto_examine) { 700 return true; 701 } else { 702 return bdev_in_examine_allowlist(bdev); 703 } 704 } 705 706 static void 707 bdev_examine(struct spdk_bdev *bdev) 708 { 709 struct spdk_bdev_module *module; 710 struct spdk_bdev_module_claim *claim, *tmpclaim; 711 uint32_t action; 712 713 if (!bdev_ok_to_examine(bdev)) { 714 return; 715 } 716 717 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 718 if (module->examine_config) { 719 spdk_spin_lock(&module->internal.spinlock); 720 action = module->internal.action_in_progress; 721 module->internal.action_in_progress++; 722 spdk_spin_unlock(&module->internal.spinlock); 723 module->examine_config(bdev); 724 if (action != module->internal.action_in_progress) { 725 SPDK_ERRLOG("examine_config for module %s did not call " 726 "spdk_bdev_module_examine_done()\n", module->name); 727 } 728 } 729 } 730 731 spdk_spin_lock(&bdev->internal.spinlock); 732 733 switch (bdev->internal.claim_type) { 734 case SPDK_BDEV_CLAIM_NONE: 735 /* Examine by all bdev modules */ 736 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 737 if (module->examine_disk) { 738 spdk_spin_lock(&module->internal.spinlock); 739 module->internal.action_in_progress++; 740 spdk_spin_unlock(&module->internal.spinlock); 741 spdk_spin_unlock(&bdev->internal.spinlock); 742 module->examine_disk(bdev); 743 spdk_spin_lock(&bdev->internal.spinlock); 744 } 745 } 746 break; 747 case SPDK_BDEV_CLAIM_EXCL_WRITE: 748 /* Examine by the one bdev module with a v1 claim */ 749 module = bdev->internal.claim.v1.module; 750 if (module->examine_disk) { 751 spdk_spin_lock(&module->internal.spinlock); 752 module->internal.action_in_progress++; 753 spdk_spin_unlock(&module->internal.spinlock); 754 spdk_spin_unlock(&bdev->internal.spinlock); 755 module->examine_disk(bdev); 756 return; 757 } 758 break; 759 default: 760 /* Examine by all bdev modules with a v2 claim */ 761 assert(claim_type_is_v2(bdev->internal.claim_type)); 762 /* 763 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 764 * list, perhaps accessing freed memory. Without protection, this could happen 765 * while the lock is dropped during the examine callback. 766 */ 767 bdev->internal.examine_in_progress++; 768 769 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 770 module = claim->module; 771 772 if (module == NULL) { 773 /* This is a vestigial claim, held by examine_count */ 774 continue; 775 } 776 777 if (module->examine_disk == NULL) { 778 continue; 779 } 780 781 spdk_spin_lock(&module->internal.spinlock); 782 module->internal.action_in_progress++; 783 spdk_spin_unlock(&module->internal.spinlock); 784 785 /* Call examine_disk without holding internal.spinlock. */ 786 spdk_spin_unlock(&bdev->internal.spinlock); 787 module->examine_disk(bdev); 788 spdk_spin_lock(&bdev->internal.spinlock); 789 } 790 791 assert(bdev->internal.examine_in_progress > 0); 792 bdev->internal.examine_in_progress--; 793 if (bdev->internal.examine_in_progress == 0) { 794 /* Remove any claims that were released during examine_disk */ 795 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 796 if (claim->desc != NULL) { 797 continue; 798 } 799 800 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 801 free(claim); 802 } 803 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 804 claim_reset(bdev); 805 } 806 } 807 } 808 809 spdk_spin_unlock(&bdev->internal.spinlock); 810 } 811 812 int 813 spdk_bdev_examine(const char *name) 814 { 815 struct spdk_bdev *bdev; 816 struct spdk_bdev_examine_item *item; 817 struct spdk_thread *thread = spdk_get_thread(); 818 819 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 820 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 821 thread ? spdk_thread_get_name(thread) : "null"); 822 return -EINVAL; 823 } 824 825 if (g_bdev_opts.bdev_auto_examine) { 826 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 827 return -EINVAL; 828 } 829 830 if (bdev_examine_allowlist_check(name)) { 831 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 832 return -EEXIST; 833 } 834 835 item = calloc(1, sizeof(*item)); 836 if (!item) { 837 return -ENOMEM; 838 } 839 item->name = strdup(name); 840 if (!item->name) { 841 free(item); 842 return -ENOMEM; 843 } 844 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 845 846 bdev = spdk_bdev_get_by_name(name); 847 if (bdev) { 848 bdev_examine(bdev); 849 } 850 return 0; 851 } 852 853 static inline void 854 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 855 { 856 struct spdk_bdev_examine_item *item; 857 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 858 spdk_json_write_object_begin(w); 859 spdk_json_write_named_string(w, "method", "bdev_examine"); 860 spdk_json_write_named_object_begin(w, "params"); 861 spdk_json_write_named_string(w, "name", item->name); 862 spdk_json_write_object_end(w); 863 spdk_json_write_object_end(w); 864 } 865 } 866 867 struct spdk_bdev * 868 spdk_bdev_first(void) 869 { 870 struct spdk_bdev *bdev; 871 872 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 873 if (bdev) { 874 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 875 } 876 877 return bdev; 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_next(struct spdk_bdev *prev) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = TAILQ_NEXT(prev, internal.link); 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 static struct spdk_bdev * 894 _bdev_next_leaf(struct spdk_bdev *bdev) 895 { 896 while (bdev != NULL) { 897 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 898 return bdev; 899 } else { 900 bdev = TAILQ_NEXT(bdev, internal.link); 901 } 902 } 903 904 return bdev; 905 } 906 907 struct spdk_bdev * 908 spdk_bdev_first_leaf(void) 909 { 910 struct spdk_bdev *bdev; 911 912 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 913 914 if (bdev) { 915 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 916 } 917 918 return bdev; 919 } 920 921 struct spdk_bdev * 922 spdk_bdev_next_leaf(struct spdk_bdev *prev) 923 { 924 struct spdk_bdev *bdev; 925 926 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 927 928 if (bdev) { 929 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 930 } 931 932 return bdev; 933 } 934 935 static inline bool 936 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 937 { 938 return bdev_io->internal.f.has_memory_domain; 939 } 940 941 static inline bool 942 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 943 { 944 return bdev_io->internal.f.has_accel_sequence; 945 } 946 947 static inline void 948 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 949 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 950 { 951 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 952 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 953 * channels we will instead wait for half to complete. 954 */ 955 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 956 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 957 958 assert(state != BDEV_IO_RETRY_STATE_INVALID); 959 bdev_io->internal.retry_state = state; 960 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 961 } 962 963 static inline void 964 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 965 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 966 { 967 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 968 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 969 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 970 971 assert(state != BDEV_IO_RETRY_STATE_INVALID); 972 bdev_io->internal.retry_state = state; 973 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 974 } 975 976 void 977 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 978 { 979 struct iovec *iovs; 980 981 if (bdev_io->u.bdev.iovs == NULL) { 982 bdev_io->u.bdev.iovs = &bdev_io->iov; 983 bdev_io->u.bdev.iovcnt = 1; 984 } 985 986 iovs = bdev_io->u.bdev.iovs; 987 988 assert(iovs != NULL); 989 assert(bdev_io->u.bdev.iovcnt >= 1); 990 991 iovs[0].iov_base = buf; 992 iovs[0].iov_len = len; 993 } 994 995 void 996 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 997 { 998 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 999 bdev_io->u.bdev.md_buf = md_buf; 1000 } 1001 1002 static bool 1003 _is_buf_allocated(const struct iovec *iovs) 1004 { 1005 if (iovs == NULL) { 1006 return false; 1007 } 1008 1009 return iovs[0].iov_base != NULL; 1010 } 1011 1012 static bool 1013 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1014 { 1015 int i; 1016 uintptr_t iov_base; 1017 1018 if (spdk_likely(alignment == 1)) { 1019 return true; 1020 } 1021 1022 for (i = 0; i < iovcnt; i++) { 1023 iov_base = (uintptr_t)iovs[i].iov_base; 1024 if ((iov_base & (alignment - 1)) != 0) { 1025 return false; 1026 } 1027 } 1028 1029 return true; 1030 } 1031 1032 static inline bool 1033 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1034 { 1035 if (!bdev_io_use_accel_sequence(bdev_io)) { 1036 return false; 1037 } 1038 1039 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1040 * bdev module didn't support accel sequences */ 1041 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1042 } 1043 1044 static inline void 1045 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1046 struct spdk_bdev_shared_resource *shared_resource) 1047 { 1048 bdev_ch->io_outstanding++; 1049 shared_resource->io_outstanding++; 1050 } 1051 1052 static inline void 1053 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1054 struct spdk_bdev_shared_resource *shared_resource) 1055 { 1056 assert(bdev_ch->io_outstanding > 0); 1057 assert(shared_resource->io_outstanding > 0); 1058 bdev_ch->io_outstanding--; 1059 shared_resource->io_outstanding--; 1060 } 1061 1062 static void 1063 bdev_io_submit_sequence_cb(void *ctx, int status) 1064 { 1065 struct spdk_bdev_io *bdev_io = ctx; 1066 1067 assert(bdev_io_use_accel_sequence(bdev_io)); 1068 1069 bdev_io->u.bdev.accel_sequence = NULL; 1070 bdev_io->internal.f.has_accel_sequence = false; 1071 1072 if (spdk_unlikely(status != 0)) { 1073 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1074 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1075 bdev_io_complete_unsubmitted(bdev_io); 1076 return; 1077 } 1078 1079 bdev_io_submit(bdev_io); 1080 } 1081 1082 static void 1083 bdev_io_exec_sequence_cb(void *ctx, int status) 1084 { 1085 struct spdk_bdev_io *bdev_io = ctx; 1086 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1087 1088 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1089 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1090 1091 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1092 bdev_ch_retry_io(ch); 1093 } 1094 1095 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1096 } 1097 1098 static void 1099 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1100 { 1101 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1102 1103 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1104 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1105 assert(bdev_io_use_accel_sequence(bdev_io)); 1106 1107 /* Since the operations are appended during submission, they're in the opposite order than 1108 * how we want to execute them for reads (i.e. we need to execute the most recently added 1109 * operation first), so reverse the sequence before executing it. 1110 */ 1111 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1112 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1113 } 1114 1115 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1116 bdev_io_increment_outstanding(ch, ch->shared_resource); 1117 bdev_io->internal.data_transfer_cpl = cb_fn; 1118 1119 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1120 bdev_io_exec_sequence_cb, bdev_io); 1121 } 1122 1123 static void 1124 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1125 { 1126 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1127 void *buf; 1128 1129 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1130 buf = bdev_io->internal.buf.ptr; 1131 bdev_io->internal.buf.ptr = NULL; 1132 bdev_io->internal.f.has_buf = false; 1133 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1134 bdev_io->internal.get_aux_buf_cb = NULL; 1135 } else { 1136 assert(bdev_io->internal.get_buf_cb != NULL); 1137 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1138 bdev_io->internal.get_buf_cb = NULL; 1139 } 1140 } 1141 1142 static void 1143 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1144 { 1145 struct spdk_bdev_io *bdev_io = ctx; 1146 1147 if (rc) { 1148 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1149 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1150 } 1151 bdev_io_get_buf_complete(bdev_io, !rc); 1152 } 1153 1154 static void 1155 bdev_io_pull_md_buf_done(void *ctx, int status) 1156 { 1157 struct spdk_bdev_io *bdev_io = ctx; 1158 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1159 1160 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1161 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1162 1163 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1164 bdev_ch_retry_io(ch); 1165 } 1166 1167 assert(bdev_io->internal.data_transfer_cpl); 1168 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1169 } 1170 1171 static void 1172 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1173 { 1174 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1175 int rc = 0; 1176 1177 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1178 assert(bdev_io->internal.f.has_bounce_buf); 1179 if (bdev_io_use_memory_domain(bdev_io)) { 1180 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1181 bdev_io_increment_outstanding(ch, ch->shared_resource); 1182 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1183 bdev_io->internal.memory_domain_ctx, 1184 &bdev_io->internal.bounce_buf.orig_md_iov, 1, 1185 &bdev_io->internal.bounce_buf.md_iov, 1, 1186 bdev_io_pull_md_buf_done, bdev_io); 1187 if (rc == 0) { 1188 /* Continue to submit IO in completion callback */ 1189 return; 1190 } 1191 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1192 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1193 if (rc != -ENOMEM) { 1194 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1195 spdk_memory_domain_get_dma_device_id( 1196 bdev_io->internal.memory_domain), rc); 1197 } 1198 } else { 1199 memcpy(bdev_io->internal.bounce_buf.md_iov.iov_base, 1200 bdev_io->internal.bounce_buf.orig_md_iov.iov_base, 1201 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1202 } 1203 } 1204 1205 if (spdk_unlikely(rc == -ENOMEM)) { 1206 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1207 } else { 1208 assert(bdev_io->internal.data_transfer_cpl); 1209 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1210 } 1211 } 1212 1213 static void 1214 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1215 { 1216 assert(bdev_io->internal.f.has_bounce_buf); 1217 1218 /* save original md_buf */ 1219 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1220 bdev_io->internal.bounce_buf.orig_md_iov.iov_len = len; 1221 bdev_io->internal.bounce_buf.md_iov.iov_base = md_buf; 1222 bdev_io->internal.bounce_buf.md_iov.iov_len = len; 1223 /* set bounce md_buf */ 1224 bdev_io->u.bdev.md_buf = md_buf; 1225 1226 bdev_io_pull_md_buf(bdev_io); 1227 } 1228 1229 static void 1230 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1231 { 1232 struct spdk_bdev *bdev = bdev_io->bdev; 1233 uint64_t md_len; 1234 void *buf; 1235 1236 if (spdk_bdev_is_md_separate(bdev)) { 1237 assert(!bdev_io_use_accel_sequence(bdev_io)); 1238 1239 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1240 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1241 1242 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1243 1244 if (bdev_io->u.bdev.md_buf != NULL) { 1245 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1246 return; 1247 } else { 1248 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1249 } 1250 } 1251 1252 bdev_io_get_buf_complete(bdev_io, true); 1253 } 1254 1255 static inline void 1256 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1257 { 1258 if (rc) { 1259 SPDK_ERRLOG("Failed to get data buffer\n"); 1260 assert(bdev_io->internal.data_transfer_cpl); 1261 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1262 return; 1263 } 1264 1265 _bdev_io_set_md_buf(bdev_io); 1266 } 1267 1268 static void 1269 bdev_io_pull_data_done_and_track(void *ctx, int status) 1270 { 1271 struct spdk_bdev_io *bdev_io = ctx; 1272 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1273 1274 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1275 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1276 1277 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1278 bdev_ch_retry_io(ch); 1279 } 1280 1281 bdev_io_pull_data_done(bdev_io, status); 1282 } 1283 1284 static void 1285 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1286 { 1287 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1288 int rc = 0; 1289 1290 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1291 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1292 * operation */ 1293 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1294 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1295 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1296 assert(bdev_io_use_accel_sequence(bdev_io)); 1297 assert(bdev_io->internal.f.has_bounce_buf); 1298 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1299 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1300 NULL, NULL, 1301 bdev_io->internal.bounce_buf.orig_iovs, 1302 bdev_io->internal.bounce_buf.orig_iovcnt, 1303 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1304 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1305 NULL, NULL); 1306 } else { 1307 /* We need to reverse the src/dst for reads */ 1308 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1309 assert(bdev_io_use_accel_sequence(bdev_io)); 1310 assert(bdev_io->internal.f.has_bounce_buf); 1311 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1312 bdev_io->internal.bounce_buf.orig_iovs, 1313 bdev_io->internal.bounce_buf.orig_iovcnt, 1314 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1315 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1316 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1317 NULL, NULL, NULL, NULL); 1318 } 1319 1320 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1321 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1322 bdev_io->internal.accel_sequence); 1323 } 1324 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1325 /* if this is write path, copy data from original buffer to bounce buffer */ 1326 if (bdev_io_use_memory_domain(bdev_io)) { 1327 assert(bdev_io->internal.f.has_bounce_buf); 1328 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1329 bdev_io_increment_outstanding(ch, ch->shared_resource); 1330 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1331 bdev_io->internal.memory_domain_ctx, 1332 bdev_io->internal.bounce_buf.orig_iovs, 1333 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1334 bdev_io->u.bdev.iovs, 1, 1335 bdev_io_pull_data_done_and_track, 1336 bdev_io); 1337 if (rc == 0) { 1338 /* Continue to submit IO in completion callback */ 1339 return; 1340 } 1341 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1342 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1343 if (rc != -ENOMEM) { 1344 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1345 spdk_memory_domain_get_dma_device_id( 1346 bdev_io->internal.memory_domain)); 1347 } 1348 } else { 1349 assert(bdev_io->u.bdev.iovcnt == 1); 1350 assert(bdev_io->internal.f.has_bounce_buf); 1351 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1352 bdev_io->u.bdev.iovs[0].iov_len, 1353 bdev_io->internal.bounce_buf.orig_iovs, 1354 bdev_io->internal.bounce_buf.orig_iovcnt); 1355 } 1356 } 1357 1358 if (spdk_unlikely(rc == -ENOMEM)) { 1359 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1360 } else { 1361 bdev_io_pull_data_done(bdev_io, rc); 1362 } 1363 } 1364 1365 static void 1366 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1367 bdev_copy_bounce_buffer_cpl cpl_cb) 1368 { 1369 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1370 1371 assert(bdev_io->internal.f.has_bounce_buf == false); 1372 1373 bdev_io->internal.data_transfer_cpl = cpl_cb; 1374 bdev_io->internal.f.has_bounce_buf = true; 1375 /* save original iovec */ 1376 bdev_io->internal.bounce_buf.orig_iovs = bdev_io->u.bdev.iovs; 1377 bdev_io->internal.bounce_buf.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1378 /* zero the other data members */ 1379 bdev_io->internal.bounce_buf.iov.iov_base = NULL; 1380 bdev_io->internal.bounce_buf.md_iov.iov_base = NULL; 1381 bdev_io->internal.bounce_buf.orig_md_iov.iov_base = NULL; 1382 /* set bounce iov */ 1383 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_buf.iov; 1384 bdev_io->u.bdev.iovcnt = 1; 1385 /* set bounce buffer for this operation */ 1386 bdev_io->u.bdev.iovs[0].iov_base = buf; 1387 bdev_io->u.bdev.iovs[0].iov_len = len; 1388 /* Now we use 1 iov, the split condition could have been changed */ 1389 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1390 1391 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1392 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1393 } else { 1394 bdev_io_pull_data(bdev_io); 1395 } 1396 } 1397 1398 static void 1399 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1400 { 1401 struct spdk_bdev *bdev = bdev_io->bdev; 1402 bool buf_allocated; 1403 uint64_t alignment; 1404 void *aligned_buf; 1405 1406 bdev_io->internal.buf.ptr = buf; 1407 bdev_io->internal.f.has_buf = true; 1408 1409 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1410 bdev_io_get_buf_complete(bdev_io, true); 1411 return; 1412 } 1413 1414 alignment = spdk_bdev_get_buf_align(bdev); 1415 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1416 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1417 1418 if (buf_allocated) { 1419 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1420 /* Continue in completion callback */ 1421 return; 1422 } else { 1423 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1424 } 1425 1426 _bdev_io_set_md_buf(bdev_io); 1427 } 1428 1429 static inline uint64_t 1430 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1431 { 1432 struct spdk_bdev *bdev = bdev_io->bdev; 1433 uint64_t md_len, alignment; 1434 1435 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1436 1437 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1438 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1439 1440 return len + alignment + md_len; 1441 } 1442 1443 static void 1444 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1445 { 1446 struct spdk_bdev_mgmt_channel *ch; 1447 1448 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1449 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1450 } 1451 1452 static void 1453 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1454 { 1455 assert(bdev_io->internal.f.has_buf); 1456 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1457 bdev_io->internal.buf.ptr = NULL; 1458 bdev_io->internal.f.has_buf = false; 1459 } 1460 1461 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_put_aux_buf, 1462 "spdk_bdev_io_put_aux_buf is deprecated", "v25.01", 0); 1463 1464 void 1465 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1466 { 1467 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1468 1469 SPDK_LOG_DEPRECATED(spdk_bdev_io_put_aux_buf); 1470 1471 assert(buf != NULL); 1472 _bdev_io_put_buf(bdev_io, buf, len); 1473 } 1474 1475 static inline void 1476 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1477 struct spdk_bdev_io *bdev_io) 1478 { 1479 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1480 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1481 * sequence pointer to make sure we won't touch it anymore. */ 1482 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1483 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1484 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1485 bdev_io->internal.f.has_accel_sequence = false; 1486 } 1487 1488 bdev->fn_table->submit_request(ioch, bdev_io); 1489 } 1490 1491 static inline void 1492 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1493 { 1494 struct spdk_bdev *bdev = bdev_io->bdev; 1495 1496 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1497 bdev_io->internal.error.nvme.cdw0 = 0; 1498 bdev_io->num_retries++; 1499 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1500 } 1501 1502 static void 1503 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1504 { 1505 struct spdk_bdev_io *bdev_io; 1506 1507 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1508 /* 1509 * Allow some more I/O to complete before retrying the nomem_io queue. 1510 * Some drivers (such as nvme) cannot immediately take a new I/O in 1511 * the context of a completion, because the resources for the I/O are 1512 * not released until control returns to the bdev poller. Also, we 1513 * may require several small I/O to complete before a larger I/O 1514 * (that requires splitting) can be submitted. 1515 */ 1516 return; 1517 } 1518 1519 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1520 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1521 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1522 1523 switch (bdev_io->internal.retry_state) { 1524 case BDEV_IO_RETRY_STATE_SUBMIT: 1525 bdev_ch_resubmit_io(shared_resource, bdev_io); 1526 break; 1527 case BDEV_IO_RETRY_STATE_PULL: 1528 bdev_io_pull_data(bdev_io); 1529 break; 1530 case BDEV_IO_RETRY_STATE_PULL_MD: 1531 bdev_io_pull_md_buf(bdev_io); 1532 break; 1533 case BDEV_IO_RETRY_STATE_PUSH: 1534 bdev_io_push_bounce_data(bdev_io); 1535 break; 1536 case BDEV_IO_RETRY_STATE_PUSH_MD: 1537 bdev_io_push_bounce_md_buf(bdev_io); 1538 break; 1539 default: 1540 assert(0 && "invalid retry state"); 1541 break; 1542 } 1543 1544 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1545 /* This IO completed again with NOMEM status, so break the loop and 1546 * don't try anymore. Note that a bdev_io that fails with NOMEM 1547 * always gets requeued at the front of the list, to maintain 1548 * ordering. 1549 */ 1550 break; 1551 } 1552 } 1553 } 1554 1555 static void 1556 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1557 { 1558 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1559 } 1560 1561 static int 1562 bdev_no_mem_poller(void *ctx) 1563 { 1564 struct spdk_bdev_shared_resource *shared_resource = ctx; 1565 1566 spdk_poller_unregister(&shared_resource->nomem_poller); 1567 1568 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1569 bdev_shared_ch_retry_io(shared_resource); 1570 } 1571 /* the retry cb may re-register the poller so double check */ 1572 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1573 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1574 /* No IOs were submitted, try again */ 1575 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1576 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1577 } 1578 1579 return SPDK_POLLER_BUSY; 1580 } 1581 1582 static inline bool 1583 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1584 { 1585 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1586 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1587 1588 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1589 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1590 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1591 1592 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1593 /* Special case when we have nomem IOs and no outstanding IOs which completions 1594 * could trigger retry of queued IOs 1595 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1596 * new IOs submitted, e.g. qd==1 */ 1597 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1598 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1599 } 1600 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1601 * ownership of that sequence is transferred back to the bdev layer, so we need to 1602 * restore internal.accel_sequence to make sure that the sequence is handled 1603 * correctly in case the I/O is later aborted. */ 1604 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1605 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1606 assert(!bdev_io_use_accel_sequence(bdev_io)); 1607 bdev_io->internal.f.has_accel_sequence = true; 1608 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1609 } 1610 1611 return true; 1612 } 1613 1614 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1615 bdev_ch_retry_io(bdev_ch); 1616 } 1617 1618 return false; 1619 } 1620 1621 static void 1622 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1623 { 1624 struct spdk_bdev_io *bdev_io = ctx; 1625 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1626 1627 if (rc) { 1628 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1629 } 1630 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1631 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1632 */ 1633 bdev_io_put_buf(bdev_io); 1634 1635 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1636 bdev_ch_retry_io(ch); 1637 } 1638 1639 /* Continue with IO completion flow */ 1640 bdev_io_complete(bdev_io); 1641 } 1642 1643 static void 1644 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1645 { 1646 struct spdk_bdev_io *bdev_io = ctx; 1647 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1648 1649 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1650 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1651 bdev_io->internal.f.has_bounce_buf = false; 1652 1653 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1654 bdev_ch_retry_io(ch); 1655 } 1656 1657 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1658 } 1659 1660 static inline void 1661 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1662 { 1663 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1664 int rc = 0; 1665 1666 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1667 assert(bdev_io->internal.f.has_bounce_buf); 1668 1669 /* do the same for metadata buffer */ 1670 if (spdk_unlikely(bdev_io->internal.bounce_buf.orig_md_iov.iov_base != NULL)) { 1671 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1672 1673 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1674 if (bdev_io_use_memory_domain(bdev_io)) { 1675 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1676 bdev_io_increment_outstanding(ch, ch->shared_resource); 1677 /* If memory domain is used then we need to call async push function */ 1678 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1679 bdev_io->internal.memory_domain_ctx, 1680 &bdev_io->internal.bounce_buf.orig_md_iov, 1681 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1682 &bdev_io->internal.bounce_buf.md_iov, 1, 1683 bdev_io_push_bounce_md_buf_done, 1684 bdev_io); 1685 if (rc == 0) { 1686 /* Continue IO completion in async callback */ 1687 return; 1688 } 1689 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1690 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1691 if (rc != -ENOMEM) { 1692 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1693 spdk_memory_domain_get_dma_device_id( 1694 bdev_io->internal.memory_domain)); 1695 } 1696 } else { 1697 memcpy(bdev_io->internal.bounce_buf.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1698 bdev_io->internal.bounce_buf.orig_md_iov.iov_len); 1699 } 1700 } 1701 } 1702 1703 if (spdk_unlikely(rc == -ENOMEM)) { 1704 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1705 } else { 1706 assert(bdev_io->internal.data_transfer_cpl); 1707 bdev_io->internal.f.has_bounce_buf = false; 1708 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1709 } 1710 } 1711 1712 static inline void 1713 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1714 { 1715 assert(bdev_io->internal.data_transfer_cpl); 1716 if (rc) { 1717 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1718 return; 1719 } 1720 1721 /* set original buffer for this io */ 1722 bdev_io->u.bdev.iovcnt = bdev_io->internal.bounce_buf.orig_iovcnt; 1723 bdev_io->u.bdev.iovs = bdev_io->internal.bounce_buf.orig_iovs; 1724 1725 /* We don't set bdev_io->internal.f.has_bounce_buf to false here because 1726 * we still need to clear the md buf */ 1727 1728 bdev_io_push_bounce_md_buf(bdev_io); 1729 } 1730 1731 static void 1732 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1733 { 1734 struct spdk_bdev_io *bdev_io = ctx; 1735 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1736 1737 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1738 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1739 1740 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1741 bdev_ch_retry_io(ch); 1742 } 1743 1744 bdev_io_push_bounce_data_done(bdev_io, status); 1745 } 1746 1747 static inline void 1748 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1749 { 1750 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1751 int rc = 0; 1752 1753 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1754 assert(!bdev_io_use_accel_sequence(bdev_io)); 1755 assert(bdev_io->internal.f.has_bounce_buf); 1756 1757 /* if this is read path, copy data from bounce buffer to original buffer */ 1758 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1759 if (bdev_io_use_memory_domain(bdev_io)) { 1760 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1761 bdev_io_increment_outstanding(ch, ch->shared_resource); 1762 /* If memory domain is used then we need to call async push function */ 1763 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1764 bdev_io->internal.memory_domain_ctx, 1765 bdev_io->internal.bounce_buf.orig_iovs, 1766 (uint32_t)bdev_io->internal.bounce_buf.orig_iovcnt, 1767 &bdev_io->internal.bounce_buf.iov, 1, 1768 bdev_io_push_bounce_data_done_and_track, 1769 bdev_io); 1770 if (rc == 0) { 1771 /* Continue IO completion in async callback */ 1772 return; 1773 } 1774 1775 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1776 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1777 if (rc != -ENOMEM) { 1778 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1779 spdk_memory_domain_get_dma_device_id( 1780 bdev_io->internal.memory_domain)); 1781 } 1782 } else { 1783 spdk_copy_buf_to_iovs(bdev_io->internal.bounce_buf.orig_iovs, 1784 bdev_io->internal.bounce_buf.orig_iovcnt, 1785 bdev_io->internal.bounce_buf.iov.iov_base, 1786 bdev_io->internal.bounce_buf.iov.iov_len); 1787 } 1788 } 1789 1790 if (spdk_unlikely(rc == -ENOMEM)) { 1791 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1792 } else { 1793 bdev_io_push_bounce_data_done(bdev_io, rc); 1794 } 1795 } 1796 1797 static inline void 1798 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1799 { 1800 bdev_io->internal.data_transfer_cpl = cpl_cb; 1801 bdev_io_push_bounce_data(bdev_io); 1802 } 1803 1804 static void 1805 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1806 { 1807 struct spdk_bdev_io *bdev_io; 1808 1809 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1810 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1811 } 1812 1813 static void 1814 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1815 { 1816 struct spdk_bdev_mgmt_channel *mgmt_ch; 1817 uint64_t max_len; 1818 void *buf; 1819 1820 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1821 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1822 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1823 1824 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1825 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1826 bdev_io_get_buf_complete(bdev_io, false); 1827 return; 1828 } 1829 1830 bdev_io->internal.buf.len = len; 1831 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1832 bdev_io_get_iobuf_cb); 1833 if (buf != NULL) { 1834 _bdev_io_set_buf(bdev_io, buf, len); 1835 } 1836 } 1837 1838 void 1839 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1840 { 1841 struct spdk_bdev *bdev = bdev_io->bdev; 1842 uint64_t alignment; 1843 1844 assert(cb != NULL); 1845 bdev_io->internal.get_buf_cb = cb; 1846 1847 alignment = spdk_bdev_get_buf_align(bdev); 1848 1849 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1850 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1851 /* Buffer already present and aligned */ 1852 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1853 return; 1854 } 1855 1856 bdev_io_get_buf(bdev_io, len); 1857 } 1858 1859 static void 1860 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1861 bool success) 1862 { 1863 if (!success) { 1864 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1865 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1866 bdev_io_complete_unsubmitted(bdev_io); 1867 return; 1868 } 1869 1870 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1871 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1872 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1873 return; 1874 } 1875 /* For reads we'll execute the sequence after the data is read, so, for now, only 1876 * clear out accel_sequence pointer and submit the IO */ 1877 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1878 bdev_io->u.bdev.accel_sequence = NULL; 1879 } 1880 1881 bdev_io_submit(bdev_io); 1882 } 1883 1884 static void 1885 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1886 uint64_t len) 1887 { 1888 assert(cb != NULL); 1889 bdev_io->internal.get_buf_cb = cb; 1890 1891 bdev_io_get_buf(bdev_io, len); 1892 } 1893 1894 1895 SPDK_LOG_DEPRECATION_REGISTER(spdk_bdev_io_get_aux_buf, 1896 "spdk_bdev_io_get_aux_buf is deprecated", "v25.01", 0); 1897 1898 void 1899 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1900 { 1901 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1902 1903 SPDK_LOG_DEPRECATED(spdk_bdev_io_get_aux_buf); 1904 1905 assert(cb != NULL); 1906 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1907 bdev_io->internal.get_aux_buf_cb = cb; 1908 bdev_io_get_buf(bdev_io, len); 1909 } 1910 1911 static int 1912 bdev_module_get_max_ctx_size(void) 1913 { 1914 struct spdk_bdev_module *bdev_module; 1915 int max_bdev_module_size = 0; 1916 1917 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1918 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1919 max_bdev_module_size = bdev_module->get_ctx_size(); 1920 } 1921 } 1922 1923 return max_bdev_module_size; 1924 } 1925 1926 static void 1927 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1928 { 1929 if (!bdev->internal.histogram_enabled) { 1930 return; 1931 } 1932 1933 spdk_json_write_object_begin(w); 1934 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1935 1936 spdk_json_write_named_object_begin(w, "params"); 1937 spdk_json_write_named_string(w, "name", bdev->name); 1938 1939 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1940 1941 if (bdev->internal.histogram_io_type) { 1942 spdk_json_write_named_string(w, "opc", 1943 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1944 } 1945 1946 spdk_json_write_object_end(w); 1947 1948 spdk_json_write_object_end(w); 1949 } 1950 1951 static void 1952 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1953 { 1954 int i; 1955 struct spdk_bdev_qos *qos = bdev->internal.qos; 1956 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1957 1958 if (!qos) { 1959 return; 1960 } 1961 1962 spdk_bdev_get_qos_rate_limits(bdev, limits); 1963 1964 spdk_json_write_object_begin(w); 1965 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1966 1967 spdk_json_write_named_object_begin(w, "params"); 1968 spdk_json_write_named_string(w, "name", bdev->name); 1969 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1970 if (limits[i] > 0) { 1971 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1972 } 1973 } 1974 spdk_json_write_object_end(w); 1975 1976 spdk_json_write_object_end(w); 1977 } 1978 1979 void 1980 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1981 { 1982 struct spdk_bdev_module *bdev_module; 1983 struct spdk_bdev *bdev; 1984 1985 assert(w != NULL); 1986 1987 spdk_json_write_array_begin(w); 1988 1989 spdk_json_write_object_begin(w); 1990 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1991 spdk_json_write_named_object_begin(w, "params"); 1992 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1993 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1994 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1995 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1996 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1997 spdk_json_write_object_end(w); 1998 spdk_json_write_object_end(w); 1999 2000 bdev_examine_allowlist_config_json(w); 2001 2002 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2003 if (bdev_module->config_json) { 2004 bdev_module->config_json(w); 2005 } 2006 } 2007 2008 spdk_spin_lock(&g_bdev_mgr.spinlock); 2009 2010 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 2011 if (bdev->fn_table->write_config_json) { 2012 bdev->fn_table->write_config_json(bdev, w); 2013 } 2014 2015 bdev_qos_config_json(bdev, w); 2016 bdev_enable_histogram_config_json(bdev, w); 2017 } 2018 2019 spdk_spin_unlock(&g_bdev_mgr.spinlock); 2020 2021 /* This has to be last RPC in array to make sure all bdevs finished examine */ 2022 spdk_json_write_object_begin(w); 2023 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 2024 spdk_json_write_object_end(w); 2025 2026 spdk_json_write_array_end(w); 2027 } 2028 2029 static void 2030 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2031 { 2032 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2033 struct spdk_bdev_io *bdev_io; 2034 2035 spdk_iobuf_channel_fini(&ch->iobuf); 2036 2037 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2038 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2039 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2040 ch->per_thread_cache_count--; 2041 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2042 } 2043 2044 assert(ch->per_thread_cache_count == 0); 2045 } 2046 2047 static int 2048 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2049 { 2050 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2051 struct spdk_bdev_io *bdev_io; 2052 uint32_t i; 2053 int rc; 2054 2055 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2056 g_bdev_opts.iobuf_small_cache_size, 2057 g_bdev_opts.iobuf_large_cache_size); 2058 if (rc != 0) { 2059 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2060 return -1; 2061 } 2062 2063 STAILQ_INIT(&ch->per_thread_cache); 2064 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2065 2066 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2067 ch->per_thread_cache_count = 0; 2068 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2069 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2070 if (bdev_io == NULL) { 2071 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2072 assert(false); 2073 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2074 return -1; 2075 } 2076 ch->per_thread_cache_count++; 2077 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2078 } 2079 2080 TAILQ_INIT(&ch->shared_resources); 2081 TAILQ_INIT(&ch->io_wait_queue); 2082 2083 return 0; 2084 } 2085 2086 static void 2087 bdev_init_complete(int rc) 2088 { 2089 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2090 void *cb_arg = g_init_cb_arg; 2091 struct spdk_bdev_module *m; 2092 2093 g_bdev_mgr.init_complete = true; 2094 g_init_cb_fn = NULL; 2095 g_init_cb_arg = NULL; 2096 2097 /* 2098 * For modules that need to know when subsystem init is complete, 2099 * inform them now. 2100 */ 2101 if (rc == 0) { 2102 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2103 if (m->init_complete) { 2104 m->init_complete(); 2105 } 2106 } 2107 } 2108 2109 cb_fn(cb_arg, rc); 2110 } 2111 2112 static bool 2113 bdev_module_all_actions_completed(void) 2114 { 2115 struct spdk_bdev_module *m; 2116 2117 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2118 if (m->internal.action_in_progress > 0) { 2119 return false; 2120 } 2121 } 2122 return true; 2123 } 2124 2125 static void 2126 bdev_module_action_complete(void) 2127 { 2128 /* 2129 * Don't finish bdev subsystem initialization if 2130 * module pre-initialization is still in progress, or 2131 * the subsystem been already initialized. 2132 */ 2133 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2134 return; 2135 } 2136 2137 /* 2138 * Check all bdev modules for inits/examinations in progress. If any 2139 * exist, return immediately since we cannot finish bdev subsystem 2140 * initialization until all are completed. 2141 */ 2142 if (!bdev_module_all_actions_completed()) { 2143 return; 2144 } 2145 2146 /* 2147 * Modules already finished initialization - now that all 2148 * the bdev modules have finished their asynchronous I/O 2149 * processing, the entire bdev layer can be marked as complete. 2150 */ 2151 bdev_init_complete(0); 2152 } 2153 2154 static void 2155 bdev_module_action_done(struct spdk_bdev_module *module) 2156 { 2157 spdk_spin_lock(&module->internal.spinlock); 2158 assert(module->internal.action_in_progress > 0); 2159 module->internal.action_in_progress--; 2160 spdk_spin_unlock(&module->internal.spinlock); 2161 bdev_module_action_complete(); 2162 } 2163 2164 void 2165 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2166 { 2167 assert(module->async_init); 2168 bdev_module_action_done(module); 2169 } 2170 2171 void 2172 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2173 { 2174 bdev_module_action_done(module); 2175 } 2176 2177 /** The last initialized bdev module */ 2178 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2179 2180 static void 2181 bdev_init_failed(void *cb_arg) 2182 { 2183 struct spdk_bdev_module *module = cb_arg; 2184 2185 spdk_spin_lock(&module->internal.spinlock); 2186 assert(module->internal.action_in_progress > 0); 2187 module->internal.action_in_progress--; 2188 spdk_spin_unlock(&module->internal.spinlock); 2189 bdev_init_complete(-1); 2190 } 2191 2192 static int 2193 bdev_modules_init(void) 2194 { 2195 struct spdk_bdev_module *module; 2196 int rc = 0; 2197 2198 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2199 g_resume_bdev_module = module; 2200 if (module->async_init) { 2201 spdk_spin_lock(&module->internal.spinlock); 2202 module->internal.action_in_progress = 1; 2203 spdk_spin_unlock(&module->internal.spinlock); 2204 } 2205 rc = module->module_init(); 2206 if (rc != 0) { 2207 /* Bump action_in_progress to prevent other modules from completion of modules_init 2208 * Send message to defer application shutdown until resources are cleaned up */ 2209 spdk_spin_lock(&module->internal.spinlock); 2210 module->internal.action_in_progress = 1; 2211 spdk_spin_unlock(&module->internal.spinlock); 2212 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2213 return rc; 2214 } 2215 } 2216 2217 g_resume_bdev_module = NULL; 2218 return 0; 2219 } 2220 2221 void 2222 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2223 { 2224 int rc = 0; 2225 char mempool_name[32]; 2226 2227 assert(cb_fn != NULL); 2228 2229 g_init_cb_fn = cb_fn; 2230 g_init_cb_arg = cb_arg; 2231 2232 spdk_notify_type_register("bdev_register"); 2233 spdk_notify_type_register("bdev_unregister"); 2234 2235 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2236 2237 rc = spdk_iobuf_register_module("bdev"); 2238 if (rc != 0) { 2239 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2240 bdev_init_complete(-1); 2241 return; 2242 } 2243 2244 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2245 g_bdev_opts.bdev_io_pool_size, 2246 sizeof(struct spdk_bdev_io) + 2247 bdev_module_get_max_ctx_size(), 2248 0, 2249 SPDK_ENV_NUMA_ID_ANY); 2250 2251 if (g_bdev_mgr.bdev_io_pool == NULL) { 2252 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2253 bdev_init_complete(-1); 2254 return; 2255 } 2256 2257 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2258 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2259 if (!g_bdev_mgr.zero_buffer) { 2260 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2261 bdev_init_complete(-1); 2262 return; 2263 } 2264 2265 #ifdef SPDK_CONFIG_VTUNE 2266 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2267 #endif 2268 2269 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2270 bdev_mgmt_channel_destroy, 2271 sizeof(struct spdk_bdev_mgmt_channel), 2272 "bdev_mgr"); 2273 2274 rc = bdev_modules_init(); 2275 g_bdev_mgr.module_init_complete = true; 2276 if (rc != 0) { 2277 SPDK_ERRLOG("bdev modules init failed\n"); 2278 return; 2279 } 2280 2281 bdev_module_action_complete(); 2282 } 2283 2284 static void 2285 bdev_mgr_unregister_cb(void *io_device) 2286 { 2287 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2288 2289 if (g_bdev_mgr.bdev_io_pool) { 2290 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2291 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2292 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2293 g_bdev_opts.bdev_io_pool_size); 2294 } 2295 2296 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2297 } 2298 2299 spdk_free(g_bdev_mgr.zero_buffer); 2300 2301 bdev_examine_allowlist_free(); 2302 2303 cb_fn(g_fini_cb_arg); 2304 g_fini_cb_fn = NULL; 2305 g_fini_cb_arg = NULL; 2306 g_bdev_mgr.init_complete = false; 2307 g_bdev_mgr.module_init_complete = false; 2308 } 2309 2310 static void 2311 bdev_module_fini_iter(void *arg) 2312 { 2313 struct spdk_bdev_module *bdev_module; 2314 2315 /* FIXME: Handling initialization failures is broken now, 2316 * so we won't even try cleaning up after successfully 2317 * initialized modules. if module_init_complete is false, 2318 * just call spdk_bdev_mgr_unregister_cb 2319 */ 2320 if (!g_bdev_mgr.module_init_complete) { 2321 bdev_mgr_unregister_cb(NULL); 2322 return; 2323 } 2324 2325 /* Start iterating from the last touched module */ 2326 if (!g_resume_bdev_module) { 2327 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2328 } else { 2329 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2330 internal.tailq); 2331 } 2332 2333 while (bdev_module) { 2334 if (bdev_module->async_fini) { 2335 /* Save our place so we can resume later. We must 2336 * save the variable here, before calling module_fini() 2337 * below, because in some cases the module may immediately 2338 * call spdk_bdev_module_fini_done() and re-enter 2339 * this function to continue iterating. */ 2340 g_resume_bdev_module = bdev_module; 2341 } 2342 2343 if (bdev_module->module_fini) { 2344 bdev_module->module_fini(); 2345 } 2346 2347 if (bdev_module->async_fini) { 2348 return; 2349 } 2350 2351 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2352 internal.tailq); 2353 } 2354 2355 g_resume_bdev_module = NULL; 2356 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2357 } 2358 2359 void 2360 spdk_bdev_module_fini_done(void) 2361 { 2362 if (spdk_get_thread() != g_fini_thread) { 2363 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2364 } else { 2365 bdev_module_fini_iter(NULL); 2366 } 2367 } 2368 2369 static void 2370 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2371 { 2372 struct spdk_bdev *bdev = cb_arg; 2373 2374 if (bdeverrno && bdev) { 2375 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2376 bdev->name); 2377 2378 /* 2379 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2380 * bdev; try to continue by manually removing this bdev from the list and continue 2381 * with the next bdev in the list. 2382 */ 2383 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2384 } 2385 2386 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2387 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2388 /* 2389 * Bdev module finish need to be deferred as we might be in the middle of some context 2390 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2391 * after returning. 2392 */ 2393 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2394 return; 2395 } 2396 2397 /* 2398 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2399 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2400 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2401 * base bdevs. 2402 * 2403 * Also, walk the list in the reverse order. 2404 */ 2405 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2406 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2407 spdk_spin_lock(&bdev->internal.spinlock); 2408 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2409 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2410 spdk_spin_unlock(&bdev->internal.spinlock); 2411 continue; 2412 } 2413 spdk_spin_unlock(&bdev->internal.spinlock); 2414 2415 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2416 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2417 return; 2418 } 2419 2420 /* 2421 * If any bdev fails to unclaim underlying bdev properly, we may face the 2422 * case of bdev list consisting of claimed bdevs only (if claims are managed 2423 * correctly, this would mean there's a loop in the claims graph which is 2424 * clearly impossible). Warn and unregister last bdev on the list then. 2425 */ 2426 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2427 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2428 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2429 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2430 return; 2431 } 2432 } 2433 2434 static void 2435 bdev_module_fini_start_iter(void *arg) 2436 { 2437 struct spdk_bdev_module *bdev_module; 2438 2439 if (!g_resume_bdev_module) { 2440 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2441 } else { 2442 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2443 } 2444 2445 while (bdev_module) { 2446 if (bdev_module->async_fini_start) { 2447 /* Save our place so we can resume later. We must 2448 * save the variable here, before calling fini_start() 2449 * below, because in some cases the module may immediately 2450 * call spdk_bdev_module_fini_start_done() and re-enter 2451 * this function to continue iterating. */ 2452 g_resume_bdev_module = bdev_module; 2453 } 2454 2455 if (bdev_module->fini_start) { 2456 bdev_module->fini_start(); 2457 } 2458 2459 if (bdev_module->async_fini_start) { 2460 return; 2461 } 2462 2463 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2464 } 2465 2466 g_resume_bdev_module = NULL; 2467 2468 bdev_finish_unregister_bdevs_iter(NULL, 0); 2469 } 2470 2471 void 2472 spdk_bdev_module_fini_start_done(void) 2473 { 2474 if (spdk_get_thread() != g_fini_thread) { 2475 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2476 } else { 2477 bdev_module_fini_start_iter(NULL); 2478 } 2479 } 2480 2481 static void 2482 bdev_finish_wait_for_examine_done(void *cb_arg) 2483 { 2484 bdev_module_fini_start_iter(NULL); 2485 } 2486 2487 static void bdev_open_async_fini(void); 2488 2489 void 2490 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2491 { 2492 int rc; 2493 2494 assert(cb_fn != NULL); 2495 2496 g_fini_thread = spdk_get_thread(); 2497 2498 g_fini_cb_fn = cb_fn; 2499 g_fini_cb_arg = cb_arg; 2500 2501 bdev_open_async_fini(); 2502 2503 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2504 if (rc != 0) { 2505 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2506 bdev_finish_wait_for_examine_done(NULL); 2507 } 2508 } 2509 2510 struct spdk_bdev_io * 2511 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2512 { 2513 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2514 struct spdk_bdev_io *bdev_io; 2515 2516 if (ch->per_thread_cache_count > 0) { 2517 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2518 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2519 ch->per_thread_cache_count--; 2520 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2521 /* 2522 * Don't try to look for bdev_ios in the global pool if there are 2523 * waiters on bdev_ios - we don't want this caller to jump the line. 2524 */ 2525 bdev_io = NULL; 2526 } else { 2527 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2528 } 2529 2530 return bdev_io; 2531 } 2532 2533 void 2534 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2535 { 2536 struct spdk_bdev_mgmt_channel *ch; 2537 2538 assert(bdev_io != NULL); 2539 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2540 2541 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2542 2543 if (bdev_io->internal.f.has_buf) { 2544 bdev_io_put_buf(bdev_io); 2545 } 2546 2547 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2548 ch->per_thread_cache_count++; 2549 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2550 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2551 struct spdk_bdev_io_wait_entry *entry; 2552 2553 entry = TAILQ_FIRST(&ch->io_wait_queue); 2554 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2555 entry->cb_fn(entry->cb_arg); 2556 } 2557 } else { 2558 /* We should never have a full cache with entries on the io wait queue. */ 2559 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2560 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2561 } 2562 } 2563 2564 static bool 2565 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2566 { 2567 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2568 2569 switch (limit) { 2570 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2571 return true; 2572 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2573 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2574 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2575 return false; 2576 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2577 default: 2578 return false; 2579 } 2580 } 2581 2582 static bool 2583 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2584 { 2585 switch (bdev_io->type) { 2586 case SPDK_BDEV_IO_TYPE_NVME_IO: 2587 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2588 case SPDK_BDEV_IO_TYPE_READ: 2589 case SPDK_BDEV_IO_TYPE_WRITE: 2590 return true; 2591 case SPDK_BDEV_IO_TYPE_ZCOPY: 2592 if (bdev_io->u.bdev.zcopy.start) { 2593 return true; 2594 } else { 2595 return false; 2596 } 2597 default: 2598 return false; 2599 } 2600 } 2601 2602 static bool 2603 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2604 { 2605 switch (bdev_io->type) { 2606 case SPDK_BDEV_IO_TYPE_NVME_IO: 2607 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2608 /* Bit 1 (0x2) set for read operation */ 2609 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2610 return true; 2611 } else { 2612 return false; 2613 } 2614 case SPDK_BDEV_IO_TYPE_READ: 2615 return true; 2616 case SPDK_BDEV_IO_TYPE_ZCOPY: 2617 /* Populate to read from disk */ 2618 if (bdev_io->u.bdev.zcopy.populate) { 2619 return true; 2620 } else { 2621 return false; 2622 } 2623 default: 2624 return false; 2625 } 2626 } 2627 2628 static uint64_t 2629 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2630 { 2631 struct spdk_bdev *bdev = bdev_io->bdev; 2632 2633 switch (bdev_io->type) { 2634 case SPDK_BDEV_IO_TYPE_NVME_IO: 2635 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2636 return bdev_io->u.nvme_passthru.nbytes; 2637 case SPDK_BDEV_IO_TYPE_READ: 2638 case SPDK_BDEV_IO_TYPE_WRITE: 2639 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2640 case SPDK_BDEV_IO_TYPE_ZCOPY: 2641 /* Track the data in the start phase only */ 2642 if (bdev_io->u.bdev.zcopy.start) { 2643 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2644 } else { 2645 return 0; 2646 } 2647 default: 2648 return 0; 2649 } 2650 } 2651 2652 static inline bool 2653 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2654 { 2655 int64_t remaining_this_timeslice; 2656 2657 if (!limit->max_per_timeslice) { 2658 /* The QoS is disabled */ 2659 return false; 2660 } 2661 2662 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2663 __ATOMIC_RELAXED); 2664 if (remaining_this_timeslice + (int64_t)delta > 0) { 2665 /* There was still a quota for this delta -> the IO shouldn't be queued 2666 * 2667 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2668 * quota can be allowed once a while. Such overrun then taken into account in 2669 * the QoS poller, where the next timeslice quota is calculated. 2670 */ 2671 return false; 2672 } 2673 2674 /* There was no quota for this delta -> the IO should be queued 2675 * The remaining_this_timeslice must be rewinded so it reflects the real 2676 * amount of IOs or bytes allowed. 2677 */ 2678 __atomic_add_fetch( 2679 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2680 return true; 2681 } 2682 2683 static inline void 2684 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2685 { 2686 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2687 } 2688 2689 static bool 2690 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2691 { 2692 return bdev_qos_rw_queue_io(limit, io, 1); 2693 } 2694 2695 static void 2696 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2697 { 2698 bdev_qos_rw_rewind_io(limit, io, 1); 2699 } 2700 2701 static bool 2702 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2703 { 2704 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2705 } 2706 2707 static void 2708 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2709 { 2710 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2711 } 2712 2713 static bool 2714 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2715 { 2716 if (bdev_is_read_io(io) == false) { 2717 return false; 2718 } 2719 2720 return bdev_qos_rw_bps_queue(limit, io); 2721 } 2722 2723 static void 2724 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2725 { 2726 if (bdev_is_read_io(io) != false) { 2727 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2728 } 2729 } 2730 2731 static bool 2732 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2733 { 2734 if (bdev_is_read_io(io) == true) { 2735 return false; 2736 } 2737 2738 return bdev_qos_rw_bps_queue(limit, io); 2739 } 2740 2741 static void 2742 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2743 { 2744 if (bdev_is_read_io(io) != true) { 2745 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2746 } 2747 } 2748 2749 static void 2750 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2751 { 2752 int i; 2753 2754 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2755 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2756 qos->rate_limits[i].queue_io = NULL; 2757 continue; 2758 } 2759 2760 switch (i) { 2761 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2762 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2763 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2764 break; 2765 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2766 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2767 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2768 break; 2769 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2770 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2771 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2772 break; 2773 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2774 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2775 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2776 break; 2777 default: 2778 break; 2779 } 2780 } 2781 } 2782 2783 static void 2784 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2785 struct spdk_bdev_io *bdev_io, 2786 enum spdk_bdev_io_status status) 2787 { 2788 bdev_io->internal.f.in_submit_request = true; 2789 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2790 spdk_bdev_io_complete(bdev_io, status); 2791 bdev_io->internal.f.in_submit_request = false; 2792 } 2793 2794 static inline void 2795 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2796 { 2797 struct spdk_bdev *bdev = bdev_io->bdev; 2798 struct spdk_io_channel *ch = bdev_ch->channel; 2799 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2800 2801 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2802 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2803 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2804 2805 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2806 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2807 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2808 SPDK_BDEV_IO_STATUS_SUCCESS); 2809 return; 2810 } 2811 } 2812 2813 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2814 bdev_io->bdev->split_on_write_unit && 2815 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2816 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2817 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2818 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2819 return; 2820 } 2821 2822 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2823 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2824 bdev_io->internal.f.in_submit_request = true; 2825 bdev_submit_request(bdev, ch, bdev_io); 2826 bdev_io->internal.f.in_submit_request = false; 2827 } else { 2828 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2829 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2830 /* Special case when we have nomem IOs and no outstanding IOs which completions 2831 * could trigger retry of queued IOs */ 2832 bdev_shared_ch_retry_io(shared_resource); 2833 } 2834 } 2835 } 2836 2837 static bool 2838 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2839 { 2840 int i; 2841 2842 if (bdev_qos_io_to_limit(bdev_io) == true) { 2843 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2844 if (!qos->rate_limits[i].queue_io) { 2845 continue; 2846 } 2847 2848 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2849 bdev_io) == true) { 2850 for (i -= 1; i >= 0 ; i--) { 2851 if (!qos->rate_limits[i].queue_io) { 2852 continue; 2853 } 2854 2855 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2856 } 2857 return true; 2858 } 2859 } 2860 } 2861 2862 return false; 2863 } 2864 2865 static int 2866 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2867 { 2868 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2869 int submitted_ios = 0; 2870 2871 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2872 if (!bdev_qos_queue_io(qos, bdev_io)) { 2873 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2874 bdev_io_do_submit(ch, bdev_io); 2875 2876 submitted_ios++; 2877 } 2878 } 2879 2880 return submitted_ios; 2881 } 2882 2883 static void 2884 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2885 { 2886 int rc; 2887 2888 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2889 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2890 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2891 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2892 &bdev_io->internal.waitq_entry); 2893 if (rc != 0) { 2894 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2895 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2896 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2897 } 2898 } 2899 2900 static bool 2901 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2902 { 2903 uint32_t io_boundary; 2904 struct spdk_bdev *bdev = bdev_io->bdev; 2905 uint32_t max_segment_size = bdev->max_segment_size; 2906 uint32_t max_size = bdev->max_rw_size; 2907 int max_segs = bdev->max_num_segments; 2908 2909 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2910 io_boundary = bdev->write_unit_size; 2911 } else if (bdev->split_on_optimal_io_boundary) { 2912 io_boundary = bdev->optimal_io_boundary; 2913 } else { 2914 io_boundary = 0; 2915 } 2916 2917 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2918 return false; 2919 } 2920 2921 if (io_boundary) { 2922 uint64_t start_stripe, end_stripe; 2923 2924 start_stripe = bdev_io->u.bdev.offset_blocks; 2925 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2926 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2927 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2928 start_stripe >>= spdk_u32log2(io_boundary); 2929 end_stripe >>= spdk_u32log2(io_boundary); 2930 } else { 2931 start_stripe /= io_boundary; 2932 end_stripe /= io_boundary; 2933 } 2934 2935 if (start_stripe != end_stripe) { 2936 return true; 2937 } 2938 } 2939 2940 if (max_segs) { 2941 if (bdev_io->u.bdev.iovcnt > max_segs) { 2942 return true; 2943 } 2944 } 2945 2946 if (max_segment_size) { 2947 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2948 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2949 return true; 2950 } 2951 } 2952 } 2953 2954 if (max_size) { 2955 if (bdev_io->u.bdev.num_blocks > max_size) { 2956 return true; 2957 } 2958 } 2959 2960 return false; 2961 } 2962 2963 static bool 2964 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2965 { 2966 uint32_t num_unmap_segments; 2967 2968 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2969 return false; 2970 } 2971 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2972 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2973 return true; 2974 } 2975 2976 return false; 2977 } 2978 2979 static bool 2980 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2981 { 2982 if (!bdev_io->bdev->max_write_zeroes) { 2983 return false; 2984 } 2985 2986 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2987 return true; 2988 } 2989 2990 return false; 2991 } 2992 2993 static bool 2994 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2995 { 2996 if (bdev_io->bdev->max_copy != 0 && 2997 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2998 return true; 2999 } 3000 3001 return false; 3002 } 3003 3004 static bool 3005 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 3006 { 3007 switch (bdev_io->type) { 3008 case SPDK_BDEV_IO_TYPE_READ: 3009 case SPDK_BDEV_IO_TYPE_WRITE: 3010 return bdev_rw_should_split(bdev_io); 3011 case SPDK_BDEV_IO_TYPE_UNMAP: 3012 return bdev_unmap_should_split(bdev_io); 3013 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3014 return bdev_write_zeroes_should_split(bdev_io); 3015 case SPDK_BDEV_IO_TYPE_COPY: 3016 return bdev_copy_should_split(bdev_io); 3017 default: 3018 return false; 3019 } 3020 } 3021 3022 static uint32_t 3023 _to_next_boundary(uint64_t offset, uint32_t boundary) 3024 { 3025 return (boundary - (offset % boundary)); 3026 } 3027 3028 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 3029 3030 static void _bdev_rw_split(void *_bdev_io); 3031 3032 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3033 3034 static void 3035 _bdev_unmap_split(void *_bdev_io) 3036 { 3037 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3038 } 3039 3040 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3041 3042 static void 3043 _bdev_write_zeroes_split(void *_bdev_io) 3044 { 3045 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3046 } 3047 3048 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3049 3050 static void 3051 _bdev_copy_split(void *_bdev_io) 3052 { 3053 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3054 } 3055 3056 static int 3057 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3058 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3059 { 3060 int rc; 3061 uint64_t current_offset, current_remaining, current_src_offset; 3062 spdk_bdev_io_wait_cb io_wait_fn; 3063 3064 current_offset = *offset; 3065 current_remaining = *remaining; 3066 3067 assert(bdev_io->internal.f.split); 3068 3069 bdev_io->internal.split.outstanding++; 3070 3071 io_wait_fn = _bdev_rw_split; 3072 switch (bdev_io->type) { 3073 case SPDK_BDEV_IO_TYPE_READ: 3074 assert(bdev_io->u.bdev.accel_sequence == NULL); 3075 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3076 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3077 iov, iovcnt, md_buf, current_offset, 3078 num_blocks, 3079 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3080 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3081 NULL, 3082 bdev_io->u.bdev.dif_check_flags, 3083 bdev_io_split_done, bdev_io); 3084 break; 3085 case SPDK_BDEV_IO_TYPE_WRITE: 3086 assert(bdev_io->u.bdev.accel_sequence == NULL); 3087 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3088 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3089 iov, iovcnt, md_buf, current_offset, 3090 num_blocks, 3091 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3092 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3093 NULL, 3094 bdev_io->u.bdev.dif_check_flags, 3095 bdev_io->u.bdev.nvme_cdw12.raw, 3096 bdev_io->u.bdev.nvme_cdw13.raw, 3097 bdev_io_split_done, bdev_io); 3098 break; 3099 case SPDK_BDEV_IO_TYPE_UNMAP: 3100 io_wait_fn = _bdev_unmap_split; 3101 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3102 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3103 current_offset, num_blocks, 3104 bdev_io_split_done, bdev_io); 3105 break; 3106 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3107 io_wait_fn = _bdev_write_zeroes_split; 3108 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3109 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3110 current_offset, num_blocks, 3111 bdev_io_split_done, bdev_io); 3112 break; 3113 case SPDK_BDEV_IO_TYPE_COPY: 3114 io_wait_fn = _bdev_copy_split; 3115 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3116 (current_offset - bdev_io->u.bdev.offset_blocks); 3117 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3118 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3119 current_offset, current_src_offset, num_blocks, 3120 bdev_io_split_done, bdev_io); 3121 break; 3122 default: 3123 assert(false); 3124 rc = -EINVAL; 3125 break; 3126 } 3127 3128 if (rc == 0) { 3129 current_offset += num_blocks; 3130 current_remaining -= num_blocks; 3131 bdev_io->internal.split.current_offset_blocks = current_offset; 3132 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3133 *offset = current_offset; 3134 *remaining = current_remaining; 3135 } else { 3136 bdev_io->internal.split.outstanding--; 3137 if (rc == -ENOMEM) { 3138 if (bdev_io->internal.split.outstanding == 0) { 3139 /* No I/O is outstanding. Hence we should wait here. */ 3140 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3141 } 3142 } else { 3143 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3144 if (bdev_io->internal.split.outstanding == 0) { 3145 bdev_ch_remove_from_io_submitted(bdev_io); 3146 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3147 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3148 bdev_io->internal.ch->queue_depth); 3149 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3150 } 3151 } 3152 } 3153 3154 return rc; 3155 } 3156 3157 static void 3158 _bdev_rw_split(void *_bdev_io) 3159 { 3160 struct iovec *parent_iov, *iov; 3161 struct spdk_bdev_io *bdev_io = _bdev_io; 3162 struct spdk_bdev *bdev = bdev_io->bdev; 3163 uint64_t parent_offset, current_offset, remaining; 3164 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3165 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3166 uint32_t iovcnt, iov_len, child_iovsize; 3167 uint32_t blocklen = bdev->blocklen; 3168 uint32_t io_boundary; 3169 uint32_t max_segment_size = bdev->max_segment_size; 3170 uint32_t max_child_iovcnt = bdev->max_num_segments; 3171 uint32_t max_size = bdev->max_rw_size; 3172 void *md_buf = NULL; 3173 int rc; 3174 3175 max_size = max_size ? max_size : UINT32_MAX; 3176 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3177 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3178 SPDK_BDEV_IO_NUM_CHILD_IOV; 3179 3180 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3181 io_boundary = bdev->write_unit_size; 3182 } else if (bdev->split_on_optimal_io_boundary) { 3183 io_boundary = bdev->optimal_io_boundary; 3184 } else { 3185 io_boundary = UINT32_MAX; 3186 } 3187 3188 assert(bdev_io->internal.f.split); 3189 3190 remaining = bdev_io->internal.split.remaining_num_blocks; 3191 current_offset = bdev_io->internal.split.current_offset_blocks; 3192 parent_offset = bdev_io->u.bdev.offset_blocks; 3193 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3194 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3195 3196 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3197 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3198 if (parent_iov_offset < parent_iov->iov_len) { 3199 break; 3200 } 3201 parent_iov_offset -= parent_iov->iov_len; 3202 } 3203 3204 child_iovcnt = 0; 3205 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3206 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3207 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3208 to_next_boundary = spdk_min(remaining, to_next_boundary); 3209 to_next_boundary = spdk_min(max_size, to_next_boundary); 3210 to_next_boundary_bytes = to_next_boundary * blocklen; 3211 3212 iov = &bdev_io->child_iov[child_iovcnt]; 3213 iovcnt = 0; 3214 3215 if (bdev_io->u.bdev.md_buf) { 3216 md_buf = (char *)bdev_io->u.bdev.md_buf + 3217 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3218 } 3219 3220 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3221 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3222 iovcnt < child_iovsize) { 3223 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3224 iov_len = parent_iov->iov_len - parent_iov_offset; 3225 3226 iov_len = spdk_min(iov_len, max_segment_size); 3227 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3228 to_next_boundary_bytes -= iov_len; 3229 3230 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3231 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3232 3233 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3234 parent_iov_offset += iov_len; 3235 } else { 3236 parent_iovpos++; 3237 parent_iov_offset = 0; 3238 } 3239 child_iovcnt++; 3240 iovcnt++; 3241 } 3242 3243 if (to_next_boundary_bytes > 0) { 3244 /* We had to stop this child I/O early because we ran out of 3245 * child_iov space or were limited by max_num_segments. 3246 * Ensure the iovs to be aligned with block size and 3247 * then adjust to_next_boundary before starting the 3248 * child I/O. 3249 */ 3250 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3251 iovcnt == child_iovsize); 3252 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3253 if (to_last_block_bytes != 0) { 3254 uint32_t child_iovpos = child_iovcnt - 1; 3255 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3256 * so the loop will naturally end 3257 */ 3258 3259 to_last_block_bytes = blocklen - to_last_block_bytes; 3260 to_next_boundary_bytes += to_last_block_bytes; 3261 while (to_last_block_bytes > 0 && iovcnt > 0) { 3262 iov_len = spdk_min(to_last_block_bytes, 3263 bdev_io->child_iov[child_iovpos].iov_len); 3264 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3265 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3266 child_iovpos--; 3267 if (--iovcnt == 0) { 3268 /* If the child IO is less than a block size just return. 3269 * If the first child IO of any split round is less than 3270 * a block size, an error exit. 3271 */ 3272 if (bdev_io->internal.split.outstanding == 0) { 3273 SPDK_ERRLOG("The first child io was less than a block size\n"); 3274 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3275 bdev_ch_remove_from_io_submitted(bdev_io); 3276 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3277 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3278 bdev_io->internal.ch->queue_depth); 3279 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3280 } 3281 3282 return; 3283 } 3284 } 3285 3286 to_last_block_bytes -= iov_len; 3287 3288 if (parent_iov_offset == 0) { 3289 parent_iovpos--; 3290 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3291 } 3292 parent_iov_offset -= iov_len; 3293 } 3294 3295 assert(to_last_block_bytes == 0); 3296 } 3297 to_next_boundary -= to_next_boundary_bytes / blocklen; 3298 } 3299 3300 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3301 ¤t_offset, &remaining); 3302 if (spdk_unlikely(rc)) { 3303 return; 3304 } 3305 } 3306 } 3307 3308 static void 3309 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3310 { 3311 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3312 uint32_t num_children_reqs = 0; 3313 int rc; 3314 3315 assert(bdev_io->internal.f.split); 3316 3317 offset = bdev_io->internal.split.current_offset_blocks; 3318 remaining = bdev_io->internal.split.remaining_num_blocks; 3319 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3320 3321 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3322 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3323 3324 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3325 &offset, &remaining); 3326 if (spdk_likely(rc == 0)) { 3327 num_children_reqs++; 3328 } else { 3329 return; 3330 } 3331 } 3332 } 3333 3334 static void 3335 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3336 { 3337 uint64_t offset, write_zeroes_blocks, remaining; 3338 uint32_t num_children_reqs = 0; 3339 int rc; 3340 3341 assert(bdev_io->internal.f.split); 3342 3343 offset = bdev_io->internal.split.current_offset_blocks; 3344 remaining = bdev_io->internal.split.remaining_num_blocks; 3345 3346 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3347 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3348 3349 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3350 &offset, &remaining); 3351 if (spdk_likely(rc == 0)) { 3352 num_children_reqs++; 3353 } else { 3354 return; 3355 } 3356 } 3357 } 3358 3359 static void 3360 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3361 { 3362 uint64_t offset, copy_blocks, remaining; 3363 uint32_t num_children_reqs = 0; 3364 int rc; 3365 3366 assert(bdev_io->internal.f.split); 3367 3368 offset = bdev_io->internal.split.current_offset_blocks; 3369 remaining = bdev_io->internal.split.remaining_num_blocks; 3370 3371 assert(bdev_io->bdev->max_copy != 0); 3372 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3373 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3374 3375 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3376 &offset, &remaining); 3377 if (spdk_likely(rc == 0)) { 3378 num_children_reqs++; 3379 } else { 3380 return; 3381 } 3382 } 3383 } 3384 3385 static void 3386 parent_bdev_io_complete(void *ctx, int rc) 3387 { 3388 struct spdk_bdev_io *parent_io = ctx; 3389 3390 if (rc) { 3391 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3392 } 3393 3394 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3395 parent_io->internal.caller_ctx); 3396 } 3397 3398 static void 3399 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3400 { 3401 struct spdk_bdev_io *bdev_io = ctx; 3402 3403 /* u.bdev.accel_sequence should have already been cleared at this point */ 3404 assert(bdev_io->u.bdev.accel_sequence == NULL); 3405 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3406 bdev_io->internal.f.has_accel_sequence = false; 3407 3408 if (spdk_unlikely(status != 0)) { 3409 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3410 } 3411 3412 parent_bdev_io_complete(bdev_io, status); 3413 } 3414 3415 static void 3416 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3417 { 3418 struct spdk_bdev_io *parent_io = cb_arg; 3419 3420 spdk_bdev_free_io(bdev_io); 3421 3422 assert(parent_io->internal.f.split); 3423 3424 if (!success) { 3425 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3426 /* If any child I/O failed, stop further splitting process. */ 3427 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3428 parent_io->internal.split.remaining_num_blocks = 0; 3429 } 3430 parent_io->internal.split.outstanding--; 3431 if (parent_io->internal.split.outstanding != 0) { 3432 return; 3433 } 3434 3435 /* 3436 * Parent I/O finishes when all blocks are consumed. 3437 */ 3438 if (parent_io->internal.split.remaining_num_blocks == 0) { 3439 assert(parent_io->internal.cb != bdev_io_split_done); 3440 bdev_ch_remove_from_io_submitted(parent_io); 3441 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3442 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3443 parent_io->internal.ch->queue_depth); 3444 3445 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3446 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3447 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3448 return; 3449 } else if (parent_io->internal.f.has_bounce_buf && 3450 !bdev_io_use_accel_sequence(bdev_io)) { 3451 /* bdev IO will be completed in the callback */ 3452 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3453 return; 3454 } 3455 } 3456 3457 parent_bdev_io_complete(parent_io, 0); 3458 return; 3459 } 3460 3461 /* 3462 * Continue with the splitting process. This function will complete the parent I/O if the 3463 * splitting is done. 3464 */ 3465 switch (parent_io->type) { 3466 case SPDK_BDEV_IO_TYPE_READ: 3467 case SPDK_BDEV_IO_TYPE_WRITE: 3468 _bdev_rw_split(parent_io); 3469 break; 3470 case SPDK_BDEV_IO_TYPE_UNMAP: 3471 bdev_unmap_split(parent_io); 3472 break; 3473 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3474 bdev_write_zeroes_split(parent_io); 3475 break; 3476 case SPDK_BDEV_IO_TYPE_COPY: 3477 bdev_copy_split(parent_io); 3478 break; 3479 default: 3480 assert(false); 3481 break; 3482 } 3483 } 3484 3485 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3486 bool success); 3487 3488 static void 3489 bdev_io_split(struct spdk_bdev_io *bdev_io) 3490 { 3491 assert(bdev_io_should_split(bdev_io)); 3492 assert(bdev_io->internal.f.split); 3493 3494 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3495 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3496 bdev_io->internal.split.outstanding = 0; 3497 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3498 3499 switch (bdev_io->type) { 3500 case SPDK_BDEV_IO_TYPE_READ: 3501 case SPDK_BDEV_IO_TYPE_WRITE: 3502 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3503 _bdev_rw_split(bdev_io); 3504 } else { 3505 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3506 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3507 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3508 } 3509 break; 3510 case SPDK_BDEV_IO_TYPE_UNMAP: 3511 bdev_unmap_split(bdev_io); 3512 break; 3513 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3514 bdev_write_zeroes_split(bdev_io); 3515 break; 3516 case SPDK_BDEV_IO_TYPE_COPY: 3517 bdev_copy_split(bdev_io); 3518 break; 3519 default: 3520 assert(false); 3521 break; 3522 } 3523 } 3524 3525 static void 3526 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3527 { 3528 if (!success) { 3529 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3530 return; 3531 } 3532 3533 _bdev_rw_split(bdev_io); 3534 } 3535 3536 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3537 * be inlined, at least on some compilers. 3538 */ 3539 static inline void 3540 _bdev_io_submit(void *ctx) 3541 { 3542 struct spdk_bdev_io *bdev_io = ctx; 3543 struct spdk_bdev *bdev = bdev_io->bdev; 3544 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3545 3546 if (spdk_likely(bdev_ch->flags == 0)) { 3547 bdev_io_do_submit(bdev_ch, bdev_io); 3548 return; 3549 } 3550 3551 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3552 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3553 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3554 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3555 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3556 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3557 } else { 3558 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3559 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3560 } 3561 } else { 3562 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3563 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3564 } 3565 } 3566 3567 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3568 3569 bool 3570 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3571 { 3572 if (range1->length == 0 || range2->length == 0) { 3573 return false; 3574 } 3575 3576 if (range1->offset + range1->length <= range2->offset) { 3577 return false; 3578 } 3579 3580 if (range2->offset + range2->length <= range1->offset) { 3581 return false; 3582 } 3583 3584 return true; 3585 } 3586 3587 static bool 3588 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3589 { 3590 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3591 struct lba_range r; 3592 3593 switch (bdev_io->type) { 3594 case SPDK_BDEV_IO_TYPE_NVME_IO: 3595 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3596 /* Don't try to decode the NVMe command - just assume worst-case and that 3597 * it overlaps a locked range. 3598 */ 3599 return true; 3600 case SPDK_BDEV_IO_TYPE_READ: 3601 if (!range->quiesce) { 3602 return false; 3603 } 3604 /* fallthrough */ 3605 case SPDK_BDEV_IO_TYPE_WRITE: 3606 case SPDK_BDEV_IO_TYPE_UNMAP: 3607 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3608 case SPDK_BDEV_IO_TYPE_ZCOPY: 3609 case SPDK_BDEV_IO_TYPE_COPY: 3610 r.offset = bdev_io->u.bdev.offset_blocks; 3611 r.length = bdev_io->u.bdev.num_blocks; 3612 if (!bdev_lba_range_overlapped(range, &r)) { 3613 /* This I/O doesn't overlap the specified LBA range. */ 3614 return false; 3615 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3616 /* This I/O overlaps, but the I/O is on the same channel that locked this 3617 * range, and the caller_ctx is the same as the locked_ctx. This means 3618 * that this I/O is associated with the lock, and is allowed to execute. 3619 */ 3620 return false; 3621 } else { 3622 return true; 3623 } 3624 default: 3625 return false; 3626 } 3627 } 3628 3629 void 3630 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3631 { 3632 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3633 3634 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3635 3636 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3637 struct lba_range *range; 3638 3639 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3640 if (bdev_io_range_is_locked(bdev_io, range)) { 3641 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3642 return; 3643 } 3644 } 3645 } 3646 3647 bdev_ch_add_to_io_submitted(bdev_io); 3648 3649 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3650 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3651 ch->trace_id, bdev_io->u.bdev.num_blocks, 3652 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3653 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3654 3655 if (bdev_io->internal.f.split) { 3656 bdev_io_split(bdev_io); 3657 return; 3658 } 3659 3660 _bdev_io_submit(bdev_io); 3661 } 3662 3663 static inline void 3664 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3665 { 3666 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3667 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3668 * For write operation we need to pull buffers from memory domain before submitting IO. 3669 * Once read operation completes, we need to use memory_domain push functionality to 3670 * update data in original memory domain IO buffer 3671 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3672 assert(bdev_io->internal.f.has_memory_domain); 3673 bdev_io->u.bdev.memory_domain = NULL; 3674 bdev_io->u.bdev.memory_domain_ctx = NULL; 3675 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3676 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3677 } 3678 3679 static inline void 3680 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3681 { 3682 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3683 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3684 3685 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3686 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3687 bdev_io_complete_unsubmitted(bdev_io); 3688 return; 3689 } 3690 3691 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3692 * support them, but we need to execute an accel sequence and the data buffer is from accel 3693 * memory domain (to avoid doing a push/pull from that domain). 3694 */ 3695 if (bdev_io_use_memory_domain(bdev_io)) { 3696 if (!desc->memory_domains_supported || 3697 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3698 _bdev_io_ext_use_bounce_buffer(bdev_io); 3699 return; 3700 } 3701 } 3702 3703 if (needs_exec) { 3704 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3705 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3706 return; 3707 } 3708 /* For reads we'll execute the sequence after the data is read, so, for now, only 3709 * clear out accel_sequence pointer and submit the IO */ 3710 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3711 bdev_io->u.bdev.accel_sequence = NULL; 3712 } 3713 3714 bdev_io_submit(bdev_io); 3715 } 3716 3717 static void 3718 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3719 { 3720 struct spdk_bdev *bdev = bdev_io->bdev; 3721 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3722 struct spdk_io_channel *ch = bdev_ch->channel; 3723 3724 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3725 3726 bdev_io->internal.f.in_submit_request = true; 3727 bdev_submit_request(bdev, ch, bdev_io); 3728 bdev_io->internal.f.in_submit_request = false; 3729 } 3730 3731 void 3732 bdev_io_init(struct spdk_bdev_io *bdev_io, 3733 struct spdk_bdev *bdev, void *cb_arg, 3734 spdk_bdev_io_completion_cb cb) 3735 { 3736 bdev_io->bdev = bdev; 3737 bdev_io->internal.f.raw = 0; 3738 bdev_io->internal.caller_ctx = cb_arg; 3739 bdev_io->internal.cb = cb; 3740 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3741 bdev_io->internal.f.in_submit_request = false; 3742 bdev_io->internal.error.nvme.cdw0 = 0; 3743 bdev_io->num_retries = 0; 3744 bdev_io->internal.get_buf_cb = NULL; 3745 bdev_io->internal.get_aux_buf_cb = NULL; 3746 bdev_io->internal.data_transfer_cpl = NULL; 3747 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3748 } 3749 3750 static bool 3751 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3752 { 3753 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3754 } 3755 3756 bool 3757 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3758 { 3759 bool supported; 3760 3761 supported = bdev_io_type_supported(bdev, io_type); 3762 3763 if (!supported) { 3764 switch (io_type) { 3765 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3766 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3767 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3768 break; 3769 default: 3770 break; 3771 } 3772 } 3773 3774 return supported; 3775 } 3776 3777 static const char *g_io_type_strings[] = { 3778 [SPDK_BDEV_IO_TYPE_READ] = "read", 3779 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3780 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3781 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3782 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3783 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3784 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3785 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3786 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3787 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3788 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3789 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3790 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3791 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3792 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3793 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3794 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3795 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3796 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3797 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3798 }; 3799 3800 const char * 3801 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3802 { 3803 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3804 return NULL; 3805 } 3806 3807 return g_io_type_strings[io_type]; 3808 } 3809 3810 int 3811 spdk_bdev_get_io_type(const char *io_type_string) 3812 { 3813 int i; 3814 3815 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3816 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3817 return i; 3818 } 3819 } 3820 3821 return -1; 3822 } 3823 3824 uint64_t 3825 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3826 { 3827 return bdev_io->internal.submit_tsc; 3828 } 3829 3830 int 3831 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3832 { 3833 if (bdev->fn_table->dump_info_json) { 3834 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3835 } 3836 3837 return 0; 3838 } 3839 3840 static void 3841 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3842 { 3843 uint32_t max_per_timeslice = 0; 3844 int i; 3845 3846 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3847 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3848 qos->rate_limits[i].max_per_timeslice = 0; 3849 continue; 3850 } 3851 3852 max_per_timeslice = qos->rate_limits[i].limit * 3853 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3854 3855 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3856 qos->rate_limits[i].min_per_timeslice); 3857 3858 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3859 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3860 } 3861 3862 bdev_qos_set_ops(qos); 3863 } 3864 3865 static void 3866 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3867 struct spdk_io_channel *io_ch, void *ctx) 3868 { 3869 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3870 int status; 3871 3872 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3873 3874 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3875 /* TODO: channels round robing */ 3876 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3877 3878 spdk_bdev_for_each_channel_continue(i, status); 3879 } 3880 3881 3882 static void 3883 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3884 { 3885 3886 } 3887 3888 static int 3889 bdev_channel_poll_qos(void *arg) 3890 { 3891 struct spdk_bdev *bdev = arg; 3892 struct spdk_bdev_qos *qos = bdev->internal.qos; 3893 uint64_t now = spdk_get_ticks(); 3894 int i; 3895 int64_t remaining_last_timeslice; 3896 3897 if (spdk_unlikely(qos->thread == NULL)) { 3898 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3899 return SPDK_POLLER_IDLE; 3900 } 3901 3902 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3903 /* We received our callback earlier than expected - return 3904 * immediately and wait to do accounting until at least one 3905 * timeslice has actually expired. This should never happen 3906 * with a well-behaved timer implementation. 3907 */ 3908 return SPDK_POLLER_IDLE; 3909 } 3910 3911 /* Reset for next round of rate limiting */ 3912 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3913 /* We may have allowed the IOs or bytes to slightly overrun in the last 3914 * timeslice. remaining_this_timeslice is signed, so if it's negative 3915 * here, we'll account for the overrun so that the next timeslice will 3916 * be appropriately reduced. 3917 */ 3918 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3919 0, __ATOMIC_RELAXED); 3920 if (remaining_last_timeslice < 0) { 3921 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3922 * potentially use 2 atomic ops each, so they can intertwine. 3923 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3924 */ 3925 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3926 remaining_last_timeslice, __ATOMIC_RELAXED); 3927 } 3928 } 3929 3930 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3931 qos->last_timeslice += qos->timeslice_size; 3932 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3933 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3934 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3935 } 3936 } 3937 3938 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3939 bdev_channel_submit_qos_io_done); 3940 3941 return SPDK_POLLER_BUSY; 3942 } 3943 3944 static void 3945 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3946 { 3947 struct spdk_bdev_shared_resource *shared_resource; 3948 struct lba_range *range; 3949 3950 bdev_free_io_stat(ch->stat); 3951 #ifdef SPDK_CONFIG_VTUNE 3952 bdev_free_io_stat(ch->prev_stat); 3953 #endif 3954 3955 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3956 range = TAILQ_FIRST(&ch->locked_ranges); 3957 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3958 free(range); 3959 } 3960 3961 spdk_put_io_channel(ch->channel); 3962 spdk_put_io_channel(ch->accel_channel); 3963 3964 shared_resource = ch->shared_resource; 3965 3966 assert(TAILQ_EMPTY(&ch->io_locked)); 3967 assert(TAILQ_EMPTY(&ch->io_submitted)); 3968 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3969 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3970 assert(ch->io_outstanding == 0); 3971 assert(shared_resource->ref > 0); 3972 shared_resource->ref--; 3973 if (shared_resource->ref == 0) { 3974 assert(shared_resource->io_outstanding == 0); 3975 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3976 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3977 spdk_poller_unregister(&shared_resource->nomem_poller); 3978 free(shared_resource); 3979 } 3980 } 3981 3982 static void 3983 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3984 { 3985 struct spdk_bdev_qos *qos = bdev->internal.qos; 3986 int i; 3987 3988 assert(spdk_spin_held(&bdev->internal.spinlock)); 3989 3990 /* Rate limiting on this bdev enabled */ 3991 if (qos) { 3992 if (qos->ch == NULL) { 3993 struct spdk_io_channel *io_ch; 3994 3995 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3996 bdev->name, spdk_get_thread()); 3997 3998 /* No qos channel has been selected, so set one up */ 3999 4000 /* Take another reference to ch */ 4001 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 4002 assert(io_ch != NULL); 4003 qos->ch = ch; 4004 4005 qos->thread = spdk_io_channel_get_thread(io_ch); 4006 4007 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4008 if (bdev_qos_is_iops_rate_limit(i) == true) { 4009 qos->rate_limits[i].min_per_timeslice = 4010 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 4011 } else { 4012 qos->rate_limits[i].min_per_timeslice = 4013 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 4014 } 4015 4016 if (qos->rate_limits[i].limit == 0) { 4017 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 4018 } 4019 } 4020 bdev_qos_update_max_quota_per_timeslice(qos); 4021 qos->timeslice_size = 4022 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 4023 qos->last_timeslice = spdk_get_ticks(); 4024 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 4025 bdev, 4026 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 4027 } 4028 4029 ch->flags |= BDEV_CH_QOS_ENABLED; 4030 } 4031 } 4032 4033 struct poll_timeout_ctx { 4034 struct spdk_bdev_desc *desc; 4035 uint64_t timeout_in_sec; 4036 spdk_bdev_io_timeout_cb cb_fn; 4037 void *cb_arg; 4038 }; 4039 4040 static void 4041 bdev_desc_free(struct spdk_bdev_desc *desc) 4042 { 4043 spdk_spin_destroy(&desc->spinlock); 4044 free(desc->media_events_buffer); 4045 free(desc); 4046 } 4047 4048 static void 4049 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4050 { 4051 struct poll_timeout_ctx *ctx = _ctx; 4052 struct spdk_bdev_desc *desc = ctx->desc; 4053 4054 free(ctx); 4055 4056 spdk_spin_lock(&desc->spinlock); 4057 desc->refs--; 4058 if (desc->closed == true && desc->refs == 0) { 4059 spdk_spin_unlock(&desc->spinlock); 4060 bdev_desc_free(desc); 4061 return; 4062 } 4063 spdk_spin_unlock(&desc->spinlock); 4064 } 4065 4066 static void 4067 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4068 struct spdk_io_channel *io_ch, void *_ctx) 4069 { 4070 struct poll_timeout_ctx *ctx = _ctx; 4071 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4072 struct spdk_bdev_desc *desc = ctx->desc; 4073 struct spdk_bdev_io *bdev_io; 4074 uint64_t now; 4075 4076 spdk_spin_lock(&desc->spinlock); 4077 if (desc->closed == true) { 4078 spdk_spin_unlock(&desc->spinlock); 4079 spdk_bdev_for_each_channel_continue(i, -1); 4080 return; 4081 } 4082 spdk_spin_unlock(&desc->spinlock); 4083 4084 now = spdk_get_ticks(); 4085 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4086 /* Exclude any I/O that are generated via splitting. */ 4087 if (bdev_io->internal.cb == bdev_io_split_done) { 4088 continue; 4089 } 4090 4091 /* Once we find an I/O that has not timed out, we can immediately 4092 * exit the loop. 4093 */ 4094 if (now < (bdev_io->internal.submit_tsc + 4095 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4096 goto end; 4097 } 4098 4099 if (bdev_io->internal.desc == desc) { 4100 ctx->cb_fn(ctx->cb_arg, bdev_io); 4101 } 4102 } 4103 4104 end: 4105 spdk_bdev_for_each_channel_continue(i, 0); 4106 } 4107 4108 static int 4109 bdev_poll_timeout_io(void *arg) 4110 { 4111 struct spdk_bdev_desc *desc = arg; 4112 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4113 struct poll_timeout_ctx *ctx; 4114 4115 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4116 if (!ctx) { 4117 SPDK_ERRLOG("failed to allocate memory\n"); 4118 return SPDK_POLLER_BUSY; 4119 } 4120 ctx->desc = desc; 4121 ctx->cb_arg = desc->cb_arg; 4122 ctx->cb_fn = desc->cb_fn; 4123 ctx->timeout_in_sec = desc->timeout_in_sec; 4124 4125 /* Take a ref on the descriptor in case it gets closed while we are checking 4126 * all of the channels. 4127 */ 4128 spdk_spin_lock(&desc->spinlock); 4129 desc->refs++; 4130 spdk_spin_unlock(&desc->spinlock); 4131 4132 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4133 bdev_channel_poll_timeout_io_done); 4134 4135 return SPDK_POLLER_BUSY; 4136 } 4137 4138 int 4139 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4140 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4141 { 4142 assert(desc->thread == spdk_get_thread()); 4143 4144 spdk_poller_unregister(&desc->io_timeout_poller); 4145 4146 if (timeout_in_sec) { 4147 assert(cb_fn != NULL); 4148 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4149 desc, 4150 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4151 1000); 4152 if (desc->io_timeout_poller == NULL) { 4153 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4154 return -1; 4155 } 4156 } 4157 4158 desc->cb_fn = cb_fn; 4159 desc->cb_arg = cb_arg; 4160 desc->timeout_in_sec = timeout_in_sec; 4161 4162 return 0; 4163 } 4164 4165 static int 4166 bdev_channel_create(void *io_device, void *ctx_buf) 4167 { 4168 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4169 struct spdk_bdev_channel *ch = ctx_buf; 4170 struct spdk_io_channel *mgmt_io_ch; 4171 struct spdk_bdev_mgmt_channel *mgmt_ch; 4172 struct spdk_bdev_shared_resource *shared_resource; 4173 struct lba_range *range; 4174 4175 ch->bdev = bdev; 4176 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4177 if (!ch->channel) { 4178 return -1; 4179 } 4180 4181 ch->accel_channel = spdk_accel_get_io_channel(); 4182 if (!ch->accel_channel) { 4183 spdk_put_io_channel(ch->channel); 4184 return -1; 4185 } 4186 4187 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4188 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4189 4190 assert(ch->histogram == NULL); 4191 if (bdev->internal.histogram_enabled) { 4192 ch->histogram = spdk_histogram_data_alloc(); 4193 if (ch->histogram == NULL) { 4194 SPDK_ERRLOG("Could not allocate histogram\n"); 4195 } 4196 } 4197 4198 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4199 if (!mgmt_io_ch) { 4200 spdk_put_io_channel(ch->channel); 4201 spdk_put_io_channel(ch->accel_channel); 4202 return -1; 4203 } 4204 4205 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4206 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4207 if (shared_resource->shared_ch == ch->channel) { 4208 spdk_put_io_channel(mgmt_io_ch); 4209 shared_resource->ref++; 4210 break; 4211 } 4212 } 4213 4214 if (shared_resource == NULL) { 4215 shared_resource = calloc(1, sizeof(*shared_resource)); 4216 if (shared_resource == NULL) { 4217 spdk_put_io_channel(ch->channel); 4218 spdk_put_io_channel(ch->accel_channel); 4219 spdk_put_io_channel(mgmt_io_ch); 4220 return -1; 4221 } 4222 4223 shared_resource->mgmt_ch = mgmt_ch; 4224 shared_resource->io_outstanding = 0; 4225 TAILQ_INIT(&shared_resource->nomem_io); 4226 shared_resource->nomem_threshold = 0; 4227 shared_resource->shared_ch = ch->channel; 4228 shared_resource->ref = 1; 4229 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4230 } 4231 4232 ch->io_outstanding = 0; 4233 TAILQ_INIT(&ch->queued_resets); 4234 TAILQ_INIT(&ch->locked_ranges); 4235 TAILQ_INIT(&ch->qos_queued_io); 4236 ch->flags = 0; 4237 ch->trace_id = bdev->internal.trace_id; 4238 ch->shared_resource = shared_resource; 4239 4240 TAILQ_INIT(&ch->io_submitted); 4241 TAILQ_INIT(&ch->io_locked); 4242 TAILQ_INIT(&ch->io_accel_exec); 4243 TAILQ_INIT(&ch->io_memory_domain); 4244 4245 ch->stat = bdev_alloc_io_stat(false); 4246 if (ch->stat == NULL) { 4247 bdev_channel_destroy_resource(ch); 4248 return -1; 4249 } 4250 4251 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4252 4253 #ifdef SPDK_CONFIG_VTUNE 4254 { 4255 char *name; 4256 __itt_init_ittlib(NULL, 0); 4257 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4258 if (!name) { 4259 bdev_channel_destroy_resource(ch); 4260 return -1; 4261 } 4262 ch->handle = __itt_string_handle_create(name); 4263 free(name); 4264 ch->start_tsc = spdk_get_ticks(); 4265 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4266 ch->prev_stat = bdev_alloc_io_stat(false); 4267 if (ch->prev_stat == NULL) { 4268 bdev_channel_destroy_resource(ch); 4269 return -1; 4270 } 4271 } 4272 #endif 4273 4274 spdk_spin_lock(&bdev->internal.spinlock); 4275 bdev_enable_qos(bdev, ch); 4276 4277 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4278 struct lba_range *new_range; 4279 4280 new_range = calloc(1, sizeof(*new_range)); 4281 if (new_range == NULL) { 4282 spdk_spin_unlock(&bdev->internal.spinlock); 4283 bdev_channel_destroy_resource(ch); 4284 return -1; 4285 } 4286 new_range->length = range->length; 4287 new_range->offset = range->offset; 4288 new_range->locked_ctx = range->locked_ctx; 4289 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4290 } 4291 4292 spdk_spin_unlock(&bdev->internal.spinlock); 4293 4294 return 0; 4295 } 4296 4297 static int 4298 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4299 void *cb_ctx) 4300 { 4301 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4302 struct spdk_bdev_io *bdev_io; 4303 uint64_t buf_len; 4304 4305 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4306 if (bdev_io->internal.ch == bdev_ch) { 4307 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4308 spdk_iobuf_entry_abort(ch, entry, buf_len); 4309 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4310 } 4311 4312 return 0; 4313 } 4314 4315 /* 4316 * Abort I/O that are waiting on a data buffer. 4317 */ 4318 static void 4319 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4320 { 4321 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4322 bdev_abort_all_buf_io_cb, ch); 4323 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4324 bdev_abort_all_buf_io_cb, ch); 4325 } 4326 4327 /* 4328 * Abort I/O that are queued waiting for submission. These types of I/O are 4329 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4330 */ 4331 static void 4332 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4333 { 4334 struct spdk_bdev_io *bdev_io, *tmp; 4335 4336 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4337 if (bdev_io->internal.ch == ch) { 4338 TAILQ_REMOVE(queue, bdev_io, internal.link); 4339 /* 4340 * spdk_bdev_io_complete() assumes that the completed I/O had 4341 * been submitted to the bdev module. Since in this case it 4342 * hadn't, bump io_outstanding to account for the decrement 4343 * that spdk_bdev_io_complete() will do. 4344 */ 4345 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4346 bdev_io_increment_outstanding(ch, ch->shared_resource); 4347 } 4348 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4349 } 4350 } 4351 } 4352 4353 static bool 4354 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4355 { 4356 struct spdk_bdev_io *bdev_io; 4357 4358 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4359 if (bdev_io == bio_to_abort) { 4360 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4361 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4362 return true; 4363 } 4364 } 4365 4366 return false; 4367 } 4368 4369 static int 4370 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4371 { 4372 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4373 uint64_t buf_len; 4374 4375 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4376 if (bdev_io == bio_to_abort) { 4377 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4378 spdk_iobuf_entry_abort(ch, entry, buf_len); 4379 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4380 return 1; 4381 } 4382 4383 return 0; 4384 } 4385 4386 static bool 4387 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4388 { 4389 int rc; 4390 4391 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4392 bdev_abort_buf_io_cb, bio_to_abort); 4393 if (rc == 1) { 4394 return true; 4395 } 4396 4397 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4398 bdev_abort_buf_io_cb, bio_to_abort); 4399 return rc == 1; 4400 } 4401 4402 static void 4403 bdev_qos_channel_destroy(void *cb_arg) 4404 { 4405 struct spdk_bdev_qos *qos = cb_arg; 4406 4407 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4408 spdk_poller_unregister(&qos->poller); 4409 4410 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4411 4412 free(qos); 4413 } 4414 4415 static int 4416 bdev_qos_destroy(struct spdk_bdev *bdev) 4417 { 4418 int i; 4419 4420 /* 4421 * Cleanly shutting down the QoS poller is tricky, because 4422 * during the asynchronous operation the user could open 4423 * a new descriptor and create a new channel, spawning 4424 * a new QoS poller. 4425 * 4426 * The strategy is to create a new QoS structure here and swap it 4427 * in. The shutdown path then continues to refer to the old one 4428 * until it completes and then releases it. 4429 */ 4430 struct spdk_bdev_qos *new_qos, *old_qos; 4431 4432 old_qos = bdev->internal.qos; 4433 4434 new_qos = calloc(1, sizeof(*new_qos)); 4435 if (!new_qos) { 4436 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4437 return -ENOMEM; 4438 } 4439 4440 /* Copy the old QoS data into the newly allocated structure */ 4441 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4442 4443 /* Zero out the key parts of the QoS structure */ 4444 new_qos->ch = NULL; 4445 new_qos->thread = NULL; 4446 new_qos->poller = NULL; 4447 /* 4448 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4449 * It will be used later for the new QoS structure. 4450 */ 4451 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4452 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4453 new_qos->rate_limits[i].min_per_timeslice = 0; 4454 new_qos->rate_limits[i].max_per_timeslice = 0; 4455 } 4456 4457 bdev->internal.qos = new_qos; 4458 4459 if (old_qos->thread == NULL) { 4460 free(old_qos); 4461 } else { 4462 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4463 } 4464 4465 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4466 * been destroyed yet. The destruction path will end up waiting for the final 4467 * channel to be put before it releases resources. */ 4468 4469 return 0; 4470 } 4471 4472 void 4473 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4474 { 4475 total->bytes_read += add->bytes_read; 4476 total->num_read_ops += add->num_read_ops; 4477 total->bytes_written += add->bytes_written; 4478 total->num_write_ops += add->num_write_ops; 4479 total->bytes_unmapped += add->bytes_unmapped; 4480 total->num_unmap_ops += add->num_unmap_ops; 4481 total->bytes_copied += add->bytes_copied; 4482 total->num_copy_ops += add->num_copy_ops; 4483 total->read_latency_ticks += add->read_latency_ticks; 4484 total->write_latency_ticks += add->write_latency_ticks; 4485 total->unmap_latency_ticks += add->unmap_latency_ticks; 4486 total->copy_latency_ticks += add->copy_latency_ticks; 4487 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4488 total->max_read_latency_ticks = add->max_read_latency_ticks; 4489 } 4490 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4491 total->min_read_latency_ticks = add->min_read_latency_ticks; 4492 } 4493 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4494 total->max_write_latency_ticks = add->max_write_latency_ticks; 4495 } 4496 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4497 total->min_write_latency_ticks = add->min_write_latency_ticks; 4498 } 4499 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4500 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4501 } 4502 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4503 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4504 } 4505 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4506 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4507 } 4508 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4509 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4510 } 4511 } 4512 4513 static void 4514 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4515 { 4516 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4517 4518 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4519 memcpy(to_stat->io_error, from_stat->io_error, 4520 sizeof(struct spdk_bdev_io_error_stat)); 4521 } 4522 } 4523 4524 void 4525 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4526 { 4527 if (mode == SPDK_BDEV_RESET_STAT_NONE) { 4528 return; 4529 } 4530 4531 stat->max_read_latency_ticks = 0; 4532 stat->min_read_latency_ticks = UINT64_MAX; 4533 stat->max_write_latency_ticks = 0; 4534 stat->min_write_latency_ticks = UINT64_MAX; 4535 stat->max_unmap_latency_ticks = 0; 4536 stat->min_unmap_latency_ticks = UINT64_MAX; 4537 stat->max_copy_latency_ticks = 0; 4538 stat->min_copy_latency_ticks = UINT64_MAX; 4539 4540 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4541 return; 4542 } 4543 4544 stat->bytes_read = 0; 4545 stat->num_read_ops = 0; 4546 stat->bytes_written = 0; 4547 stat->num_write_ops = 0; 4548 stat->bytes_unmapped = 0; 4549 stat->num_unmap_ops = 0; 4550 stat->bytes_copied = 0; 4551 stat->num_copy_ops = 0; 4552 stat->read_latency_ticks = 0; 4553 stat->write_latency_ticks = 0; 4554 stat->unmap_latency_ticks = 0; 4555 stat->copy_latency_ticks = 0; 4556 4557 if (stat->io_error != NULL) { 4558 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4559 } 4560 } 4561 4562 struct spdk_bdev_io_stat * 4563 bdev_alloc_io_stat(bool io_error_stat) 4564 { 4565 struct spdk_bdev_io_stat *stat; 4566 4567 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4568 if (stat == NULL) { 4569 return NULL; 4570 } 4571 4572 if (io_error_stat) { 4573 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4574 if (stat->io_error == NULL) { 4575 free(stat); 4576 return NULL; 4577 } 4578 } else { 4579 stat->io_error = NULL; 4580 } 4581 4582 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4583 4584 return stat; 4585 } 4586 4587 void 4588 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4589 { 4590 if (stat != NULL) { 4591 free(stat->io_error); 4592 free(stat); 4593 } 4594 } 4595 4596 void 4597 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4598 { 4599 int i; 4600 4601 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4602 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4603 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4604 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4605 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4606 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4607 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4608 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4609 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4610 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4611 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4612 stat->min_read_latency_ticks != UINT64_MAX ? 4613 stat->min_read_latency_ticks : 0); 4614 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4615 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4616 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4617 stat->min_write_latency_ticks != UINT64_MAX ? 4618 stat->min_write_latency_ticks : 0); 4619 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4620 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4621 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4622 stat->min_unmap_latency_ticks != UINT64_MAX ? 4623 stat->min_unmap_latency_ticks : 0); 4624 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4625 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4626 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4627 stat->min_copy_latency_ticks != UINT64_MAX ? 4628 stat->min_copy_latency_ticks : 0); 4629 4630 if (stat->io_error != NULL) { 4631 spdk_json_write_named_object_begin(w, "io_error"); 4632 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4633 if (stat->io_error->error_status[i] != 0) { 4634 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4635 stat->io_error->error_status[i]); 4636 } 4637 } 4638 spdk_json_write_object_end(w); 4639 } 4640 } 4641 4642 static void 4643 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4644 { 4645 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4646 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4647 4648 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4649 bdev_abort_all_buf_io(mgmt_ch, ch); 4650 } 4651 4652 static void 4653 bdev_channel_destroy(void *io_device, void *ctx_buf) 4654 { 4655 struct spdk_bdev_channel *ch = ctx_buf; 4656 4657 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4658 spdk_get_thread()); 4659 4660 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4661 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4662 4663 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4664 spdk_spin_lock(&ch->bdev->internal.spinlock); 4665 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4666 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4667 4668 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4669 4670 bdev_channel_abort_queued_ios(ch); 4671 4672 if (ch->histogram) { 4673 spdk_histogram_data_free(ch->histogram); 4674 } 4675 4676 bdev_channel_destroy_resource(ch); 4677 } 4678 4679 /* 4680 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4681 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4682 */ 4683 static int 4684 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4685 { 4686 struct spdk_bdev_name *tmp; 4687 4688 bdev_name->name = strdup(name); 4689 if (bdev_name->name == NULL) { 4690 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4691 return -ENOMEM; 4692 } 4693 4694 bdev_name->bdev = bdev; 4695 4696 spdk_spin_lock(&g_bdev_mgr.spinlock); 4697 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4698 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4699 4700 if (tmp != NULL) { 4701 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4702 free(bdev_name->name); 4703 return -EEXIST; 4704 } 4705 4706 return 0; 4707 } 4708 4709 static void 4710 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4711 { 4712 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4713 free(bdev_name->name); 4714 } 4715 4716 static void 4717 bdev_name_del(struct spdk_bdev_name *bdev_name) 4718 { 4719 spdk_spin_lock(&g_bdev_mgr.spinlock); 4720 bdev_name_del_unsafe(bdev_name); 4721 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4722 } 4723 4724 int 4725 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4726 { 4727 struct spdk_bdev_alias *tmp; 4728 int ret; 4729 4730 if (alias == NULL) { 4731 SPDK_ERRLOG("Empty alias passed\n"); 4732 return -EINVAL; 4733 } 4734 4735 tmp = calloc(1, sizeof(*tmp)); 4736 if (tmp == NULL) { 4737 SPDK_ERRLOG("Unable to allocate alias\n"); 4738 return -ENOMEM; 4739 } 4740 4741 ret = bdev_name_add(&tmp->alias, bdev, alias); 4742 if (ret != 0) { 4743 free(tmp); 4744 return ret; 4745 } 4746 4747 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4748 4749 return 0; 4750 } 4751 4752 static int 4753 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4754 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4755 { 4756 struct spdk_bdev_alias *tmp; 4757 4758 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4759 if (strcmp(alias, tmp->alias.name) == 0) { 4760 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4761 alias_del_fn(&tmp->alias); 4762 free(tmp); 4763 return 0; 4764 } 4765 } 4766 4767 return -ENOENT; 4768 } 4769 4770 int 4771 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4772 { 4773 int rc; 4774 4775 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4776 if (rc == -ENOENT) { 4777 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4778 } 4779 4780 return rc; 4781 } 4782 4783 void 4784 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4785 { 4786 struct spdk_bdev_alias *p, *tmp; 4787 4788 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4789 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4790 bdev_name_del(&p->alias); 4791 free(p); 4792 } 4793 } 4794 4795 struct spdk_io_channel * 4796 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4797 { 4798 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4799 } 4800 4801 void * 4802 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4803 { 4804 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4805 void *ctx = NULL; 4806 4807 if (bdev->fn_table->get_module_ctx) { 4808 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4809 } 4810 4811 return ctx; 4812 } 4813 4814 const char * 4815 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4816 { 4817 return bdev->module->name; 4818 } 4819 4820 const char * 4821 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4822 { 4823 return bdev->name; 4824 } 4825 4826 const char * 4827 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4828 { 4829 return bdev->product_name; 4830 } 4831 4832 const struct spdk_bdev_aliases_list * 4833 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4834 { 4835 return &bdev->aliases; 4836 } 4837 4838 uint32_t 4839 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4840 { 4841 return bdev->blocklen; 4842 } 4843 4844 uint32_t 4845 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4846 { 4847 return bdev->write_unit_size; 4848 } 4849 4850 uint64_t 4851 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4852 { 4853 return bdev->blockcnt; 4854 } 4855 4856 const char * 4857 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4858 { 4859 return qos_rpc_type[type]; 4860 } 4861 4862 void 4863 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4864 { 4865 int i; 4866 4867 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4868 4869 spdk_spin_lock(&bdev->internal.spinlock); 4870 if (bdev->internal.qos) { 4871 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4872 if (bdev->internal.qos->rate_limits[i].limit != 4873 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4874 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4875 if (bdev_qos_is_iops_rate_limit(i) == false) { 4876 /* Change from Byte to Megabyte which is user visible. */ 4877 limits[i] = limits[i] / 1024 / 1024; 4878 } 4879 } 4880 } 4881 } 4882 spdk_spin_unlock(&bdev->internal.spinlock); 4883 } 4884 4885 size_t 4886 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4887 { 4888 return 1 << bdev->required_alignment; 4889 } 4890 4891 uint32_t 4892 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4893 { 4894 return bdev->optimal_io_boundary; 4895 } 4896 4897 bool 4898 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4899 { 4900 return bdev->write_cache; 4901 } 4902 4903 const struct spdk_uuid * 4904 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4905 { 4906 return &bdev->uuid; 4907 } 4908 4909 uint16_t 4910 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4911 { 4912 return bdev->acwu; 4913 } 4914 4915 uint32_t 4916 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4917 { 4918 return bdev->md_len; 4919 } 4920 4921 bool 4922 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4923 { 4924 return (bdev->md_len != 0) && bdev->md_interleave; 4925 } 4926 4927 bool 4928 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4929 { 4930 return (bdev->md_len != 0) && !bdev->md_interleave; 4931 } 4932 4933 bool 4934 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4935 { 4936 return bdev->zoned; 4937 } 4938 4939 uint32_t 4940 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4941 { 4942 if (spdk_bdev_is_md_interleaved(bdev)) { 4943 return bdev->blocklen - bdev->md_len; 4944 } else { 4945 return bdev->blocklen; 4946 } 4947 } 4948 4949 uint32_t 4950 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4951 { 4952 return bdev->phys_blocklen; 4953 } 4954 4955 static uint32_t 4956 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4957 { 4958 if (!spdk_bdev_is_md_interleaved(bdev)) { 4959 return bdev->blocklen + bdev->md_len; 4960 } else { 4961 return bdev->blocklen; 4962 } 4963 } 4964 4965 /* We have to use the typedef in the function declaration to appease astyle. */ 4966 typedef enum spdk_dif_type spdk_dif_type_t; 4967 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4968 4969 spdk_dif_type_t 4970 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4971 { 4972 if (bdev->md_len != 0) { 4973 return bdev->dif_type; 4974 } else { 4975 return SPDK_DIF_DISABLE; 4976 } 4977 } 4978 4979 spdk_dif_pi_format_t 4980 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4981 { 4982 return bdev->dif_pi_format; 4983 } 4984 4985 bool 4986 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4987 { 4988 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4989 return bdev->dif_is_head_of_md; 4990 } else { 4991 return false; 4992 } 4993 } 4994 4995 bool 4996 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4997 enum spdk_dif_check_type check_type) 4998 { 4999 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 5000 return false; 5001 } 5002 5003 switch (check_type) { 5004 case SPDK_DIF_CHECK_TYPE_REFTAG: 5005 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 5006 case SPDK_DIF_CHECK_TYPE_APPTAG: 5007 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 5008 case SPDK_DIF_CHECK_TYPE_GUARD: 5009 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 5010 default: 5011 return false; 5012 } 5013 } 5014 5015 static uint32_t 5016 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 5017 { 5018 uint64_t aligned_length, max_write_blocks; 5019 5020 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 5021 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 5022 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 5023 5024 return max_write_blocks; 5025 } 5026 5027 uint32_t 5028 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 5029 { 5030 return bdev->max_copy; 5031 } 5032 5033 uint64_t 5034 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5035 { 5036 return bdev->internal.measured_queue_depth; 5037 } 5038 5039 uint64_t 5040 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5041 { 5042 return bdev->internal.period; 5043 } 5044 5045 uint64_t 5046 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5047 { 5048 return bdev->internal.weighted_io_time; 5049 } 5050 5051 uint64_t 5052 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5053 { 5054 return bdev->internal.io_time; 5055 } 5056 5057 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5058 { 5059 return bdev->ctratt; 5060 } 5061 5062 static void bdev_update_qd_sampling_period(void *ctx); 5063 5064 static void 5065 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5066 { 5067 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5068 5069 if (bdev->internal.measured_queue_depth) { 5070 bdev->internal.io_time += bdev->internal.period; 5071 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5072 } 5073 5074 bdev->internal.qd_poll_in_progress = false; 5075 5076 bdev_update_qd_sampling_period(bdev); 5077 } 5078 5079 static void 5080 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5081 struct spdk_io_channel *io_ch, void *_ctx) 5082 { 5083 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5084 5085 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5086 spdk_bdev_for_each_channel_continue(i, 0); 5087 } 5088 5089 static int 5090 bdev_calculate_measured_queue_depth(void *ctx) 5091 { 5092 struct spdk_bdev *bdev = ctx; 5093 5094 bdev->internal.qd_poll_in_progress = true; 5095 bdev->internal.temporary_queue_depth = 0; 5096 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5097 return SPDK_POLLER_BUSY; 5098 } 5099 5100 static void 5101 bdev_update_qd_sampling_period(void *ctx) 5102 { 5103 struct spdk_bdev *bdev = ctx; 5104 5105 if (bdev->internal.period == bdev->internal.new_period) { 5106 return; 5107 } 5108 5109 if (bdev->internal.qd_poll_in_progress) { 5110 return; 5111 } 5112 5113 bdev->internal.period = bdev->internal.new_period; 5114 5115 spdk_poller_unregister(&bdev->internal.qd_poller); 5116 if (bdev->internal.period != 0) { 5117 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5118 bdev, bdev->internal.period); 5119 } else { 5120 spdk_bdev_close(bdev->internal.qd_desc); 5121 bdev->internal.qd_desc = NULL; 5122 } 5123 } 5124 5125 static void 5126 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5127 { 5128 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5129 } 5130 5131 void 5132 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5133 { 5134 int rc; 5135 5136 if (bdev->internal.new_period == period) { 5137 return; 5138 } 5139 5140 bdev->internal.new_period = period; 5141 5142 if (bdev->internal.qd_desc != NULL) { 5143 assert(bdev->internal.period != 0); 5144 5145 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5146 bdev_update_qd_sampling_period, bdev); 5147 return; 5148 } 5149 5150 assert(bdev->internal.period == 0); 5151 5152 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5153 NULL, &bdev->internal.qd_desc); 5154 if (rc != 0) { 5155 return; 5156 } 5157 5158 bdev->internal.period = period; 5159 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5160 bdev, period); 5161 } 5162 5163 struct bdev_get_current_qd_ctx { 5164 uint64_t current_qd; 5165 spdk_bdev_get_current_qd_cb cb_fn; 5166 void *cb_arg; 5167 }; 5168 5169 static void 5170 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5171 { 5172 struct bdev_get_current_qd_ctx *ctx = _ctx; 5173 5174 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5175 5176 free(ctx); 5177 } 5178 5179 static void 5180 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5181 struct spdk_io_channel *io_ch, void *_ctx) 5182 { 5183 struct bdev_get_current_qd_ctx *ctx = _ctx; 5184 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5185 5186 ctx->current_qd += bdev_ch->io_outstanding; 5187 5188 spdk_bdev_for_each_channel_continue(i, 0); 5189 } 5190 5191 void 5192 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5193 void *cb_arg) 5194 { 5195 struct bdev_get_current_qd_ctx *ctx; 5196 5197 assert(cb_fn != NULL); 5198 5199 ctx = calloc(1, sizeof(*ctx)); 5200 if (ctx == NULL) { 5201 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5202 return; 5203 } 5204 5205 ctx->cb_fn = cb_fn; 5206 ctx->cb_arg = cb_arg; 5207 5208 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5209 } 5210 5211 static void 5212 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5213 { 5214 assert(desc->thread == spdk_get_thread()); 5215 5216 spdk_spin_lock(&desc->spinlock); 5217 desc->refs--; 5218 if (!desc->closed) { 5219 spdk_spin_unlock(&desc->spinlock); 5220 desc->callback.event_fn(type, 5221 desc->bdev, 5222 desc->callback.ctx); 5223 return; 5224 } else if (desc->refs == 0) { 5225 /* This descriptor was closed after this event_notify message was sent. 5226 * spdk_bdev_close() could not free the descriptor since this message was 5227 * in flight, so we free it now using bdev_desc_free(). 5228 */ 5229 spdk_spin_unlock(&desc->spinlock); 5230 bdev_desc_free(desc); 5231 return; 5232 } 5233 spdk_spin_unlock(&desc->spinlock); 5234 } 5235 5236 static void 5237 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5238 { 5239 spdk_spin_lock(&desc->spinlock); 5240 desc->refs++; 5241 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5242 spdk_spin_unlock(&desc->spinlock); 5243 } 5244 5245 static void 5246 _resize_notify(void *ctx) 5247 { 5248 struct spdk_bdev_desc *desc = ctx; 5249 5250 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5251 } 5252 5253 int 5254 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5255 { 5256 struct spdk_bdev_desc *desc; 5257 int ret; 5258 5259 if (size == bdev->blockcnt) { 5260 return 0; 5261 } 5262 5263 spdk_spin_lock(&bdev->internal.spinlock); 5264 5265 /* bdev has open descriptors */ 5266 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5267 bdev->blockcnt > size) { 5268 ret = -EBUSY; 5269 } else { 5270 bdev->blockcnt = size; 5271 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5272 event_notify(desc, _resize_notify); 5273 } 5274 ret = 0; 5275 } 5276 5277 spdk_spin_unlock(&bdev->internal.spinlock); 5278 5279 return ret; 5280 } 5281 5282 /* 5283 * Convert I/O offset and length from bytes to blocks. 5284 * 5285 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5286 */ 5287 static uint64_t 5288 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5289 uint64_t num_bytes, uint64_t *num_blocks) 5290 { 5291 uint32_t block_size = bdev->blocklen; 5292 uint8_t shift_cnt; 5293 5294 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5295 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5296 shift_cnt = spdk_u32log2(block_size); 5297 *offset_blocks = offset_bytes >> shift_cnt; 5298 *num_blocks = num_bytes >> shift_cnt; 5299 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5300 (num_bytes - (*num_blocks << shift_cnt)); 5301 } else { 5302 *offset_blocks = offset_bytes / block_size; 5303 *num_blocks = num_bytes / block_size; 5304 return (offset_bytes % block_size) | (num_bytes % block_size); 5305 } 5306 } 5307 5308 static bool 5309 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5310 { 5311 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5312 * has been an overflow and hence the offset has been wrapped around */ 5313 if (offset_blocks + num_blocks < offset_blocks) { 5314 return false; 5315 } 5316 5317 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5318 if (offset_blocks + num_blocks > bdev->blockcnt) { 5319 return false; 5320 } 5321 5322 return true; 5323 } 5324 5325 static void 5326 bdev_seek_complete_cb(void *ctx) 5327 { 5328 struct spdk_bdev_io *bdev_io = ctx; 5329 5330 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5331 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5332 } 5333 5334 static int 5335 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5336 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5337 spdk_bdev_io_completion_cb cb, void *cb_arg) 5338 { 5339 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5340 struct spdk_bdev_io *bdev_io; 5341 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5342 5343 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5344 5345 /* Check if offset_blocks is valid looking at the validity of one block */ 5346 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5347 return -EINVAL; 5348 } 5349 5350 bdev_io = bdev_channel_get_io(channel); 5351 if (!bdev_io) { 5352 return -ENOMEM; 5353 } 5354 5355 bdev_io->internal.ch = channel; 5356 bdev_io->internal.desc = desc; 5357 bdev_io->type = io_type; 5358 bdev_io->u.bdev.offset_blocks = offset_blocks; 5359 bdev_io->u.bdev.memory_domain = NULL; 5360 bdev_io->u.bdev.memory_domain_ctx = NULL; 5361 bdev_io->u.bdev.accel_sequence = NULL; 5362 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5363 5364 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5365 /* In case bdev doesn't support seek to next data/hole offset, 5366 * it is assumed that only data and no holes are present */ 5367 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5368 bdev_io->u.bdev.seek.offset = offset_blocks; 5369 } else { 5370 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5371 } 5372 5373 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5374 return 0; 5375 } 5376 5377 bdev_io_submit(bdev_io); 5378 return 0; 5379 } 5380 5381 int 5382 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5383 uint64_t offset_blocks, 5384 spdk_bdev_io_completion_cb cb, void *cb_arg) 5385 { 5386 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5387 } 5388 5389 int 5390 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5391 uint64_t offset_blocks, 5392 spdk_bdev_io_completion_cb cb, void *cb_arg) 5393 { 5394 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5395 } 5396 5397 uint64_t 5398 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5399 { 5400 return bdev_io->u.bdev.seek.offset; 5401 } 5402 5403 static int 5404 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5405 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5406 spdk_bdev_io_completion_cb cb, void *cb_arg) 5407 { 5408 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5409 struct spdk_bdev_io *bdev_io; 5410 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5411 5412 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5413 return -EINVAL; 5414 } 5415 5416 bdev_io = bdev_channel_get_io(channel); 5417 if (!bdev_io) { 5418 return -ENOMEM; 5419 } 5420 5421 bdev_io->internal.ch = channel; 5422 bdev_io->internal.desc = desc; 5423 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5424 bdev_io->u.bdev.iovs = &bdev_io->iov; 5425 bdev_io->u.bdev.iovs[0].iov_base = buf; 5426 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5427 bdev_io->u.bdev.iovcnt = 1; 5428 bdev_io->u.bdev.md_buf = md_buf; 5429 bdev_io->u.bdev.num_blocks = num_blocks; 5430 bdev_io->u.bdev.offset_blocks = offset_blocks; 5431 bdev_io->u.bdev.memory_domain = NULL; 5432 bdev_io->u.bdev.memory_domain_ctx = NULL; 5433 bdev_io->u.bdev.accel_sequence = NULL; 5434 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5435 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5436 5437 bdev_io_submit(bdev_io); 5438 return 0; 5439 } 5440 5441 int 5442 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5443 void *buf, uint64_t offset, uint64_t nbytes, 5444 spdk_bdev_io_completion_cb cb, void *cb_arg) 5445 { 5446 uint64_t offset_blocks, num_blocks; 5447 5448 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5449 nbytes, &num_blocks) != 0) { 5450 return -EINVAL; 5451 } 5452 5453 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5454 } 5455 5456 int 5457 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5458 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5459 spdk_bdev_io_completion_cb cb, void *cb_arg) 5460 { 5461 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5462 } 5463 5464 int 5465 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5466 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5467 spdk_bdev_io_completion_cb cb, void *cb_arg) 5468 { 5469 struct iovec iov = { 5470 .iov_base = buf, 5471 }; 5472 5473 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5474 return -EINVAL; 5475 } 5476 5477 if (md_buf && !_is_buf_allocated(&iov)) { 5478 return -EINVAL; 5479 } 5480 5481 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5482 cb, cb_arg); 5483 } 5484 5485 int 5486 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5487 struct iovec *iov, int iovcnt, 5488 uint64_t offset, uint64_t nbytes, 5489 spdk_bdev_io_completion_cb cb, void *cb_arg) 5490 { 5491 uint64_t offset_blocks, num_blocks; 5492 5493 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5494 nbytes, &num_blocks) != 0) { 5495 return -EINVAL; 5496 } 5497 5498 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5499 } 5500 5501 static int 5502 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5503 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5504 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5505 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5506 spdk_bdev_io_completion_cb cb, void *cb_arg) 5507 { 5508 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5509 struct spdk_bdev_io *bdev_io; 5510 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5511 5512 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5513 return -EINVAL; 5514 } 5515 5516 bdev_io = bdev_channel_get_io(channel); 5517 if (spdk_unlikely(!bdev_io)) { 5518 return -ENOMEM; 5519 } 5520 5521 bdev_io->internal.ch = channel; 5522 bdev_io->internal.desc = desc; 5523 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5524 bdev_io->u.bdev.iovs = iov; 5525 bdev_io->u.bdev.iovcnt = iovcnt; 5526 bdev_io->u.bdev.md_buf = md_buf; 5527 bdev_io->u.bdev.num_blocks = num_blocks; 5528 bdev_io->u.bdev.offset_blocks = offset_blocks; 5529 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5530 5531 if (seq != NULL) { 5532 bdev_io->internal.f.has_accel_sequence = true; 5533 bdev_io->internal.accel_sequence = seq; 5534 } 5535 5536 if (domain != NULL) { 5537 bdev_io->internal.f.has_memory_domain = true; 5538 bdev_io->internal.memory_domain = domain; 5539 bdev_io->internal.memory_domain_ctx = domain_ctx; 5540 } 5541 5542 bdev_io->u.bdev.memory_domain = domain; 5543 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5544 bdev_io->u.bdev.accel_sequence = seq; 5545 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5546 5547 _bdev_io_submit_ext(desc, bdev_io); 5548 5549 return 0; 5550 } 5551 5552 int 5553 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5554 struct iovec *iov, int iovcnt, 5555 uint64_t offset_blocks, uint64_t num_blocks, 5556 spdk_bdev_io_completion_cb cb, void *cb_arg) 5557 { 5558 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5559 5560 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5561 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5562 } 5563 5564 int 5565 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5566 struct iovec *iov, int iovcnt, void *md_buf, 5567 uint64_t offset_blocks, uint64_t num_blocks, 5568 spdk_bdev_io_completion_cb cb, void *cb_arg) 5569 { 5570 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5571 5572 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5573 return -EINVAL; 5574 } 5575 5576 if (md_buf && !_is_buf_allocated(iov)) { 5577 return -EINVAL; 5578 } 5579 5580 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5581 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5582 } 5583 5584 static inline bool 5585 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5586 { 5587 /* 5588 * We check if opts size is at least of size when we first introduced 5589 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5590 * are not checked internal. 5591 */ 5592 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5593 sizeof(opts->metadata) && 5594 opts->size <= sizeof(*opts) && 5595 /* When memory domain is used, the user must provide data buffers */ 5596 (!opts->memory_domain || (iov && iov[0].iov_base)); 5597 } 5598 5599 int 5600 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5601 struct iovec *iov, int iovcnt, 5602 uint64_t offset_blocks, uint64_t num_blocks, 5603 spdk_bdev_io_completion_cb cb, void *cb_arg, 5604 struct spdk_bdev_ext_io_opts *opts) 5605 { 5606 struct spdk_memory_domain *domain = NULL; 5607 struct spdk_accel_sequence *seq = NULL; 5608 void *domain_ctx = NULL, *md = NULL; 5609 uint32_t dif_check_flags = 0; 5610 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5611 5612 if (opts) { 5613 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5614 return -EINVAL; 5615 } 5616 5617 md = opts->metadata; 5618 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5619 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5620 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5621 if (md) { 5622 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5623 return -EINVAL; 5624 } 5625 5626 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5627 return -EINVAL; 5628 } 5629 5630 if (spdk_unlikely(seq != NULL)) { 5631 return -EINVAL; 5632 } 5633 } 5634 } 5635 5636 dif_check_flags = bdev->dif_check_flags & 5637 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5638 5639 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5640 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5641 } 5642 5643 static int 5644 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5645 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5646 spdk_bdev_io_completion_cb cb, void *cb_arg) 5647 { 5648 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5649 struct spdk_bdev_io *bdev_io; 5650 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5651 5652 if (!desc->write) { 5653 return -EBADF; 5654 } 5655 5656 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5657 return -EINVAL; 5658 } 5659 5660 bdev_io = bdev_channel_get_io(channel); 5661 if (!bdev_io) { 5662 return -ENOMEM; 5663 } 5664 5665 bdev_io->internal.ch = channel; 5666 bdev_io->internal.desc = desc; 5667 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5668 bdev_io->u.bdev.iovs = &bdev_io->iov; 5669 bdev_io->u.bdev.iovs[0].iov_base = buf; 5670 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5671 bdev_io->u.bdev.iovcnt = 1; 5672 bdev_io->u.bdev.md_buf = md_buf; 5673 bdev_io->u.bdev.num_blocks = num_blocks; 5674 bdev_io->u.bdev.offset_blocks = offset_blocks; 5675 bdev_io->u.bdev.memory_domain = NULL; 5676 bdev_io->u.bdev.memory_domain_ctx = NULL; 5677 bdev_io->u.bdev.accel_sequence = NULL; 5678 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5679 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5680 5681 bdev_io_submit(bdev_io); 5682 return 0; 5683 } 5684 5685 int 5686 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5687 void *buf, uint64_t offset, uint64_t nbytes, 5688 spdk_bdev_io_completion_cb cb, void *cb_arg) 5689 { 5690 uint64_t offset_blocks, num_blocks; 5691 5692 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5693 nbytes, &num_blocks) != 0) { 5694 return -EINVAL; 5695 } 5696 5697 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5698 } 5699 5700 int 5701 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5702 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5703 spdk_bdev_io_completion_cb cb, void *cb_arg) 5704 { 5705 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5706 cb, cb_arg); 5707 } 5708 5709 int 5710 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5711 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5712 spdk_bdev_io_completion_cb cb, void *cb_arg) 5713 { 5714 struct iovec iov = { 5715 .iov_base = buf, 5716 }; 5717 5718 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5719 return -EINVAL; 5720 } 5721 5722 if (md_buf && !_is_buf_allocated(&iov)) { 5723 return -EINVAL; 5724 } 5725 5726 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5727 cb, cb_arg); 5728 } 5729 5730 static int 5731 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5732 struct iovec *iov, int iovcnt, void *md_buf, 5733 uint64_t offset_blocks, uint64_t num_blocks, 5734 struct spdk_memory_domain *domain, void *domain_ctx, 5735 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5736 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5737 spdk_bdev_io_completion_cb cb, void *cb_arg) 5738 { 5739 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5740 struct spdk_bdev_io *bdev_io; 5741 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5742 5743 if (spdk_unlikely(!desc->write)) { 5744 return -EBADF; 5745 } 5746 5747 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5748 return -EINVAL; 5749 } 5750 5751 bdev_io = bdev_channel_get_io(channel); 5752 if (spdk_unlikely(!bdev_io)) { 5753 return -ENOMEM; 5754 } 5755 5756 bdev_io->internal.ch = channel; 5757 bdev_io->internal.desc = desc; 5758 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5759 bdev_io->u.bdev.iovs = iov; 5760 bdev_io->u.bdev.iovcnt = iovcnt; 5761 bdev_io->u.bdev.md_buf = md_buf; 5762 bdev_io->u.bdev.num_blocks = num_blocks; 5763 bdev_io->u.bdev.offset_blocks = offset_blocks; 5764 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5765 if (seq != NULL) { 5766 bdev_io->internal.f.has_accel_sequence = true; 5767 bdev_io->internal.accel_sequence = seq; 5768 } 5769 5770 if (domain != NULL) { 5771 bdev_io->internal.f.has_memory_domain = true; 5772 bdev_io->internal.memory_domain = domain; 5773 bdev_io->internal.memory_domain_ctx = domain_ctx; 5774 } 5775 5776 bdev_io->u.bdev.memory_domain = domain; 5777 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5778 bdev_io->u.bdev.accel_sequence = seq; 5779 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5780 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5781 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5782 5783 _bdev_io_submit_ext(desc, bdev_io); 5784 5785 return 0; 5786 } 5787 5788 int 5789 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5790 struct iovec *iov, int iovcnt, 5791 uint64_t offset, uint64_t len, 5792 spdk_bdev_io_completion_cb cb, void *cb_arg) 5793 { 5794 uint64_t offset_blocks, num_blocks; 5795 5796 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5797 len, &num_blocks) != 0) { 5798 return -EINVAL; 5799 } 5800 5801 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5802 } 5803 5804 int 5805 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5806 struct iovec *iov, int iovcnt, 5807 uint64_t offset_blocks, uint64_t num_blocks, 5808 spdk_bdev_io_completion_cb cb, void *cb_arg) 5809 { 5810 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5811 5812 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5813 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5814 cb, cb_arg); 5815 } 5816 5817 int 5818 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5819 struct iovec *iov, int iovcnt, void *md_buf, 5820 uint64_t offset_blocks, uint64_t num_blocks, 5821 spdk_bdev_io_completion_cb cb, void *cb_arg) 5822 { 5823 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5824 5825 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5826 return -EINVAL; 5827 } 5828 5829 if (md_buf && !_is_buf_allocated(iov)) { 5830 return -EINVAL; 5831 } 5832 5833 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5834 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5835 cb, cb_arg); 5836 } 5837 5838 int 5839 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5840 struct iovec *iov, int iovcnt, 5841 uint64_t offset_blocks, uint64_t num_blocks, 5842 spdk_bdev_io_completion_cb cb, void *cb_arg, 5843 struct spdk_bdev_ext_io_opts *opts) 5844 { 5845 struct spdk_memory_domain *domain = NULL; 5846 struct spdk_accel_sequence *seq = NULL; 5847 void *domain_ctx = NULL, *md = NULL; 5848 uint32_t dif_check_flags = 0; 5849 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5850 uint32_t nvme_cdw12_raw = 0; 5851 uint32_t nvme_cdw13_raw = 0; 5852 5853 if (opts) { 5854 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5855 return -EINVAL; 5856 } 5857 md = opts->metadata; 5858 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5859 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5860 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5861 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5862 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5863 if (md) { 5864 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5865 return -EINVAL; 5866 } 5867 5868 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5869 return -EINVAL; 5870 } 5871 5872 if (spdk_unlikely(seq != NULL)) { 5873 return -EINVAL; 5874 } 5875 } 5876 } 5877 5878 dif_check_flags = bdev->dif_check_flags & 5879 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5880 5881 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5882 domain, domain_ctx, seq, dif_check_flags, 5883 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5884 } 5885 5886 static void 5887 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5888 { 5889 struct spdk_bdev_io *parent_io = cb_arg; 5890 struct spdk_bdev *bdev = parent_io->bdev; 5891 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5892 int i, rc = 0; 5893 5894 if (!success) { 5895 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5896 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5897 spdk_bdev_free_io(bdev_io); 5898 return; 5899 } 5900 5901 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5902 rc = memcmp(read_buf, 5903 parent_io->u.bdev.iovs[i].iov_base, 5904 parent_io->u.bdev.iovs[i].iov_len); 5905 if (rc) { 5906 break; 5907 } 5908 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5909 } 5910 5911 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5912 rc = memcmp(bdev_io->u.bdev.md_buf, 5913 parent_io->u.bdev.md_buf, 5914 spdk_bdev_get_md_size(bdev)); 5915 } 5916 5917 spdk_bdev_free_io(bdev_io); 5918 5919 if (rc == 0) { 5920 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5921 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5922 } else { 5923 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5924 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5925 } 5926 } 5927 5928 static void 5929 bdev_compare_do_read(void *_bdev_io) 5930 { 5931 struct spdk_bdev_io *bdev_io = _bdev_io; 5932 int rc; 5933 5934 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5935 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5936 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5937 bdev_compare_do_read_done, bdev_io); 5938 5939 if (rc == -ENOMEM) { 5940 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5941 } else if (rc != 0) { 5942 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5943 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5944 } 5945 } 5946 5947 static int 5948 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5949 struct iovec *iov, int iovcnt, void *md_buf, 5950 uint64_t offset_blocks, uint64_t num_blocks, 5951 spdk_bdev_io_completion_cb cb, void *cb_arg) 5952 { 5953 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5954 struct spdk_bdev_io *bdev_io; 5955 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5956 5957 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5958 return -EINVAL; 5959 } 5960 5961 bdev_io = bdev_channel_get_io(channel); 5962 if (!bdev_io) { 5963 return -ENOMEM; 5964 } 5965 5966 bdev_io->internal.ch = channel; 5967 bdev_io->internal.desc = desc; 5968 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5969 bdev_io->u.bdev.iovs = iov; 5970 bdev_io->u.bdev.iovcnt = iovcnt; 5971 bdev_io->u.bdev.md_buf = md_buf; 5972 bdev_io->u.bdev.num_blocks = num_blocks; 5973 bdev_io->u.bdev.offset_blocks = offset_blocks; 5974 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5975 bdev_io->u.bdev.memory_domain = NULL; 5976 bdev_io->u.bdev.memory_domain_ctx = NULL; 5977 bdev_io->u.bdev.accel_sequence = NULL; 5978 5979 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5980 bdev_io_submit(bdev_io); 5981 return 0; 5982 } 5983 5984 bdev_compare_do_read(bdev_io); 5985 5986 return 0; 5987 } 5988 5989 int 5990 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5991 struct iovec *iov, int iovcnt, 5992 uint64_t offset_blocks, uint64_t num_blocks, 5993 spdk_bdev_io_completion_cb cb, void *cb_arg) 5994 { 5995 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5996 num_blocks, cb, cb_arg); 5997 } 5998 5999 int 6000 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6001 struct iovec *iov, int iovcnt, void *md_buf, 6002 uint64_t offset_blocks, uint64_t num_blocks, 6003 spdk_bdev_io_completion_cb cb, void *cb_arg) 6004 { 6005 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6006 return -EINVAL; 6007 } 6008 6009 if (md_buf && !_is_buf_allocated(iov)) { 6010 return -EINVAL; 6011 } 6012 6013 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 6014 num_blocks, cb, cb_arg); 6015 } 6016 6017 static int 6018 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6019 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6020 spdk_bdev_io_completion_cb cb, void *cb_arg) 6021 { 6022 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6023 struct spdk_bdev_io *bdev_io; 6024 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6025 6026 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6027 return -EINVAL; 6028 } 6029 6030 bdev_io = bdev_channel_get_io(channel); 6031 if (!bdev_io) { 6032 return -ENOMEM; 6033 } 6034 6035 bdev_io->internal.ch = channel; 6036 bdev_io->internal.desc = desc; 6037 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6038 bdev_io->u.bdev.iovs = &bdev_io->iov; 6039 bdev_io->u.bdev.iovs[0].iov_base = buf; 6040 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6041 bdev_io->u.bdev.iovcnt = 1; 6042 bdev_io->u.bdev.md_buf = md_buf; 6043 bdev_io->u.bdev.num_blocks = num_blocks; 6044 bdev_io->u.bdev.offset_blocks = offset_blocks; 6045 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6046 bdev_io->u.bdev.memory_domain = NULL; 6047 bdev_io->u.bdev.memory_domain_ctx = NULL; 6048 bdev_io->u.bdev.accel_sequence = NULL; 6049 6050 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6051 bdev_io_submit(bdev_io); 6052 return 0; 6053 } 6054 6055 bdev_compare_do_read(bdev_io); 6056 6057 return 0; 6058 } 6059 6060 int 6061 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6062 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6063 spdk_bdev_io_completion_cb cb, void *cb_arg) 6064 { 6065 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6066 cb, cb_arg); 6067 } 6068 6069 int 6070 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6071 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6072 spdk_bdev_io_completion_cb cb, void *cb_arg) 6073 { 6074 struct iovec iov = { 6075 .iov_base = buf, 6076 }; 6077 6078 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6079 return -EINVAL; 6080 } 6081 6082 if (md_buf && !_is_buf_allocated(&iov)) { 6083 return -EINVAL; 6084 } 6085 6086 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6087 cb, cb_arg); 6088 } 6089 6090 static void 6091 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6092 { 6093 struct spdk_bdev_io *bdev_io = ctx; 6094 6095 if (unlock_status) { 6096 SPDK_ERRLOG("LBA range unlock failed\n"); 6097 } 6098 6099 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6100 false, bdev_io->internal.caller_ctx); 6101 } 6102 6103 static void 6104 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6105 { 6106 bdev_io->internal.status = status; 6107 6108 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6109 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6110 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6111 } 6112 6113 static void 6114 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6115 { 6116 struct spdk_bdev_io *parent_io = cb_arg; 6117 6118 if (!success) { 6119 SPDK_ERRLOG("Compare and write operation failed\n"); 6120 } 6121 6122 spdk_bdev_free_io(bdev_io); 6123 6124 bdev_comparev_and_writev_blocks_unlock(parent_io, 6125 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6126 } 6127 6128 static void 6129 bdev_compare_and_write_do_write(void *_bdev_io) 6130 { 6131 struct spdk_bdev_io *bdev_io = _bdev_io; 6132 int rc; 6133 6134 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6135 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6136 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6137 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6138 bdev_compare_and_write_do_write_done, bdev_io); 6139 6140 6141 if (rc == -ENOMEM) { 6142 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6143 } else if (rc != 0) { 6144 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6145 } 6146 } 6147 6148 static void 6149 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6150 { 6151 struct spdk_bdev_io *parent_io = cb_arg; 6152 6153 spdk_bdev_free_io(bdev_io); 6154 6155 if (!success) { 6156 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6157 return; 6158 } 6159 6160 bdev_compare_and_write_do_write(parent_io); 6161 } 6162 6163 static void 6164 bdev_compare_and_write_do_compare(void *_bdev_io) 6165 { 6166 struct spdk_bdev_io *bdev_io = _bdev_io; 6167 int rc; 6168 6169 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6170 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6171 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6172 bdev_compare_and_write_do_compare_done, bdev_io); 6173 6174 if (rc == -ENOMEM) { 6175 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6176 } else if (rc != 0) { 6177 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6178 } 6179 } 6180 6181 static void 6182 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6183 { 6184 struct spdk_bdev_io *bdev_io = ctx; 6185 6186 if (status) { 6187 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6188 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6189 return; 6190 } 6191 6192 bdev_compare_and_write_do_compare(bdev_io); 6193 } 6194 6195 int 6196 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6197 struct iovec *compare_iov, int compare_iovcnt, 6198 struct iovec *write_iov, int write_iovcnt, 6199 uint64_t offset_blocks, uint64_t num_blocks, 6200 spdk_bdev_io_completion_cb cb, void *cb_arg) 6201 { 6202 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6203 struct spdk_bdev_io *bdev_io; 6204 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6205 6206 if (!desc->write) { 6207 return -EBADF; 6208 } 6209 6210 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6211 return -EINVAL; 6212 } 6213 6214 if (num_blocks > bdev->acwu) { 6215 return -EINVAL; 6216 } 6217 6218 bdev_io = bdev_channel_get_io(channel); 6219 if (!bdev_io) { 6220 return -ENOMEM; 6221 } 6222 6223 bdev_io->internal.ch = channel; 6224 bdev_io->internal.desc = desc; 6225 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6226 bdev_io->u.bdev.iovs = compare_iov; 6227 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6228 bdev_io->u.bdev.fused_iovs = write_iov; 6229 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6230 bdev_io->u.bdev.md_buf = NULL; 6231 bdev_io->u.bdev.num_blocks = num_blocks; 6232 bdev_io->u.bdev.offset_blocks = offset_blocks; 6233 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6234 bdev_io->u.bdev.memory_domain = NULL; 6235 bdev_io->u.bdev.memory_domain_ctx = NULL; 6236 bdev_io->u.bdev.accel_sequence = NULL; 6237 6238 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6239 bdev_io_submit(bdev_io); 6240 return 0; 6241 } 6242 6243 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6244 bdev_comparev_and_writev_blocks_locked, bdev_io); 6245 } 6246 6247 int 6248 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6249 struct iovec *iov, int iovcnt, 6250 uint64_t offset_blocks, uint64_t num_blocks, 6251 bool populate, 6252 spdk_bdev_io_completion_cb cb, void *cb_arg) 6253 { 6254 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6255 struct spdk_bdev_io *bdev_io; 6256 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6257 6258 if (!desc->write) { 6259 return -EBADF; 6260 } 6261 6262 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6263 return -EINVAL; 6264 } 6265 6266 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6267 return -ENOTSUP; 6268 } 6269 6270 bdev_io = bdev_channel_get_io(channel); 6271 if (!bdev_io) { 6272 return -ENOMEM; 6273 } 6274 6275 bdev_io->internal.ch = channel; 6276 bdev_io->internal.desc = desc; 6277 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6278 bdev_io->u.bdev.num_blocks = num_blocks; 6279 bdev_io->u.bdev.offset_blocks = offset_blocks; 6280 bdev_io->u.bdev.iovs = iov; 6281 bdev_io->u.bdev.iovcnt = iovcnt; 6282 bdev_io->u.bdev.md_buf = NULL; 6283 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6284 bdev_io->u.bdev.zcopy.commit = 0; 6285 bdev_io->u.bdev.zcopy.start = 1; 6286 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6287 bdev_io->u.bdev.memory_domain = NULL; 6288 bdev_io->u.bdev.memory_domain_ctx = NULL; 6289 bdev_io->u.bdev.accel_sequence = NULL; 6290 6291 bdev_io_submit(bdev_io); 6292 6293 return 0; 6294 } 6295 6296 int 6297 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6298 spdk_bdev_io_completion_cb cb, void *cb_arg) 6299 { 6300 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6301 return -EINVAL; 6302 } 6303 6304 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6305 bdev_io->u.bdev.zcopy.start = 0; 6306 bdev_io->internal.caller_ctx = cb_arg; 6307 bdev_io->internal.cb = cb; 6308 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6309 6310 bdev_io_submit(bdev_io); 6311 6312 return 0; 6313 } 6314 6315 int 6316 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6317 uint64_t offset, uint64_t len, 6318 spdk_bdev_io_completion_cb cb, void *cb_arg) 6319 { 6320 uint64_t offset_blocks, num_blocks; 6321 6322 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6323 len, &num_blocks) != 0) { 6324 return -EINVAL; 6325 } 6326 6327 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6328 } 6329 6330 int 6331 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6332 uint64_t offset_blocks, uint64_t num_blocks, 6333 spdk_bdev_io_completion_cb cb, void *cb_arg) 6334 { 6335 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6336 struct spdk_bdev_io *bdev_io; 6337 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6338 6339 if (!desc->write) { 6340 return -EBADF; 6341 } 6342 6343 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6344 return -EINVAL; 6345 } 6346 6347 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6348 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6349 return -ENOTSUP; 6350 } 6351 6352 bdev_io = bdev_channel_get_io(channel); 6353 6354 if (!bdev_io) { 6355 return -ENOMEM; 6356 } 6357 6358 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6359 bdev_io->internal.ch = channel; 6360 bdev_io->internal.desc = desc; 6361 bdev_io->u.bdev.offset_blocks = offset_blocks; 6362 bdev_io->u.bdev.num_blocks = num_blocks; 6363 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6364 bdev_io->u.bdev.memory_domain = NULL; 6365 bdev_io->u.bdev.memory_domain_ctx = NULL; 6366 bdev_io->u.bdev.accel_sequence = NULL; 6367 6368 /* If the write_zeroes size is large and should be split, use the generic split 6369 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6370 * 6371 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6372 * or emulate it using regular write request otherwise. 6373 */ 6374 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6375 bdev_io->internal.f.split) { 6376 bdev_io_submit(bdev_io); 6377 return 0; 6378 } 6379 6380 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6381 6382 return bdev_write_zero_buffer(bdev_io); 6383 } 6384 6385 int 6386 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6387 uint64_t offset, uint64_t nbytes, 6388 spdk_bdev_io_completion_cb cb, void *cb_arg) 6389 { 6390 uint64_t offset_blocks, num_blocks; 6391 6392 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6393 nbytes, &num_blocks) != 0) { 6394 return -EINVAL; 6395 } 6396 6397 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6398 } 6399 6400 static void 6401 bdev_io_complete_cb(void *ctx) 6402 { 6403 struct spdk_bdev_io *bdev_io = ctx; 6404 6405 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6406 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6407 } 6408 6409 int 6410 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6411 uint64_t offset_blocks, uint64_t num_blocks, 6412 spdk_bdev_io_completion_cb cb, void *cb_arg) 6413 { 6414 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6415 struct spdk_bdev_io *bdev_io; 6416 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6417 6418 if (!desc->write) { 6419 return -EBADF; 6420 } 6421 6422 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6423 return -EINVAL; 6424 } 6425 6426 bdev_io = bdev_channel_get_io(channel); 6427 if (!bdev_io) { 6428 return -ENOMEM; 6429 } 6430 6431 bdev_io->internal.ch = channel; 6432 bdev_io->internal.desc = desc; 6433 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6434 6435 bdev_io->u.bdev.iovs = &bdev_io->iov; 6436 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6437 bdev_io->u.bdev.iovs[0].iov_len = 0; 6438 bdev_io->u.bdev.iovcnt = 1; 6439 6440 bdev_io->u.bdev.offset_blocks = offset_blocks; 6441 bdev_io->u.bdev.num_blocks = num_blocks; 6442 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6443 bdev_io->u.bdev.memory_domain = NULL; 6444 bdev_io->u.bdev.memory_domain_ctx = NULL; 6445 bdev_io->u.bdev.accel_sequence = NULL; 6446 6447 if (num_blocks == 0) { 6448 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6449 return 0; 6450 } 6451 6452 bdev_io_submit(bdev_io); 6453 return 0; 6454 } 6455 6456 int 6457 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6458 uint64_t offset, uint64_t length, 6459 spdk_bdev_io_completion_cb cb, void *cb_arg) 6460 { 6461 uint64_t offset_blocks, num_blocks; 6462 6463 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6464 length, &num_blocks) != 0) { 6465 return -EINVAL; 6466 } 6467 6468 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6469 } 6470 6471 int 6472 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6473 uint64_t offset_blocks, uint64_t num_blocks, 6474 spdk_bdev_io_completion_cb cb, void *cb_arg) 6475 { 6476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6477 struct spdk_bdev_io *bdev_io; 6478 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6479 6480 if (!desc->write) { 6481 return -EBADF; 6482 } 6483 6484 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6485 return -EINVAL; 6486 } 6487 6488 bdev_io = bdev_channel_get_io(channel); 6489 if (!bdev_io) { 6490 return -ENOMEM; 6491 } 6492 6493 bdev_io->internal.ch = channel; 6494 bdev_io->internal.desc = desc; 6495 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6496 bdev_io->u.bdev.iovs = NULL; 6497 bdev_io->u.bdev.iovcnt = 0; 6498 bdev_io->u.bdev.offset_blocks = offset_blocks; 6499 bdev_io->u.bdev.num_blocks = num_blocks; 6500 bdev_io->u.bdev.memory_domain = NULL; 6501 bdev_io->u.bdev.memory_domain_ctx = NULL; 6502 bdev_io->u.bdev.accel_sequence = NULL; 6503 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6504 6505 bdev_io_submit(bdev_io); 6506 return 0; 6507 } 6508 6509 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6510 6511 static void 6512 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6513 { 6514 struct spdk_bdev_channel *ch = _ctx; 6515 struct spdk_bdev_io *bdev_io; 6516 6517 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6518 6519 if (status == -EBUSY) { 6520 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6521 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6522 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6523 } else { 6524 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6525 6526 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6527 /* If outstanding IOs are still present and reset_io_drain_timeout 6528 * seconds passed, start the reset. */ 6529 bdev_io_submit_reset(bdev_io); 6530 } else { 6531 /* We still have in progress memory domain pull/push or we're 6532 * executing accel sequence. Since we cannot abort either of those 6533 * operations, fail the reset request. */ 6534 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6535 } 6536 } 6537 } else { 6538 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6539 SPDK_DEBUGLOG(bdev, 6540 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6541 ch->bdev->name); 6542 /* Mark the completion status as a SUCCESS and complete the reset. */ 6543 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6544 } 6545 } 6546 6547 static void 6548 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6549 struct spdk_io_channel *io_ch, void *_ctx) 6550 { 6551 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6552 int status = 0; 6553 6554 if (cur_ch->io_outstanding > 0 || 6555 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6556 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6557 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6558 * further iteration over the rest of the channels and pass non-zero status 6559 * to the callback function. */ 6560 status = -EBUSY; 6561 } 6562 spdk_bdev_for_each_channel_continue(i, status); 6563 } 6564 6565 static int 6566 bdev_reset_poll_for_outstanding_io(void *ctx) 6567 { 6568 struct spdk_bdev_channel *ch = ctx; 6569 struct spdk_bdev_io *bdev_io; 6570 6571 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6572 6573 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6574 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6575 bdev_reset_check_outstanding_io_done); 6576 6577 return SPDK_POLLER_BUSY; 6578 } 6579 6580 static void 6581 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6582 { 6583 struct spdk_bdev_channel *ch = _ctx; 6584 struct spdk_bdev_io *bdev_io; 6585 6586 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6587 6588 if (bdev->reset_io_drain_timeout == 0) { 6589 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6590 6591 bdev_io_submit_reset(bdev_io); 6592 return; 6593 } 6594 6595 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6596 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6597 6598 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6599 * submit the reset to the underlying module only if outstanding I/O 6600 * remain after reset_io_drain_timeout seconds have passed. */ 6601 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6602 bdev_reset_check_outstanding_io_done); 6603 } 6604 6605 static void 6606 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6607 struct spdk_io_channel *ch, void *_ctx) 6608 { 6609 struct spdk_bdev_channel *channel; 6610 struct spdk_bdev_mgmt_channel *mgmt_channel; 6611 struct spdk_bdev_shared_resource *shared_resource; 6612 bdev_io_tailq_t tmp_queued; 6613 6614 TAILQ_INIT(&tmp_queued); 6615 6616 channel = __io_ch_to_bdev_ch(ch); 6617 shared_resource = channel->shared_resource; 6618 mgmt_channel = shared_resource->mgmt_ch; 6619 6620 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6621 6622 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6623 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6624 } 6625 6626 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6627 bdev_abort_all_buf_io(mgmt_channel, channel); 6628 bdev_abort_all_queued_io(&tmp_queued, channel); 6629 6630 spdk_bdev_for_each_channel_continue(i, 0); 6631 } 6632 6633 static void 6634 bdev_start_reset(void *ctx) 6635 { 6636 struct spdk_bdev_channel *ch = ctx; 6637 6638 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6639 bdev_reset_freeze_channel_done); 6640 } 6641 6642 static void 6643 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6644 { 6645 struct spdk_bdev *bdev = ch->bdev; 6646 6647 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6648 6649 spdk_spin_lock(&bdev->internal.spinlock); 6650 if (bdev->internal.reset_in_progress == NULL) { 6651 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6652 /* 6653 * Take a channel reference for the target bdev for the life of this 6654 * reset. This guards against the channel getting destroyed while 6655 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6656 * progress. We will release the reference when this reset is 6657 * completed. 6658 */ 6659 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6660 bdev_start_reset(ch); 6661 } 6662 spdk_spin_unlock(&bdev->internal.spinlock); 6663 } 6664 6665 int 6666 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6667 spdk_bdev_io_completion_cb cb, void *cb_arg) 6668 { 6669 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6670 struct spdk_bdev_io *bdev_io; 6671 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6672 6673 bdev_io = bdev_channel_get_io(channel); 6674 if (!bdev_io) { 6675 return -ENOMEM; 6676 } 6677 6678 bdev_io->internal.ch = channel; 6679 bdev_io->internal.desc = desc; 6680 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6681 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6682 bdev_io->u.reset.ch_ref = NULL; 6683 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6684 6685 spdk_spin_lock(&bdev->internal.spinlock); 6686 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6687 spdk_spin_unlock(&bdev->internal.spinlock); 6688 6689 bdev_ch_add_to_io_submitted(bdev_io); 6690 6691 bdev_channel_start_reset(channel); 6692 6693 return 0; 6694 } 6695 6696 void 6697 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6698 struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode reset_mode) 6699 { 6700 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6701 6702 bdev_get_io_stat(stat, channel->stat); 6703 spdk_bdev_reset_io_stat(stat, reset_mode); 6704 } 6705 6706 static void 6707 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6708 { 6709 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6710 6711 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6712 bdev_iostat_ctx->cb_arg, 0); 6713 free(bdev_iostat_ctx); 6714 } 6715 6716 static void 6717 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6718 struct spdk_io_channel *ch, void *_ctx) 6719 { 6720 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6721 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6722 6723 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6724 spdk_bdev_reset_io_stat(channel->stat, bdev_iostat_ctx->reset_mode); 6725 spdk_bdev_for_each_channel_continue(i, 0); 6726 } 6727 6728 void 6729 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6730 enum spdk_bdev_reset_stat_mode reset_mode, spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6731 { 6732 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6733 6734 assert(bdev != NULL); 6735 assert(stat != NULL); 6736 assert(cb != NULL); 6737 6738 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6739 if (bdev_iostat_ctx == NULL) { 6740 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6741 cb(bdev, stat, cb_arg, -ENOMEM); 6742 return; 6743 } 6744 6745 bdev_iostat_ctx->stat = stat; 6746 bdev_iostat_ctx->cb = cb; 6747 bdev_iostat_ctx->cb_arg = cb_arg; 6748 bdev_iostat_ctx->reset_mode = reset_mode; 6749 6750 /* Start with the statistics from previously deleted channels. */ 6751 spdk_spin_lock(&bdev->internal.spinlock); 6752 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6753 spdk_bdev_reset_io_stat(bdev->internal.stat, reset_mode); 6754 spdk_spin_unlock(&bdev->internal.spinlock); 6755 6756 /* Then iterate and add the statistics from each existing channel. */ 6757 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6758 bdev_get_device_stat_done); 6759 } 6760 6761 struct bdev_iostat_reset_ctx { 6762 enum spdk_bdev_reset_stat_mode mode; 6763 bdev_reset_device_stat_cb cb; 6764 void *cb_arg; 6765 }; 6766 6767 static void 6768 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6769 { 6770 struct bdev_iostat_reset_ctx *ctx = _ctx; 6771 6772 ctx->cb(bdev, ctx->cb_arg, 0); 6773 6774 free(ctx); 6775 } 6776 6777 static void 6778 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6779 struct spdk_io_channel *ch, void *_ctx) 6780 { 6781 struct bdev_iostat_reset_ctx *ctx = _ctx; 6782 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6783 6784 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6785 6786 spdk_bdev_for_each_channel_continue(i, 0); 6787 } 6788 6789 void 6790 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6791 bdev_reset_device_stat_cb cb, void *cb_arg) 6792 { 6793 struct bdev_iostat_reset_ctx *ctx; 6794 6795 assert(bdev != NULL); 6796 assert(cb != NULL); 6797 6798 ctx = calloc(1, sizeof(*ctx)); 6799 if (ctx == NULL) { 6800 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6801 cb(bdev, cb_arg, -ENOMEM); 6802 return; 6803 } 6804 6805 ctx->mode = mode; 6806 ctx->cb = cb; 6807 ctx->cb_arg = cb_arg; 6808 6809 spdk_spin_lock(&bdev->internal.spinlock); 6810 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6811 spdk_spin_unlock(&bdev->internal.spinlock); 6812 6813 spdk_bdev_for_each_channel(bdev, 6814 bdev_reset_each_channel_stat, 6815 ctx, 6816 bdev_reset_device_stat_done); 6817 } 6818 6819 int 6820 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6821 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6822 spdk_bdev_io_completion_cb cb, void *cb_arg) 6823 { 6824 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6825 struct spdk_bdev_io *bdev_io; 6826 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6827 6828 if (!desc->write) { 6829 return -EBADF; 6830 } 6831 6832 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6833 return -ENOTSUP; 6834 } 6835 6836 bdev_io = bdev_channel_get_io(channel); 6837 if (!bdev_io) { 6838 return -ENOMEM; 6839 } 6840 6841 bdev_io->internal.ch = channel; 6842 bdev_io->internal.desc = desc; 6843 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6844 bdev_io->u.nvme_passthru.cmd = *cmd; 6845 bdev_io->u.nvme_passthru.buf = buf; 6846 bdev_io->u.nvme_passthru.nbytes = nbytes; 6847 bdev_io->u.nvme_passthru.md_buf = NULL; 6848 bdev_io->u.nvme_passthru.md_len = 0; 6849 6850 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6851 6852 bdev_io_submit(bdev_io); 6853 return 0; 6854 } 6855 6856 int 6857 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6858 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6859 spdk_bdev_io_completion_cb cb, void *cb_arg) 6860 { 6861 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6862 struct spdk_bdev_io *bdev_io; 6863 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6864 6865 if (!desc->write) { 6866 /* 6867 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6868 * to easily determine if the command is a read or write, but for now just 6869 * do not allow io_passthru with a read-only descriptor. 6870 */ 6871 return -EBADF; 6872 } 6873 6874 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6875 return -ENOTSUP; 6876 } 6877 6878 bdev_io = bdev_channel_get_io(channel); 6879 if (!bdev_io) { 6880 return -ENOMEM; 6881 } 6882 6883 bdev_io->internal.ch = channel; 6884 bdev_io->internal.desc = desc; 6885 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6886 bdev_io->u.nvme_passthru.cmd = *cmd; 6887 bdev_io->u.nvme_passthru.buf = buf; 6888 bdev_io->u.nvme_passthru.nbytes = nbytes; 6889 bdev_io->u.nvme_passthru.md_buf = NULL; 6890 bdev_io->u.nvme_passthru.md_len = 0; 6891 6892 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6893 6894 bdev_io_submit(bdev_io); 6895 return 0; 6896 } 6897 6898 int 6899 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6900 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6901 spdk_bdev_io_completion_cb cb, void *cb_arg) 6902 { 6903 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6904 struct spdk_bdev_io *bdev_io; 6905 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6906 6907 if (!desc->write) { 6908 /* 6909 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6910 * to easily determine if the command is a read or write, but for now just 6911 * do not allow io_passthru with a read-only descriptor. 6912 */ 6913 return -EBADF; 6914 } 6915 6916 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6917 return -ENOTSUP; 6918 } 6919 6920 bdev_io = bdev_channel_get_io(channel); 6921 if (!bdev_io) { 6922 return -ENOMEM; 6923 } 6924 6925 bdev_io->internal.ch = channel; 6926 bdev_io->internal.desc = desc; 6927 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6928 bdev_io->u.nvme_passthru.cmd = *cmd; 6929 bdev_io->u.nvme_passthru.buf = buf; 6930 bdev_io->u.nvme_passthru.nbytes = nbytes; 6931 bdev_io->u.nvme_passthru.md_buf = md_buf; 6932 bdev_io->u.nvme_passthru.md_len = md_len; 6933 6934 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6935 6936 bdev_io_submit(bdev_io); 6937 return 0; 6938 } 6939 6940 int 6941 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6942 struct spdk_io_channel *ch, 6943 const struct spdk_nvme_cmd *cmd, 6944 struct iovec *iov, int iovcnt, size_t nbytes, 6945 void *md_buf, size_t md_len, 6946 spdk_bdev_io_completion_cb cb, void *cb_arg) 6947 { 6948 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6949 struct spdk_bdev_io *bdev_io; 6950 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6951 6952 if (!desc->write) { 6953 /* 6954 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6955 * to easily determine if the command is a read or write, but for now just 6956 * do not allow io_passthru with a read-only descriptor. 6957 */ 6958 return -EBADF; 6959 } 6960 6961 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6962 return -ENOTSUP; 6963 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6964 return -ENOTSUP; 6965 } 6966 6967 bdev_io = bdev_channel_get_io(channel); 6968 if (!bdev_io) { 6969 return -ENOMEM; 6970 } 6971 6972 bdev_io->internal.ch = channel; 6973 bdev_io->internal.desc = desc; 6974 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6975 bdev_io->u.nvme_passthru.cmd = *cmd; 6976 bdev_io->u.nvme_passthru.iovs = iov; 6977 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6978 bdev_io->u.nvme_passthru.nbytes = nbytes; 6979 bdev_io->u.nvme_passthru.md_buf = md_buf; 6980 bdev_io->u.nvme_passthru.md_len = md_len; 6981 6982 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6983 6984 bdev_io_submit(bdev_io); 6985 return 0; 6986 } 6987 6988 static void bdev_abort_retry(void *ctx); 6989 static void bdev_abort(struct spdk_bdev_io *parent_io); 6990 6991 static void 6992 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6993 { 6994 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6995 struct spdk_bdev_io *parent_io = cb_arg; 6996 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6997 6998 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6999 7000 spdk_bdev_free_io(bdev_io); 7001 7002 if (!success) { 7003 /* Check if the target I/O completed in the meantime. */ 7004 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 7005 if (tmp_io == bio_to_abort) { 7006 break; 7007 } 7008 } 7009 7010 /* If the target I/O still exists, set the parent to failed. */ 7011 if (tmp_io != NULL) { 7012 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7013 } 7014 } 7015 7016 assert(parent_io->internal.f.split); 7017 7018 parent_io->internal.split.outstanding--; 7019 if (parent_io->internal.split.outstanding == 0) { 7020 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7021 bdev_abort_retry(parent_io); 7022 } else { 7023 bdev_io_complete(parent_io); 7024 } 7025 } 7026 } 7027 7028 static int 7029 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 7030 struct spdk_bdev_io *bio_to_abort, 7031 spdk_bdev_io_completion_cb cb, void *cb_arg) 7032 { 7033 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7034 struct spdk_bdev_io *bdev_io; 7035 7036 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7037 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7038 /* TODO: Abort reset or abort request. */ 7039 return -ENOTSUP; 7040 } 7041 7042 bdev_io = bdev_channel_get_io(channel); 7043 if (bdev_io == NULL) { 7044 return -ENOMEM; 7045 } 7046 7047 bdev_io->internal.ch = channel; 7048 bdev_io->internal.desc = desc; 7049 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7050 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7051 7052 if (bio_to_abort->internal.f.split) { 7053 assert(bdev_io_should_split(bio_to_abort)); 7054 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7055 7056 /* Parent abort request is not submitted directly, but to manage its 7057 * execution add it to the submitted list here. 7058 */ 7059 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7060 bdev_ch_add_to_io_submitted(bdev_io); 7061 7062 bdev_abort(bdev_io); 7063 7064 return 0; 7065 } 7066 7067 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7068 7069 /* Submit the abort request to the underlying bdev module. */ 7070 bdev_io_submit(bdev_io); 7071 7072 return 0; 7073 } 7074 7075 static bool 7076 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7077 { 7078 struct spdk_bdev_io *iter; 7079 7080 TAILQ_FOREACH(iter, tailq, internal.link) { 7081 if (iter == bdev_io) { 7082 return true; 7083 } 7084 } 7085 7086 return false; 7087 } 7088 7089 static uint32_t 7090 _bdev_abort(struct spdk_bdev_io *parent_io) 7091 { 7092 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7093 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7094 void *bio_cb_arg; 7095 struct spdk_bdev_io *bio_to_abort; 7096 uint32_t matched_ios; 7097 int rc; 7098 7099 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7100 7101 /* matched_ios is returned and will be kept by the caller. 7102 * 7103 * This function will be used for two cases, 1) the same cb_arg is used for 7104 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7105 * Incrementing split_outstanding directly here may confuse readers especially 7106 * for the 1st case. 7107 * 7108 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7109 * works as expected. 7110 */ 7111 matched_ios = 0; 7112 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7113 7114 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7115 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7116 continue; 7117 } 7118 7119 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7120 /* Any I/O which was submitted after this abort command should be excluded. */ 7121 continue; 7122 } 7123 7124 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7125 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7126 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7127 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7128 break; 7129 } 7130 7131 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7132 if (rc != 0) { 7133 if (rc == -ENOMEM) { 7134 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7135 } else { 7136 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7137 } 7138 break; 7139 } 7140 matched_ios++; 7141 } 7142 7143 return matched_ios; 7144 } 7145 7146 static void 7147 bdev_abort_retry(void *ctx) 7148 { 7149 struct spdk_bdev_io *parent_io = ctx; 7150 uint32_t matched_ios; 7151 7152 matched_ios = _bdev_abort(parent_io); 7153 7154 if (matched_ios == 0) { 7155 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7156 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7157 } else { 7158 /* For retry, the case that no target I/O was found is success 7159 * because it means target I/Os completed in the meantime. 7160 */ 7161 bdev_io_complete(parent_io); 7162 } 7163 return; 7164 } 7165 7166 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7167 parent_io->internal.f.split = true; 7168 parent_io->internal.split.outstanding = matched_ios; 7169 } 7170 7171 static void 7172 bdev_abort(struct spdk_bdev_io *parent_io) 7173 { 7174 uint32_t matched_ios; 7175 7176 matched_ios = _bdev_abort(parent_io); 7177 7178 if (matched_ios == 0) { 7179 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7180 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7181 } else { 7182 /* The case the no target I/O was found is failure. */ 7183 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7184 bdev_io_complete(parent_io); 7185 } 7186 return; 7187 } 7188 7189 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7190 parent_io->internal.f.split = true; 7191 parent_io->internal.split.outstanding = matched_ios; 7192 } 7193 7194 int 7195 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7196 void *bio_cb_arg, 7197 spdk_bdev_io_completion_cb cb, void *cb_arg) 7198 { 7199 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7200 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7201 struct spdk_bdev_io *bdev_io; 7202 7203 if (bio_cb_arg == NULL) { 7204 return -EINVAL; 7205 } 7206 7207 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7208 return -ENOTSUP; 7209 } 7210 7211 bdev_io = bdev_channel_get_io(channel); 7212 if (bdev_io == NULL) { 7213 return -ENOMEM; 7214 } 7215 7216 bdev_io->internal.ch = channel; 7217 bdev_io->internal.desc = desc; 7218 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7219 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7220 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7221 7222 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7223 7224 /* Parent abort request is not submitted directly, but to manage its execution, 7225 * add it to the submitted list here. 7226 */ 7227 bdev_ch_add_to_io_submitted(bdev_io); 7228 7229 bdev_abort(bdev_io); 7230 7231 return 0; 7232 } 7233 7234 int 7235 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7236 struct spdk_bdev_io_wait_entry *entry) 7237 { 7238 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7239 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7240 7241 if (bdev != entry->bdev) { 7242 SPDK_ERRLOG("bdevs do not match\n"); 7243 return -EINVAL; 7244 } 7245 7246 if (mgmt_ch->per_thread_cache_count > 0) { 7247 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7248 return -EINVAL; 7249 } 7250 7251 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7252 return 0; 7253 } 7254 7255 static inline void 7256 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7257 { 7258 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7259 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7260 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7261 uint32_t blocklen = bdev_io->bdev->blocklen; 7262 7263 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7264 switch (bdev_io->type) { 7265 case SPDK_BDEV_IO_TYPE_READ: 7266 io_stat->bytes_read += num_blocks * blocklen; 7267 io_stat->num_read_ops++; 7268 io_stat->read_latency_ticks += tsc_diff; 7269 if (io_stat->max_read_latency_ticks < tsc_diff) { 7270 io_stat->max_read_latency_ticks = tsc_diff; 7271 } 7272 if (io_stat->min_read_latency_ticks > tsc_diff) { 7273 io_stat->min_read_latency_ticks = tsc_diff; 7274 } 7275 break; 7276 case SPDK_BDEV_IO_TYPE_WRITE: 7277 io_stat->bytes_written += num_blocks * blocklen; 7278 io_stat->num_write_ops++; 7279 io_stat->write_latency_ticks += tsc_diff; 7280 if (io_stat->max_write_latency_ticks < tsc_diff) { 7281 io_stat->max_write_latency_ticks = tsc_diff; 7282 } 7283 if (io_stat->min_write_latency_ticks > tsc_diff) { 7284 io_stat->min_write_latency_ticks = tsc_diff; 7285 } 7286 break; 7287 case SPDK_BDEV_IO_TYPE_UNMAP: 7288 io_stat->bytes_unmapped += num_blocks * blocklen; 7289 io_stat->num_unmap_ops++; 7290 io_stat->unmap_latency_ticks += tsc_diff; 7291 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7292 io_stat->max_unmap_latency_ticks = tsc_diff; 7293 } 7294 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7295 io_stat->min_unmap_latency_ticks = tsc_diff; 7296 } 7297 break; 7298 case SPDK_BDEV_IO_TYPE_ZCOPY: 7299 /* Track the data in the start phase only */ 7300 if (bdev_io->u.bdev.zcopy.start) { 7301 if (bdev_io->u.bdev.zcopy.populate) { 7302 io_stat->bytes_read += num_blocks * blocklen; 7303 io_stat->num_read_ops++; 7304 io_stat->read_latency_ticks += tsc_diff; 7305 if (io_stat->max_read_latency_ticks < tsc_diff) { 7306 io_stat->max_read_latency_ticks = tsc_diff; 7307 } 7308 if (io_stat->min_read_latency_ticks > tsc_diff) { 7309 io_stat->min_read_latency_ticks = tsc_diff; 7310 } 7311 } else { 7312 io_stat->bytes_written += num_blocks * blocklen; 7313 io_stat->num_write_ops++; 7314 io_stat->write_latency_ticks += tsc_diff; 7315 if (io_stat->max_write_latency_ticks < tsc_diff) { 7316 io_stat->max_write_latency_ticks = tsc_diff; 7317 } 7318 if (io_stat->min_write_latency_ticks > tsc_diff) { 7319 io_stat->min_write_latency_ticks = tsc_diff; 7320 } 7321 } 7322 } 7323 break; 7324 case SPDK_BDEV_IO_TYPE_COPY: 7325 io_stat->bytes_copied += num_blocks * blocklen; 7326 io_stat->num_copy_ops++; 7327 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7328 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7329 io_stat->max_copy_latency_ticks = tsc_diff; 7330 } 7331 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7332 io_stat->min_copy_latency_ticks = tsc_diff; 7333 } 7334 break; 7335 default: 7336 break; 7337 } 7338 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7339 io_stat = bdev_io->bdev->internal.stat; 7340 assert(io_stat->io_error != NULL); 7341 7342 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7343 io_stat->io_error->error_status[-io_status - 1]++; 7344 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7345 } 7346 7347 #ifdef SPDK_CONFIG_VTUNE 7348 uint64_t now_tsc = spdk_get_ticks(); 7349 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7350 uint64_t data[5]; 7351 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7352 7353 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7354 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7355 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7356 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7357 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7358 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7359 7360 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7361 __itt_metadata_u64, 5, data); 7362 7363 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7364 bdev_io->internal.ch->start_tsc = now_tsc; 7365 } 7366 #endif 7367 } 7368 7369 static inline void 7370 _bdev_io_complete(void *ctx) 7371 { 7372 struct spdk_bdev_io *bdev_io = ctx; 7373 7374 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7375 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7376 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7377 } 7378 7379 assert(bdev_io->internal.cb != NULL); 7380 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7381 7382 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7383 bdev_io->internal.caller_ctx); 7384 } 7385 7386 static inline void 7387 bdev_io_complete(void *ctx) 7388 { 7389 struct spdk_bdev_io *bdev_io = ctx; 7390 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7391 uint64_t tsc, tsc_diff; 7392 7393 if (spdk_unlikely(bdev_io->internal.f.in_submit_request)) { 7394 /* 7395 * Defer completion to avoid potential infinite recursion if the 7396 * user's completion callback issues a new I/O. 7397 */ 7398 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7399 bdev_io_complete, bdev_io); 7400 return; 7401 } 7402 7403 tsc = spdk_get_ticks(); 7404 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7405 7406 bdev_ch_remove_from_io_submitted(bdev_io); 7407 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7408 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7409 7410 if (bdev_ch->histogram) { 7411 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7412 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7413 /* 7414 * Tally all I/O types if the histogram_io_type is set to 0. 7415 */ 7416 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7417 } 7418 } 7419 7420 bdev_io_update_io_stat(bdev_io, tsc_diff); 7421 _bdev_io_complete(bdev_io); 7422 } 7423 7424 /* The difference between this function and bdev_io_complete() is that this should be called to 7425 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7426 * io_submitted list and don't have submit_tsc updated. 7427 */ 7428 static inline void 7429 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7430 { 7431 /* Since the IO hasn't been submitted it's bound to be failed */ 7432 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7433 7434 /* At this point we don't know if the IO is completed from submission context or not, but, 7435 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7436 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7437 _bdev_io_complete, bdev_io); 7438 } 7439 7440 static void bdev_destroy_cb(void *io_device); 7441 7442 static void 7443 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7444 { 7445 struct spdk_bdev_io *bdev_io = _ctx; 7446 7447 if (bdev_io->u.reset.ch_ref != NULL) { 7448 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7449 bdev_io->u.reset.ch_ref = NULL; 7450 } 7451 7452 bdev_io_complete(bdev_io); 7453 7454 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7455 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7456 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7457 } 7458 } 7459 7460 static void 7461 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7462 struct spdk_io_channel *_ch, void *_ctx) 7463 { 7464 struct spdk_bdev_io *bdev_io = _ctx; 7465 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7466 struct spdk_bdev_io *queued_reset; 7467 7468 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7469 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7470 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7471 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7472 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7473 } 7474 7475 spdk_bdev_for_each_channel_continue(i, 0); 7476 } 7477 7478 static void 7479 bdev_io_complete_sequence_cb(void *ctx, int status) 7480 { 7481 struct spdk_bdev_io *bdev_io = ctx; 7482 7483 /* u.bdev.accel_sequence should have already been cleared at this point */ 7484 assert(bdev_io->u.bdev.accel_sequence == NULL); 7485 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7486 bdev_io->internal.f.has_accel_sequence = false; 7487 7488 if (spdk_unlikely(status != 0)) { 7489 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7490 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7491 } 7492 7493 bdev_io_complete(bdev_io); 7494 } 7495 7496 void 7497 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7498 { 7499 struct spdk_bdev *bdev = bdev_io->bdev; 7500 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7501 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7502 7503 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7504 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7505 spdk_bdev_get_module_name(bdev), 7506 bdev_io_status_get_string(bdev_io->internal.status)); 7507 assert(false); 7508 } 7509 bdev_io->internal.status = status; 7510 7511 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7512 bool unlock_channels = false; 7513 7514 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7515 SPDK_ERRLOG("NOMEM returned for reset\n"); 7516 } 7517 spdk_spin_lock(&bdev->internal.spinlock); 7518 if (bdev_io == bdev->internal.reset_in_progress) { 7519 bdev->internal.reset_in_progress = NULL; 7520 unlock_channels = true; 7521 } 7522 spdk_spin_unlock(&bdev->internal.spinlock); 7523 7524 if (unlock_channels) { 7525 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7526 bdev_reset_complete); 7527 return; 7528 } 7529 } else { 7530 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7531 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7532 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7533 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7534 return; 7535 } else if (spdk_unlikely(bdev_io->internal.f.has_bounce_buf && 7536 !bdev_io_use_accel_sequence(bdev_io))) { 7537 _bdev_io_push_bounce_data_buffer(bdev_io, 7538 _bdev_io_complete_push_bounce_done); 7539 /* bdev IO will be completed in the callback */ 7540 return; 7541 } 7542 } 7543 7544 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7545 return; 7546 } 7547 } 7548 7549 bdev_io_complete(bdev_io); 7550 } 7551 7552 void 7553 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7554 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7555 { 7556 enum spdk_bdev_io_status status; 7557 7558 if (sc == SPDK_SCSI_STATUS_GOOD) { 7559 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7560 } else { 7561 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7562 bdev_io->internal.error.scsi.sc = sc; 7563 bdev_io->internal.error.scsi.sk = sk; 7564 bdev_io->internal.error.scsi.asc = asc; 7565 bdev_io->internal.error.scsi.ascq = ascq; 7566 } 7567 7568 spdk_bdev_io_complete(bdev_io, status); 7569 } 7570 7571 void 7572 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7573 int *sc, int *sk, int *asc, int *ascq) 7574 { 7575 assert(sc != NULL); 7576 assert(sk != NULL); 7577 assert(asc != NULL); 7578 assert(ascq != NULL); 7579 7580 switch (bdev_io->internal.status) { 7581 case SPDK_BDEV_IO_STATUS_SUCCESS: 7582 *sc = SPDK_SCSI_STATUS_GOOD; 7583 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7584 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7585 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7586 break; 7587 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7588 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7589 break; 7590 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7591 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7592 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7593 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7594 *ascq = bdev_io->internal.error.scsi.ascq; 7595 break; 7596 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7597 *sc = bdev_io->internal.error.scsi.sc; 7598 *sk = bdev_io->internal.error.scsi.sk; 7599 *asc = bdev_io->internal.error.scsi.asc; 7600 *ascq = bdev_io->internal.error.scsi.ascq; 7601 break; 7602 default: 7603 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7604 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7605 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7606 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7607 break; 7608 } 7609 } 7610 7611 void 7612 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7613 { 7614 enum spdk_bdev_io_status status; 7615 7616 if (aio_result == 0) { 7617 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7618 } else { 7619 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7620 } 7621 7622 bdev_io->internal.error.aio_result = aio_result; 7623 7624 spdk_bdev_io_complete(bdev_io, status); 7625 } 7626 7627 void 7628 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7629 { 7630 assert(aio_result != NULL); 7631 7632 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7633 *aio_result = bdev_io->internal.error.aio_result; 7634 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7635 *aio_result = 0; 7636 } else { 7637 *aio_result = -EIO; 7638 } 7639 } 7640 7641 void 7642 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7643 { 7644 enum spdk_bdev_io_status status; 7645 7646 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7647 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7648 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7649 status = SPDK_BDEV_IO_STATUS_ABORTED; 7650 } else { 7651 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7652 } 7653 7654 bdev_io->internal.error.nvme.cdw0 = cdw0; 7655 bdev_io->internal.error.nvme.sct = sct; 7656 bdev_io->internal.error.nvme.sc = sc; 7657 7658 spdk_bdev_io_complete(bdev_io, status); 7659 } 7660 7661 void 7662 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7663 { 7664 assert(sct != NULL); 7665 assert(sc != NULL); 7666 assert(cdw0 != NULL); 7667 7668 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7669 *sct = SPDK_NVME_SCT_GENERIC; 7670 *sc = SPDK_NVME_SC_SUCCESS; 7671 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7672 *cdw0 = 0; 7673 } else { 7674 *cdw0 = 1U; 7675 } 7676 return; 7677 } 7678 7679 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7680 *sct = SPDK_NVME_SCT_GENERIC; 7681 *sc = SPDK_NVME_SC_SUCCESS; 7682 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7683 *sct = bdev_io->internal.error.nvme.sct; 7684 *sc = bdev_io->internal.error.nvme.sc; 7685 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7686 *sct = SPDK_NVME_SCT_GENERIC; 7687 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7688 } else { 7689 *sct = SPDK_NVME_SCT_GENERIC; 7690 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7691 } 7692 7693 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7694 } 7695 7696 void 7697 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7698 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7699 { 7700 assert(first_sct != NULL); 7701 assert(first_sc != NULL); 7702 assert(second_sct != NULL); 7703 assert(second_sc != NULL); 7704 assert(cdw0 != NULL); 7705 7706 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7707 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7708 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7709 *first_sct = bdev_io->internal.error.nvme.sct; 7710 *first_sc = bdev_io->internal.error.nvme.sc; 7711 *second_sct = SPDK_NVME_SCT_GENERIC; 7712 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7713 } else { 7714 *first_sct = SPDK_NVME_SCT_GENERIC; 7715 *first_sc = SPDK_NVME_SC_SUCCESS; 7716 *second_sct = bdev_io->internal.error.nvme.sct; 7717 *second_sc = bdev_io->internal.error.nvme.sc; 7718 } 7719 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7720 *first_sct = SPDK_NVME_SCT_GENERIC; 7721 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7722 *second_sct = SPDK_NVME_SCT_GENERIC; 7723 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7724 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7725 *first_sct = SPDK_NVME_SCT_GENERIC; 7726 *first_sc = SPDK_NVME_SC_SUCCESS; 7727 *second_sct = SPDK_NVME_SCT_GENERIC; 7728 *second_sc = SPDK_NVME_SC_SUCCESS; 7729 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7730 *first_sct = SPDK_NVME_SCT_GENERIC; 7731 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7732 *second_sct = SPDK_NVME_SCT_GENERIC; 7733 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7734 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7735 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7736 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7737 *second_sct = SPDK_NVME_SCT_GENERIC; 7738 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7739 } else { 7740 *first_sct = SPDK_NVME_SCT_GENERIC; 7741 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7742 *second_sct = SPDK_NVME_SCT_GENERIC; 7743 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7744 } 7745 7746 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7747 } 7748 7749 void 7750 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7751 const struct spdk_bdev_io *base_io) 7752 { 7753 switch (base_io->internal.status) { 7754 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7755 spdk_bdev_io_complete_nvme_status(bdev_io, 7756 base_io->internal.error.nvme.cdw0, 7757 base_io->internal.error.nvme.sct, 7758 base_io->internal.error.nvme.sc); 7759 break; 7760 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7761 spdk_bdev_io_complete_scsi_status(bdev_io, 7762 base_io->internal.error.scsi.sc, 7763 base_io->internal.error.scsi.sk, 7764 base_io->internal.error.scsi.asc, 7765 base_io->internal.error.scsi.ascq); 7766 break; 7767 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7768 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7769 break; 7770 default: 7771 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7772 break; 7773 } 7774 } 7775 7776 struct spdk_thread * 7777 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7778 { 7779 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7780 } 7781 7782 struct spdk_io_channel * 7783 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7784 { 7785 return bdev_io->internal.ch->channel; 7786 } 7787 7788 static int 7789 bdev_register(struct spdk_bdev *bdev) 7790 { 7791 char *bdev_name; 7792 char uuid[SPDK_UUID_STRING_LEN]; 7793 struct spdk_iobuf_opts iobuf_opts; 7794 int ret; 7795 7796 assert(bdev->module != NULL); 7797 7798 if (!bdev->name) { 7799 SPDK_ERRLOG("Bdev name is NULL\n"); 7800 return -EINVAL; 7801 } 7802 7803 if (!strlen(bdev->name)) { 7804 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7805 return -EINVAL; 7806 } 7807 7808 /* Users often register their own I/O devices using the bdev name. In 7809 * order to avoid conflicts, prepend bdev_. */ 7810 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7811 if (!bdev_name) { 7812 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7813 return -ENOMEM; 7814 } 7815 7816 bdev->internal.stat = bdev_alloc_io_stat(true); 7817 if (!bdev->internal.stat) { 7818 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7819 free(bdev_name); 7820 return -ENOMEM; 7821 } 7822 7823 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7824 bdev->internal.measured_queue_depth = UINT64_MAX; 7825 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7826 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7827 bdev->internal.qd_poller = NULL; 7828 bdev->internal.qos = NULL; 7829 7830 TAILQ_INIT(&bdev->internal.open_descs); 7831 TAILQ_INIT(&bdev->internal.locked_ranges); 7832 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7833 TAILQ_INIT(&bdev->aliases); 7834 7835 /* UUID may be specified by the user or defined by bdev itself. 7836 * Otherwise it will be generated here, so this field will never be empty. */ 7837 if (spdk_uuid_is_null(&bdev->uuid)) { 7838 spdk_uuid_generate(&bdev->uuid); 7839 } 7840 7841 /* Add the UUID alias only if it's different than the name */ 7842 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7843 if (strcmp(bdev->name, uuid) != 0) { 7844 ret = spdk_bdev_alias_add(bdev, uuid); 7845 if (ret != 0) { 7846 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7847 bdev_free_io_stat(bdev->internal.stat); 7848 free(bdev_name); 7849 return ret; 7850 } 7851 } 7852 7853 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7854 if (spdk_bdev_get_buf_align(bdev) > 1) { 7855 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7856 iobuf_opts.large_bufsize / bdev->blocklen); 7857 } 7858 7859 /* If the user didn't specify a write unit size, set it to one. */ 7860 if (bdev->write_unit_size == 0) { 7861 bdev->write_unit_size = 1; 7862 } 7863 7864 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7865 if (bdev->acwu == 0) { 7866 bdev->acwu = bdev->write_unit_size; 7867 } 7868 7869 if (bdev->phys_blocklen == 0) { 7870 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7871 } 7872 7873 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7874 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7875 } 7876 7877 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7878 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7879 } 7880 7881 bdev->internal.reset_in_progress = NULL; 7882 bdev->internal.qd_poll_in_progress = false; 7883 bdev->internal.period = 0; 7884 bdev->internal.new_period = 0; 7885 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7886 7887 /* 7888 * Initialize spinlock before registering IO device because spinlock is used in 7889 * bdev_channel_create 7890 */ 7891 spdk_spin_init(&bdev->internal.spinlock); 7892 7893 spdk_io_device_register(__bdev_to_io_dev(bdev), 7894 bdev_channel_create, bdev_channel_destroy, 7895 sizeof(struct spdk_bdev_channel), 7896 bdev_name); 7897 7898 /* 7899 * Register bdev name only after the bdev object is ready. 7900 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7901 * create IO channels... 7902 */ 7903 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7904 if (ret != 0) { 7905 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7906 bdev_free_io_stat(bdev->internal.stat); 7907 spdk_spin_destroy(&bdev->internal.spinlock); 7908 free(bdev_name); 7909 return ret; 7910 } 7911 7912 free(bdev_name); 7913 7914 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7915 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7916 7917 return 0; 7918 } 7919 7920 static void 7921 bdev_destroy_cb(void *io_device) 7922 { 7923 int rc; 7924 struct spdk_bdev *bdev; 7925 spdk_bdev_unregister_cb cb_fn; 7926 void *cb_arg; 7927 7928 bdev = __bdev_from_io_dev(io_device); 7929 7930 if (bdev->internal.unregister_td != spdk_get_thread()) { 7931 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7932 return; 7933 } 7934 7935 cb_fn = bdev->internal.unregister_cb; 7936 cb_arg = bdev->internal.unregister_ctx; 7937 7938 spdk_spin_destroy(&bdev->internal.spinlock); 7939 free(bdev->internal.qos); 7940 bdev_free_io_stat(bdev->internal.stat); 7941 spdk_trace_unregister_owner(bdev->internal.trace_id); 7942 7943 rc = bdev->fn_table->destruct(bdev->ctxt); 7944 if (rc < 0) { 7945 SPDK_ERRLOG("destruct failed\n"); 7946 } 7947 if (rc <= 0 && cb_fn != NULL) { 7948 cb_fn(cb_arg, rc); 7949 } 7950 } 7951 7952 void 7953 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7954 { 7955 if (bdev->internal.unregister_cb != NULL) { 7956 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7957 } 7958 } 7959 7960 static void 7961 _remove_notify(void *arg) 7962 { 7963 struct spdk_bdev_desc *desc = arg; 7964 7965 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7966 } 7967 7968 /* returns: 0 - bdev removed and ready to be destructed. 7969 * -EBUSY - bdev can't be destructed yet. */ 7970 static int 7971 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7972 { 7973 struct spdk_bdev_desc *desc, *tmp; 7974 int rc = 0; 7975 char uuid[SPDK_UUID_STRING_LEN]; 7976 7977 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7978 assert(spdk_spin_held(&bdev->internal.spinlock)); 7979 7980 /* Notify each descriptor about hotremoval */ 7981 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7982 rc = -EBUSY; 7983 /* 7984 * Defer invocation of the event_cb to a separate message that will 7985 * run later on its thread. This ensures this context unwinds and 7986 * we don't recursively unregister this bdev again if the event_cb 7987 * immediately closes its descriptor. 7988 */ 7989 event_notify(desc, _remove_notify); 7990 } 7991 7992 /* If there are no descriptors, proceed removing the bdev */ 7993 if (rc == 0) { 7994 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7995 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7996 7997 /* Delete the name and the UUID alias */ 7998 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7999 bdev_name_del_unsafe(&bdev->internal.bdev_name); 8000 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 8001 8002 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 8003 8004 if (bdev->internal.reset_in_progress != NULL) { 8005 /* If reset is in progress, let the completion callback for reset 8006 * unregister the bdev. 8007 */ 8008 rc = -EBUSY; 8009 } 8010 } 8011 8012 return rc; 8013 } 8014 8015 static void 8016 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8017 struct spdk_io_channel *io_ch, void *_ctx) 8018 { 8019 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8020 8021 bdev_channel_abort_queued_ios(bdev_ch); 8022 spdk_bdev_for_each_channel_continue(i, 0); 8023 } 8024 8025 static void 8026 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 8027 { 8028 int rc; 8029 8030 spdk_spin_lock(&g_bdev_mgr.spinlock); 8031 spdk_spin_lock(&bdev->internal.spinlock); 8032 /* 8033 * Set the status to REMOVING after completing to abort channels. Otherwise, 8034 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 8035 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8036 * may fail. 8037 */ 8038 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8039 rc = bdev_unregister_unsafe(bdev); 8040 spdk_spin_unlock(&bdev->internal.spinlock); 8041 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8042 8043 if (rc == 0) { 8044 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8045 } 8046 } 8047 8048 void 8049 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8050 { 8051 struct spdk_thread *thread; 8052 8053 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8054 8055 thread = spdk_get_thread(); 8056 if (!thread) { 8057 /* The user called this from a non-SPDK thread. */ 8058 if (cb_fn != NULL) { 8059 cb_fn(cb_arg, -ENOTSUP); 8060 } 8061 return; 8062 } 8063 8064 spdk_spin_lock(&g_bdev_mgr.spinlock); 8065 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8066 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8067 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8068 if (cb_fn) { 8069 cb_fn(cb_arg, -EBUSY); 8070 } 8071 return; 8072 } 8073 8074 spdk_spin_lock(&bdev->internal.spinlock); 8075 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8076 bdev->internal.unregister_cb = cb_fn; 8077 bdev->internal.unregister_ctx = cb_arg; 8078 bdev->internal.unregister_td = thread; 8079 spdk_spin_unlock(&bdev->internal.spinlock); 8080 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8081 8082 spdk_bdev_set_qd_sampling_period(bdev, 0); 8083 8084 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8085 bdev_unregister); 8086 } 8087 8088 int 8089 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8090 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8091 { 8092 struct spdk_bdev_desc *desc; 8093 struct spdk_bdev *bdev; 8094 int rc; 8095 8096 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8097 if (rc != 0) { 8098 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8099 return rc; 8100 } 8101 8102 bdev = spdk_bdev_desc_get_bdev(desc); 8103 8104 if (bdev->module != module) { 8105 spdk_bdev_close(desc); 8106 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8107 bdev_name); 8108 return -ENODEV; 8109 } 8110 8111 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8112 8113 spdk_bdev_close(desc); 8114 8115 return 0; 8116 } 8117 8118 static int 8119 bdev_start_qos(struct spdk_bdev *bdev) 8120 { 8121 struct set_qos_limit_ctx *ctx; 8122 8123 /* Enable QoS */ 8124 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8125 ctx = calloc(1, sizeof(*ctx)); 8126 if (ctx == NULL) { 8127 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8128 return -ENOMEM; 8129 } 8130 ctx->bdev = bdev; 8131 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8132 } 8133 8134 return 0; 8135 } 8136 8137 static void 8138 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8139 struct spdk_bdev *bdev) 8140 { 8141 enum spdk_bdev_claim_type type; 8142 const char *typename, *modname; 8143 extern struct spdk_log_flag SPDK_LOG_bdev; 8144 8145 assert(spdk_spin_held(&bdev->internal.spinlock)); 8146 8147 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8148 return; 8149 } 8150 8151 type = bdev->internal.claim_type; 8152 typename = spdk_bdev_claim_get_name(type); 8153 8154 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8155 modname = bdev->internal.claim.v1.module->name; 8156 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8157 bdev->name, detail, typename, modname); 8158 return; 8159 } 8160 8161 if (claim_type_is_v2(type)) { 8162 struct spdk_bdev_module_claim *claim; 8163 8164 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8165 modname = claim->module->name; 8166 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8167 bdev->name, detail, typename, modname); 8168 } 8169 return; 8170 } 8171 8172 assert(false); 8173 } 8174 8175 static int 8176 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8177 { 8178 struct spdk_thread *thread; 8179 int rc = 0; 8180 8181 thread = spdk_get_thread(); 8182 if (!thread) { 8183 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8184 return -ENOTSUP; 8185 } 8186 8187 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8188 spdk_get_thread()); 8189 8190 desc->bdev = bdev; 8191 desc->thread = thread; 8192 desc->write = write; 8193 8194 spdk_spin_lock(&bdev->internal.spinlock); 8195 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8196 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8197 spdk_spin_unlock(&bdev->internal.spinlock); 8198 return -ENODEV; 8199 } 8200 8201 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8202 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8203 spdk_spin_unlock(&bdev->internal.spinlock); 8204 return -EPERM; 8205 } 8206 8207 rc = bdev_start_qos(bdev); 8208 if (rc != 0) { 8209 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8210 spdk_spin_unlock(&bdev->internal.spinlock); 8211 return rc; 8212 } 8213 8214 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8215 8216 spdk_spin_unlock(&bdev->internal.spinlock); 8217 8218 return 0; 8219 } 8220 8221 static int 8222 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8223 struct spdk_bdev_desc **_desc) 8224 { 8225 struct spdk_bdev_desc *desc; 8226 unsigned int i; 8227 8228 desc = calloc(1, sizeof(*desc)); 8229 if (desc == NULL) { 8230 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8231 return -ENOMEM; 8232 } 8233 8234 TAILQ_INIT(&desc->pending_media_events); 8235 TAILQ_INIT(&desc->free_media_events); 8236 8237 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8238 desc->callback.event_fn = event_cb; 8239 desc->callback.ctx = event_ctx; 8240 spdk_spin_init(&desc->spinlock); 8241 8242 if (bdev->media_events) { 8243 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8244 sizeof(*desc->media_events_buffer)); 8245 if (desc->media_events_buffer == NULL) { 8246 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8247 bdev_desc_free(desc); 8248 return -ENOMEM; 8249 } 8250 8251 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8252 TAILQ_INSERT_TAIL(&desc->free_media_events, 8253 &desc->media_events_buffer[i], tailq); 8254 } 8255 } 8256 8257 if (bdev->fn_table->accel_sequence_supported != NULL) { 8258 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8259 desc->accel_sequence_supported[i] = 8260 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8261 (enum spdk_bdev_io_type)i); 8262 } 8263 } 8264 8265 *_desc = desc; 8266 8267 return 0; 8268 } 8269 8270 static int 8271 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8272 void *event_ctx, struct spdk_bdev_desc **_desc) 8273 { 8274 struct spdk_bdev_desc *desc; 8275 struct spdk_bdev *bdev; 8276 int rc; 8277 8278 bdev = bdev_get_by_name(bdev_name); 8279 8280 if (bdev == NULL) { 8281 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8282 return -ENODEV; 8283 } 8284 8285 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8286 if (rc != 0) { 8287 return rc; 8288 } 8289 8290 rc = bdev_open(bdev, write, desc); 8291 if (rc != 0) { 8292 bdev_desc_free(desc); 8293 desc = NULL; 8294 } 8295 8296 *_desc = desc; 8297 8298 return rc; 8299 } 8300 8301 int 8302 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8303 void *event_ctx, struct spdk_bdev_desc **_desc) 8304 { 8305 int rc; 8306 8307 if (event_cb == NULL) { 8308 SPDK_ERRLOG("Missing event callback function\n"); 8309 return -EINVAL; 8310 } 8311 8312 spdk_spin_lock(&g_bdev_mgr.spinlock); 8313 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8314 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8315 8316 return rc; 8317 } 8318 8319 struct spdk_bdev_open_async_ctx { 8320 char *bdev_name; 8321 spdk_bdev_event_cb_t event_cb; 8322 void *event_ctx; 8323 bool write; 8324 int rc; 8325 spdk_bdev_open_async_cb_t cb_fn; 8326 void *cb_arg; 8327 struct spdk_bdev_desc *desc; 8328 struct spdk_bdev_open_async_opts opts; 8329 uint64_t start_ticks; 8330 struct spdk_thread *orig_thread; 8331 struct spdk_poller *poller; 8332 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8333 }; 8334 8335 static void 8336 bdev_open_async_done(void *arg) 8337 { 8338 struct spdk_bdev_open_async_ctx *ctx = arg; 8339 8340 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8341 8342 free(ctx->bdev_name); 8343 free(ctx); 8344 } 8345 8346 static void 8347 bdev_open_async_cancel(void *arg) 8348 { 8349 struct spdk_bdev_open_async_ctx *ctx = arg; 8350 8351 assert(ctx->rc == -ESHUTDOWN); 8352 8353 spdk_poller_unregister(&ctx->poller); 8354 8355 bdev_open_async_done(ctx); 8356 } 8357 8358 /* This is called when the bdev library finishes at shutdown. */ 8359 static void 8360 bdev_open_async_fini(void) 8361 { 8362 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8363 8364 spdk_spin_lock(&g_bdev_mgr.spinlock); 8365 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8366 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8367 /* 8368 * We have to move to ctx->orig_thread to unregister ctx->poller. 8369 * However, there is a chance that ctx->poller is executed before 8370 * message is executed, which could result in bdev_open_async_done() 8371 * being called twice. To avoid such race condition, set ctx->rc to 8372 * -ESHUTDOWN. 8373 */ 8374 ctx->rc = -ESHUTDOWN; 8375 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8376 } 8377 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8378 } 8379 8380 static int bdev_open_async(void *arg); 8381 8382 static void 8383 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8384 { 8385 uint64_t timeout_ticks; 8386 8387 if (ctx->rc == -ESHUTDOWN) { 8388 /* This context is being canceled. Do nothing. */ 8389 return; 8390 } 8391 8392 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8393 &ctx->desc); 8394 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8395 goto exit; 8396 } 8397 8398 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8399 if (spdk_get_ticks() >= timeout_ticks) { 8400 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8401 ctx->rc = -ETIMEDOUT; 8402 goto exit; 8403 } 8404 8405 return; 8406 8407 exit: 8408 spdk_poller_unregister(&ctx->poller); 8409 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8410 8411 /* Completion callback is processed after stack unwinding. */ 8412 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8413 } 8414 8415 static int 8416 bdev_open_async(void *arg) 8417 { 8418 struct spdk_bdev_open_async_ctx *ctx = arg; 8419 8420 spdk_spin_lock(&g_bdev_mgr.spinlock); 8421 8422 _bdev_open_async(ctx); 8423 8424 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8425 8426 return SPDK_POLLER_BUSY; 8427 } 8428 8429 static void 8430 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8431 struct spdk_bdev_open_async_opts *opts_src, 8432 size_t size) 8433 { 8434 assert(opts); 8435 assert(opts_src); 8436 8437 opts->size = size; 8438 8439 #define SET_FIELD(field) \ 8440 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8441 opts->field = opts_src->field; \ 8442 } \ 8443 8444 SET_FIELD(timeout_ms); 8445 8446 /* Do not remove this statement, you should always update this statement when you adding a new field, 8447 * and do not forget to add the SET_FIELD statement for your added field. */ 8448 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8449 8450 #undef SET_FIELD 8451 } 8452 8453 static void 8454 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8455 { 8456 assert(opts); 8457 8458 opts->size = size; 8459 8460 #define SET_FIELD(field, value) \ 8461 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8462 opts->field = value; \ 8463 } \ 8464 8465 SET_FIELD(timeout_ms, 0); 8466 8467 #undef SET_FIELD 8468 } 8469 8470 int 8471 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8472 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8473 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8474 { 8475 struct spdk_bdev_open_async_ctx *ctx; 8476 8477 if (event_cb == NULL) { 8478 SPDK_ERRLOG("Missing event callback function\n"); 8479 return -EINVAL; 8480 } 8481 8482 if (open_cb == NULL) { 8483 SPDK_ERRLOG("Missing open callback function\n"); 8484 return -EINVAL; 8485 } 8486 8487 if (opts != NULL && opts->size == 0) { 8488 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8489 return -EINVAL; 8490 } 8491 8492 ctx = calloc(1, sizeof(*ctx)); 8493 if (ctx == NULL) { 8494 SPDK_ERRLOG("Failed to allocate open context\n"); 8495 return -ENOMEM; 8496 } 8497 8498 ctx->bdev_name = strdup(bdev_name); 8499 if (ctx->bdev_name == NULL) { 8500 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8501 free(ctx); 8502 return -ENOMEM; 8503 } 8504 8505 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8506 if (ctx->poller == NULL) { 8507 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8508 free(ctx->bdev_name); 8509 free(ctx); 8510 return -ENOMEM; 8511 } 8512 8513 ctx->cb_fn = open_cb; 8514 ctx->cb_arg = open_cb_arg; 8515 ctx->write = write; 8516 ctx->event_cb = event_cb; 8517 ctx->event_ctx = event_ctx; 8518 ctx->orig_thread = spdk_get_thread(); 8519 ctx->start_ticks = spdk_get_ticks(); 8520 8521 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8522 if (opts != NULL) { 8523 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8524 } 8525 8526 spdk_spin_lock(&g_bdev_mgr.spinlock); 8527 8528 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8529 _bdev_open_async(ctx); 8530 8531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8532 8533 return 0; 8534 } 8535 8536 static void 8537 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8538 { 8539 int rc; 8540 8541 spdk_spin_lock(&bdev->internal.spinlock); 8542 spdk_spin_lock(&desc->spinlock); 8543 8544 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8545 8546 desc->closed = true; 8547 8548 if (desc->claim != NULL) { 8549 bdev_desc_release_claims(desc); 8550 } 8551 8552 if (0 == desc->refs) { 8553 spdk_spin_unlock(&desc->spinlock); 8554 bdev_desc_free(desc); 8555 } else { 8556 spdk_spin_unlock(&desc->spinlock); 8557 } 8558 8559 /* If no more descriptors, kill QoS channel */ 8560 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8561 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8562 bdev->name, spdk_get_thread()); 8563 8564 if (bdev_qos_destroy(bdev)) { 8565 /* There isn't anything we can do to recover here. Just let the 8566 * old QoS poller keep running. The QoS handling won't change 8567 * cores when the user allocates a new channel, but it won't break. */ 8568 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8569 } 8570 } 8571 8572 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8573 rc = bdev_unregister_unsafe(bdev); 8574 spdk_spin_unlock(&bdev->internal.spinlock); 8575 8576 if (rc == 0) { 8577 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8578 } 8579 } else { 8580 spdk_spin_unlock(&bdev->internal.spinlock); 8581 } 8582 } 8583 8584 void 8585 spdk_bdev_close(struct spdk_bdev_desc *desc) 8586 { 8587 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8588 8589 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8590 spdk_get_thread()); 8591 8592 assert(desc->thread == spdk_get_thread()); 8593 8594 spdk_poller_unregister(&desc->io_timeout_poller); 8595 8596 spdk_spin_lock(&g_bdev_mgr.spinlock); 8597 8598 bdev_close(bdev, desc); 8599 8600 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8601 } 8602 8603 int32_t 8604 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8605 { 8606 if (bdev->numa.id_valid) { 8607 return bdev->numa.id; 8608 } else { 8609 return SPDK_ENV_NUMA_ID_ANY; 8610 } 8611 } 8612 8613 static void 8614 bdev_register_finished(void *arg) 8615 { 8616 struct spdk_bdev_desc *desc = arg; 8617 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8618 8619 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8620 8621 spdk_spin_lock(&g_bdev_mgr.spinlock); 8622 8623 bdev_close(bdev, desc); 8624 8625 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8626 } 8627 8628 int 8629 spdk_bdev_register(struct spdk_bdev *bdev) 8630 { 8631 struct spdk_bdev_desc *desc; 8632 struct spdk_thread *thread = spdk_get_thread(); 8633 int rc; 8634 8635 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8636 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8637 thread ? spdk_thread_get_name(thread) : "null"); 8638 return -EINVAL; 8639 } 8640 8641 rc = bdev_register(bdev); 8642 if (rc != 0) { 8643 return rc; 8644 } 8645 8646 /* A descriptor is opened to prevent bdev deletion during examination */ 8647 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8648 if (rc != 0) { 8649 spdk_bdev_unregister(bdev, NULL, NULL); 8650 return rc; 8651 } 8652 8653 rc = bdev_open(bdev, false, desc); 8654 if (rc != 0) { 8655 bdev_desc_free(desc); 8656 spdk_bdev_unregister(bdev, NULL, NULL); 8657 return rc; 8658 } 8659 8660 /* Examine configuration before initializing I/O */ 8661 bdev_examine(bdev); 8662 8663 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8664 if (rc != 0) { 8665 bdev_close(bdev, desc); 8666 spdk_bdev_unregister(bdev, NULL, NULL); 8667 } 8668 8669 return rc; 8670 } 8671 8672 int 8673 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8674 struct spdk_bdev_module *module) 8675 { 8676 spdk_spin_lock(&bdev->internal.spinlock); 8677 8678 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8679 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8680 spdk_spin_unlock(&bdev->internal.spinlock); 8681 return -EPERM; 8682 } 8683 8684 if (desc && !desc->write) { 8685 desc->write = true; 8686 } 8687 8688 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8689 bdev->internal.claim.v1.module = module; 8690 8691 spdk_spin_unlock(&bdev->internal.spinlock); 8692 return 0; 8693 } 8694 8695 void 8696 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8697 { 8698 spdk_spin_lock(&bdev->internal.spinlock); 8699 8700 assert(bdev->internal.claim.v1.module != NULL); 8701 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8702 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8703 bdev->internal.claim.v1.module = NULL; 8704 8705 spdk_spin_unlock(&bdev->internal.spinlock); 8706 } 8707 8708 /* 8709 * Start claims v2 8710 */ 8711 8712 const char * 8713 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8714 { 8715 switch (type) { 8716 case SPDK_BDEV_CLAIM_NONE: 8717 return "not_claimed"; 8718 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8719 return "exclusive_write"; 8720 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8721 return "read_many_write_one"; 8722 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8723 return "read_many_write_none"; 8724 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8725 return "read_many_write_many"; 8726 default: 8727 break; 8728 } 8729 return "invalid_claim"; 8730 } 8731 8732 static bool 8733 claim_type_is_v2(enum spdk_bdev_claim_type type) 8734 { 8735 switch (type) { 8736 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8737 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8738 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8739 return true; 8740 default: 8741 break; 8742 } 8743 return false; 8744 } 8745 8746 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8747 static bool 8748 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8749 { 8750 switch (type) { 8751 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8752 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8753 return true; 8754 default: 8755 break; 8756 } 8757 return false; 8758 } 8759 8760 void 8761 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8762 { 8763 if (opts == NULL) { 8764 SPDK_ERRLOG("opts should not be NULL\n"); 8765 assert(opts != NULL); 8766 return; 8767 } 8768 if (size == 0) { 8769 SPDK_ERRLOG("size should not be zero\n"); 8770 assert(size != 0); 8771 return; 8772 } 8773 8774 memset(opts, 0, size); 8775 opts->opts_size = size; 8776 8777 #define FIELD_OK(field) \ 8778 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8779 8780 #define SET_FIELD(field, value) \ 8781 if (FIELD_OK(field)) { \ 8782 opts->field = value; \ 8783 } \ 8784 8785 SET_FIELD(shared_claim_key, 0); 8786 8787 #undef FIELD_OK 8788 #undef SET_FIELD 8789 } 8790 8791 static int 8792 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8793 { 8794 if (src->opts_size == 0) { 8795 SPDK_ERRLOG("size should not be zero\n"); 8796 return -1; 8797 } 8798 8799 memset(dst, 0, sizeof(*dst)); 8800 dst->opts_size = src->opts_size; 8801 8802 #define FIELD_OK(field) \ 8803 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8804 8805 #define SET_FIELD(field) \ 8806 if (FIELD_OK(field)) { \ 8807 dst->field = src->field; \ 8808 } \ 8809 8810 if (FIELD_OK(name)) { 8811 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8812 } 8813 8814 SET_FIELD(shared_claim_key); 8815 8816 /* You should not remove this statement, but need to update the assert statement 8817 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8818 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8819 8820 #undef FIELD_OK 8821 #undef SET_FIELD 8822 return 0; 8823 } 8824 8825 /* Returns 0 if a read-write-once claim can be taken. */ 8826 static int 8827 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8828 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8829 { 8830 struct spdk_bdev *bdev = desc->bdev; 8831 struct spdk_bdev_desc *open_desc; 8832 8833 assert(spdk_spin_held(&bdev->internal.spinlock)); 8834 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8835 8836 if (opts->shared_claim_key != 0) { 8837 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8838 bdev->name); 8839 return -EINVAL; 8840 } 8841 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8842 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8843 return -EPERM; 8844 } 8845 if (desc->claim != NULL) { 8846 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8847 bdev->name, desc->claim->module->name); 8848 return -EPERM; 8849 } 8850 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8851 if (desc != open_desc && open_desc->write) { 8852 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8853 "another descriptor is open for writing\n", 8854 bdev->name); 8855 return -EPERM; 8856 } 8857 } 8858 8859 return 0; 8860 } 8861 8862 /* Returns 0 if a read-only-many claim can be taken. */ 8863 static int 8864 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8865 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8866 { 8867 struct spdk_bdev *bdev = desc->bdev; 8868 struct spdk_bdev_desc *open_desc; 8869 8870 assert(spdk_spin_held(&bdev->internal.spinlock)); 8871 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8872 assert(desc->claim == NULL); 8873 8874 if (desc->write) { 8875 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8876 bdev->name); 8877 return -EINVAL; 8878 } 8879 if (opts->shared_claim_key != 0) { 8880 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8881 return -EINVAL; 8882 } 8883 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8884 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8885 if (open_desc->write) { 8886 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8887 "another descriptor is open for writing\n", 8888 bdev->name); 8889 return -EPERM; 8890 } 8891 } 8892 } 8893 8894 return 0; 8895 } 8896 8897 /* Returns 0 if a read-write-many claim can be taken. */ 8898 static int 8899 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8900 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8901 { 8902 struct spdk_bdev *bdev = desc->bdev; 8903 struct spdk_bdev_desc *open_desc; 8904 8905 assert(spdk_spin_held(&bdev->internal.spinlock)); 8906 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8907 assert(desc->claim == NULL); 8908 8909 if (opts->shared_claim_key == 0) { 8910 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8911 bdev->name); 8912 return -EINVAL; 8913 } 8914 switch (bdev->internal.claim_type) { 8915 case SPDK_BDEV_CLAIM_NONE: 8916 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8917 if (open_desc == desc) { 8918 continue; 8919 } 8920 if (open_desc->write) { 8921 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8922 "another descriptor is open for writing without a " 8923 "claim\n", bdev->name); 8924 return -EPERM; 8925 } 8926 } 8927 break; 8928 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8929 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8930 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8931 return -EPERM; 8932 } 8933 break; 8934 default: 8935 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8936 return -EBUSY; 8937 } 8938 8939 return 0; 8940 } 8941 8942 /* Updates desc and its bdev with a v2 claim. */ 8943 static int 8944 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8945 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8946 { 8947 struct spdk_bdev *bdev = desc->bdev; 8948 struct spdk_bdev_module_claim *claim; 8949 8950 assert(spdk_spin_held(&bdev->internal.spinlock)); 8951 assert(claim_type_is_v2(type)); 8952 assert(desc->claim == NULL); 8953 8954 claim = calloc(1, sizeof(*desc->claim)); 8955 if (claim == NULL) { 8956 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8957 return -ENOMEM; 8958 } 8959 claim->module = module; 8960 claim->desc = desc; 8961 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8962 memcpy(claim->name, opts->name, sizeof(claim->name)); 8963 desc->claim = claim; 8964 8965 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8966 bdev->internal.claim_type = type; 8967 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8968 bdev->internal.claim.v2.key = opts->shared_claim_key; 8969 } 8970 assert(type == bdev->internal.claim_type); 8971 8972 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8973 8974 if (!desc->write && claim_type_promotes_to_write(type)) { 8975 desc->write = true; 8976 } 8977 8978 return 0; 8979 } 8980 8981 int 8982 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8983 struct spdk_bdev_claim_opts *_opts, 8984 struct spdk_bdev_module *module) 8985 { 8986 struct spdk_bdev *bdev; 8987 struct spdk_bdev_claim_opts opts; 8988 int rc = 0; 8989 8990 if (desc == NULL) { 8991 SPDK_ERRLOG("descriptor must not be NULL\n"); 8992 return -EINVAL; 8993 } 8994 8995 bdev = desc->bdev; 8996 8997 if (_opts == NULL) { 8998 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8999 } else if (claim_opts_copy(_opts, &opts) != 0) { 9000 return -EINVAL; 9001 } 9002 9003 spdk_spin_lock(&bdev->internal.spinlock); 9004 9005 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 9006 bdev->internal.claim_type != type) { 9007 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 9008 spdk_spin_unlock(&bdev->internal.spinlock); 9009 return -EPERM; 9010 } 9011 9012 if (claim_type_is_v2(type) && desc->claim != NULL) { 9013 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 9014 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 9015 spdk_spin_unlock(&bdev->internal.spinlock); 9016 return -EPERM; 9017 } 9018 9019 switch (type) { 9020 case SPDK_BDEV_CLAIM_EXCL_WRITE: 9021 spdk_spin_unlock(&bdev->internal.spinlock); 9022 return spdk_bdev_module_claim_bdev(bdev, desc, module); 9023 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 9024 rc = claim_verify_rwo(desc, type, &opts, module); 9025 break; 9026 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 9027 rc = claim_verify_rom(desc, type, &opts, module); 9028 break; 9029 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 9030 rc = claim_verify_rwm(desc, type, &opts, module); 9031 break; 9032 default: 9033 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 9034 rc = -ENOTSUP; 9035 } 9036 9037 if (rc == 0) { 9038 rc = claim_bdev(desc, type, &opts, module); 9039 } 9040 9041 spdk_spin_unlock(&bdev->internal.spinlock); 9042 return rc; 9043 } 9044 9045 static void 9046 claim_reset(struct spdk_bdev *bdev) 9047 { 9048 assert(spdk_spin_held(&bdev->internal.spinlock)); 9049 assert(claim_type_is_v2(bdev->internal.claim_type)); 9050 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9051 9052 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9053 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9054 } 9055 9056 static void 9057 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9058 { 9059 struct spdk_bdev *bdev = desc->bdev; 9060 9061 assert(spdk_spin_held(&bdev->internal.spinlock)); 9062 assert(claim_type_is_v2(bdev->internal.claim_type)); 9063 9064 if (bdev->internal.examine_in_progress == 0) { 9065 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9066 free(desc->claim); 9067 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9068 claim_reset(bdev); 9069 } 9070 } else { 9071 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9072 desc->claim->module = NULL; 9073 desc->claim->desc = NULL; 9074 } 9075 desc->claim = NULL; 9076 } 9077 9078 /* 9079 * End claims v2 9080 */ 9081 9082 struct spdk_bdev * 9083 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9084 { 9085 assert(desc != NULL); 9086 return desc->bdev; 9087 } 9088 9089 int 9090 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9091 { 9092 struct spdk_bdev *bdev, *tmp; 9093 struct spdk_bdev_desc *desc; 9094 int rc = 0; 9095 9096 assert(fn != NULL); 9097 9098 spdk_spin_lock(&g_bdev_mgr.spinlock); 9099 bdev = spdk_bdev_first(); 9100 while (bdev != NULL) { 9101 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9102 if (rc != 0) { 9103 break; 9104 } 9105 rc = bdev_open(bdev, false, desc); 9106 if (rc != 0) { 9107 bdev_desc_free(desc); 9108 if (rc == -ENODEV) { 9109 /* Ignore the error and move to the next bdev. */ 9110 rc = 0; 9111 bdev = spdk_bdev_next(bdev); 9112 continue; 9113 } 9114 break; 9115 } 9116 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9117 9118 rc = fn(ctx, bdev); 9119 9120 spdk_spin_lock(&g_bdev_mgr.spinlock); 9121 tmp = spdk_bdev_next(bdev); 9122 bdev_close(bdev, desc); 9123 if (rc != 0) { 9124 break; 9125 } 9126 bdev = tmp; 9127 } 9128 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9129 9130 return rc; 9131 } 9132 9133 int 9134 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9135 { 9136 struct spdk_bdev *bdev, *tmp; 9137 struct spdk_bdev_desc *desc; 9138 int rc = 0; 9139 9140 assert(fn != NULL); 9141 9142 spdk_spin_lock(&g_bdev_mgr.spinlock); 9143 bdev = spdk_bdev_first_leaf(); 9144 while (bdev != NULL) { 9145 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9146 if (rc != 0) { 9147 break; 9148 } 9149 rc = bdev_open(bdev, false, desc); 9150 if (rc != 0) { 9151 bdev_desc_free(desc); 9152 if (rc == -ENODEV) { 9153 /* Ignore the error and move to the next bdev. */ 9154 rc = 0; 9155 bdev = spdk_bdev_next_leaf(bdev); 9156 continue; 9157 } 9158 break; 9159 } 9160 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9161 9162 rc = fn(ctx, bdev); 9163 9164 spdk_spin_lock(&g_bdev_mgr.spinlock); 9165 tmp = spdk_bdev_next_leaf(bdev); 9166 bdev_close(bdev, desc); 9167 if (rc != 0) { 9168 break; 9169 } 9170 bdev = tmp; 9171 } 9172 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9173 9174 return rc; 9175 } 9176 9177 void 9178 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9179 { 9180 struct iovec *iovs; 9181 int iovcnt; 9182 9183 if (bdev_io == NULL) { 9184 return; 9185 } 9186 9187 switch (bdev_io->type) { 9188 case SPDK_BDEV_IO_TYPE_READ: 9189 case SPDK_BDEV_IO_TYPE_WRITE: 9190 case SPDK_BDEV_IO_TYPE_ZCOPY: 9191 iovs = bdev_io->u.bdev.iovs; 9192 iovcnt = bdev_io->u.bdev.iovcnt; 9193 break; 9194 default: 9195 iovs = NULL; 9196 iovcnt = 0; 9197 break; 9198 } 9199 9200 if (iovp) { 9201 *iovp = iovs; 9202 } 9203 if (iovcntp) { 9204 *iovcntp = iovcnt; 9205 } 9206 } 9207 9208 void * 9209 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9210 { 9211 if (bdev_io == NULL) { 9212 return NULL; 9213 } 9214 9215 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9216 return NULL; 9217 } 9218 9219 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9220 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9221 return bdev_io->u.bdev.md_buf; 9222 } 9223 9224 return NULL; 9225 } 9226 9227 void * 9228 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9229 { 9230 if (bdev_io == NULL) { 9231 assert(false); 9232 return NULL; 9233 } 9234 9235 return bdev_io->internal.caller_ctx; 9236 } 9237 9238 void 9239 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9240 { 9241 9242 if (spdk_bdev_module_list_find(bdev_module->name)) { 9243 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9244 assert(false); 9245 } 9246 9247 spdk_spin_init(&bdev_module->internal.spinlock); 9248 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9249 9250 /* 9251 * Modules with examine callbacks must be initialized first, so they are 9252 * ready to handle examine callbacks from later modules that will 9253 * register physical bdevs. 9254 */ 9255 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9256 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9257 } else { 9258 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9259 } 9260 } 9261 9262 struct spdk_bdev_module * 9263 spdk_bdev_module_list_find(const char *name) 9264 { 9265 struct spdk_bdev_module *bdev_module; 9266 9267 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9268 if (strcmp(name, bdev_module->name) == 0) { 9269 break; 9270 } 9271 } 9272 9273 return bdev_module; 9274 } 9275 9276 static int 9277 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9278 { 9279 uint64_t num_blocks; 9280 void *md_buf = NULL; 9281 9282 num_blocks = bdev_io->u.bdev.num_blocks; 9283 9284 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9285 md_buf = (char *)g_bdev_mgr.zero_buffer + 9286 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9287 } 9288 9289 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9290 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9291 g_bdev_mgr.zero_buffer, md_buf, 9292 bdev_io->u.bdev.offset_blocks, num_blocks, 9293 bdev_write_zero_buffer_done, bdev_io); 9294 } 9295 9296 static void 9297 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9298 { 9299 struct spdk_bdev_io *parent_io = cb_arg; 9300 9301 spdk_bdev_free_io(bdev_io); 9302 9303 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9304 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9305 } 9306 9307 static void 9308 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9309 { 9310 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9311 ctx->bdev->internal.qos_mod_in_progress = false; 9312 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9313 9314 if (ctx->cb_fn) { 9315 ctx->cb_fn(ctx->cb_arg, status); 9316 } 9317 free(ctx); 9318 } 9319 9320 static void 9321 bdev_disable_qos_done(void *cb_arg) 9322 { 9323 struct set_qos_limit_ctx *ctx = cb_arg; 9324 struct spdk_bdev *bdev = ctx->bdev; 9325 struct spdk_bdev_qos *qos; 9326 9327 spdk_spin_lock(&bdev->internal.spinlock); 9328 qos = bdev->internal.qos; 9329 bdev->internal.qos = NULL; 9330 spdk_spin_unlock(&bdev->internal.spinlock); 9331 9332 if (qos->thread != NULL) { 9333 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9334 spdk_poller_unregister(&qos->poller); 9335 } 9336 9337 free(qos); 9338 9339 bdev_set_qos_limit_done(ctx, 0); 9340 } 9341 9342 static void 9343 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9344 { 9345 struct set_qos_limit_ctx *ctx = _ctx; 9346 struct spdk_thread *thread; 9347 9348 spdk_spin_lock(&bdev->internal.spinlock); 9349 thread = bdev->internal.qos->thread; 9350 spdk_spin_unlock(&bdev->internal.spinlock); 9351 9352 if (thread != NULL) { 9353 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9354 } else { 9355 bdev_disable_qos_done(ctx); 9356 } 9357 } 9358 9359 static void 9360 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9361 struct spdk_io_channel *ch, void *_ctx) 9362 { 9363 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9364 struct spdk_bdev_io *bdev_io; 9365 9366 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9367 9368 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9369 /* Re-submit the queued I/O. */ 9370 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9371 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9372 _bdev_io_submit(bdev_io); 9373 } 9374 9375 spdk_bdev_for_each_channel_continue(i, 0); 9376 } 9377 9378 static void 9379 bdev_update_qos_rate_limit_msg(void *cb_arg) 9380 { 9381 struct set_qos_limit_ctx *ctx = cb_arg; 9382 struct spdk_bdev *bdev = ctx->bdev; 9383 9384 spdk_spin_lock(&bdev->internal.spinlock); 9385 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9386 spdk_spin_unlock(&bdev->internal.spinlock); 9387 9388 bdev_set_qos_limit_done(ctx, 0); 9389 } 9390 9391 static void 9392 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9393 struct spdk_io_channel *ch, void *_ctx) 9394 { 9395 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9396 9397 spdk_spin_lock(&bdev->internal.spinlock); 9398 bdev_enable_qos(bdev, bdev_ch); 9399 spdk_spin_unlock(&bdev->internal.spinlock); 9400 spdk_bdev_for_each_channel_continue(i, 0); 9401 } 9402 9403 static void 9404 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9405 { 9406 struct set_qos_limit_ctx *ctx = _ctx; 9407 9408 bdev_set_qos_limit_done(ctx, status); 9409 } 9410 9411 static void 9412 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9413 { 9414 int i; 9415 9416 assert(bdev->internal.qos != NULL); 9417 9418 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9419 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9420 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9421 9422 if (limits[i] == 0) { 9423 bdev->internal.qos->rate_limits[i].limit = 9424 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9425 } 9426 } 9427 } 9428 } 9429 9430 void 9431 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9432 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9433 { 9434 struct set_qos_limit_ctx *ctx; 9435 uint32_t limit_set_complement; 9436 uint64_t min_limit_per_sec; 9437 int i; 9438 bool disable_rate_limit = true; 9439 9440 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9441 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9442 continue; 9443 } 9444 9445 if (limits[i] > 0) { 9446 disable_rate_limit = false; 9447 } 9448 9449 if (bdev_qos_is_iops_rate_limit(i) == true) { 9450 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9451 } else { 9452 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9453 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9454 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9455 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9456 } 9457 /* Change from megabyte to byte rate limit */ 9458 limits[i] = limits[i] * 1024 * 1024; 9459 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9460 } 9461 9462 limit_set_complement = limits[i] % min_limit_per_sec; 9463 if (limit_set_complement) { 9464 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9465 limits[i], min_limit_per_sec); 9466 limits[i] += min_limit_per_sec - limit_set_complement; 9467 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9468 } 9469 } 9470 9471 ctx = calloc(1, sizeof(*ctx)); 9472 if (ctx == NULL) { 9473 cb_fn(cb_arg, -ENOMEM); 9474 return; 9475 } 9476 9477 ctx->cb_fn = cb_fn; 9478 ctx->cb_arg = cb_arg; 9479 ctx->bdev = bdev; 9480 9481 spdk_spin_lock(&bdev->internal.spinlock); 9482 if (bdev->internal.qos_mod_in_progress) { 9483 spdk_spin_unlock(&bdev->internal.spinlock); 9484 free(ctx); 9485 cb_fn(cb_arg, -EAGAIN); 9486 return; 9487 } 9488 bdev->internal.qos_mod_in_progress = true; 9489 9490 if (disable_rate_limit == true && bdev->internal.qos) { 9491 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9492 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9493 (bdev->internal.qos->rate_limits[i].limit > 0 && 9494 bdev->internal.qos->rate_limits[i].limit != 9495 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9496 disable_rate_limit = false; 9497 break; 9498 } 9499 } 9500 } 9501 9502 if (disable_rate_limit == false) { 9503 if (bdev->internal.qos == NULL) { 9504 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9505 if (!bdev->internal.qos) { 9506 spdk_spin_unlock(&bdev->internal.spinlock); 9507 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9508 bdev_set_qos_limit_done(ctx, -ENOMEM); 9509 return; 9510 } 9511 } 9512 9513 if (bdev->internal.qos->thread == NULL) { 9514 /* Enabling */ 9515 bdev_set_qos_rate_limits(bdev, limits); 9516 9517 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9518 bdev_enable_qos_done); 9519 } else { 9520 /* Updating */ 9521 bdev_set_qos_rate_limits(bdev, limits); 9522 9523 spdk_thread_send_msg(bdev->internal.qos->thread, 9524 bdev_update_qos_rate_limit_msg, ctx); 9525 } 9526 } else { 9527 if (bdev->internal.qos != NULL) { 9528 bdev_set_qos_rate_limits(bdev, limits); 9529 9530 /* Disabling */ 9531 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9532 bdev_disable_qos_msg_done); 9533 } else { 9534 spdk_spin_unlock(&bdev->internal.spinlock); 9535 bdev_set_qos_limit_done(ctx, 0); 9536 return; 9537 } 9538 } 9539 9540 spdk_spin_unlock(&bdev->internal.spinlock); 9541 } 9542 9543 struct spdk_bdev_histogram_ctx { 9544 spdk_bdev_histogram_status_cb cb_fn; 9545 void *cb_arg; 9546 struct spdk_bdev *bdev; 9547 int status; 9548 }; 9549 9550 static void 9551 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9552 { 9553 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9554 9555 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9556 ctx->bdev->internal.histogram_in_progress = false; 9557 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9558 ctx->cb_fn(ctx->cb_arg, ctx->status); 9559 free(ctx); 9560 } 9561 9562 static void 9563 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9564 struct spdk_io_channel *_ch, void *_ctx) 9565 { 9566 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9567 9568 if (ch->histogram != NULL) { 9569 spdk_histogram_data_free(ch->histogram); 9570 ch->histogram = NULL; 9571 } 9572 spdk_bdev_for_each_channel_continue(i, 0); 9573 } 9574 9575 static void 9576 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9577 { 9578 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9579 9580 if (status != 0) { 9581 ctx->status = status; 9582 ctx->bdev->internal.histogram_enabled = false; 9583 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9584 bdev_histogram_disable_channel_cb); 9585 } else { 9586 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9587 ctx->bdev->internal.histogram_in_progress = false; 9588 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9589 ctx->cb_fn(ctx->cb_arg, ctx->status); 9590 free(ctx); 9591 } 9592 } 9593 9594 static void 9595 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9596 struct spdk_io_channel *_ch, void *_ctx) 9597 { 9598 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9599 int status = 0; 9600 9601 if (ch->histogram == NULL) { 9602 ch->histogram = spdk_histogram_data_alloc(); 9603 if (ch->histogram == NULL) { 9604 status = -ENOMEM; 9605 } 9606 } 9607 9608 spdk_bdev_for_each_channel_continue(i, status); 9609 } 9610 9611 void 9612 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9613 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9614 { 9615 struct spdk_bdev_histogram_ctx *ctx; 9616 9617 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9618 if (ctx == NULL) { 9619 cb_fn(cb_arg, -ENOMEM); 9620 return; 9621 } 9622 9623 ctx->bdev = bdev; 9624 ctx->status = 0; 9625 ctx->cb_fn = cb_fn; 9626 ctx->cb_arg = cb_arg; 9627 9628 spdk_spin_lock(&bdev->internal.spinlock); 9629 if (bdev->internal.histogram_in_progress) { 9630 spdk_spin_unlock(&bdev->internal.spinlock); 9631 free(ctx); 9632 cb_fn(cb_arg, -EAGAIN); 9633 return; 9634 } 9635 9636 bdev->internal.histogram_in_progress = true; 9637 spdk_spin_unlock(&bdev->internal.spinlock); 9638 9639 bdev->internal.histogram_enabled = enable; 9640 bdev->internal.histogram_io_type = opts->io_type; 9641 9642 if (enable) { 9643 /* Allocate histogram for each channel */ 9644 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9645 bdev_histogram_enable_channel_cb); 9646 } else { 9647 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9648 bdev_histogram_disable_channel_cb); 9649 } 9650 } 9651 9652 void 9653 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9654 { 9655 if (opts == NULL) { 9656 SPDK_ERRLOG("opts should not be NULL\n"); 9657 assert(opts != NULL); 9658 return; 9659 } 9660 if (size == 0) { 9661 SPDK_ERRLOG("size should not be zero\n"); 9662 assert(size != 0); 9663 return; 9664 } 9665 9666 memset(opts, 0, size); 9667 opts->size = size; 9668 9669 #define FIELD_OK(field) \ 9670 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9671 9672 #define SET_FIELD(field, value) \ 9673 if (FIELD_OK(field)) { \ 9674 opts->field = value; \ 9675 } \ 9676 9677 SET_FIELD(io_type, 0); 9678 9679 /* You should not remove this statement, but need to update the assert statement 9680 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9681 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9682 9683 #undef FIELD_OK 9684 #undef SET_FIELD 9685 } 9686 9687 void 9688 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9689 void *cb_arg, bool enable) 9690 { 9691 struct spdk_bdev_enable_histogram_opts opts; 9692 9693 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9694 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9695 } 9696 9697 struct spdk_bdev_histogram_data_ctx { 9698 spdk_bdev_histogram_data_cb cb_fn; 9699 void *cb_arg; 9700 struct spdk_bdev *bdev; 9701 /** merged histogram data from all channels */ 9702 struct spdk_histogram_data *histogram; 9703 }; 9704 9705 static void 9706 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9707 { 9708 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9709 9710 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9711 free(ctx); 9712 } 9713 9714 static void 9715 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9716 struct spdk_io_channel *_ch, void *_ctx) 9717 { 9718 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9719 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9720 int status = 0; 9721 9722 if (ch->histogram == NULL) { 9723 status = -EFAULT; 9724 } else { 9725 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9726 } 9727 9728 spdk_bdev_for_each_channel_continue(i, status); 9729 } 9730 9731 void 9732 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9733 spdk_bdev_histogram_data_cb cb_fn, 9734 void *cb_arg) 9735 { 9736 struct spdk_bdev_histogram_data_ctx *ctx; 9737 9738 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9739 if (ctx == NULL) { 9740 cb_fn(cb_arg, -ENOMEM, NULL); 9741 return; 9742 } 9743 9744 ctx->bdev = bdev; 9745 ctx->cb_fn = cb_fn; 9746 ctx->cb_arg = cb_arg; 9747 9748 ctx->histogram = histogram; 9749 9750 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9751 bdev_histogram_get_channel_cb); 9752 } 9753 9754 void 9755 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9756 void *cb_arg) 9757 { 9758 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9759 int status = 0; 9760 9761 assert(cb_fn != NULL); 9762 9763 if (bdev_ch->histogram == NULL) { 9764 status = -EFAULT; 9765 } 9766 cb_fn(cb_arg, status, bdev_ch->histogram); 9767 } 9768 9769 size_t 9770 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9771 size_t max_events) 9772 { 9773 struct media_event_entry *entry; 9774 size_t num_events = 0; 9775 9776 for (; num_events < max_events; ++num_events) { 9777 entry = TAILQ_FIRST(&desc->pending_media_events); 9778 if (entry == NULL) { 9779 break; 9780 } 9781 9782 events[num_events] = entry->event; 9783 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9784 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9785 } 9786 9787 return num_events; 9788 } 9789 9790 int 9791 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9792 size_t num_events) 9793 { 9794 struct spdk_bdev_desc *desc; 9795 struct media_event_entry *entry; 9796 size_t event_id; 9797 int rc = 0; 9798 9799 assert(bdev->media_events); 9800 9801 spdk_spin_lock(&bdev->internal.spinlock); 9802 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9803 if (desc->write) { 9804 break; 9805 } 9806 } 9807 9808 if (desc == NULL || desc->media_events_buffer == NULL) { 9809 rc = -ENODEV; 9810 goto out; 9811 } 9812 9813 for (event_id = 0; event_id < num_events; ++event_id) { 9814 entry = TAILQ_FIRST(&desc->free_media_events); 9815 if (entry == NULL) { 9816 break; 9817 } 9818 9819 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9820 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9821 entry->event = events[event_id]; 9822 } 9823 9824 rc = event_id; 9825 out: 9826 spdk_spin_unlock(&bdev->internal.spinlock); 9827 return rc; 9828 } 9829 9830 static void 9831 _media_management_notify(void *arg) 9832 { 9833 struct spdk_bdev_desc *desc = arg; 9834 9835 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9836 } 9837 9838 void 9839 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9840 { 9841 struct spdk_bdev_desc *desc; 9842 9843 spdk_spin_lock(&bdev->internal.spinlock); 9844 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9845 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9846 event_notify(desc, _media_management_notify); 9847 } 9848 } 9849 spdk_spin_unlock(&bdev->internal.spinlock); 9850 } 9851 9852 struct locked_lba_range_ctx { 9853 struct lba_range range; 9854 struct lba_range *current_range; 9855 struct lba_range *owner_range; 9856 struct spdk_poller *poller; 9857 lock_range_cb cb_fn; 9858 void *cb_arg; 9859 }; 9860 9861 static void 9862 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9863 { 9864 struct locked_lba_range_ctx *ctx = _ctx; 9865 9866 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9867 free(ctx); 9868 } 9869 9870 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9871 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9872 9873 static void 9874 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9875 { 9876 struct locked_lba_range_ctx *ctx = _ctx; 9877 9878 if (status == -ENOMEM) { 9879 /* One of the channels could not allocate a range object. 9880 * So we have to go back and clean up any ranges that were 9881 * allocated successfully before we return error status to 9882 * the caller. We can reuse the unlock function to do that 9883 * clean up. 9884 */ 9885 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9886 bdev_lock_error_cleanup_cb); 9887 return; 9888 } 9889 9890 /* All channels have locked this range and no I/O overlapping the range 9891 * are outstanding! Set the owner_ch for the range object for the 9892 * locking channel, so that this channel will know that it is allowed 9893 * to write to this range. 9894 */ 9895 if (ctx->owner_range != NULL) { 9896 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9897 } 9898 9899 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9900 9901 /* Don't free the ctx here. Its range is in the bdev's global list of 9902 * locked ranges still, and will be removed and freed when this range 9903 * is later unlocked. 9904 */ 9905 } 9906 9907 static int 9908 bdev_lock_lba_range_check_io(void *_i) 9909 { 9910 struct spdk_bdev_channel_iter *i = _i; 9911 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9912 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9913 struct locked_lba_range_ctx *ctx = i->ctx; 9914 struct lba_range *range = ctx->current_range; 9915 struct spdk_bdev_io *bdev_io; 9916 9917 spdk_poller_unregister(&ctx->poller); 9918 9919 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9920 * range. But we need to wait until any outstanding IO overlapping with this range 9921 * are completed. 9922 */ 9923 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9924 if (bdev_io_range_is_locked(bdev_io, range)) { 9925 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9926 return SPDK_POLLER_BUSY; 9927 } 9928 } 9929 9930 spdk_bdev_for_each_channel_continue(i, 0); 9931 return SPDK_POLLER_BUSY; 9932 } 9933 9934 static void 9935 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9936 struct spdk_io_channel *_ch, void *_ctx) 9937 { 9938 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9939 struct locked_lba_range_ctx *ctx = _ctx; 9940 struct lba_range *range; 9941 9942 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9943 if (range->length == ctx->range.length && 9944 range->offset == ctx->range.offset && 9945 range->locked_ctx == ctx->range.locked_ctx) { 9946 /* This range already exists on this channel, so don't add 9947 * it again. This can happen when a new channel is created 9948 * while the for_each_channel operation is in progress. 9949 * Do not check for outstanding I/O in that case, since the 9950 * range was locked before any I/O could be submitted to the 9951 * new channel. 9952 */ 9953 spdk_bdev_for_each_channel_continue(i, 0); 9954 return; 9955 } 9956 } 9957 9958 range = calloc(1, sizeof(*range)); 9959 if (range == NULL) { 9960 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9961 return; 9962 } 9963 9964 range->length = ctx->range.length; 9965 range->offset = ctx->range.offset; 9966 range->locked_ctx = ctx->range.locked_ctx; 9967 range->quiesce = ctx->range.quiesce; 9968 ctx->current_range = range; 9969 if (ctx->range.owner_ch == ch) { 9970 /* This is the range object for the channel that will hold 9971 * the lock. Store it in the ctx object so that we can easily 9972 * set its owner_ch after the lock is finally acquired. 9973 */ 9974 ctx->owner_range = range; 9975 } 9976 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9977 bdev_lock_lba_range_check_io(i); 9978 } 9979 9980 static void 9981 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9982 { 9983 assert(spdk_get_thread() == ctx->range.owner_thread); 9984 assert(ctx->range.owner_ch == NULL || 9985 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9986 9987 /* We will add a copy of this range to each channel now. */ 9988 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9989 bdev_lock_lba_range_cb); 9990 } 9991 9992 static bool 9993 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9994 { 9995 struct lba_range *r; 9996 9997 TAILQ_FOREACH(r, tailq, tailq) { 9998 if (bdev_lba_range_overlapped(range, r)) { 9999 return true; 10000 } 10001 } 10002 return false; 10003 } 10004 10005 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 10006 10007 static int 10008 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 10009 uint64_t offset, uint64_t length, 10010 lock_range_cb cb_fn, void *cb_arg) 10011 { 10012 struct locked_lba_range_ctx *ctx; 10013 10014 ctx = calloc(1, sizeof(*ctx)); 10015 if (ctx == NULL) { 10016 return -ENOMEM; 10017 } 10018 10019 ctx->range.offset = offset; 10020 ctx->range.length = length; 10021 ctx->range.owner_thread = spdk_get_thread(); 10022 ctx->range.owner_ch = ch; 10023 ctx->range.locked_ctx = cb_arg; 10024 ctx->range.bdev = bdev; 10025 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 10026 ctx->cb_fn = cb_fn; 10027 ctx->cb_arg = cb_arg; 10028 10029 spdk_spin_lock(&bdev->internal.spinlock); 10030 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 10031 /* There is an active lock overlapping with this range. 10032 * Put it on the pending list until this range no 10033 * longer overlaps with another. 10034 */ 10035 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10036 } else { 10037 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10038 bdev_lock_lba_range_ctx(bdev, ctx); 10039 } 10040 spdk_spin_unlock(&bdev->internal.spinlock); 10041 return 0; 10042 } 10043 10044 static int 10045 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10046 uint64_t offset, uint64_t length, 10047 lock_range_cb cb_fn, void *cb_arg) 10048 { 10049 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10050 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10051 10052 if (cb_arg == NULL) { 10053 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10054 return -EINVAL; 10055 } 10056 10057 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10058 } 10059 10060 static void 10061 bdev_lock_lba_range_ctx_msg(void *_ctx) 10062 { 10063 struct locked_lba_range_ctx *ctx = _ctx; 10064 10065 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10066 } 10067 10068 static void 10069 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10070 { 10071 struct locked_lba_range_ctx *ctx = _ctx; 10072 struct locked_lba_range_ctx *pending_ctx; 10073 struct lba_range *range, *tmp; 10074 10075 spdk_spin_lock(&bdev->internal.spinlock); 10076 /* Check if there are any pending locked ranges that overlap with this range 10077 * that was just unlocked. If there are, check that it doesn't overlap with any 10078 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10079 * the lock process. 10080 */ 10081 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10082 if (bdev_lba_range_overlapped(range, &ctx->range) && 10083 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10084 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10085 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10086 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10087 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10088 bdev_lock_lba_range_ctx_msg, pending_ctx); 10089 } 10090 } 10091 spdk_spin_unlock(&bdev->internal.spinlock); 10092 10093 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10094 free(ctx); 10095 } 10096 10097 static void 10098 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10099 struct spdk_io_channel *_ch, void *_ctx) 10100 { 10101 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10102 struct locked_lba_range_ctx *ctx = _ctx; 10103 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10104 struct spdk_bdev_io *bdev_io; 10105 struct lba_range *range; 10106 10107 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10108 if (ctx->range.offset == range->offset && 10109 ctx->range.length == range->length && 10110 ctx->range.locked_ctx == range->locked_ctx) { 10111 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10112 free(range); 10113 break; 10114 } 10115 } 10116 10117 /* Note: we should almost always be able to assert that the range specified 10118 * was found. But there are some very rare corner cases where a new channel 10119 * gets created simultaneously with a range unlock, where this function 10120 * would execute on that new channel and wouldn't have the range. 10121 * We also use this to clean up range allocations when a later allocation 10122 * fails in the locking path. 10123 * So we can't actually assert() here. 10124 */ 10125 10126 /* Swap the locked IO into a temporary list, and then try to submit them again. 10127 * We could hyper-optimize this to only resubmit locked I/O that overlap 10128 * with the range that was just unlocked, but this isn't a performance path so 10129 * we go for simplicity here. 10130 */ 10131 TAILQ_INIT(&io_locked); 10132 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10133 while (!TAILQ_EMPTY(&io_locked)) { 10134 bdev_io = TAILQ_FIRST(&io_locked); 10135 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10136 bdev_io_submit(bdev_io); 10137 } 10138 10139 spdk_bdev_for_each_channel_continue(i, 0); 10140 } 10141 10142 static int 10143 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10144 lock_range_cb cb_fn, void *cb_arg) 10145 { 10146 struct locked_lba_range_ctx *ctx; 10147 struct lba_range *range; 10148 10149 spdk_spin_lock(&bdev->internal.spinlock); 10150 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10151 * and remove it. This ensures new channels don't inherit the locked range. 10152 * Then we will send a message to each channel to remove the range from its 10153 * per-channel list. 10154 */ 10155 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10156 if (range->offset == offset && range->length == length && 10157 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10158 break; 10159 } 10160 } 10161 if (range == NULL) { 10162 assert(false); 10163 spdk_spin_unlock(&bdev->internal.spinlock); 10164 return -EINVAL; 10165 } 10166 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10167 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10168 spdk_spin_unlock(&bdev->internal.spinlock); 10169 10170 ctx->cb_fn = cb_fn; 10171 ctx->cb_arg = cb_arg; 10172 10173 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10174 bdev_unlock_lba_range_cb); 10175 return 0; 10176 } 10177 10178 static int 10179 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10180 uint64_t offset, uint64_t length, 10181 lock_range_cb cb_fn, void *cb_arg) 10182 { 10183 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10184 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10185 struct lba_range *range; 10186 bool range_found = false; 10187 10188 /* Let's make sure the specified channel actually has a lock on 10189 * the specified range. Note that the range must match exactly. 10190 */ 10191 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10192 if (range->offset == offset && range->length == length && 10193 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10194 range_found = true; 10195 break; 10196 } 10197 } 10198 10199 if (!range_found) { 10200 return -EINVAL; 10201 } 10202 10203 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10204 } 10205 10206 struct bdev_quiesce_ctx { 10207 spdk_bdev_quiesce_cb cb_fn; 10208 void *cb_arg; 10209 }; 10210 10211 static void 10212 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10213 { 10214 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10215 10216 if (quiesce_ctx->cb_fn != NULL) { 10217 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10218 } 10219 10220 free(quiesce_ctx); 10221 } 10222 10223 static void 10224 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10225 { 10226 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10227 struct spdk_bdev_module *module = range->bdev->module; 10228 10229 if (status != 0) { 10230 if (quiesce_ctx->cb_fn != NULL) { 10231 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10232 } 10233 free(quiesce_ctx); 10234 return; 10235 } 10236 10237 spdk_spin_lock(&module->internal.spinlock); 10238 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10239 spdk_spin_unlock(&module->internal.spinlock); 10240 10241 if (quiesce_ctx->cb_fn != NULL) { 10242 /* copy the context in case the range is unlocked by the callback */ 10243 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10244 10245 quiesce_ctx->cb_fn = NULL; 10246 quiesce_ctx->cb_arg = NULL; 10247 10248 tmp.cb_fn(tmp.cb_arg, status); 10249 } 10250 /* quiesce_ctx will be freed on unquiesce */ 10251 } 10252 10253 static int 10254 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10255 uint64_t offset, uint64_t length, 10256 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10257 bool unquiesce) 10258 { 10259 struct bdev_quiesce_ctx *quiesce_ctx; 10260 int rc; 10261 10262 if (module != bdev->module) { 10263 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10264 return -EINVAL; 10265 } 10266 10267 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10268 return -EINVAL; 10269 } 10270 10271 if (unquiesce) { 10272 struct lba_range *range; 10273 10274 /* Make sure the specified range is actually quiesced in the specified module and 10275 * then remove it from the list. Note that the range must match exactly. 10276 */ 10277 spdk_spin_lock(&module->internal.spinlock); 10278 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10279 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10280 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10281 break; 10282 } 10283 } 10284 spdk_spin_unlock(&module->internal.spinlock); 10285 10286 if (range == NULL) { 10287 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10288 return -EINVAL; 10289 } 10290 10291 quiesce_ctx = range->locked_ctx; 10292 quiesce_ctx->cb_fn = cb_fn; 10293 quiesce_ctx->cb_arg = cb_arg; 10294 10295 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10296 } else { 10297 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10298 if (quiesce_ctx == NULL) { 10299 return -ENOMEM; 10300 } 10301 10302 quiesce_ctx->cb_fn = cb_fn; 10303 quiesce_ctx->cb_arg = cb_arg; 10304 10305 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10306 if (rc != 0) { 10307 free(quiesce_ctx); 10308 } 10309 } 10310 10311 return rc; 10312 } 10313 10314 int 10315 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10316 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10317 { 10318 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10319 } 10320 10321 int 10322 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10323 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10324 { 10325 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10326 } 10327 10328 int 10329 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10330 uint64_t offset, uint64_t length, 10331 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10332 { 10333 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10334 } 10335 10336 int 10337 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10338 uint64_t offset, uint64_t length, 10339 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10340 { 10341 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10342 } 10343 10344 int 10345 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10346 int array_size) 10347 { 10348 if (!bdev) { 10349 return -EINVAL; 10350 } 10351 10352 if (bdev->fn_table->get_memory_domains) { 10353 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10354 } 10355 10356 return 0; 10357 } 10358 10359 struct spdk_bdev_for_each_io_ctx { 10360 void *ctx; 10361 spdk_bdev_io_fn fn; 10362 spdk_bdev_for_each_io_cb cb; 10363 }; 10364 10365 static void 10366 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10367 struct spdk_io_channel *io_ch, void *_ctx) 10368 { 10369 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10370 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10371 struct spdk_bdev_io *bdev_io; 10372 int rc = 0; 10373 10374 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10375 rc = ctx->fn(ctx->ctx, bdev_io); 10376 if (rc != 0) { 10377 break; 10378 } 10379 } 10380 10381 spdk_bdev_for_each_channel_continue(i, rc); 10382 } 10383 10384 static void 10385 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10386 { 10387 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10388 10389 ctx->cb(ctx->ctx, status); 10390 10391 free(ctx); 10392 } 10393 10394 void 10395 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10396 spdk_bdev_for_each_io_cb cb) 10397 { 10398 struct spdk_bdev_for_each_io_ctx *ctx; 10399 10400 assert(fn != NULL && cb != NULL); 10401 10402 ctx = calloc(1, sizeof(*ctx)); 10403 if (ctx == NULL) { 10404 SPDK_ERRLOG("Failed to allocate context.\n"); 10405 cb(_ctx, -ENOMEM); 10406 return; 10407 } 10408 10409 ctx->ctx = _ctx; 10410 ctx->fn = fn; 10411 ctx->cb = cb; 10412 10413 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10414 bdev_for_each_io_done); 10415 } 10416 10417 void 10418 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10419 { 10420 spdk_for_each_channel_continue(iter->i, status); 10421 } 10422 10423 static struct spdk_bdev * 10424 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10425 { 10426 void *io_device = spdk_io_channel_iter_get_io_device(i); 10427 10428 return __bdev_from_io_dev(io_device); 10429 } 10430 10431 static void 10432 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10433 { 10434 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10435 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10436 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10437 10438 iter->i = i; 10439 iter->fn(iter, bdev, ch, iter->ctx); 10440 } 10441 10442 static void 10443 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10444 { 10445 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10446 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10447 10448 iter->i = i; 10449 iter->cpl(bdev, iter->ctx, status); 10450 10451 free(iter); 10452 } 10453 10454 void 10455 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10456 void *ctx, spdk_bdev_for_each_channel_done cpl) 10457 { 10458 struct spdk_bdev_channel_iter *iter; 10459 10460 assert(bdev != NULL && fn != NULL && ctx != NULL); 10461 10462 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10463 if (iter == NULL) { 10464 SPDK_ERRLOG("Unable to allocate iterator\n"); 10465 assert(false); 10466 return; 10467 } 10468 10469 iter->fn = fn; 10470 iter->cpl = cpl; 10471 iter->ctx = ctx; 10472 10473 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10474 iter, bdev_each_channel_cpl); 10475 } 10476 10477 static void 10478 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10479 { 10480 struct spdk_bdev_io *parent_io = cb_arg; 10481 10482 spdk_bdev_free_io(bdev_io); 10483 10484 /* Check return status of write */ 10485 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10486 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10487 } 10488 10489 static void 10490 bdev_copy_do_write(void *_bdev_io) 10491 { 10492 struct spdk_bdev_io *bdev_io = _bdev_io; 10493 int rc; 10494 10495 /* Write blocks */ 10496 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10497 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10498 bdev_io->u.bdev.iovs[0].iov_base, 10499 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10500 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10501 10502 if (rc == -ENOMEM) { 10503 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10504 } else if (rc != 0) { 10505 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10506 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10507 } 10508 } 10509 10510 static void 10511 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10512 { 10513 struct spdk_bdev_io *parent_io = cb_arg; 10514 10515 spdk_bdev_free_io(bdev_io); 10516 10517 /* Check return status of read */ 10518 if (!success) { 10519 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10520 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10521 return; 10522 } 10523 10524 /* Do write */ 10525 bdev_copy_do_write(parent_io); 10526 } 10527 10528 static void 10529 bdev_copy_do_read(void *_bdev_io) 10530 { 10531 struct spdk_bdev_io *bdev_io = _bdev_io; 10532 int rc; 10533 10534 /* Read blocks */ 10535 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10536 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10537 bdev_io->u.bdev.iovs[0].iov_base, 10538 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10539 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10540 10541 if (rc == -ENOMEM) { 10542 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10543 } else if (rc != 0) { 10544 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10545 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10546 } 10547 } 10548 10549 static void 10550 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10551 { 10552 if (!success) { 10553 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10554 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10555 return; 10556 } 10557 10558 bdev_copy_do_read(bdev_io); 10559 } 10560 10561 int 10562 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10563 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10564 spdk_bdev_io_completion_cb cb, void *cb_arg) 10565 { 10566 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10567 struct spdk_bdev_io *bdev_io; 10568 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10569 10570 if (!desc->write) { 10571 return -EBADF; 10572 } 10573 10574 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10575 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10576 SPDK_DEBUGLOG(bdev, 10577 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10578 dst_offset_blocks, src_offset_blocks, num_blocks); 10579 return -EINVAL; 10580 } 10581 10582 bdev_io = bdev_channel_get_io(channel); 10583 if (!bdev_io) { 10584 return -ENOMEM; 10585 } 10586 10587 bdev_io->internal.ch = channel; 10588 bdev_io->internal.desc = desc; 10589 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10590 10591 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10592 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10593 bdev_io->u.bdev.num_blocks = num_blocks; 10594 bdev_io->u.bdev.memory_domain = NULL; 10595 bdev_io->u.bdev.memory_domain_ctx = NULL; 10596 bdev_io->u.bdev.iovs = NULL; 10597 bdev_io->u.bdev.iovcnt = 0; 10598 bdev_io->u.bdev.md_buf = NULL; 10599 bdev_io->u.bdev.accel_sequence = NULL; 10600 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10601 10602 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10603 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10604 return 0; 10605 } 10606 10607 10608 /* If the copy size is large and should be split, use the generic split logic 10609 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10610 * 10611 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10612 * emulate it using regular read and write requests otherwise. 10613 */ 10614 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10615 bdev_io->internal.f.split) { 10616 bdev_io_submit(bdev_io); 10617 return 0; 10618 } 10619 10620 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10621 10622 return 0; 10623 } 10624 10625 SPDK_LOG_REGISTER_COMPONENT(bdev) 10626 10627 static void 10628 bdev_trace(void) 10629 { 10630 struct spdk_trace_tpoint_opts opts[] = { 10631 { 10632 "BDEV_IO_START", TRACE_BDEV_IO_START, 10633 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10634 { 10635 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10636 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10637 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10638 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10639 } 10640 }, 10641 { 10642 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10643 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10644 { 10645 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10646 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10647 } 10648 }, 10649 { 10650 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10651 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10652 { 10653 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10654 } 10655 }, 10656 { 10657 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10658 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10659 { 10660 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10661 } 10662 }, 10663 }; 10664 10665 10666 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10667 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10668 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10669 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10670 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10671 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10672 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10673 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_START, OBJECT_BDEV_IO, 0); 10674 spdk_trace_tpoint_register_relation(TRACE_BDEV_RAID_IO_DONE, OBJECT_BDEV_IO, 0); 10675 } 10676 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10677