1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 bdev_io_tailq_t queued_resets; 322 323 lba_range_tailq_t locked_ranges; 324 325 /** List of I/Os queued by QoS. */ 326 bdev_io_tailq_t qos_queued_io; 327 }; 328 329 struct media_event_entry { 330 struct spdk_bdev_media_event event; 331 TAILQ_ENTRY(media_event_entry) tailq; 332 }; 333 334 #define MEDIA_EVENT_POOL_SIZE 64 335 336 struct spdk_bdev_desc { 337 struct spdk_bdev *bdev; 338 struct spdk_thread *thread; 339 struct { 340 spdk_bdev_event_cb_t event_fn; 341 void *ctx; 342 } callback; 343 bool closed; 344 bool write; 345 bool memory_domains_supported; 346 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 347 struct spdk_spinlock spinlock; 348 uint32_t refs; 349 TAILQ_HEAD(, media_event_entry) pending_media_events; 350 TAILQ_HEAD(, media_event_entry) free_media_events; 351 struct media_event_entry *media_events_buffer; 352 TAILQ_ENTRY(spdk_bdev_desc) link; 353 354 uint64_t timeout_in_sec; 355 spdk_bdev_io_timeout_cb cb_fn; 356 void *cb_arg; 357 struct spdk_poller *io_timeout_poller; 358 struct spdk_bdev_module_claim *claim; 359 }; 360 361 struct spdk_bdev_iostat_ctx { 362 struct spdk_bdev_io_stat *stat; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 static bool bdev_io_should_split(struct spdk_bdev_io *bdev_io); 442 443 #define bdev_get_ext_io_opt(opts, field, defval) \ 444 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 445 446 static inline void 447 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 448 { 449 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 450 bdev_io->internal.ch->queue_depth++; 451 } 452 453 static inline void 454 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 455 { 456 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 457 bdev_io->internal.ch->queue_depth--; 458 } 459 460 void 461 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 462 { 463 if (!opts) { 464 SPDK_ERRLOG("opts should not be NULL\n"); 465 return; 466 } 467 468 if (!opts_size) { 469 SPDK_ERRLOG("opts_size should not be zero value\n"); 470 return; 471 } 472 473 opts->opts_size = opts_size; 474 475 #define SET_FIELD(field) \ 476 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 477 opts->field = g_bdev_opts.field; \ 478 } \ 479 480 SET_FIELD(bdev_io_pool_size); 481 SET_FIELD(bdev_io_cache_size); 482 SET_FIELD(bdev_auto_examine); 483 SET_FIELD(iobuf_small_cache_size); 484 SET_FIELD(iobuf_large_cache_size); 485 486 /* Do not remove this statement, you should always update this statement when you adding a new field, 487 * and do not forget to add the SET_FIELD statement for your added field. */ 488 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 489 490 #undef SET_FIELD 491 } 492 493 int 494 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 495 { 496 uint32_t min_pool_size; 497 498 if (!opts) { 499 SPDK_ERRLOG("opts cannot be NULL\n"); 500 return -1; 501 } 502 503 if (!opts->opts_size) { 504 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 505 return -1; 506 } 507 508 /* 509 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 510 * initialization. A second mgmt_ch will be created on the same thread when the application starts 511 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 512 */ 513 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 514 if (opts->bdev_io_pool_size < min_pool_size) { 515 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 516 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 517 spdk_thread_get_count()); 518 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 519 return -1; 520 } 521 522 #define SET_FIELD(field) \ 523 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 524 g_bdev_opts.field = opts->field; \ 525 } \ 526 527 SET_FIELD(bdev_io_pool_size); 528 SET_FIELD(bdev_io_cache_size); 529 SET_FIELD(bdev_auto_examine); 530 SET_FIELD(iobuf_small_cache_size); 531 SET_FIELD(iobuf_large_cache_size); 532 533 g_bdev_opts.opts_size = opts->opts_size; 534 535 #undef SET_FIELD 536 537 return 0; 538 } 539 540 static struct spdk_bdev * 541 bdev_get_by_name(const char *bdev_name) 542 { 543 struct spdk_bdev_name find; 544 struct spdk_bdev_name *res; 545 546 find.name = (char *)bdev_name; 547 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 548 if (res != NULL) { 549 return res->bdev; 550 } 551 552 return NULL; 553 } 554 555 struct spdk_bdev * 556 spdk_bdev_get_by_name(const char *bdev_name) 557 { 558 struct spdk_bdev *bdev; 559 560 spdk_spin_lock(&g_bdev_mgr.spinlock); 561 bdev = bdev_get_by_name(bdev_name); 562 spdk_spin_unlock(&g_bdev_mgr.spinlock); 563 564 return bdev; 565 } 566 567 struct bdev_io_status_string { 568 enum spdk_bdev_io_status status; 569 const char *str; 570 }; 571 572 static const struct bdev_io_status_string bdev_io_status_strings[] = { 573 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 574 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 575 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 576 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 577 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 578 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 579 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 580 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 581 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 582 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 583 }; 584 585 static const char * 586 bdev_io_status_get_string(enum spdk_bdev_io_status status) 587 { 588 uint32_t i; 589 590 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 591 if (bdev_io_status_strings[i].status == status) { 592 return bdev_io_status_strings[i].str; 593 } 594 } 595 596 return "reserved"; 597 } 598 599 struct spdk_bdev_wait_for_examine_ctx { 600 struct spdk_poller *poller; 601 spdk_bdev_wait_for_examine_cb cb_fn; 602 void *cb_arg; 603 }; 604 605 static bool bdev_module_all_actions_completed(void); 606 607 static int 608 bdev_wait_for_examine_cb(void *arg) 609 { 610 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 611 612 if (!bdev_module_all_actions_completed()) { 613 return SPDK_POLLER_IDLE; 614 } 615 616 spdk_poller_unregister(&ctx->poller); 617 ctx->cb_fn(ctx->cb_arg); 618 free(ctx); 619 620 return SPDK_POLLER_BUSY; 621 } 622 623 int 624 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 625 { 626 struct spdk_bdev_wait_for_examine_ctx *ctx; 627 628 ctx = calloc(1, sizeof(*ctx)); 629 if (ctx == NULL) { 630 return -ENOMEM; 631 } 632 ctx->cb_fn = cb_fn; 633 ctx->cb_arg = cb_arg; 634 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 635 636 return 0; 637 } 638 639 struct spdk_bdev_examine_item { 640 char *name; 641 TAILQ_ENTRY(spdk_bdev_examine_item) link; 642 }; 643 644 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 645 646 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 647 g_bdev_examine_allowlist); 648 649 static inline bool 650 bdev_examine_allowlist_check(const char *name) 651 { 652 struct spdk_bdev_examine_item *item; 653 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 654 if (strcmp(name, item->name) == 0) { 655 return true; 656 } 657 } 658 return false; 659 } 660 661 static inline void 662 bdev_examine_allowlist_free(void) 663 { 664 struct spdk_bdev_examine_item *item; 665 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 666 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 667 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 668 free(item->name); 669 free(item); 670 } 671 } 672 673 static inline bool 674 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 675 { 676 struct spdk_bdev_alias *tmp; 677 if (bdev_examine_allowlist_check(bdev->name)) { 678 return true; 679 } 680 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 681 if (bdev_examine_allowlist_check(tmp->alias.name)) { 682 return true; 683 } 684 } 685 return false; 686 } 687 688 static inline bool 689 bdev_ok_to_examine(struct spdk_bdev *bdev) 690 { 691 /* Some bdevs may not support the READ command. 692 * Do not try to examine them. 693 */ 694 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_READ)) { 695 return false; 696 } 697 698 if (g_bdev_opts.bdev_auto_examine) { 699 return true; 700 } else { 701 return bdev_in_examine_allowlist(bdev); 702 } 703 } 704 705 static void 706 bdev_examine(struct spdk_bdev *bdev) 707 { 708 struct spdk_bdev_module *module; 709 struct spdk_bdev_module_claim *claim, *tmpclaim; 710 uint32_t action; 711 712 if (!bdev_ok_to_examine(bdev)) { 713 return; 714 } 715 716 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 717 if (module->examine_config) { 718 spdk_spin_lock(&module->internal.spinlock); 719 action = module->internal.action_in_progress; 720 module->internal.action_in_progress++; 721 spdk_spin_unlock(&module->internal.spinlock); 722 module->examine_config(bdev); 723 if (action != module->internal.action_in_progress) { 724 SPDK_ERRLOG("examine_config for module %s did not call " 725 "spdk_bdev_module_examine_done()\n", module->name); 726 } 727 } 728 } 729 730 spdk_spin_lock(&bdev->internal.spinlock); 731 732 switch (bdev->internal.claim_type) { 733 case SPDK_BDEV_CLAIM_NONE: 734 /* Examine by all bdev modules */ 735 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 736 if (module->examine_disk) { 737 spdk_spin_lock(&module->internal.spinlock); 738 module->internal.action_in_progress++; 739 spdk_spin_unlock(&module->internal.spinlock); 740 spdk_spin_unlock(&bdev->internal.spinlock); 741 module->examine_disk(bdev); 742 spdk_spin_lock(&bdev->internal.spinlock); 743 } 744 } 745 break; 746 case SPDK_BDEV_CLAIM_EXCL_WRITE: 747 /* Examine by the one bdev module with a v1 claim */ 748 module = bdev->internal.claim.v1.module; 749 if (module->examine_disk) { 750 spdk_spin_lock(&module->internal.spinlock); 751 module->internal.action_in_progress++; 752 spdk_spin_unlock(&module->internal.spinlock); 753 spdk_spin_unlock(&bdev->internal.spinlock); 754 module->examine_disk(bdev); 755 return; 756 } 757 break; 758 default: 759 /* Examine by all bdev modules with a v2 claim */ 760 assert(claim_type_is_v2(bdev->internal.claim_type)); 761 /* 762 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 763 * list, perhaps accessing freed memory. Without protection, this could happen 764 * while the lock is dropped during the examine callback. 765 */ 766 bdev->internal.examine_in_progress++; 767 768 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 769 module = claim->module; 770 771 if (module == NULL) { 772 /* This is a vestigial claim, held by examine_count */ 773 continue; 774 } 775 776 if (module->examine_disk == NULL) { 777 continue; 778 } 779 780 spdk_spin_lock(&module->internal.spinlock); 781 module->internal.action_in_progress++; 782 spdk_spin_unlock(&module->internal.spinlock); 783 784 /* Call examine_disk without holding internal.spinlock. */ 785 spdk_spin_unlock(&bdev->internal.spinlock); 786 module->examine_disk(bdev); 787 spdk_spin_lock(&bdev->internal.spinlock); 788 } 789 790 assert(bdev->internal.examine_in_progress > 0); 791 bdev->internal.examine_in_progress--; 792 if (bdev->internal.examine_in_progress == 0) { 793 /* Remove any claims that were released during examine_disk */ 794 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 795 if (claim->desc != NULL) { 796 continue; 797 } 798 799 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 800 free(claim); 801 } 802 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 803 claim_reset(bdev); 804 } 805 } 806 } 807 808 spdk_spin_unlock(&bdev->internal.spinlock); 809 } 810 811 int 812 spdk_bdev_examine(const char *name) 813 { 814 struct spdk_bdev *bdev; 815 struct spdk_bdev_examine_item *item; 816 struct spdk_thread *thread = spdk_get_thread(); 817 818 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 819 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 820 thread ? spdk_thread_get_name(thread) : "null"); 821 return -EINVAL; 822 } 823 824 if (g_bdev_opts.bdev_auto_examine) { 825 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 826 return -EINVAL; 827 } 828 829 if (bdev_examine_allowlist_check(name)) { 830 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 831 return -EEXIST; 832 } 833 834 item = calloc(1, sizeof(*item)); 835 if (!item) { 836 return -ENOMEM; 837 } 838 item->name = strdup(name); 839 if (!item->name) { 840 free(item); 841 return -ENOMEM; 842 } 843 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 844 845 bdev = spdk_bdev_get_by_name(name); 846 if (bdev) { 847 bdev_examine(bdev); 848 } 849 return 0; 850 } 851 852 static inline void 853 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 854 { 855 struct spdk_bdev_examine_item *item; 856 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 857 spdk_json_write_object_begin(w); 858 spdk_json_write_named_string(w, "method", "bdev_examine"); 859 spdk_json_write_named_object_begin(w, "params"); 860 spdk_json_write_named_string(w, "name", item->name); 861 spdk_json_write_object_end(w); 862 spdk_json_write_object_end(w); 863 } 864 } 865 866 struct spdk_bdev * 867 spdk_bdev_first(void) 868 { 869 struct spdk_bdev *bdev; 870 871 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 872 if (bdev) { 873 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_next(struct spdk_bdev *prev) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = TAILQ_NEXT(prev, internal.link); 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 static struct spdk_bdev * 893 _bdev_next_leaf(struct spdk_bdev *bdev) 894 { 895 while (bdev != NULL) { 896 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 897 return bdev; 898 } else { 899 bdev = TAILQ_NEXT(bdev, internal.link); 900 } 901 } 902 903 return bdev; 904 } 905 906 struct spdk_bdev * 907 spdk_bdev_first_leaf(void) 908 { 909 struct spdk_bdev *bdev; 910 911 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 912 913 if (bdev) { 914 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 915 } 916 917 return bdev; 918 } 919 920 struct spdk_bdev * 921 spdk_bdev_next_leaf(struct spdk_bdev *prev) 922 { 923 struct spdk_bdev *bdev; 924 925 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 926 927 if (bdev) { 928 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 929 } 930 931 return bdev; 932 } 933 934 static inline bool 935 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 936 { 937 return bdev_io->internal.f.has_memory_domain; 938 } 939 940 static inline bool 941 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 942 { 943 return bdev_io->internal.f.has_accel_sequence; 944 } 945 946 static inline void 947 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 948 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 949 { 950 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 951 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 952 * channels we will instead wait for half to complete. 953 */ 954 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 955 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 956 957 assert(state != BDEV_IO_RETRY_STATE_INVALID); 958 bdev_io->internal.retry_state = state; 959 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 960 } 961 962 static inline void 963 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 964 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 965 { 966 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 967 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 968 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 969 970 assert(state != BDEV_IO_RETRY_STATE_INVALID); 971 bdev_io->internal.retry_state = state; 972 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 973 } 974 975 void 976 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 977 { 978 struct iovec *iovs; 979 980 if (bdev_io->u.bdev.iovs == NULL) { 981 bdev_io->u.bdev.iovs = &bdev_io->iov; 982 bdev_io->u.bdev.iovcnt = 1; 983 } 984 985 iovs = bdev_io->u.bdev.iovs; 986 987 assert(iovs != NULL); 988 assert(bdev_io->u.bdev.iovcnt >= 1); 989 990 iovs[0].iov_base = buf; 991 iovs[0].iov_len = len; 992 } 993 994 void 995 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 996 { 997 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 998 bdev_io->u.bdev.md_buf = md_buf; 999 } 1000 1001 static bool 1002 _is_buf_allocated(const struct iovec *iovs) 1003 { 1004 if (iovs == NULL) { 1005 return false; 1006 } 1007 1008 return iovs[0].iov_base != NULL; 1009 } 1010 1011 static bool 1012 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1013 { 1014 int i; 1015 uintptr_t iov_base; 1016 1017 if (spdk_likely(alignment == 1)) { 1018 return true; 1019 } 1020 1021 for (i = 0; i < iovcnt; i++) { 1022 iov_base = (uintptr_t)iovs[i].iov_base; 1023 if ((iov_base & (alignment - 1)) != 0) { 1024 return false; 1025 } 1026 } 1027 1028 return true; 1029 } 1030 1031 static inline bool 1032 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1033 { 1034 if (!bdev_io_use_accel_sequence(bdev_io)) { 1035 return false; 1036 } 1037 1038 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1039 * bdev module didn't support accel sequences */ 1040 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1041 } 1042 1043 static inline void 1044 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1045 struct spdk_bdev_shared_resource *shared_resource) 1046 { 1047 bdev_ch->io_outstanding++; 1048 shared_resource->io_outstanding++; 1049 } 1050 1051 static inline void 1052 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1053 struct spdk_bdev_shared_resource *shared_resource) 1054 { 1055 assert(bdev_ch->io_outstanding > 0); 1056 assert(shared_resource->io_outstanding > 0); 1057 bdev_ch->io_outstanding--; 1058 shared_resource->io_outstanding--; 1059 } 1060 1061 static void 1062 bdev_io_submit_sequence_cb(void *ctx, int status) 1063 { 1064 struct spdk_bdev_io *bdev_io = ctx; 1065 1066 assert(bdev_io_use_accel_sequence(bdev_io)); 1067 1068 bdev_io->u.bdev.accel_sequence = NULL; 1069 bdev_io->internal.f.has_accel_sequence = false; 1070 1071 if (spdk_unlikely(status != 0)) { 1072 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1073 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1074 bdev_io_complete_unsubmitted(bdev_io); 1075 return; 1076 } 1077 1078 bdev_io_submit(bdev_io); 1079 } 1080 1081 static void 1082 bdev_io_exec_sequence_cb(void *ctx, int status) 1083 { 1084 struct spdk_bdev_io *bdev_io = ctx; 1085 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1086 1087 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1088 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1089 1090 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1091 bdev_ch_retry_io(ch); 1092 } 1093 1094 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1095 } 1096 1097 static void 1098 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1099 { 1100 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1101 1102 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1103 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1104 assert(bdev_io_use_accel_sequence(bdev_io)); 1105 1106 /* Since the operations are appended during submission, they're in the opposite order than 1107 * how we want to execute them for reads (i.e. we need to execute the most recently added 1108 * operation first), so reverse the sequence before executing it. 1109 */ 1110 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1111 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1112 } 1113 1114 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1115 bdev_io_increment_outstanding(ch, ch->shared_resource); 1116 bdev_io->internal.data_transfer_cpl = cb_fn; 1117 1118 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1119 bdev_io_exec_sequence_cb, bdev_io); 1120 } 1121 1122 static void 1123 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1124 { 1125 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1126 void *buf; 1127 1128 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1129 buf = bdev_io->internal.buf.ptr; 1130 bdev_io->internal.buf.ptr = NULL; 1131 bdev_io->internal.f.has_buf = false; 1132 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1133 bdev_io->internal.get_aux_buf_cb = NULL; 1134 } else { 1135 assert(bdev_io->internal.get_buf_cb != NULL); 1136 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1137 bdev_io->internal.get_buf_cb = NULL; 1138 } 1139 } 1140 1141 static void 1142 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1143 { 1144 struct spdk_bdev_io *bdev_io = ctx; 1145 1146 if (rc) { 1147 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1148 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1149 } 1150 bdev_io_get_buf_complete(bdev_io, !rc); 1151 } 1152 1153 static void 1154 bdev_io_pull_md_buf_done(void *ctx, int status) 1155 { 1156 struct spdk_bdev_io *bdev_io = ctx; 1157 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1158 1159 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1160 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1161 1162 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1163 bdev_ch_retry_io(ch); 1164 } 1165 1166 assert(bdev_io->internal.data_transfer_cpl); 1167 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1168 } 1169 1170 static void 1171 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1172 { 1173 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1174 int rc = 0; 1175 1176 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1177 if (bdev_io_use_memory_domain(bdev_io)) { 1178 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1179 bdev_io_increment_outstanding(ch, ch->shared_resource); 1180 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1181 bdev_io->internal.memory_domain_ctx, 1182 &bdev_io->internal.orig_md_iov, 1, 1183 &bdev_io->internal.bounce_md_iov, 1, 1184 bdev_io_pull_md_buf_done, bdev_io); 1185 if (rc == 0) { 1186 /* Continue to submit IO in completion callback */ 1187 return; 1188 } 1189 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1190 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1191 if (rc != -ENOMEM) { 1192 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1193 spdk_memory_domain_get_dma_device_id( 1194 bdev_io->internal.memory_domain), rc); 1195 } 1196 } else { 1197 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1198 bdev_io->internal.orig_md_iov.iov_base, 1199 bdev_io->internal.orig_md_iov.iov_len); 1200 } 1201 } 1202 1203 if (spdk_unlikely(rc == -ENOMEM)) { 1204 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1205 } else { 1206 assert(bdev_io->internal.data_transfer_cpl); 1207 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1208 } 1209 } 1210 1211 static void 1212 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1213 { 1214 /* save original md_buf */ 1215 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1216 bdev_io->internal.orig_md_iov.iov_len = len; 1217 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1218 bdev_io->internal.bounce_md_iov.iov_len = len; 1219 /* set bounce md_buf */ 1220 bdev_io->u.bdev.md_buf = md_buf; 1221 1222 bdev_io_pull_md_buf(bdev_io); 1223 } 1224 1225 static void 1226 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1227 { 1228 struct spdk_bdev *bdev = bdev_io->bdev; 1229 uint64_t md_len; 1230 void *buf; 1231 1232 if (spdk_bdev_is_md_separate(bdev)) { 1233 assert(!bdev_io_use_accel_sequence(bdev_io)); 1234 1235 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1236 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1237 1238 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1239 1240 if (bdev_io->u.bdev.md_buf != NULL) { 1241 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1242 return; 1243 } else { 1244 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1245 } 1246 } 1247 1248 bdev_io_get_buf_complete(bdev_io, true); 1249 } 1250 1251 static inline void 1252 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1253 { 1254 if (rc) { 1255 SPDK_ERRLOG("Failed to get data buffer\n"); 1256 assert(bdev_io->internal.data_transfer_cpl); 1257 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1258 return; 1259 } 1260 1261 _bdev_io_set_md_buf(bdev_io); 1262 } 1263 1264 static void 1265 bdev_io_pull_data_done_and_track(void *ctx, int status) 1266 { 1267 struct spdk_bdev_io *bdev_io = ctx; 1268 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1269 1270 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1271 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1272 1273 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1274 bdev_ch_retry_io(ch); 1275 } 1276 1277 bdev_io_pull_data_done(bdev_io, status); 1278 } 1279 1280 static void 1281 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1282 { 1283 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1284 int rc = 0; 1285 1286 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1287 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1288 * operation */ 1289 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1290 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1291 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1292 assert(bdev_io_use_accel_sequence(bdev_io)); 1293 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1294 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1295 NULL, NULL, 1296 bdev_io->internal.orig_iovs, 1297 bdev_io->internal.orig_iovcnt, 1298 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1299 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1300 NULL, NULL); 1301 } else { 1302 /* We need to reverse the src/dst for reads */ 1303 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1304 assert(bdev_io_use_accel_sequence(bdev_io)); 1305 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1306 bdev_io->internal.orig_iovs, 1307 bdev_io->internal.orig_iovcnt, 1308 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1309 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1310 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1311 NULL, NULL, NULL, NULL); 1312 } 1313 1314 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1315 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1316 bdev_io->internal.accel_sequence); 1317 } 1318 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1319 /* if this is write path, copy data from original buffer to bounce buffer */ 1320 if (bdev_io_use_memory_domain(bdev_io)) { 1321 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1322 bdev_io_increment_outstanding(ch, ch->shared_resource); 1323 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1324 bdev_io->internal.memory_domain_ctx, 1325 bdev_io->internal.orig_iovs, 1326 (uint32_t) bdev_io->internal.orig_iovcnt, 1327 bdev_io->u.bdev.iovs, 1, 1328 bdev_io_pull_data_done_and_track, 1329 bdev_io); 1330 if (rc == 0) { 1331 /* Continue to submit IO in completion callback */ 1332 return; 1333 } 1334 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1335 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1336 if (rc != -ENOMEM) { 1337 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1338 spdk_memory_domain_get_dma_device_id( 1339 bdev_io->internal.memory_domain)); 1340 } 1341 } else { 1342 assert(bdev_io->u.bdev.iovcnt == 1); 1343 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1344 bdev_io->u.bdev.iovs[0].iov_len, 1345 bdev_io->internal.orig_iovs, 1346 bdev_io->internal.orig_iovcnt); 1347 } 1348 } 1349 1350 if (spdk_unlikely(rc == -ENOMEM)) { 1351 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1352 } else { 1353 bdev_io_pull_data_done(bdev_io, rc); 1354 } 1355 } 1356 1357 static void 1358 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1359 bdev_copy_bounce_buffer_cpl cpl_cb) 1360 { 1361 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1362 1363 bdev_io->internal.data_transfer_cpl = cpl_cb; 1364 /* save original iovec */ 1365 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1366 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1367 /* set bounce iov */ 1368 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1369 bdev_io->u.bdev.iovcnt = 1; 1370 /* set bounce buffer for this operation */ 1371 bdev_io->u.bdev.iovs[0].iov_base = buf; 1372 bdev_io->u.bdev.iovs[0].iov_len = len; 1373 /* Now we use 1 iov, the split condition could have been changed */ 1374 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 1375 1376 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1377 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1378 } else { 1379 bdev_io_pull_data(bdev_io); 1380 } 1381 } 1382 1383 static void 1384 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1385 { 1386 struct spdk_bdev *bdev = bdev_io->bdev; 1387 bool buf_allocated; 1388 uint64_t alignment; 1389 void *aligned_buf; 1390 1391 bdev_io->internal.buf.ptr = buf; 1392 bdev_io->internal.f.has_buf = true; 1393 1394 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1395 bdev_io_get_buf_complete(bdev_io, true); 1396 return; 1397 } 1398 1399 alignment = spdk_bdev_get_buf_align(bdev); 1400 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1401 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1402 1403 if (buf_allocated) { 1404 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1405 /* Continue in completion callback */ 1406 return; 1407 } else { 1408 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1409 } 1410 1411 _bdev_io_set_md_buf(bdev_io); 1412 } 1413 1414 static inline uint64_t 1415 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1416 { 1417 struct spdk_bdev *bdev = bdev_io->bdev; 1418 uint64_t md_len, alignment; 1419 1420 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1421 1422 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1423 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1424 1425 return len + alignment + md_len; 1426 } 1427 1428 static void 1429 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1430 { 1431 struct spdk_bdev_mgmt_channel *ch; 1432 1433 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1434 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1435 } 1436 1437 static void 1438 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1439 { 1440 assert(bdev_io->internal.f.has_buf); 1441 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1442 bdev_io->internal.buf.ptr = NULL; 1443 bdev_io->internal.f.has_buf = false; 1444 } 1445 1446 void 1447 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1448 { 1449 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1450 1451 assert(buf != NULL); 1452 _bdev_io_put_buf(bdev_io, buf, len); 1453 } 1454 1455 static inline void 1456 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1457 struct spdk_bdev_io *bdev_io) 1458 { 1459 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1460 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1461 * sequence pointer to make sure we won't touch it anymore. */ 1462 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1463 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1464 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1465 bdev_io->internal.f.has_accel_sequence = false; 1466 } 1467 1468 bdev->fn_table->submit_request(ioch, bdev_io); 1469 } 1470 1471 static inline void 1472 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1473 { 1474 struct spdk_bdev *bdev = bdev_io->bdev; 1475 1476 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1477 bdev_io->internal.error.nvme.cdw0 = 0; 1478 bdev_io->num_retries++; 1479 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1480 } 1481 1482 static void 1483 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1484 { 1485 struct spdk_bdev_io *bdev_io; 1486 1487 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1488 /* 1489 * Allow some more I/O to complete before retrying the nomem_io queue. 1490 * Some drivers (such as nvme) cannot immediately take a new I/O in 1491 * the context of a completion, because the resources for the I/O are 1492 * not released until control returns to the bdev poller. Also, we 1493 * may require several small I/O to complete before a larger I/O 1494 * (that requires splitting) can be submitted. 1495 */ 1496 return; 1497 } 1498 1499 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1500 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1501 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1502 1503 switch (bdev_io->internal.retry_state) { 1504 case BDEV_IO_RETRY_STATE_SUBMIT: 1505 bdev_ch_resubmit_io(shared_resource, bdev_io); 1506 break; 1507 case BDEV_IO_RETRY_STATE_PULL: 1508 bdev_io_pull_data(bdev_io); 1509 break; 1510 case BDEV_IO_RETRY_STATE_PULL_MD: 1511 bdev_io_pull_md_buf(bdev_io); 1512 break; 1513 case BDEV_IO_RETRY_STATE_PUSH: 1514 bdev_io_push_bounce_data(bdev_io); 1515 break; 1516 case BDEV_IO_RETRY_STATE_PUSH_MD: 1517 bdev_io_push_bounce_md_buf(bdev_io); 1518 break; 1519 default: 1520 assert(0 && "invalid retry state"); 1521 break; 1522 } 1523 1524 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1525 /* This IO completed again with NOMEM status, so break the loop and 1526 * don't try anymore. Note that a bdev_io that fails with NOMEM 1527 * always gets requeued at the front of the list, to maintain 1528 * ordering. 1529 */ 1530 break; 1531 } 1532 } 1533 } 1534 1535 static void 1536 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1537 { 1538 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1539 } 1540 1541 static int 1542 bdev_no_mem_poller(void *ctx) 1543 { 1544 struct spdk_bdev_shared_resource *shared_resource = ctx; 1545 1546 spdk_poller_unregister(&shared_resource->nomem_poller); 1547 1548 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1549 bdev_shared_ch_retry_io(shared_resource); 1550 } 1551 /* the retry cb may re-register the poller so double check */ 1552 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1553 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1554 /* No IOs were submitted, try again */ 1555 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1556 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1557 } 1558 1559 return SPDK_POLLER_BUSY; 1560 } 1561 1562 static inline bool 1563 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1564 { 1565 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1566 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1567 1568 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1569 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1570 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1571 1572 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1573 /* Special case when we have nomem IOs and no outstanding IOs which completions 1574 * could trigger retry of queued IOs 1575 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1576 * new IOs submitted, e.g. qd==1 */ 1577 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1578 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1579 } 1580 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1581 * ownership of that sequence is transferred back to the bdev layer, so we need to 1582 * restore internal.accel_sequence to make sure that the sequence is handled 1583 * correctly in case the I/O is later aborted. */ 1584 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1585 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1586 assert(!bdev_io_use_accel_sequence(bdev_io)); 1587 bdev_io->internal.f.has_accel_sequence = true; 1588 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1589 } 1590 1591 return true; 1592 } 1593 1594 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1595 bdev_ch_retry_io(bdev_ch); 1596 } 1597 1598 return false; 1599 } 1600 1601 static void 1602 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1603 { 1604 struct spdk_bdev_io *bdev_io = ctx; 1605 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1606 1607 if (rc) { 1608 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1609 } 1610 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1611 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1612 */ 1613 bdev_io_put_buf(bdev_io); 1614 1615 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1616 bdev_ch_retry_io(ch); 1617 } 1618 1619 /* Continue with IO completion flow */ 1620 bdev_io_complete(bdev_io); 1621 } 1622 1623 static void 1624 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1625 { 1626 struct spdk_bdev_io *bdev_io = ctx; 1627 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1628 1629 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1630 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1631 1632 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1633 bdev_ch_retry_io(ch); 1634 } 1635 1636 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1637 } 1638 1639 static inline void 1640 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1641 { 1642 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1643 int rc = 0; 1644 1645 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1646 /* do the same for metadata buffer */ 1647 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1648 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1649 1650 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1651 if (bdev_io_use_memory_domain(bdev_io)) { 1652 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1653 bdev_io_increment_outstanding(ch, ch->shared_resource); 1654 /* If memory domain is used then we need to call async push function */ 1655 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1656 bdev_io->internal.memory_domain_ctx, 1657 &bdev_io->internal.orig_md_iov, 1658 (uint32_t)bdev_io->internal.orig_iovcnt, 1659 &bdev_io->internal.bounce_md_iov, 1, 1660 bdev_io_push_bounce_md_buf_done, 1661 bdev_io); 1662 if (rc == 0) { 1663 /* Continue IO completion in async callback */ 1664 return; 1665 } 1666 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1667 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1668 if (rc != -ENOMEM) { 1669 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1670 spdk_memory_domain_get_dma_device_id( 1671 bdev_io->internal.memory_domain)); 1672 } 1673 } else { 1674 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1675 bdev_io->internal.orig_md_iov.iov_len); 1676 } 1677 } 1678 } 1679 1680 if (spdk_unlikely(rc == -ENOMEM)) { 1681 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1682 } else { 1683 assert(bdev_io->internal.data_transfer_cpl); 1684 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1685 } 1686 } 1687 1688 static inline void 1689 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1690 { 1691 assert(bdev_io->internal.data_transfer_cpl); 1692 if (rc) { 1693 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1694 return; 1695 } 1696 1697 /* set original buffer for this io */ 1698 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1699 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1700 /* disable bouncing buffer for this io */ 1701 bdev_io->internal.orig_iovcnt = 0; 1702 bdev_io->internal.orig_iovs = NULL; 1703 1704 bdev_io_push_bounce_md_buf(bdev_io); 1705 } 1706 1707 static void 1708 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1709 { 1710 struct spdk_bdev_io *bdev_io = ctx; 1711 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1712 1713 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1714 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1715 1716 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1717 bdev_ch_retry_io(ch); 1718 } 1719 1720 bdev_io_push_bounce_data_done(bdev_io, status); 1721 } 1722 1723 static inline void 1724 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1725 { 1726 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1727 int rc = 0; 1728 1729 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1730 assert(!bdev_io_use_accel_sequence(bdev_io)); 1731 1732 /* if this is read path, copy data from bounce buffer to original buffer */ 1733 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1734 if (bdev_io_use_memory_domain(bdev_io)) { 1735 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1736 bdev_io_increment_outstanding(ch, ch->shared_resource); 1737 /* If memory domain is used then we need to call async push function */ 1738 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1739 bdev_io->internal.memory_domain_ctx, 1740 bdev_io->internal.orig_iovs, 1741 (uint32_t)bdev_io->internal.orig_iovcnt, 1742 &bdev_io->internal.bounce_iov, 1, 1743 bdev_io_push_bounce_data_done_and_track, 1744 bdev_io); 1745 if (rc == 0) { 1746 /* Continue IO completion in async callback */ 1747 return; 1748 } 1749 1750 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1751 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1752 if (rc != -ENOMEM) { 1753 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1754 spdk_memory_domain_get_dma_device_id( 1755 bdev_io->internal.memory_domain)); 1756 } 1757 } else { 1758 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1759 bdev_io->internal.orig_iovcnt, 1760 bdev_io->internal.bounce_iov.iov_base, 1761 bdev_io->internal.bounce_iov.iov_len); 1762 } 1763 } 1764 1765 if (spdk_unlikely(rc == -ENOMEM)) { 1766 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1767 } else { 1768 bdev_io_push_bounce_data_done(bdev_io, rc); 1769 } 1770 } 1771 1772 static inline void 1773 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1774 { 1775 bdev_io->internal.data_transfer_cpl = cpl_cb; 1776 bdev_io_push_bounce_data(bdev_io); 1777 } 1778 1779 static void 1780 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1781 { 1782 struct spdk_bdev_io *bdev_io; 1783 1784 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1785 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1786 } 1787 1788 static void 1789 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1790 { 1791 struct spdk_bdev_mgmt_channel *mgmt_ch; 1792 uint64_t max_len; 1793 void *buf; 1794 1795 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1796 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1797 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1798 1799 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1800 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1801 bdev_io_get_buf_complete(bdev_io, false); 1802 return; 1803 } 1804 1805 bdev_io->internal.buf.len = len; 1806 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1807 bdev_io_get_iobuf_cb); 1808 if (buf != NULL) { 1809 _bdev_io_set_buf(bdev_io, buf, len); 1810 } 1811 } 1812 1813 void 1814 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1815 { 1816 struct spdk_bdev *bdev = bdev_io->bdev; 1817 uint64_t alignment; 1818 1819 assert(cb != NULL); 1820 bdev_io->internal.get_buf_cb = cb; 1821 1822 alignment = spdk_bdev_get_buf_align(bdev); 1823 1824 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1825 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1826 /* Buffer already present and aligned */ 1827 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1828 return; 1829 } 1830 1831 bdev_io_get_buf(bdev_io, len); 1832 } 1833 1834 static void 1835 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1836 bool success) 1837 { 1838 if (!success) { 1839 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1840 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1841 bdev_io_complete_unsubmitted(bdev_io); 1842 return; 1843 } 1844 1845 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1846 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1847 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1848 return; 1849 } 1850 /* For reads we'll execute the sequence after the data is read, so, for now, only 1851 * clear out accel_sequence pointer and submit the IO */ 1852 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1853 bdev_io->u.bdev.accel_sequence = NULL; 1854 } 1855 1856 bdev_io_submit(bdev_io); 1857 } 1858 1859 static void 1860 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1861 uint64_t len) 1862 { 1863 assert(cb != NULL); 1864 bdev_io->internal.get_buf_cb = cb; 1865 1866 bdev_io_get_buf(bdev_io, len); 1867 } 1868 1869 void 1870 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1871 { 1872 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1873 1874 assert(cb != NULL); 1875 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1876 bdev_io->internal.get_aux_buf_cb = cb; 1877 bdev_io_get_buf(bdev_io, len); 1878 } 1879 1880 static int 1881 bdev_module_get_max_ctx_size(void) 1882 { 1883 struct spdk_bdev_module *bdev_module; 1884 int max_bdev_module_size = 0; 1885 1886 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1887 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1888 max_bdev_module_size = bdev_module->get_ctx_size(); 1889 } 1890 } 1891 1892 return max_bdev_module_size; 1893 } 1894 1895 static void 1896 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1897 { 1898 if (!bdev->internal.histogram_enabled) { 1899 return; 1900 } 1901 1902 spdk_json_write_object_begin(w); 1903 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1904 1905 spdk_json_write_named_object_begin(w, "params"); 1906 spdk_json_write_named_string(w, "name", bdev->name); 1907 1908 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1909 1910 if (bdev->internal.histogram_io_type) { 1911 spdk_json_write_named_string(w, "opc", 1912 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1913 } 1914 1915 spdk_json_write_object_end(w); 1916 1917 spdk_json_write_object_end(w); 1918 } 1919 1920 static void 1921 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1922 { 1923 int i; 1924 struct spdk_bdev_qos *qos = bdev->internal.qos; 1925 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1926 1927 if (!qos) { 1928 return; 1929 } 1930 1931 spdk_bdev_get_qos_rate_limits(bdev, limits); 1932 1933 spdk_json_write_object_begin(w); 1934 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1935 1936 spdk_json_write_named_object_begin(w, "params"); 1937 spdk_json_write_named_string(w, "name", bdev->name); 1938 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1939 if (limits[i] > 0) { 1940 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1941 } 1942 } 1943 spdk_json_write_object_end(w); 1944 1945 spdk_json_write_object_end(w); 1946 } 1947 1948 void 1949 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1950 { 1951 struct spdk_bdev_module *bdev_module; 1952 struct spdk_bdev *bdev; 1953 1954 assert(w != NULL); 1955 1956 spdk_json_write_array_begin(w); 1957 1958 spdk_json_write_object_begin(w); 1959 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1960 spdk_json_write_named_object_begin(w, "params"); 1961 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1962 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1963 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1964 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1965 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1966 spdk_json_write_object_end(w); 1967 spdk_json_write_object_end(w); 1968 1969 bdev_examine_allowlist_config_json(w); 1970 1971 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1972 if (bdev_module->config_json) { 1973 bdev_module->config_json(w); 1974 } 1975 } 1976 1977 spdk_spin_lock(&g_bdev_mgr.spinlock); 1978 1979 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1980 if (bdev->fn_table->write_config_json) { 1981 bdev->fn_table->write_config_json(bdev, w); 1982 } 1983 1984 bdev_qos_config_json(bdev, w); 1985 bdev_enable_histogram_config_json(bdev, w); 1986 } 1987 1988 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1989 1990 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1991 spdk_json_write_object_begin(w); 1992 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1993 spdk_json_write_object_end(w); 1994 1995 spdk_json_write_array_end(w); 1996 } 1997 1998 static void 1999 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 2000 { 2001 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2002 struct spdk_bdev_io *bdev_io; 2003 2004 spdk_iobuf_channel_fini(&ch->iobuf); 2005 2006 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 2007 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2008 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2009 ch->per_thread_cache_count--; 2010 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2011 } 2012 2013 assert(ch->per_thread_cache_count == 0); 2014 } 2015 2016 static int 2017 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2018 { 2019 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2020 struct spdk_bdev_io *bdev_io; 2021 uint32_t i; 2022 int rc; 2023 2024 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2025 g_bdev_opts.iobuf_small_cache_size, 2026 g_bdev_opts.iobuf_large_cache_size); 2027 if (rc != 0) { 2028 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2029 return -1; 2030 } 2031 2032 STAILQ_INIT(&ch->per_thread_cache); 2033 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2034 2035 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2036 ch->per_thread_cache_count = 0; 2037 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2038 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2039 if (bdev_io == NULL) { 2040 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2041 assert(false); 2042 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2043 return -1; 2044 } 2045 ch->per_thread_cache_count++; 2046 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2047 } 2048 2049 TAILQ_INIT(&ch->shared_resources); 2050 TAILQ_INIT(&ch->io_wait_queue); 2051 2052 return 0; 2053 } 2054 2055 static void 2056 bdev_init_complete(int rc) 2057 { 2058 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2059 void *cb_arg = g_init_cb_arg; 2060 struct spdk_bdev_module *m; 2061 2062 g_bdev_mgr.init_complete = true; 2063 g_init_cb_fn = NULL; 2064 g_init_cb_arg = NULL; 2065 2066 /* 2067 * For modules that need to know when subsystem init is complete, 2068 * inform them now. 2069 */ 2070 if (rc == 0) { 2071 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2072 if (m->init_complete) { 2073 m->init_complete(); 2074 } 2075 } 2076 } 2077 2078 cb_fn(cb_arg, rc); 2079 } 2080 2081 static bool 2082 bdev_module_all_actions_completed(void) 2083 { 2084 struct spdk_bdev_module *m; 2085 2086 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2087 if (m->internal.action_in_progress > 0) { 2088 return false; 2089 } 2090 } 2091 return true; 2092 } 2093 2094 static void 2095 bdev_module_action_complete(void) 2096 { 2097 /* 2098 * Don't finish bdev subsystem initialization if 2099 * module pre-initialization is still in progress, or 2100 * the subsystem been already initialized. 2101 */ 2102 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2103 return; 2104 } 2105 2106 /* 2107 * Check all bdev modules for inits/examinations in progress. If any 2108 * exist, return immediately since we cannot finish bdev subsystem 2109 * initialization until all are completed. 2110 */ 2111 if (!bdev_module_all_actions_completed()) { 2112 return; 2113 } 2114 2115 /* 2116 * Modules already finished initialization - now that all 2117 * the bdev modules have finished their asynchronous I/O 2118 * processing, the entire bdev layer can be marked as complete. 2119 */ 2120 bdev_init_complete(0); 2121 } 2122 2123 static void 2124 bdev_module_action_done(struct spdk_bdev_module *module) 2125 { 2126 spdk_spin_lock(&module->internal.spinlock); 2127 assert(module->internal.action_in_progress > 0); 2128 module->internal.action_in_progress--; 2129 spdk_spin_unlock(&module->internal.spinlock); 2130 bdev_module_action_complete(); 2131 } 2132 2133 void 2134 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2135 { 2136 assert(module->async_init); 2137 bdev_module_action_done(module); 2138 } 2139 2140 void 2141 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2142 { 2143 bdev_module_action_done(module); 2144 } 2145 2146 /** The last initialized bdev module */ 2147 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2148 2149 static void 2150 bdev_init_failed(void *cb_arg) 2151 { 2152 struct spdk_bdev_module *module = cb_arg; 2153 2154 spdk_spin_lock(&module->internal.spinlock); 2155 assert(module->internal.action_in_progress > 0); 2156 module->internal.action_in_progress--; 2157 spdk_spin_unlock(&module->internal.spinlock); 2158 bdev_init_complete(-1); 2159 } 2160 2161 static int 2162 bdev_modules_init(void) 2163 { 2164 struct spdk_bdev_module *module; 2165 int rc = 0; 2166 2167 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2168 g_resume_bdev_module = module; 2169 if (module->async_init) { 2170 spdk_spin_lock(&module->internal.spinlock); 2171 module->internal.action_in_progress = 1; 2172 spdk_spin_unlock(&module->internal.spinlock); 2173 } 2174 rc = module->module_init(); 2175 if (rc != 0) { 2176 /* Bump action_in_progress to prevent other modules from completion of modules_init 2177 * Send message to defer application shutdown until resources are cleaned up */ 2178 spdk_spin_lock(&module->internal.spinlock); 2179 module->internal.action_in_progress = 1; 2180 spdk_spin_unlock(&module->internal.spinlock); 2181 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2182 return rc; 2183 } 2184 } 2185 2186 g_resume_bdev_module = NULL; 2187 return 0; 2188 } 2189 2190 void 2191 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2192 { 2193 int rc = 0; 2194 char mempool_name[32]; 2195 2196 assert(cb_fn != NULL); 2197 2198 g_init_cb_fn = cb_fn; 2199 g_init_cb_arg = cb_arg; 2200 2201 spdk_notify_type_register("bdev_register"); 2202 spdk_notify_type_register("bdev_unregister"); 2203 2204 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2205 2206 rc = spdk_iobuf_register_module("bdev"); 2207 if (rc != 0) { 2208 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2209 bdev_init_complete(-1); 2210 return; 2211 } 2212 2213 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2214 g_bdev_opts.bdev_io_pool_size, 2215 sizeof(struct spdk_bdev_io) + 2216 bdev_module_get_max_ctx_size(), 2217 0, 2218 SPDK_ENV_NUMA_ID_ANY); 2219 2220 if (g_bdev_mgr.bdev_io_pool == NULL) { 2221 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2222 bdev_init_complete(-1); 2223 return; 2224 } 2225 2226 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2227 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2228 if (!g_bdev_mgr.zero_buffer) { 2229 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2230 bdev_init_complete(-1); 2231 return; 2232 } 2233 2234 #ifdef SPDK_CONFIG_VTUNE 2235 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2236 #endif 2237 2238 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2239 bdev_mgmt_channel_destroy, 2240 sizeof(struct spdk_bdev_mgmt_channel), 2241 "bdev_mgr"); 2242 2243 rc = bdev_modules_init(); 2244 g_bdev_mgr.module_init_complete = true; 2245 if (rc != 0) { 2246 SPDK_ERRLOG("bdev modules init failed\n"); 2247 return; 2248 } 2249 2250 bdev_module_action_complete(); 2251 } 2252 2253 static void 2254 bdev_mgr_unregister_cb(void *io_device) 2255 { 2256 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2257 2258 if (g_bdev_mgr.bdev_io_pool) { 2259 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2260 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2261 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2262 g_bdev_opts.bdev_io_pool_size); 2263 } 2264 2265 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2266 } 2267 2268 spdk_free(g_bdev_mgr.zero_buffer); 2269 2270 bdev_examine_allowlist_free(); 2271 2272 cb_fn(g_fini_cb_arg); 2273 g_fini_cb_fn = NULL; 2274 g_fini_cb_arg = NULL; 2275 g_bdev_mgr.init_complete = false; 2276 g_bdev_mgr.module_init_complete = false; 2277 } 2278 2279 static void 2280 bdev_module_fini_iter(void *arg) 2281 { 2282 struct spdk_bdev_module *bdev_module; 2283 2284 /* FIXME: Handling initialization failures is broken now, 2285 * so we won't even try cleaning up after successfully 2286 * initialized modules. if module_init_complete is false, 2287 * just call spdk_bdev_mgr_unregister_cb 2288 */ 2289 if (!g_bdev_mgr.module_init_complete) { 2290 bdev_mgr_unregister_cb(NULL); 2291 return; 2292 } 2293 2294 /* Start iterating from the last touched module */ 2295 if (!g_resume_bdev_module) { 2296 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2297 } else { 2298 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2299 internal.tailq); 2300 } 2301 2302 while (bdev_module) { 2303 if (bdev_module->async_fini) { 2304 /* Save our place so we can resume later. We must 2305 * save the variable here, before calling module_fini() 2306 * below, because in some cases the module may immediately 2307 * call spdk_bdev_module_fini_done() and re-enter 2308 * this function to continue iterating. */ 2309 g_resume_bdev_module = bdev_module; 2310 } 2311 2312 if (bdev_module->module_fini) { 2313 bdev_module->module_fini(); 2314 } 2315 2316 if (bdev_module->async_fini) { 2317 return; 2318 } 2319 2320 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2321 internal.tailq); 2322 } 2323 2324 g_resume_bdev_module = NULL; 2325 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2326 } 2327 2328 void 2329 spdk_bdev_module_fini_done(void) 2330 { 2331 if (spdk_get_thread() != g_fini_thread) { 2332 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2333 } else { 2334 bdev_module_fini_iter(NULL); 2335 } 2336 } 2337 2338 static void 2339 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2340 { 2341 struct spdk_bdev *bdev = cb_arg; 2342 2343 if (bdeverrno && bdev) { 2344 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2345 bdev->name); 2346 2347 /* 2348 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2349 * bdev; try to continue by manually removing this bdev from the list and continue 2350 * with the next bdev in the list. 2351 */ 2352 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2353 } 2354 2355 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2356 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2357 /* 2358 * Bdev module finish need to be deferred as we might be in the middle of some context 2359 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2360 * after returning. 2361 */ 2362 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2363 return; 2364 } 2365 2366 /* 2367 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2368 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2369 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2370 * base bdevs. 2371 * 2372 * Also, walk the list in the reverse order. 2373 */ 2374 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2375 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2376 spdk_spin_lock(&bdev->internal.spinlock); 2377 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2378 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2379 spdk_spin_unlock(&bdev->internal.spinlock); 2380 continue; 2381 } 2382 spdk_spin_unlock(&bdev->internal.spinlock); 2383 2384 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2385 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2386 return; 2387 } 2388 2389 /* 2390 * If any bdev fails to unclaim underlying bdev properly, we may face the 2391 * case of bdev list consisting of claimed bdevs only (if claims are managed 2392 * correctly, this would mean there's a loop in the claims graph which is 2393 * clearly impossible). Warn and unregister last bdev on the list then. 2394 */ 2395 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2396 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2397 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2398 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2399 return; 2400 } 2401 } 2402 2403 static void 2404 bdev_module_fini_start_iter(void *arg) 2405 { 2406 struct spdk_bdev_module *bdev_module; 2407 2408 if (!g_resume_bdev_module) { 2409 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2410 } else { 2411 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2412 } 2413 2414 while (bdev_module) { 2415 if (bdev_module->async_fini_start) { 2416 /* Save our place so we can resume later. We must 2417 * save the variable here, before calling fini_start() 2418 * below, because in some cases the module may immediately 2419 * call spdk_bdev_module_fini_start_done() and re-enter 2420 * this function to continue iterating. */ 2421 g_resume_bdev_module = bdev_module; 2422 } 2423 2424 if (bdev_module->fini_start) { 2425 bdev_module->fini_start(); 2426 } 2427 2428 if (bdev_module->async_fini_start) { 2429 return; 2430 } 2431 2432 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2433 } 2434 2435 g_resume_bdev_module = NULL; 2436 2437 bdev_finish_unregister_bdevs_iter(NULL, 0); 2438 } 2439 2440 void 2441 spdk_bdev_module_fini_start_done(void) 2442 { 2443 if (spdk_get_thread() != g_fini_thread) { 2444 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2445 } else { 2446 bdev_module_fini_start_iter(NULL); 2447 } 2448 } 2449 2450 static void 2451 bdev_finish_wait_for_examine_done(void *cb_arg) 2452 { 2453 bdev_module_fini_start_iter(NULL); 2454 } 2455 2456 static void bdev_open_async_fini(void); 2457 2458 void 2459 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2460 { 2461 int rc; 2462 2463 assert(cb_fn != NULL); 2464 2465 g_fini_thread = spdk_get_thread(); 2466 2467 g_fini_cb_fn = cb_fn; 2468 g_fini_cb_arg = cb_arg; 2469 2470 bdev_open_async_fini(); 2471 2472 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2473 if (rc != 0) { 2474 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2475 bdev_finish_wait_for_examine_done(NULL); 2476 } 2477 } 2478 2479 struct spdk_bdev_io * 2480 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2481 { 2482 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2483 struct spdk_bdev_io *bdev_io; 2484 2485 if (ch->per_thread_cache_count > 0) { 2486 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2487 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2488 ch->per_thread_cache_count--; 2489 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2490 /* 2491 * Don't try to look for bdev_ios in the global pool if there are 2492 * waiters on bdev_ios - we don't want this caller to jump the line. 2493 */ 2494 bdev_io = NULL; 2495 } else { 2496 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2497 } 2498 2499 return bdev_io; 2500 } 2501 2502 void 2503 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2504 { 2505 struct spdk_bdev_mgmt_channel *ch; 2506 2507 assert(bdev_io != NULL); 2508 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2509 2510 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2511 2512 if (bdev_io->internal.f.has_buf) { 2513 bdev_io_put_buf(bdev_io); 2514 } 2515 2516 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2517 ch->per_thread_cache_count++; 2518 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2519 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2520 struct spdk_bdev_io_wait_entry *entry; 2521 2522 entry = TAILQ_FIRST(&ch->io_wait_queue); 2523 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2524 entry->cb_fn(entry->cb_arg); 2525 } 2526 } else { 2527 /* We should never have a full cache with entries on the io wait queue. */ 2528 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2529 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2530 } 2531 } 2532 2533 static bool 2534 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2535 { 2536 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2537 2538 switch (limit) { 2539 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2540 return true; 2541 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2542 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2543 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2544 return false; 2545 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2546 default: 2547 return false; 2548 } 2549 } 2550 2551 static bool 2552 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2553 { 2554 switch (bdev_io->type) { 2555 case SPDK_BDEV_IO_TYPE_NVME_IO: 2556 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2557 case SPDK_BDEV_IO_TYPE_READ: 2558 case SPDK_BDEV_IO_TYPE_WRITE: 2559 return true; 2560 case SPDK_BDEV_IO_TYPE_ZCOPY: 2561 if (bdev_io->u.bdev.zcopy.start) { 2562 return true; 2563 } else { 2564 return false; 2565 } 2566 default: 2567 return false; 2568 } 2569 } 2570 2571 static bool 2572 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2573 { 2574 switch (bdev_io->type) { 2575 case SPDK_BDEV_IO_TYPE_NVME_IO: 2576 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2577 /* Bit 1 (0x2) set for read operation */ 2578 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2579 return true; 2580 } else { 2581 return false; 2582 } 2583 case SPDK_BDEV_IO_TYPE_READ: 2584 return true; 2585 case SPDK_BDEV_IO_TYPE_ZCOPY: 2586 /* Populate to read from disk */ 2587 if (bdev_io->u.bdev.zcopy.populate) { 2588 return true; 2589 } else { 2590 return false; 2591 } 2592 default: 2593 return false; 2594 } 2595 } 2596 2597 static uint64_t 2598 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2599 { 2600 struct spdk_bdev *bdev = bdev_io->bdev; 2601 2602 switch (bdev_io->type) { 2603 case SPDK_BDEV_IO_TYPE_NVME_IO: 2604 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2605 return bdev_io->u.nvme_passthru.nbytes; 2606 case SPDK_BDEV_IO_TYPE_READ: 2607 case SPDK_BDEV_IO_TYPE_WRITE: 2608 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2609 case SPDK_BDEV_IO_TYPE_ZCOPY: 2610 /* Track the data in the start phase only */ 2611 if (bdev_io->u.bdev.zcopy.start) { 2612 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2613 } else { 2614 return 0; 2615 } 2616 default: 2617 return 0; 2618 } 2619 } 2620 2621 static inline bool 2622 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2623 { 2624 int64_t remaining_this_timeslice; 2625 2626 if (!limit->max_per_timeslice) { 2627 /* The QoS is disabled */ 2628 return false; 2629 } 2630 2631 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2632 __ATOMIC_RELAXED); 2633 if (remaining_this_timeslice + (int64_t)delta > 0) { 2634 /* There was still a quota for this delta -> the IO shouldn't be queued 2635 * 2636 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2637 * quota can be allowed once a while. Such overrun then taken into account in 2638 * the QoS poller, where the next timeslice quota is calculated. 2639 */ 2640 return false; 2641 } 2642 2643 /* There was no quota for this delta -> the IO should be queued 2644 * The remaining_this_timeslice must be rewinded so it reflects the real 2645 * amount of IOs or bytes allowed. 2646 */ 2647 __atomic_add_fetch( 2648 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2649 return true; 2650 } 2651 2652 static inline void 2653 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2654 { 2655 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2656 } 2657 2658 static bool 2659 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2660 { 2661 return bdev_qos_rw_queue_io(limit, io, 1); 2662 } 2663 2664 static void 2665 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2666 { 2667 bdev_qos_rw_rewind_io(limit, io, 1); 2668 } 2669 2670 static bool 2671 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2672 { 2673 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2674 } 2675 2676 static void 2677 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2678 { 2679 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2680 } 2681 2682 static bool 2683 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2684 { 2685 if (bdev_is_read_io(io) == false) { 2686 return false; 2687 } 2688 2689 return bdev_qos_rw_bps_queue(limit, io); 2690 } 2691 2692 static void 2693 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2694 { 2695 if (bdev_is_read_io(io) != false) { 2696 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2697 } 2698 } 2699 2700 static bool 2701 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2702 { 2703 if (bdev_is_read_io(io) == true) { 2704 return false; 2705 } 2706 2707 return bdev_qos_rw_bps_queue(limit, io); 2708 } 2709 2710 static void 2711 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2712 { 2713 if (bdev_is_read_io(io) != true) { 2714 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2715 } 2716 } 2717 2718 static void 2719 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2720 { 2721 int i; 2722 2723 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2724 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2725 qos->rate_limits[i].queue_io = NULL; 2726 continue; 2727 } 2728 2729 switch (i) { 2730 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2731 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2732 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2733 break; 2734 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2735 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2736 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2737 break; 2738 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2739 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2740 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2741 break; 2742 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2743 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2744 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2745 break; 2746 default: 2747 break; 2748 } 2749 } 2750 } 2751 2752 static void 2753 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2754 struct spdk_bdev_io *bdev_io, 2755 enum spdk_bdev_io_status status) 2756 { 2757 bdev_io->internal.in_submit_request = true; 2758 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2759 spdk_bdev_io_complete(bdev_io, status); 2760 bdev_io->internal.in_submit_request = false; 2761 } 2762 2763 static inline void 2764 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2765 { 2766 struct spdk_bdev *bdev = bdev_io->bdev; 2767 struct spdk_io_channel *ch = bdev_ch->channel; 2768 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2769 2770 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2771 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2772 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2773 2774 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2775 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2776 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2777 SPDK_BDEV_IO_STATUS_SUCCESS); 2778 return; 2779 } 2780 } 2781 2782 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2783 bdev_io->bdev->split_on_write_unit && 2784 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2785 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2786 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2787 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2788 return; 2789 } 2790 2791 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2792 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2793 bdev_io->internal.in_submit_request = true; 2794 bdev_submit_request(bdev, ch, bdev_io); 2795 bdev_io->internal.in_submit_request = false; 2796 } else { 2797 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2798 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2799 /* Special case when we have nomem IOs and no outstanding IOs which completions 2800 * could trigger retry of queued IOs */ 2801 bdev_shared_ch_retry_io(shared_resource); 2802 } 2803 } 2804 } 2805 2806 static bool 2807 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2808 { 2809 int i; 2810 2811 if (bdev_qos_io_to_limit(bdev_io) == true) { 2812 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2813 if (!qos->rate_limits[i].queue_io) { 2814 continue; 2815 } 2816 2817 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2818 bdev_io) == true) { 2819 for (i -= 1; i >= 0 ; i--) { 2820 if (!qos->rate_limits[i].queue_io) { 2821 continue; 2822 } 2823 2824 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2825 } 2826 return true; 2827 } 2828 } 2829 } 2830 2831 return false; 2832 } 2833 2834 static int 2835 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2836 { 2837 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2838 int submitted_ios = 0; 2839 2840 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2841 if (!bdev_qos_queue_io(qos, bdev_io)) { 2842 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2843 bdev_io_do_submit(ch, bdev_io); 2844 2845 submitted_ios++; 2846 } 2847 } 2848 2849 return submitted_ios; 2850 } 2851 2852 static void 2853 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2854 { 2855 int rc; 2856 2857 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2858 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2859 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2860 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2861 &bdev_io->internal.waitq_entry); 2862 if (rc != 0) { 2863 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2864 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2865 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2866 } 2867 } 2868 2869 static bool 2870 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2871 { 2872 uint32_t io_boundary; 2873 struct spdk_bdev *bdev = bdev_io->bdev; 2874 uint32_t max_segment_size = bdev->max_segment_size; 2875 uint32_t max_size = bdev->max_rw_size; 2876 int max_segs = bdev->max_num_segments; 2877 2878 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2879 io_boundary = bdev->write_unit_size; 2880 } else if (bdev->split_on_optimal_io_boundary) { 2881 io_boundary = bdev->optimal_io_boundary; 2882 } else { 2883 io_boundary = 0; 2884 } 2885 2886 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2887 return false; 2888 } 2889 2890 if (io_boundary) { 2891 uint64_t start_stripe, end_stripe; 2892 2893 start_stripe = bdev_io->u.bdev.offset_blocks; 2894 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2895 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2896 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2897 start_stripe >>= spdk_u32log2(io_boundary); 2898 end_stripe >>= spdk_u32log2(io_boundary); 2899 } else { 2900 start_stripe /= io_boundary; 2901 end_stripe /= io_boundary; 2902 } 2903 2904 if (start_stripe != end_stripe) { 2905 return true; 2906 } 2907 } 2908 2909 if (max_segs) { 2910 if (bdev_io->u.bdev.iovcnt > max_segs) { 2911 return true; 2912 } 2913 } 2914 2915 if (max_segment_size) { 2916 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2917 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2918 return true; 2919 } 2920 } 2921 } 2922 2923 if (max_size) { 2924 if (bdev_io->u.bdev.num_blocks > max_size) { 2925 return true; 2926 } 2927 } 2928 2929 return false; 2930 } 2931 2932 static bool 2933 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2934 { 2935 uint32_t num_unmap_segments; 2936 2937 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2938 return false; 2939 } 2940 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2941 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2942 return true; 2943 } 2944 2945 return false; 2946 } 2947 2948 static bool 2949 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2950 { 2951 if (!bdev_io->bdev->max_write_zeroes) { 2952 return false; 2953 } 2954 2955 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2956 return true; 2957 } 2958 2959 return false; 2960 } 2961 2962 static bool 2963 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2964 { 2965 if (bdev_io->bdev->max_copy != 0 && 2966 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2967 return true; 2968 } 2969 2970 return false; 2971 } 2972 2973 static bool 2974 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2975 { 2976 switch (bdev_io->type) { 2977 case SPDK_BDEV_IO_TYPE_READ: 2978 case SPDK_BDEV_IO_TYPE_WRITE: 2979 return bdev_rw_should_split(bdev_io); 2980 case SPDK_BDEV_IO_TYPE_UNMAP: 2981 return bdev_unmap_should_split(bdev_io); 2982 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2983 return bdev_write_zeroes_should_split(bdev_io); 2984 case SPDK_BDEV_IO_TYPE_COPY: 2985 return bdev_copy_should_split(bdev_io); 2986 default: 2987 return false; 2988 } 2989 } 2990 2991 static uint32_t 2992 _to_next_boundary(uint64_t offset, uint32_t boundary) 2993 { 2994 return (boundary - (offset % boundary)); 2995 } 2996 2997 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2998 2999 static void _bdev_rw_split(void *_bdev_io); 3000 3001 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 3002 3003 static void 3004 _bdev_unmap_split(void *_bdev_io) 3005 { 3006 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 3007 } 3008 3009 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 3010 3011 static void 3012 _bdev_write_zeroes_split(void *_bdev_io) 3013 { 3014 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3015 } 3016 3017 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3018 3019 static void 3020 _bdev_copy_split(void *_bdev_io) 3021 { 3022 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3023 } 3024 3025 static int 3026 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3027 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3028 { 3029 int rc; 3030 uint64_t current_offset, current_remaining, current_src_offset; 3031 spdk_bdev_io_wait_cb io_wait_fn; 3032 3033 current_offset = *offset; 3034 current_remaining = *remaining; 3035 3036 assert(bdev_io->internal.f.split); 3037 3038 bdev_io->internal.split.outstanding++; 3039 3040 io_wait_fn = _bdev_rw_split; 3041 switch (bdev_io->type) { 3042 case SPDK_BDEV_IO_TYPE_READ: 3043 assert(bdev_io->u.bdev.accel_sequence == NULL); 3044 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3045 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3046 iov, iovcnt, md_buf, current_offset, 3047 num_blocks, 3048 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3049 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3050 NULL, 3051 bdev_io->u.bdev.dif_check_flags, 3052 bdev_io_split_done, bdev_io); 3053 break; 3054 case SPDK_BDEV_IO_TYPE_WRITE: 3055 assert(bdev_io->u.bdev.accel_sequence == NULL); 3056 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3057 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3058 iov, iovcnt, md_buf, current_offset, 3059 num_blocks, 3060 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3061 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3062 NULL, 3063 bdev_io->u.bdev.dif_check_flags, 3064 bdev_io->u.bdev.nvme_cdw12.raw, 3065 bdev_io->u.bdev.nvme_cdw13.raw, 3066 bdev_io_split_done, bdev_io); 3067 break; 3068 case SPDK_BDEV_IO_TYPE_UNMAP: 3069 io_wait_fn = _bdev_unmap_split; 3070 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3071 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3072 current_offset, num_blocks, 3073 bdev_io_split_done, bdev_io); 3074 break; 3075 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3076 io_wait_fn = _bdev_write_zeroes_split; 3077 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3078 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3079 current_offset, num_blocks, 3080 bdev_io_split_done, bdev_io); 3081 break; 3082 case SPDK_BDEV_IO_TYPE_COPY: 3083 io_wait_fn = _bdev_copy_split; 3084 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3085 (current_offset - bdev_io->u.bdev.offset_blocks); 3086 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3087 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3088 current_offset, current_src_offset, num_blocks, 3089 bdev_io_split_done, bdev_io); 3090 break; 3091 default: 3092 assert(false); 3093 rc = -EINVAL; 3094 break; 3095 } 3096 3097 if (rc == 0) { 3098 current_offset += num_blocks; 3099 current_remaining -= num_blocks; 3100 bdev_io->internal.split.current_offset_blocks = current_offset; 3101 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3102 *offset = current_offset; 3103 *remaining = current_remaining; 3104 } else { 3105 bdev_io->internal.split.outstanding--; 3106 if (rc == -ENOMEM) { 3107 if (bdev_io->internal.split.outstanding == 0) { 3108 /* No I/O is outstanding. Hence we should wait here. */ 3109 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3110 } 3111 } else { 3112 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3113 if (bdev_io->internal.split.outstanding == 0) { 3114 bdev_ch_remove_from_io_submitted(bdev_io); 3115 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3116 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3117 bdev_io->internal.ch->queue_depth); 3118 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3119 } 3120 } 3121 } 3122 3123 return rc; 3124 } 3125 3126 static void 3127 _bdev_rw_split(void *_bdev_io) 3128 { 3129 struct iovec *parent_iov, *iov; 3130 struct spdk_bdev_io *bdev_io = _bdev_io; 3131 struct spdk_bdev *bdev = bdev_io->bdev; 3132 uint64_t parent_offset, current_offset, remaining; 3133 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3134 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3135 uint32_t iovcnt, iov_len, child_iovsize; 3136 uint32_t blocklen = bdev->blocklen; 3137 uint32_t io_boundary; 3138 uint32_t max_segment_size = bdev->max_segment_size; 3139 uint32_t max_child_iovcnt = bdev->max_num_segments; 3140 uint32_t max_size = bdev->max_rw_size; 3141 void *md_buf = NULL; 3142 int rc; 3143 3144 max_size = max_size ? max_size : UINT32_MAX; 3145 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3146 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3147 SPDK_BDEV_IO_NUM_CHILD_IOV; 3148 3149 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3150 io_boundary = bdev->write_unit_size; 3151 } else if (bdev->split_on_optimal_io_boundary) { 3152 io_boundary = bdev->optimal_io_boundary; 3153 } else { 3154 io_boundary = UINT32_MAX; 3155 } 3156 3157 assert(bdev_io->internal.f.split); 3158 3159 remaining = bdev_io->internal.split.remaining_num_blocks; 3160 current_offset = bdev_io->internal.split.current_offset_blocks; 3161 parent_offset = bdev_io->u.bdev.offset_blocks; 3162 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3163 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3164 3165 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3166 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3167 if (parent_iov_offset < parent_iov->iov_len) { 3168 break; 3169 } 3170 parent_iov_offset -= parent_iov->iov_len; 3171 } 3172 3173 child_iovcnt = 0; 3174 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3175 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3176 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3177 to_next_boundary = spdk_min(remaining, to_next_boundary); 3178 to_next_boundary = spdk_min(max_size, to_next_boundary); 3179 to_next_boundary_bytes = to_next_boundary * blocklen; 3180 3181 iov = &bdev_io->child_iov[child_iovcnt]; 3182 iovcnt = 0; 3183 3184 if (bdev_io->u.bdev.md_buf) { 3185 md_buf = (char *)bdev_io->u.bdev.md_buf + 3186 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3187 } 3188 3189 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3190 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3191 iovcnt < child_iovsize) { 3192 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3193 iov_len = parent_iov->iov_len - parent_iov_offset; 3194 3195 iov_len = spdk_min(iov_len, max_segment_size); 3196 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3197 to_next_boundary_bytes -= iov_len; 3198 3199 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3200 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3201 3202 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3203 parent_iov_offset += iov_len; 3204 } else { 3205 parent_iovpos++; 3206 parent_iov_offset = 0; 3207 } 3208 child_iovcnt++; 3209 iovcnt++; 3210 } 3211 3212 if (to_next_boundary_bytes > 0) { 3213 /* We had to stop this child I/O early because we ran out of 3214 * child_iov space or were limited by max_num_segments. 3215 * Ensure the iovs to be aligned with block size and 3216 * then adjust to_next_boundary before starting the 3217 * child I/O. 3218 */ 3219 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3220 iovcnt == child_iovsize); 3221 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3222 if (to_last_block_bytes != 0) { 3223 uint32_t child_iovpos = child_iovcnt - 1; 3224 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3225 * so the loop will naturally end 3226 */ 3227 3228 to_last_block_bytes = blocklen - to_last_block_bytes; 3229 to_next_boundary_bytes += to_last_block_bytes; 3230 while (to_last_block_bytes > 0 && iovcnt > 0) { 3231 iov_len = spdk_min(to_last_block_bytes, 3232 bdev_io->child_iov[child_iovpos].iov_len); 3233 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3234 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3235 child_iovpos--; 3236 if (--iovcnt == 0) { 3237 /* If the child IO is less than a block size just return. 3238 * If the first child IO of any split round is less than 3239 * a block size, an error exit. 3240 */ 3241 if (bdev_io->internal.split.outstanding == 0) { 3242 SPDK_ERRLOG("The first child io was less than a block size\n"); 3243 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3244 bdev_ch_remove_from_io_submitted(bdev_io); 3245 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3246 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3247 bdev_io->internal.ch->queue_depth); 3248 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3249 } 3250 3251 return; 3252 } 3253 } 3254 3255 to_last_block_bytes -= iov_len; 3256 3257 if (parent_iov_offset == 0) { 3258 parent_iovpos--; 3259 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3260 } 3261 parent_iov_offset -= iov_len; 3262 } 3263 3264 assert(to_last_block_bytes == 0); 3265 } 3266 to_next_boundary -= to_next_boundary_bytes / blocklen; 3267 } 3268 3269 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3270 ¤t_offset, &remaining); 3271 if (spdk_unlikely(rc)) { 3272 return; 3273 } 3274 } 3275 } 3276 3277 static void 3278 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3279 { 3280 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3281 uint32_t num_children_reqs = 0; 3282 int rc; 3283 3284 assert(bdev_io->internal.f.split); 3285 3286 offset = bdev_io->internal.split.current_offset_blocks; 3287 remaining = bdev_io->internal.split.remaining_num_blocks; 3288 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3289 3290 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3291 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3292 3293 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3294 &offset, &remaining); 3295 if (spdk_likely(rc == 0)) { 3296 num_children_reqs++; 3297 } else { 3298 return; 3299 } 3300 } 3301 } 3302 3303 static void 3304 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3305 { 3306 uint64_t offset, write_zeroes_blocks, remaining; 3307 uint32_t num_children_reqs = 0; 3308 int rc; 3309 3310 assert(bdev_io->internal.f.split); 3311 3312 offset = bdev_io->internal.split.current_offset_blocks; 3313 remaining = bdev_io->internal.split.remaining_num_blocks; 3314 3315 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3316 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3317 3318 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3319 &offset, &remaining); 3320 if (spdk_likely(rc == 0)) { 3321 num_children_reqs++; 3322 } else { 3323 return; 3324 } 3325 } 3326 } 3327 3328 static void 3329 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3330 { 3331 uint64_t offset, copy_blocks, remaining; 3332 uint32_t num_children_reqs = 0; 3333 int rc; 3334 3335 assert(bdev_io->internal.f.split); 3336 3337 offset = bdev_io->internal.split.current_offset_blocks; 3338 remaining = bdev_io->internal.split.remaining_num_blocks; 3339 3340 assert(bdev_io->bdev->max_copy != 0); 3341 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3342 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3343 3344 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3345 &offset, &remaining); 3346 if (spdk_likely(rc == 0)) { 3347 num_children_reqs++; 3348 } else { 3349 return; 3350 } 3351 } 3352 } 3353 3354 static void 3355 parent_bdev_io_complete(void *ctx, int rc) 3356 { 3357 struct spdk_bdev_io *parent_io = ctx; 3358 3359 if (rc) { 3360 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3361 } 3362 3363 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3364 parent_io->internal.caller_ctx); 3365 } 3366 3367 static void 3368 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3369 { 3370 struct spdk_bdev_io *bdev_io = ctx; 3371 3372 /* u.bdev.accel_sequence should have already been cleared at this point */ 3373 assert(bdev_io->u.bdev.accel_sequence == NULL); 3374 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3375 bdev_io->internal.f.has_accel_sequence = false; 3376 3377 if (spdk_unlikely(status != 0)) { 3378 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3379 } 3380 3381 parent_bdev_io_complete(bdev_io, status); 3382 } 3383 3384 static void 3385 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3386 { 3387 struct spdk_bdev_io *parent_io = cb_arg; 3388 3389 spdk_bdev_free_io(bdev_io); 3390 3391 assert(parent_io->internal.f.split); 3392 3393 if (!success) { 3394 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3395 /* If any child I/O failed, stop further splitting process. */ 3396 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3397 parent_io->internal.split.remaining_num_blocks = 0; 3398 } 3399 parent_io->internal.split.outstanding--; 3400 if (parent_io->internal.split.outstanding != 0) { 3401 return; 3402 } 3403 3404 /* 3405 * Parent I/O finishes when all blocks are consumed. 3406 */ 3407 if (parent_io->internal.split.remaining_num_blocks == 0) { 3408 assert(parent_io->internal.cb != bdev_io_split_done); 3409 bdev_ch_remove_from_io_submitted(parent_io); 3410 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3411 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3412 parent_io->internal.ch->queue_depth); 3413 3414 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3415 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3416 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3417 return; 3418 } else if (parent_io->internal.orig_iovcnt != 0 && 3419 !bdev_io_use_accel_sequence(bdev_io)) { 3420 /* bdev IO will be completed in the callback */ 3421 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3422 return; 3423 } 3424 } 3425 3426 parent_bdev_io_complete(parent_io, 0); 3427 return; 3428 } 3429 3430 /* 3431 * Continue with the splitting process. This function will complete the parent I/O if the 3432 * splitting is done. 3433 */ 3434 switch (parent_io->type) { 3435 case SPDK_BDEV_IO_TYPE_READ: 3436 case SPDK_BDEV_IO_TYPE_WRITE: 3437 _bdev_rw_split(parent_io); 3438 break; 3439 case SPDK_BDEV_IO_TYPE_UNMAP: 3440 bdev_unmap_split(parent_io); 3441 break; 3442 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3443 bdev_write_zeroes_split(parent_io); 3444 break; 3445 case SPDK_BDEV_IO_TYPE_COPY: 3446 bdev_copy_split(parent_io); 3447 break; 3448 default: 3449 assert(false); 3450 break; 3451 } 3452 } 3453 3454 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3455 bool success); 3456 3457 static void 3458 bdev_io_split(struct spdk_bdev_io *bdev_io) 3459 { 3460 assert(bdev_io_should_split(bdev_io)); 3461 assert(bdev_io->internal.f.split); 3462 3463 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3464 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3465 bdev_io->internal.split.outstanding = 0; 3466 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3467 3468 switch (bdev_io->type) { 3469 case SPDK_BDEV_IO_TYPE_READ: 3470 case SPDK_BDEV_IO_TYPE_WRITE: 3471 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3472 _bdev_rw_split(bdev_io); 3473 } else { 3474 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3475 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3476 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3477 } 3478 break; 3479 case SPDK_BDEV_IO_TYPE_UNMAP: 3480 bdev_unmap_split(bdev_io); 3481 break; 3482 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3483 bdev_write_zeroes_split(bdev_io); 3484 break; 3485 case SPDK_BDEV_IO_TYPE_COPY: 3486 bdev_copy_split(bdev_io); 3487 break; 3488 default: 3489 assert(false); 3490 break; 3491 } 3492 } 3493 3494 static void 3495 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3496 { 3497 if (!success) { 3498 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3499 return; 3500 } 3501 3502 _bdev_rw_split(bdev_io); 3503 } 3504 3505 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3506 * be inlined, at least on some compilers. 3507 */ 3508 static inline void 3509 _bdev_io_submit(void *ctx) 3510 { 3511 struct spdk_bdev_io *bdev_io = ctx; 3512 struct spdk_bdev *bdev = bdev_io->bdev; 3513 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3514 3515 if (spdk_likely(bdev_ch->flags == 0)) { 3516 bdev_io_do_submit(bdev_ch, bdev_io); 3517 return; 3518 } 3519 3520 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3521 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3522 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3523 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3524 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3525 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3526 } else { 3527 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3528 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3529 } 3530 } else { 3531 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3532 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3533 } 3534 } 3535 3536 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3537 3538 bool 3539 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3540 { 3541 if (range1->length == 0 || range2->length == 0) { 3542 return false; 3543 } 3544 3545 if (range1->offset + range1->length <= range2->offset) { 3546 return false; 3547 } 3548 3549 if (range2->offset + range2->length <= range1->offset) { 3550 return false; 3551 } 3552 3553 return true; 3554 } 3555 3556 static bool 3557 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3558 { 3559 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3560 struct lba_range r; 3561 3562 switch (bdev_io->type) { 3563 case SPDK_BDEV_IO_TYPE_NVME_IO: 3564 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3565 /* Don't try to decode the NVMe command - just assume worst-case and that 3566 * it overlaps a locked range. 3567 */ 3568 return true; 3569 case SPDK_BDEV_IO_TYPE_READ: 3570 if (!range->quiesce) { 3571 return false; 3572 } 3573 /* fallthrough */ 3574 case SPDK_BDEV_IO_TYPE_WRITE: 3575 case SPDK_BDEV_IO_TYPE_UNMAP: 3576 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3577 case SPDK_BDEV_IO_TYPE_ZCOPY: 3578 case SPDK_BDEV_IO_TYPE_COPY: 3579 r.offset = bdev_io->u.bdev.offset_blocks; 3580 r.length = bdev_io->u.bdev.num_blocks; 3581 if (!bdev_lba_range_overlapped(range, &r)) { 3582 /* This I/O doesn't overlap the specified LBA range. */ 3583 return false; 3584 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3585 /* This I/O overlaps, but the I/O is on the same channel that locked this 3586 * range, and the caller_ctx is the same as the locked_ctx. This means 3587 * that this I/O is associated with the lock, and is allowed to execute. 3588 */ 3589 return false; 3590 } else { 3591 return true; 3592 } 3593 default: 3594 return false; 3595 } 3596 } 3597 3598 void 3599 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3600 { 3601 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3602 3603 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3604 3605 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3606 struct lba_range *range; 3607 3608 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3609 if (bdev_io_range_is_locked(bdev_io, range)) { 3610 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3611 return; 3612 } 3613 } 3614 } 3615 3616 bdev_ch_add_to_io_submitted(bdev_io); 3617 3618 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3619 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3620 ch->trace_id, bdev_io->u.bdev.num_blocks, 3621 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3622 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3623 3624 if (bdev_io->internal.f.split) { 3625 bdev_io_split(bdev_io); 3626 return; 3627 } 3628 3629 _bdev_io_submit(bdev_io); 3630 } 3631 3632 static inline void 3633 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3634 { 3635 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3636 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3637 * For write operation we need to pull buffers from memory domain before submitting IO. 3638 * Once read operation completes, we need to use memory_domain push functionality to 3639 * update data in original memory domain IO buffer 3640 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3641 assert(bdev_io->internal.f.has_memory_domain); 3642 bdev_io->u.bdev.memory_domain = NULL; 3643 bdev_io->u.bdev.memory_domain_ctx = NULL; 3644 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3645 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3646 } 3647 3648 static inline void 3649 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3650 { 3651 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3652 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3653 3654 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3655 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3656 bdev_io_complete_unsubmitted(bdev_io); 3657 return; 3658 } 3659 3660 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3661 * support them, but we need to execute an accel sequence and the data buffer is from accel 3662 * memory domain (to avoid doing a push/pull from that domain). 3663 */ 3664 if (bdev_io_use_memory_domain(bdev_io)) { 3665 if (!desc->memory_domains_supported || 3666 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3667 _bdev_io_ext_use_bounce_buffer(bdev_io); 3668 return; 3669 } 3670 } 3671 3672 if (needs_exec) { 3673 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3674 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3675 return; 3676 } 3677 /* For reads we'll execute the sequence after the data is read, so, for now, only 3678 * clear out accel_sequence pointer and submit the IO */ 3679 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3680 bdev_io->u.bdev.accel_sequence = NULL; 3681 } 3682 3683 bdev_io_submit(bdev_io); 3684 } 3685 3686 static void 3687 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3688 { 3689 struct spdk_bdev *bdev = bdev_io->bdev; 3690 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3691 struct spdk_io_channel *ch = bdev_ch->channel; 3692 3693 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3694 3695 bdev_io->internal.in_submit_request = true; 3696 bdev_submit_request(bdev, ch, bdev_io); 3697 bdev_io->internal.in_submit_request = false; 3698 } 3699 3700 void 3701 bdev_io_init(struct spdk_bdev_io *bdev_io, 3702 struct spdk_bdev *bdev, void *cb_arg, 3703 spdk_bdev_io_completion_cb cb) 3704 { 3705 bdev_io->bdev = bdev; 3706 bdev_io->internal.f.raw = 0; 3707 bdev_io->internal.caller_ctx = cb_arg; 3708 bdev_io->internal.cb = cb; 3709 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3710 bdev_io->internal.in_submit_request = false; 3711 bdev_io->internal.orig_iovs = NULL; 3712 bdev_io->internal.orig_iovcnt = 0; 3713 bdev_io->internal.orig_md_iov.iov_base = NULL; 3714 bdev_io->internal.error.nvme.cdw0 = 0; 3715 bdev_io->num_retries = 0; 3716 bdev_io->internal.get_buf_cb = NULL; 3717 bdev_io->internal.get_aux_buf_cb = NULL; 3718 bdev_io->internal.data_transfer_cpl = NULL; 3719 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3720 } 3721 3722 static bool 3723 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3724 { 3725 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3726 } 3727 3728 bool 3729 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3730 { 3731 bool supported; 3732 3733 supported = bdev_io_type_supported(bdev, io_type); 3734 3735 if (!supported) { 3736 switch (io_type) { 3737 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3738 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3739 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3740 break; 3741 default: 3742 break; 3743 } 3744 } 3745 3746 return supported; 3747 } 3748 3749 static const char *g_io_type_strings[] = { 3750 [SPDK_BDEV_IO_TYPE_READ] = "read", 3751 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3752 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3753 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3754 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3755 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3756 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3757 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3758 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3759 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3760 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3761 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3762 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3763 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3764 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3765 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3766 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3767 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3768 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3769 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3770 }; 3771 3772 const char * 3773 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3774 { 3775 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3776 return NULL; 3777 } 3778 3779 return g_io_type_strings[io_type]; 3780 } 3781 3782 int 3783 spdk_bdev_get_io_type(const char *io_type_string) 3784 { 3785 int i; 3786 3787 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3788 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3789 return i; 3790 } 3791 } 3792 3793 return -1; 3794 } 3795 3796 uint64_t 3797 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3798 { 3799 return bdev_io->internal.submit_tsc; 3800 } 3801 3802 int 3803 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3804 { 3805 if (bdev->fn_table->dump_info_json) { 3806 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3807 } 3808 3809 return 0; 3810 } 3811 3812 static void 3813 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3814 { 3815 uint32_t max_per_timeslice = 0; 3816 int i; 3817 3818 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3819 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3820 qos->rate_limits[i].max_per_timeslice = 0; 3821 continue; 3822 } 3823 3824 max_per_timeslice = qos->rate_limits[i].limit * 3825 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3826 3827 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3828 qos->rate_limits[i].min_per_timeslice); 3829 3830 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3831 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3832 } 3833 3834 bdev_qos_set_ops(qos); 3835 } 3836 3837 static void 3838 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3839 struct spdk_io_channel *io_ch, void *ctx) 3840 { 3841 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3842 int status; 3843 3844 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3845 3846 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3847 /* TODO: channels round robing */ 3848 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3849 3850 spdk_bdev_for_each_channel_continue(i, status); 3851 } 3852 3853 3854 static void 3855 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3856 { 3857 3858 } 3859 3860 static int 3861 bdev_channel_poll_qos(void *arg) 3862 { 3863 struct spdk_bdev *bdev = arg; 3864 struct spdk_bdev_qos *qos = bdev->internal.qos; 3865 uint64_t now = spdk_get_ticks(); 3866 int i; 3867 int64_t remaining_last_timeslice; 3868 3869 if (spdk_unlikely(qos->thread == NULL)) { 3870 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3871 return SPDK_POLLER_IDLE; 3872 } 3873 3874 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3875 /* We received our callback earlier than expected - return 3876 * immediately and wait to do accounting until at least one 3877 * timeslice has actually expired. This should never happen 3878 * with a well-behaved timer implementation. 3879 */ 3880 return SPDK_POLLER_IDLE; 3881 } 3882 3883 /* Reset for next round of rate limiting */ 3884 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3885 /* We may have allowed the IOs or bytes to slightly overrun in the last 3886 * timeslice. remaining_this_timeslice is signed, so if it's negative 3887 * here, we'll account for the overrun so that the next timeslice will 3888 * be appropriately reduced. 3889 */ 3890 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3891 0, __ATOMIC_RELAXED); 3892 if (remaining_last_timeslice < 0) { 3893 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3894 * potentially use 2 atomic ops each, so they can intertwine. 3895 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3896 */ 3897 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3898 remaining_last_timeslice, __ATOMIC_RELAXED); 3899 } 3900 } 3901 3902 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3903 qos->last_timeslice += qos->timeslice_size; 3904 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3905 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3906 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3907 } 3908 } 3909 3910 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3911 bdev_channel_submit_qos_io_done); 3912 3913 return SPDK_POLLER_BUSY; 3914 } 3915 3916 static void 3917 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3918 { 3919 struct spdk_bdev_shared_resource *shared_resource; 3920 struct lba_range *range; 3921 3922 bdev_free_io_stat(ch->stat); 3923 #ifdef SPDK_CONFIG_VTUNE 3924 bdev_free_io_stat(ch->prev_stat); 3925 #endif 3926 3927 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3928 range = TAILQ_FIRST(&ch->locked_ranges); 3929 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3930 free(range); 3931 } 3932 3933 spdk_put_io_channel(ch->channel); 3934 spdk_put_io_channel(ch->accel_channel); 3935 3936 shared_resource = ch->shared_resource; 3937 3938 assert(TAILQ_EMPTY(&ch->io_locked)); 3939 assert(TAILQ_EMPTY(&ch->io_submitted)); 3940 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3941 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3942 assert(ch->io_outstanding == 0); 3943 assert(shared_resource->ref > 0); 3944 shared_resource->ref--; 3945 if (shared_resource->ref == 0) { 3946 assert(shared_resource->io_outstanding == 0); 3947 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3948 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3949 spdk_poller_unregister(&shared_resource->nomem_poller); 3950 free(shared_resource); 3951 } 3952 } 3953 3954 static void 3955 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3956 { 3957 struct spdk_bdev_qos *qos = bdev->internal.qos; 3958 int i; 3959 3960 assert(spdk_spin_held(&bdev->internal.spinlock)); 3961 3962 /* Rate limiting on this bdev enabled */ 3963 if (qos) { 3964 if (qos->ch == NULL) { 3965 struct spdk_io_channel *io_ch; 3966 3967 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3968 bdev->name, spdk_get_thread()); 3969 3970 /* No qos channel has been selected, so set one up */ 3971 3972 /* Take another reference to ch */ 3973 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3974 assert(io_ch != NULL); 3975 qos->ch = ch; 3976 3977 qos->thread = spdk_io_channel_get_thread(io_ch); 3978 3979 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3980 if (bdev_qos_is_iops_rate_limit(i) == true) { 3981 qos->rate_limits[i].min_per_timeslice = 3982 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3983 } else { 3984 qos->rate_limits[i].min_per_timeslice = 3985 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3986 } 3987 3988 if (qos->rate_limits[i].limit == 0) { 3989 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3990 } 3991 } 3992 bdev_qos_update_max_quota_per_timeslice(qos); 3993 qos->timeslice_size = 3994 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3995 qos->last_timeslice = spdk_get_ticks(); 3996 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3997 bdev, 3998 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3999 } 4000 4001 ch->flags |= BDEV_CH_QOS_ENABLED; 4002 } 4003 } 4004 4005 struct poll_timeout_ctx { 4006 struct spdk_bdev_desc *desc; 4007 uint64_t timeout_in_sec; 4008 spdk_bdev_io_timeout_cb cb_fn; 4009 void *cb_arg; 4010 }; 4011 4012 static void 4013 bdev_desc_free(struct spdk_bdev_desc *desc) 4014 { 4015 spdk_spin_destroy(&desc->spinlock); 4016 free(desc->media_events_buffer); 4017 free(desc); 4018 } 4019 4020 static void 4021 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4022 { 4023 struct poll_timeout_ctx *ctx = _ctx; 4024 struct spdk_bdev_desc *desc = ctx->desc; 4025 4026 free(ctx); 4027 4028 spdk_spin_lock(&desc->spinlock); 4029 desc->refs--; 4030 if (desc->closed == true && desc->refs == 0) { 4031 spdk_spin_unlock(&desc->spinlock); 4032 bdev_desc_free(desc); 4033 return; 4034 } 4035 spdk_spin_unlock(&desc->spinlock); 4036 } 4037 4038 static void 4039 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4040 struct spdk_io_channel *io_ch, void *_ctx) 4041 { 4042 struct poll_timeout_ctx *ctx = _ctx; 4043 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4044 struct spdk_bdev_desc *desc = ctx->desc; 4045 struct spdk_bdev_io *bdev_io; 4046 uint64_t now; 4047 4048 spdk_spin_lock(&desc->spinlock); 4049 if (desc->closed == true) { 4050 spdk_spin_unlock(&desc->spinlock); 4051 spdk_bdev_for_each_channel_continue(i, -1); 4052 return; 4053 } 4054 spdk_spin_unlock(&desc->spinlock); 4055 4056 now = spdk_get_ticks(); 4057 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4058 /* Exclude any I/O that are generated via splitting. */ 4059 if (bdev_io->internal.cb == bdev_io_split_done) { 4060 continue; 4061 } 4062 4063 /* Once we find an I/O that has not timed out, we can immediately 4064 * exit the loop. 4065 */ 4066 if (now < (bdev_io->internal.submit_tsc + 4067 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4068 goto end; 4069 } 4070 4071 if (bdev_io->internal.desc == desc) { 4072 ctx->cb_fn(ctx->cb_arg, bdev_io); 4073 } 4074 } 4075 4076 end: 4077 spdk_bdev_for_each_channel_continue(i, 0); 4078 } 4079 4080 static int 4081 bdev_poll_timeout_io(void *arg) 4082 { 4083 struct spdk_bdev_desc *desc = arg; 4084 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4085 struct poll_timeout_ctx *ctx; 4086 4087 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4088 if (!ctx) { 4089 SPDK_ERRLOG("failed to allocate memory\n"); 4090 return SPDK_POLLER_BUSY; 4091 } 4092 ctx->desc = desc; 4093 ctx->cb_arg = desc->cb_arg; 4094 ctx->cb_fn = desc->cb_fn; 4095 ctx->timeout_in_sec = desc->timeout_in_sec; 4096 4097 /* Take a ref on the descriptor in case it gets closed while we are checking 4098 * all of the channels. 4099 */ 4100 spdk_spin_lock(&desc->spinlock); 4101 desc->refs++; 4102 spdk_spin_unlock(&desc->spinlock); 4103 4104 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4105 bdev_channel_poll_timeout_io_done); 4106 4107 return SPDK_POLLER_BUSY; 4108 } 4109 4110 int 4111 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4112 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4113 { 4114 assert(desc->thread == spdk_get_thread()); 4115 4116 spdk_poller_unregister(&desc->io_timeout_poller); 4117 4118 if (timeout_in_sec) { 4119 assert(cb_fn != NULL); 4120 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4121 desc, 4122 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4123 1000); 4124 if (desc->io_timeout_poller == NULL) { 4125 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4126 return -1; 4127 } 4128 } 4129 4130 desc->cb_fn = cb_fn; 4131 desc->cb_arg = cb_arg; 4132 desc->timeout_in_sec = timeout_in_sec; 4133 4134 return 0; 4135 } 4136 4137 static int 4138 bdev_channel_create(void *io_device, void *ctx_buf) 4139 { 4140 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4141 struct spdk_bdev_channel *ch = ctx_buf; 4142 struct spdk_io_channel *mgmt_io_ch; 4143 struct spdk_bdev_mgmt_channel *mgmt_ch; 4144 struct spdk_bdev_shared_resource *shared_resource; 4145 struct lba_range *range; 4146 4147 ch->bdev = bdev; 4148 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4149 if (!ch->channel) { 4150 return -1; 4151 } 4152 4153 ch->accel_channel = spdk_accel_get_io_channel(); 4154 if (!ch->accel_channel) { 4155 spdk_put_io_channel(ch->channel); 4156 return -1; 4157 } 4158 4159 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4160 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4161 4162 assert(ch->histogram == NULL); 4163 if (bdev->internal.histogram_enabled) { 4164 ch->histogram = spdk_histogram_data_alloc(); 4165 if (ch->histogram == NULL) { 4166 SPDK_ERRLOG("Could not allocate histogram\n"); 4167 } 4168 } 4169 4170 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4171 if (!mgmt_io_ch) { 4172 spdk_put_io_channel(ch->channel); 4173 spdk_put_io_channel(ch->accel_channel); 4174 return -1; 4175 } 4176 4177 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4178 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4179 if (shared_resource->shared_ch == ch->channel) { 4180 spdk_put_io_channel(mgmt_io_ch); 4181 shared_resource->ref++; 4182 break; 4183 } 4184 } 4185 4186 if (shared_resource == NULL) { 4187 shared_resource = calloc(1, sizeof(*shared_resource)); 4188 if (shared_resource == NULL) { 4189 spdk_put_io_channel(ch->channel); 4190 spdk_put_io_channel(ch->accel_channel); 4191 spdk_put_io_channel(mgmt_io_ch); 4192 return -1; 4193 } 4194 4195 shared_resource->mgmt_ch = mgmt_ch; 4196 shared_resource->io_outstanding = 0; 4197 TAILQ_INIT(&shared_resource->nomem_io); 4198 shared_resource->nomem_threshold = 0; 4199 shared_resource->shared_ch = ch->channel; 4200 shared_resource->ref = 1; 4201 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4202 } 4203 4204 ch->io_outstanding = 0; 4205 TAILQ_INIT(&ch->queued_resets); 4206 TAILQ_INIT(&ch->locked_ranges); 4207 TAILQ_INIT(&ch->qos_queued_io); 4208 ch->flags = 0; 4209 ch->trace_id = bdev->internal.trace_id; 4210 ch->shared_resource = shared_resource; 4211 4212 TAILQ_INIT(&ch->io_submitted); 4213 TAILQ_INIT(&ch->io_locked); 4214 TAILQ_INIT(&ch->io_accel_exec); 4215 TAILQ_INIT(&ch->io_memory_domain); 4216 4217 ch->stat = bdev_alloc_io_stat(false); 4218 if (ch->stat == NULL) { 4219 bdev_channel_destroy_resource(ch); 4220 return -1; 4221 } 4222 4223 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4224 4225 #ifdef SPDK_CONFIG_VTUNE 4226 { 4227 char *name; 4228 __itt_init_ittlib(NULL, 0); 4229 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4230 if (!name) { 4231 bdev_channel_destroy_resource(ch); 4232 return -1; 4233 } 4234 ch->handle = __itt_string_handle_create(name); 4235 free(name); 4236 ch->start_tsc = spdk_get_ticks(); 4237 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4238 ch->prev_stat = bdev_alloc_io_stat(false); 4239 if (ch->prev_stat == NULL) { 4240 bdev_channel_destroy_resource(ch); 4241 return -1; 4242 } 4243 } 4244 #endif 4245 4246 spdk_spin_lock(&bdev->internal.spinlock); 4247 bdev_enable_qos(bdev, ch); 4248 4249 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4250 struct lba_range *new_range; 4251 4252 new_range = calloc(1, sizeof(*new_range)); 4253 if (new_range == NULL) { 4254 spdk_spin_unlock(&bdev->internal.spinlock); 4255 bdev_channel_destroy_resource(ch); 4256 return -1; 4257 } 4258 new_range->length = range->length; 4259 new_range->offset = range->offset; 4260 new_range->locked_ctx = range->locked_ctx; 4261 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4262 } 4263 4264 spdk_spin_unlock(&bdev->internal.spinlock); 4265 4266 return 0; 4267 } 4268 4269 static int 4270 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4271 void *cb_ctx) 4272 { 4273 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4274 struct spdk_bdev_io *bdev_io; 4275 uint64_t buf_len; 4276 4277 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4278 if (bdev_io->internal.ch == bdev_ch) { 4279 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4280 spdk_iobuf_entry_abort(ch, entry, buf_len); 4281 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4282 } 4283 4284 return 0; 4285 } 4286 4287 /* 4288 * Abort I/O that are waiting on a data buffer. 4289 */ 4290 static void 4291 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4292 { 4293 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4294 bdev_abort_all_buf_io_cb, ch); 4295 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4296 bdev_abort_all_buf_io_cb, ch); 4297 } 4298 4299 /* 4300 * Abort I/O that are queued waiting for submission. These types of I/O are 4301 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4302 */ 4303 static void 4304 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4305 { 4306 struct spdk_bdev_io *bdev_io, *tmp; 4307 4308 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4309 if (bdev_io->internal.ch == ch) { 4310 TAILQ_REMOVE(queue, bdev_io, internal.link); 4311 /* 4312 * spdk_bdev_io_complete() assumes that the completed I/O had 4313 * been submitted to the bdev module. Since in this case it 4314 * hadn't, bump io_outstanding to account for the decrement 4315 * that spdk_bdev_io_complete() will do. 4316 */ 4317 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4318 bdev_io_increment_outstanding(ch, ch->shared_resource); 4319 } 4320 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4321 } 4322 } 4323 } 4324 4325 static bool 4326 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4327 { 4328 struct spdk_bdev_io *bdev_io; 4329 4330 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4331 if (bdev_io == bio_to_abort) { 4332 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4333 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4334 return true; 4335 } 4336 } 4337 4338 return false; 4339 } 4340 4341 static int 4342 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4343 { 4344 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4345 uint64_t buf_len; 4346 4347 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4348 if (bdev_io == bio_to_abort) { 4349 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4350 spdk_iobuf_entry_abort(ch, entry, buf_len); 4351 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4352 return 1; 4353 } 4354 4355 return 0; 4356 } 4357 4358 static bool 4359 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4360 { 4361 int rc; 4362 4363 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4364 bdev_abort_buf_io_cb, bio_to_abort); 4365 if (rc == 1) { 4366 return true; 4367 } 4368 4369 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4370 bdev_abort_buf_io_cb, bio_to_abort); 4371 return rc == 1; 4372 } 4373 4374 static void 4375 bdev_qos_channel_destroy(void *cb_arg) 4376 { 4377 struct spdk_bdev_qos *qos = cb_arg; 4378 4379 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4380 spdk_poller_unregister(&qos->poller); 4381 4382 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4383 4384 free(qos); 4385 } 4386 4387 static int 4388 bdev_qos_destroy(struct spdk_bdev *bdev) 4389 { 4390 int i; 4391 4392 /* 4393 * Cleanly shutting down the QoS poller is tricky, because 4394 * during the asynchronous operation the user could open 4395 * a new descriptor and create a new channel, spawning 4396 * a new QoS poller. 4397 * 4398 * The strategy is to create a new QoS structure here and swap it 4399 * in. The shutdown path then continues to refer to the old one 4400 * until it completes and then releases it. 4401 */ 4402 struct spdk_bdev_qos *new_qos, *old_qos; 4403 4404 old_qos = bdev->internal.qos; 4405 4406 new_qos = calloc(1, sizeof(*new_qos)); 4407 if (!new_qos) { 4408 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4409 return -ENOMEM; 4410 } 4411 4412 /* Copy the old QoS data into the newly allocated structure */ 4413 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4414 4415 /* Zero out the key parts of the QoS structure */ 4416 new_qos->ch = NULL; 4417 new_qos->thread = NULL; 4418 new_qos->poller = NULL; 4419 /* 4420 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4421 * It will be used later for the new QoS structure. 4422 */ 4423 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4424 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4425 new_qos->rate_limits[i].min_per_timeslice = 0; 4426 new_qos->rate_limits[i].max_per_timeslice = 0; 4427 } 4428 4429 bdev->internal.qos = new_qos; 4430 4431 if (old_qos->thread == NULL) { 4432 free(old_qos); 4433 } else { 4434 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4435 } 4436 4437 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4438 * been destroyed yet. The destruction path will end up waiting for the final 4439 * channel to be put before it releases resources. */ 4440 4441 return 0; 4442 } 4443 4444 void 4445 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4446 { 4447 total->bytes_read += add->bytes_read; 4448 total->num_read_ops += add->num_read_ops; 4449 total->bytes_written += add->bytes_written; 4450 total->num_write_ops += add->num_write_ops; 4451 total->bytes_unmapped += add->bytes_unmapped; 4452 total->num_unmap_ops += add->num_unmap_ops; 4453 total->bytes_copied += add->bytes_copied; 4454 total->num_copy_ops += add->num_copy_ops; 4455 total->read_latency_ticks += add->read_latency_ticks; 4456 total->write_latency_ticks += add->write_latency_ticks; 4457 total->unmap_latency_ticks += add->unmap_latency_ticks; 4458 total->copy_latency_ticks += add->copy_latency_ticks; 4459 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4460 total->max_read_latency_ticks = add->max_read_latency_ticks; 4461 } 4462 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4463 total->min_read_latency_ticks = add->min_read_latency_ticks; 4464 } 4465 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4466 total->max_write_latency_ticks = add->max_write_latency_ticks; 4467 } 4468 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4469 total->min_write_latency_ticks = add->min_write_latency_ticks; 4470 } 4471 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4472 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4473 } 4474 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4475 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4476 } 4477 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4478 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4479 } 4480 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4481 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4482 } 4483 } 4484 4485 static void 4486 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4487 { 4488 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4489 4490 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4491 memcpy(to_stat->io_error, from_stat->io_error, 4492 sizeof(struct spdk_bdev_io_error_stat)); 4493 } 4494 } 4495 4496 void 4497 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4498 { 4499 stat->max_read_latency_ticks = 0; 4500 stat->min_read_latency_ticks = UINT64_MAX; 4501 stat->max_write_latency_ticks = 0; 4502 stat->min_write_latency_ticks = UINT64_MAX; 4503 stat->max_unmap_latency_ticks = 0; 4504 stat->min_unmap_latency_ticks = UINT64_MAX; 4505 stat->max_copy_latency_ticks = 0; 4506 stat->min_copy_latency_ticks = UINT64_MAX; 4507 4508 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4509 return; 4510 } 4511 4512 stat->bytes_read = 0; 4513 stat->num_read_ops = 0; 4514 stat->bytes_written = 0; 4515 stat->num_write_ops = 0; 4516 stat->bytes_unmapped = 0; 4517 stat->num_unmap_ops = 0; 4518 stat->bytes_copied = 0; 4519 stat->num_copy_ops = 0; 4520 stat->read_latency_ticks = 0; 4521 stat->write_latency_ticks = 0; 4522 stat->unmap_latency_ticks = 0; 4523 stat->copy_latency_ticks = 0; 4524 4525 if (stat->io_error != NULL) { 4526 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4527 } 4528 } 4529 4530 struct spdk_bdev_io_stat * 4531 bdev_alloc_io_stat(bool io_error_stat) 4532 { 4533 struct spdk_bdev_io_stat *stat; 4534 4535 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4536 if (stat == NULL) { 4537 return NULL; 4538 } 4539 4540 if (io_error_stat) { 4541 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4542 if (stat->io_error == NULL) { 4543 free(stat); 4544 return NULL; 4545 } 4546 } else { 4547 stat->io_error = NULL; 4548 } 4549 4550 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4551 4552 return stat; 4553 } 4554 4555 void 4556 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4557 { 4558 if (stat != NULL) { 4559 free(stat->io_error); 4560 free(stat); 4561 } 4562 } 4563 4564 void 4565 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4566 { 4567 int i; 4568 4569 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4570 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4571 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4572 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4573 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4574 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4575 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4576 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4577 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4578 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4579 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4580 stat->min_read_latency_ticks != UINT64_MAX ? 4581 stat->min_read_latency_ticks : 0); 4582 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4583 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4584 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4585 stat->min_write_latency_ticks != UINT64_MAX ? 4586 stat->min_write_latency_ticks : 0); 4587 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4588 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4589 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4590 stat->min_unmap_latency_ticks != UINT64_MAX ? 4591 stat->min_unmap_latency_ticks : 0); 4592 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4593 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4594 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4595 stat->min_copy_latency_ticks != UINT64_MAX ? 4596 stat->min_copy_latency_ticks : 0); 4597 4598 if (stat->io_error != NULL) { 4599 spdk_json_write_named_object_begin(w, "io_error"); 4600 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4601 if (stat->io_error->error_status[i] != 0) { 4602 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4603 stat->io_error->error_status[i]); 4604 } 4605 } 4606 spdk_json_write_object_end(w); 4607 } 4608 } 4609 4610 static void 4611 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4612 { 4613 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4614 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4615 4616 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4617 bdev_abort_all_buf_io(mgmt_ch, ch); 4618 } 4619 4620 static void 4621 bdev_channel_destroy(void *io_device, void *ctx_buf) 4622 { 4623 struct spdk_bdev_channel *ch = ctx_buf; 4624 4625 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4626 spdk_get_thread()); 4627 4628 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4629 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4630 4631 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4632 spdk_spin_lock(&ch->bdev->internal.spinlock); 4633 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4634 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4635 4636 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4637 4638 bdev_channel_abort_queued_ios(ch); 4639 4640 if (ch->histogram) { 4641 spdk_histogram_data_free(ch->histogram); 4642 } 4643 4644 bdev_channel_destroy_resource(ch); 4645 } 4646 4647 /* 4648 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4649 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4650 */ 4651 static int 4652 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4653 { 4654 struct spdk_bdev_name *tmp; 4655 4656 bdev_name->name = strdup(name); 4657 if (bdev_name->name == NULL) { 4658 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4659 return -ENOMEM; 4660 } 4661 4662 bdev_name->bdev = bdev; 4663 4664 spdk_spin_lock(&g_bdev_mgr.spinlock); 4665 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4666 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4667 4668 if (tmp != NULL) { 4669 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4670 free(bdev_name->name); 4671 return -EEXIST; 4672 } 4673 4674 return 0; 4675 } 4676 4677 static void 4678 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4679 { 4680 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4681 free(bdev_name->name); 4682 } 4683 4684 static void 4685 bdev_name_del(struct spdk_bdev_name *bdev_name) 4686 { 4687 spdk_spin_lock(&g_bdev_mgr.spinlock); 4688 bdev_name_del_unsafe(bdev_name); 4689 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4690 } 4691 4692 int 4693 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4694 { 4695 struct spdk_bdev_alias *tmp; 4696 int ret; 4697 4698 if (alias == NULL) { 4699 SPDK_ERRLOG("Empty alias passed\n"); 4700 return -EINVAL; 4701 } 4702 4703 tmp = calloc(1, sizeof(*tmp)); 4704 if (tmp == NULL) { 4705 SPDK_ERRLOG("Unable to allocate alias\n"); 4706 return -ENOMEM; 4707 } 4708 4709 ret = bdev_name_add(&tmp->alias, bdev, alias); 4710 if (ret != 0) { 4711 free(tmp); 4712 return ret; 4713 } 4714 4715 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4716 4717 return 0; 4718 } 4719 4720 static int 4721 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4722 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4723 { 4724 struct spdk_bdev_alias *tmp; 4725 4726 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4727 if (strcmp(alias, tmp->alias.name) == 0) { 4728 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4729 alias_del_fn(&tmp->alias); 4730 free(tmp); 4731 return 0; 4732 } 4733 } 4734 4735 return -ENOENT; 4736 } 4737 4738 int 4739 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4740 { 4741 int rc; 4742 4743 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4744 if (rc == -ENOENT) { 4745 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4746 } 4747 4748 return rc; 4749 } 4750 4751 void 4752 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4753 { 4754 struct spdk_bdev_alias *p, *tmp; 4755 4756 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4757 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4758 bdev_name_del(&p->alias); 4759 free(p); 4760 } 4761 } 4762 4763 struct spdk_io_channel * 4764 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4765 { 4766 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4767 } 4768 4769 void * 4770 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4771 { 4772 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4773 void *ctx = NULL; 4774 4775 if (bdev->fn_table->get_module_ctx) { 4776 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4777 } 4778 4779 return ctx; 4780 } 4781 4782 const char * 4783 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4784 { 4785 return bdev->module->name; 4786 } 4787 4788 const char * 4789 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4790 { 4791 return bdev->name; 4792 } 4793 4794 const char * 4795 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4796 { 4797 return bdev->product_name; 4798 } 4799 4800 const struct spdk_bdev_aliases_list * 4801 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4802 { 4803 return &bdev->aliases; 4804 } 4805 4806 uint32_t 4807 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4808 { 4809 return bdev->blocklen; 4810 } 4811 4812 uint32_t 4813 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4814 { 4815 return bdev->write_unit_size; 4816 } 4817 4818 uint64_t 4819 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4820 { 4821 return bdev->blockcnt; 4822 } 4823 4824 const char * 4825 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4826 { 4827 return qos_rpc_type[type]; 4828 } 4829 4830 void 4831 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4832 { 4833 int i; 4834 4835 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4836 4837 spdk_spin_lock(&bdev->internal.spinlock); 4838 if (bdev->internal.qos) { 4839 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4840 if (bdev->internal.qos->rate_limits[i].limit != 4841 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4842 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4843 if (bdev_qos_is_iops_rate_limit(i) == false) { 4844 /* Change from Byte to Megabyte which is user visible. */ 4845 limits[i] = limits[i] / 1024 / 1024; 4846 } 4847 } 4848 } 4849 } 4850 spdk_spin_unlock(&bdev->internal.spinlock); 4851 } 4852 4853 size_t 4854 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4855 { 4856 return 1 << bdev->required_alignment; 4857 } 4858 4859 uint32_t 4860 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4861 { 4862 return bdev->optimal_io_boundary; 4863 } 4864 4865 bool 4866 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4867 { 4868 return bdev->write_cache; 4869 } 4870 4871 const struct spdk_uuid * 4872 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4873 { 4874 return &bdev->uuid; 4875 } 4876 4877 uint16_t 4878 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4879 { 4880 return bdev->acwu; 4881 } 4882 4883 uint32_t 4884 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4885 { 4886 return bdev->md_len; 4887 } 4888 4889 bool 4890 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4891 { 4892 return (bdev->md_len != 0) && bdev->md_interleave; 4893 } 4894 4895 bool 4896 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4897 { 4898 return (bdev->md_len != 0) && !bdev->md_interleave; 4899 } 4900 4901 bool 4902 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4903 { 4904 return bdev->zoned; 4905 } 4906 4907 uint32_t 4908 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4909 { 4910 if (spdk_bdev_is_md_interleaved(bdev)) { 4911 return bdev->blocklen - bdev->md_len; 4912 } else { 4913 return bdev->blocklen; 4914 } 4915 } 4916 4917 uint32_t 4918 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4919 { 4920 return bdev->phys_blocklen; 4921 } 4922 4923 static uint32_t 4924 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4925 { 4926 if (!spdk_bdev_is_md_interleaved(bdev)) { 4927 return bdev->blocklen + bdev->md_len; 4928 } else { 4929 return bdev->blocklen; 4930 } 4931 } 4932 4933 /* We have to use the typedef in the function declaration to appease astyle. */ 4934 typedef enum spdk_dif_type spdk_dif_type_t; 4935 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4936 4937 spdk_dif_type_t 4938 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4939 { 4940 if (bdev->md_len != 0) { 4941 return bdev->dif_type; 4942 } else { 4943 return SPDK_DIF_DISABLE; 4944 } 4945 } 4946 4947 spdk_dif_pi_format_t 4948 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4949 { 4950 return bdev->dif_pi_format; 4951 } 4952 4953 bool 4954 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4955 { 4956 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4957 return bdev->dif_is_head_of_md; 4958 } else { 4959 return false; 4960 } 4961 } 4962 4963 bool 4964 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4965 enum spdk_dif_check_type check_type) 4966 { 4967 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4968 return false; 4969 } 4970 4971 switch (check_type) { 4972 case SPDK_DIF_CHECK_TYPE_REFTAG: 4973 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4974 case SPDK_DIF_CHECK_TYPE_APPTAG: 4975 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4976 case SPDK_DIF_CHECK_TYPE_GUARD: 4977 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4978 default: 4979 return false; 4980 } 4981 } 4982 4983 static uint32_t 4984 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4985 { 4986 uint64_t aligned_length, max_write_blocks; 4987 4988 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4989 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4990 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4991 4992 return max_write_blocks; 4993 } 4994 4995 uint32_t 4996 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4997 { 4998 return bdev->max_copy; 4999 } 5000 5001 uint64_t 5002 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 5003 { 5004 return bdev->internal.measured_queue_depth; 5005 } 5006 5007 uint64_t 5008 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 5009 { 5010 return bdev->internal.period; 5011 } 5012 5013 uint64_t 5014 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5015 { 5016 return bdev->internal.weighted_io_time; 5017 } 5018 5019 uint64_t 5020 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5021 { 5022 return bdev->internal.io_time; 5023 } 5024 5025 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5026 { 5027 return bdev->ctratt; 5028 } 5029 5030 static void bdev_update_qd_sampling_period(void *ctx); 5031 5032 static void 5033 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5034 { 5035 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5036 5037 if (bdev->internal.measured_queue_depth) { 5038 bdev->internal.io_time += bdev->internal.period; 5039 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5040 } 5041 5042 bdev->internal.qd_poll_in_progress = false; 5043 5044 bdev_update_qd_sampling_period(bdev); 5045 } 5046 5047 static void 5048 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5049 struct spdk_io_channel *io_ch, void *_ctx) 5050 { 5051 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5052 5053 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5054 spdk_bdev_for_each_channel_continue(i, 0); 5055 } 5056 5057 static int 5058 bdev_calculate_measured_queue_depth(void *ctx) 5059 { 5060 struct spdk_bdev *bdev = ctx; 5061 5062 bdev->internal.qd_poll_in_progress = true; 5063 bdev->internal.temporary_queue_depth = 0; 5064 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5065 return SPDK_POLLER_BUSY; 5066 } 5067 5068 static void 5069 bdev_update_qd_sampling_period(void *ctx) 5070 { 5071 struct spdk_bdev *bdev = ctx; 5072 5073 if (bdev->internal.period == bdev->internal.new_period) { 5074 return; 5075 } 5076 5077 if (bdev->internal.qd_poll_in_progress) { 5078 return; 5079 } 5080 5081 bdev->internal.period = bdev->internal.new_period; 5082 5083 spdk_poller_unregister(&bdev->internal.qd_poller); 5084 if (bdev->internal.period != 0) { 5085 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5086 bdev, bdev->internal.period); 5087 } else { 5088 spdk_bdev_close(bdev->internal.qd_desc); 5089 bdev->internal.qd_desc = NULL; 5090 } 5091 } 5092 5093 static void 5094 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5095 { 5096 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5097 } 5098 5099 void 5100 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5101 { 5102 int rc; 5103 5104 if (bdev->internal.new_period == period) { 5105 return; 5106 } 5107 5108 bdev->internal.new_period = period; 5109 5110 if (bdev->internal.qd_desc != NULL) { 5111 assert(bdev->internal.period != 0); 5112 5113 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5114 bdev_update_qd_sampling_period, bdev); 5115 return; 5116 } 5117 5118 assert(bdev->internal.period == 0); 5119 5120 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5121 NULL, &bdev->internal.qd_desc); 5122 if (rc != 0) { 5123 return; 5124 } 5125 5126 bdev->internal.period = period; 5127 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5128 bdev, period); 5129 } 5130 5131 struct bdev_get_current_qd_ctx { 5132 uint64_t current_qd; 5133 spdk_bdev_get_current_qd_cb cb_fn; 5134 void *cb_arg; 5135 }; 5136 5137 static void 5138 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5139 { 5140 struct bdev_get_current_qd_ctx *ctx = _ctx; 5141 5142 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5143 5144 free(ctx); 5145 } 5146 5147 static void 5148 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5149 struct spdk_io_channel *io_ch, void *_ctx) 5150 { 5151 struct bdev_get_current_qd_ctx *ctx = _ctx; 5152 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5153 5154 ctx->current_qd += bdev_ch->io_outstanding; 5155 5156 spdk_bdev_for_each_channel_continue(i, 0); 5157 } 5158 5159 void 5160 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5161 void *cb_arg) 5162 { 5163 struct bdev_get_current_qd_ctx *ctx; 5164 5165 assert(cb_fn != NULL); 5166 5167 ctx = calloc(1, sizeof(*ctx)); 5168 if (ctx == NULL) { 5169 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5170 return; 5171 } 5172 5173 ctx->cb_fn = cb_fn; 5174 ctx->cb_arg = cb_arg; 5175 5176 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5177 } 5178 5179 static void 5180 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5181 { 5182 assert(desc->thread == spdk_get_thread()); 5183 5184 spdk_spin_lock(&desc->spinlock); 5185 desc->refs--; 5186 if (!desc->closed) { 5187 spdk_spin_unlock(&desc->spinlock); 5188 desc->callback.event_fn(type, 5189 desc->bdev, 5190 desc->callback.ctx); 5191 return; 5192 } else if (desc->refs == 0) { 5193 /* This descriptor was closed after this event_notify message was sent. 5194 * spdk_bdev_close() could not free the descriptor since this message was 5195 * in flight, so we free it now using bdev_desc_free(). 5196 */ 5197 spdk_spin_unlock(&desc->spinlock); 5198 bdev_desc_free(desc); 5199 return; 5200 } 5201 spdk_spin_unlock(&desc->spinlock); 5202 } 5203 5204 static void 5205 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5206 { 5207 spdk_spin_lock(&desc->spinlock); 5208 desc->refs++; 5209 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5210 spdk_spin_unlock(&desc->spinlock); 5211 } 5212 5213 static void 5214 _resize_notify(void *ctx) 5215 { 5216 struct spdk_bdev_desc *desc = ctx; 5217 5218 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5219 } 5220 5221 int 5222 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5223 { 5224 struct spdk_bdev_desc *desc; 5225 int ret; 5226 5227 if (size == bdev->blockcnt) { 5228 return 0; 5229 } 5230 5231 spdk_spin_lock(&bdev->internal.spinlock); 5232 5233 /* bdev has open descriptors */ 5234 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5235 bdev->blockcnt > size) { 5236 ret = -EBUSY; 5237 } else { 5238 bdev->blockcnt = size; 5239 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5240 event_notify(desc, _resize_notify); 5241 } 5242 ret = 0; 5243 } 5244 5245 spdk_spin_unlock(&bdev->internal.spinlock); 5246 5247 return ret; 5248 } 5249 5250 /* 5251 * Convert I/O offset and length from bytes to blocks. 5252 * 5253 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5254 */ 5255 static uint64_t 5256 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5257 uint64_t num_bytes, uint64_t *num_blocks) 5258 { 5259 uint32_t block_size = bdev->blocklen; 5260 uint8_t shift_cnt; 5261 5262 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5263 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5264 shift_cnt = spdk_u32log2(block_size); 5265 *offset_blocks = offset_bytes >> shift_cnt; 5266 *num_blocks = num_bytes >> shift_cnt; 5267 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5268 (num_bytes - (*num_blocks << shift_cnt)); 5269 } else { 5270 *offset_blocks = offset_bytes / block_size; 5271 *num_blocks = num_bytes / block_size; 5272 return (offset_bytes % block_size) | (num_bytes % block_size); 5273 } 5274 } 5275 5276 static bool 5277 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5278 { 5279 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5280 * has been an overflow and hence the offset has been wrapped around */ 5281 if (offset_blocks + num_blocks < offset_blocks) { 5282 return false; 5283 } 5284 5285 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5286 if (offset_blocks + num_blocks > bdev->blockcnt) { 5287 return false; 5288 } 5289 5290 return true; 5291 } 5292 5293 static void 5294 bdev_seek_complete_cb(void *ctx) 5295 { 5296 struct spdk_bdev_io *bdev_io = ctx; 5297 5298 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5299 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5300 } 5301 5302 static int 5303 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5304 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5305 spdk_bdev_io_completion_cb cb, void *cb_arg) 5306 { 5307 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5308 struct spdk_bdev_io *bdev_io; 5309 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5310 5311 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5312 5313 /* Check if offset_blocks is valid looking at the validity of one block */ 5314 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5315 return -EINVAL; 5316 } 5317 5318 bdev_io = bdev_channel_get_io(channel); 5319 if (!bdev_io) { 5320 return -ENOMEM; 5321 } 5322 5323 bdev_io->internal.ch = channel; 5324 bdev_io->internal.desc = desc; 5325 bdev_io->type = io_type; 5326 bdev_io->u.bdev.offset_blocks = offset_blocks; 5327 bdev_io->u.bdev.memory_domain = NULL; 5328 bdev_io->u.bdev.memory_domain_ctx = NULL; 5329 bdev_io->u.bdev.accel_sequence = NULL; 5330 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5331 5332 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5333 /* In case bdev doesn't support seek to next data/hole offset, 5334 * it is assumed that only data and no holes are present */ 5335 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5336 bdev_io->u.bdev.seek.offset = offset_blocks; 5337 } else { 5338 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5339 } 5340 5341 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5342 return 0; 5343 } 5344 5345 bdev_io_submit(bdev_io); 5346 return 0; 5347 } 5348 5349 int 5350 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5351 uint64_t offset_blocks, 5352 spdk_bdev_io_completion_cb cb, void *cb_arg) 5353 { 5354 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5355 } 5356 5357 int 5358 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5359 uint64_t offset_blocks, 5360 spdk_bdev_io_completion_cb cb, void *cb_arg) 5361 { 5362 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5363 } 5364 5365 uint64_t 5366 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5367 { 5368 return bdev_io->u.bdev.seek.offset; 5369 } 5370 5371 static int 5372 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5373 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5374 spdk_bdev_io_completion_cb cb, void *cb_arg) 5375 { 5376 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5377 struct spdk_bdev_io *bdev_io; 5378 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5379 5380 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5381 return -EINVAL; 5382 } 5383 5384 bdev_io = bdev_channel_get_io(channel); 5385 if (!bdev_io) { 5386 return -ENOMEM; 5387 } 5388 5389 bdev_io->internal.ch = channel; 5390 bdev_io->internal.desc = desc; 5391 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5392 bdev_io->u.bdev.iovs = &bdev_io->iov; 5393 bdev_io->u.bdev.iovs[0].iov_base = buf; 5394 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5395 bdev_io->u.bdev.iovcnt = 1; 5396 bdev_io->u.bdev.md_buf = md_buf; 5397 bdev_io->u.bdev.num_blocks = num_blocks; 5398 bdev_io->u.bdev.offset_blocks = offset_blocks; 5399 bdev_io->u.bdev.memory_domain = NULL; 5400 bdev_io->u.bdev.memory_domain_ctx = NULL; 5401 bdev_io->u.bdev.accel_sequence = NULL; 5402 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5403 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5404 5405 bdev_io_submit(bdev_io); 5406 return 0; 5407 } 5408 5409 int 5410 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5411 void *buf, uint64_t offset, uint64_t nbytes, 5412 spdk_bdev_io_completion_cb cb, void *cb_arg) 5413 { 5414 uint64_t offset_blocks, num_blocks; 5415 5416 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5417 nbytes, &num_blocks) != 0) { 5418 return -EINVAL; 5419 } 5420 5421 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5422 } 5423 5424 int 5425 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5426 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5427 spdk_bdev_io_completion_cb cb, void *cb_arg) 5428 { 5429 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5430 } 5431 5432 int 5433 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5434 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5435 spdk_bdev_io_completion_cb cb, void *cb_arg) 5436 { 5437 struct iovec iov = { 5438 .iov_base = buf, 5439 }; 5440 5441 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5442 return -EINVAL; 5443 } 5444 5445 if (md_buf && !_is_buf_allocated(&iov)) { 5446 return -EINVAL; 5447 } 5448 5449 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5450 cb, cb_arg); 5451 } 5452 5453 int 5454 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5455 struct iovec *iov, int iovcnt, 5456 uint64_t offset, uint64_t nbytes, 5457 spdk_bdev_io_completion_cb cb, void *cb_arg) 5458 { 5459 uint64_t offset_blocks, num_blocks; 5460 5461 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5462 nbytes, &num_blocks) != 0) { 5463 return -EINVAL; 5464 } 5465 5466 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5467 } 5468 5469 static int 5470 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5471 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5472 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5473 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5474 spdk_bdev_io_completion_cb cb, void *cb_arg) 5475 { 5476 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5477 struct spdk_bdev_io *bdev_io; 5478 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5479 5480 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5481 return -EINVAL; 5482 } 5483 5484 bdev_io = bdev_channel_get_io(channel); 5485 if (spdk_unlikely(!bdev_io)) { 5486 return -ENOMEM; 5487 } 5488 5489 bdev_io->internal.ch = channel; 5490 bdev_io->internal.desc = desc; 5491 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5492 bdev_io->u.bdev.iovs = iov; 5493 bdev_io->u.bdev.iovcnt = iovcnt; 5494 bdev_io->u.bdev.md_buf = md_buf; 5495 bdev_io->u.bdev.num_blocks = num_blocks; 5496 bdev_io->u.bdev.offset_blocks = offset_blocks; 5497 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5498 5499 if (seq != NULL) { 5500 bdev_io->internal.f.has_accel_sequence = true; 5501 bdev_io->internal.accel_sequence = seq; 5502 } 5503 5504 if (domain != NULL) { 5505 bdev_io->internal.f.has_memory_domain = true; 5506 bdev_io->internal.memory_domain = domain; 5507 bdev_io->internal.memory_domain_ctx = domain_ctx; 5508 } 5509 5510 bdev_io->u.bdev.memory_domain = domain; 5511 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5512 bdev_io->u.bdev.accel_sequence = seq; 5513 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5514 5515 _bdev_io_submit_ext(desc, bdev_io); 5516 5517 return 0; 5518 } 5519 5520 int 5521 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5522 struct iovec *iov, int iovcnt, 5523 uint64_t offset_blocks, uint64_t num_blocks, 5524 spdk_bdev_io_completion_cb cb, void *cb_arg) 5525 { 5526 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5527 5528 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5529 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5530 } 5531 5532 int 5533 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5534 struct iovec *iov, int iovcnt, void *md_buf, 5535 uint64_t offset_blocks, uint64_t num_blocks, 5536 spdk_bdev_io_completion_cb cb, void *cb_arg) 5537 { 5538 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5539 5540 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5541 return -EINVAL; 5542 } 5543 5544 if (md_buf && !_is_buf_allocated(iov)) { 5545 return -EINVAL; 5546 } 5547 5548 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5549 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5550 } 5551 5552 static inline bool 5553 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5554 { 5555 /* 5556 * We check if opts size is at least of size when we first introduced 5557 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5558 * are not checked internal. 5559 */ 5560 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5561 sizeof(opts->metadata) && 5562 opts->size <= sizeof(*opts) && 5563 /* When memory domain is used, the user must provide data buffers */ 5564 (!opts->memory_domain || (iov && iov[0].iov_base)); 5565 } 5566 5567 int 5568 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5569 struct iovec *iov, int iovcnt, 5570 uint64_t offset_blocks, uint64_t num_blocks, 5571 spdk_bdev_io_completion_cb cb, void *cb_arg, 5572 struct spdk_bdev_ext_io_opts *opts) 5573 { 5574 struct spdk_memory_domain *domain = NULL; 5575 struct spdk_accel_sequence *seq = NULL; 5576 void *domain_ctx = NULL, *md = NULL; 5577 uint32_t dif_check_flags = 0; 5578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5579 5580 if (opts) { 5581 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5582 return -EINVAL; 5583 } 5584 5585 md = opts->metadata; 5586 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5587 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5588 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5589 if (md) { 5590 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5591 return -EINVAL; 5592 } 5593 5594 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5595 return -EINVAL; 5596 } 5597 5598 if (spdk_unlikely(seq != NULL)) { 5599 return -EINVAL; 5600 } 5601 } 5602 } 5603 5604 dif_check_flags = bdev->dif_check_flags & 5605 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5606 5607 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5608 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5609 } 5610 5611 static int 5612 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5613 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5614 spdk_bdev_io_completion_cb cb, void *cb_arg) 5615 { 5616 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5617 struct spdk_bdev_io *bdev_io; 5618 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5619 5620 if (!desc->write) { 5621 return -EBADF; 5622 } 5623 5624 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5625 return -EINVAL; 5626 } 5627 5628 bdev_io = bdev_channel_get_io(channel); 5629 if (!bdev_io) { 5630 return -ENOMEM; 5631 } 5632 5633 bdev_io->internal.ch = channel; 5634 bdev_io->internal.desc = desc; 5635 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5636 bdev_io->u.bdev.iovs = &bdev_io->iov; 5637 bdev_io->u.bdev.iovs[0].iov_base = buf; 5638 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5639 bdev_io->u.bdev.iovcnt = 1; 5640 bdev_io->u.bdev.md_buf = md_buf; 5641 bdev_io->u.bdev.num_blocks = num_blocks; 5642 bdev_io->u.bdev.offset_blocks = offset_blocks; 5643 bdev_io->u.bdev.memory_domain = NULL; 5644 bdev_io->u.bdev.memory_domain_ctx = NULL; 5645 bdev_io->u.bdev.accel_sequence = NULL; 5646 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5647 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5648 5649 bdev_io_submit(bdev_io); 5650 return 0; 5651 } 5652 5653 int 5654 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5655 void *buf, uint64_t offset, uint64_t nbytes, 5656 spdk_bdev_io_completion_cb cb, void *cb_arg) 5657 { 5658 uint64_t offset_blocks, num_blocks; 5659 5660 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5661 nbytes, &num_blocks) != 0) { 5662 return -EINVAL; 5663 } 5664 5665 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5666 } 5667 5668 int 5669 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5670 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5671 spdk_bdev_io_completion_cb cb, void *cb_arg) 5672 { 5673 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5674 cb, cb_arg); 5675 } 5676 5677 int 5678 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5679 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5680 spdk_bdev_io_completion_cb cb, void *cb_arg) 5681 { 5682 struct iovec iov = { 5683 .iov_base = buf, 5684 }; 5685 5686 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5687 return -EINVAL; 5688 } 5689 5690 if (md_buf && !_is_buf_allocated(&iov)) { 5691 return -EINVAL; 5692 } 5693 5694 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5695 cb, cb_arg); 5696 } 5697 5698 static int 5699 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5700 struct iovec *iov, int iovcnt, void *md_buf, 5701 uint64_t offset_blocks, uint64_t num_blocks, 5702 struct spdk_memory_domain *domain, void *domain_ctx, 5703 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5704 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5705 spdk_bdev_io_completion_cb cb, void *cb_arg) 5706 { 5707 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5708 struct spdk_bdev_io *bdev_io; 5709 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5710 5711 if (spdk_unlikely(!desc->write)) { 5712 return -EBADF; 5713 } 5714 5715 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5716 return -EINVAL; 5717 } 5718 5719 bdev_io = bdev_channel_get_io(channel); 5720 if (spdk_unlikely(!bdev_io)) { 5721 return -ENOMEM; 5722 } 5723 5724 bdev_io->internal.ch = channel; 5725 bdev_io->internal.desc = desc; 5726 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5727 bdev_io->u.bdev.iovs = iov; 5728 bdev_io->u.bdev.iovcnt = iovcnt; 5729 bdev_io->u.bdev.md_buf = md_buf; 5730 bdev_io->u.bdev.num_blocks = num_blocks; 5731 bdev_io->u.bdev.offset_blocks = offset_blocks; 5732 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5733 if (seq != NULL) { 5734 bdev_io->internal.f.has_accel_sequence = true; 5735 bdev_io->internal.accel_sequence = seq; 5736 } 5737 5738 if (domain != NULL) { 5739 bdev_io->internal.f.has_memory_domain = true; 5740 bdev_io->internal.memory_domain = domain; 5741 bdev_io->internal.memory_domain_ctx = domain_ctx; 5742 } 5743 5744 bdev_io->u.bdev.memory_domain = domain; 5745 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5746 bdev_io->u.bdev.accel_sequence = seq; 5747 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5748 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5749 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5750 5751 _bdev_io_submit_ext(desc, bdev_io); 5752 5753 return 0; 5754 } 5755 5756 int 5757 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5758 struct iovec *iov, int iovcnt, 5759 uint64_t offset, uint64_t len, 5760 spdk_bdev_io_completion_cb cb, void *cb_arg) 5761 { 5762 uint64_t offset_blocks, num_blocks; 5763 5764 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5765 len, &num_blocks) != 0) { 5766 return -EINVAL; 5767 } 5768 5769 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5770 } 5771 5772 int 5773 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5774 struct iovec *iov, int iovcnt, 5775 uint64_t offset_blocks, uint64_t num_blocks, 5776 spdk_bdev_io_completion_cb cb, void *cb_arg) 5777 { 5778 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5779 5780 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5781 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5782 cb, cb_arg); 5783 } 5784 5785 int 5786 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5787 struct iovec *iov, int iovcnt, void *md_buf, 5788 uint64_t offset_blocks, uint64_t num_blocks, 5789 spdk_bdev_io_completion_cb cb, void *cb_arg) 5790 { 5791 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5792 5793 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5794 return -EINVAL; 5795 } 5796 5797 if (md_buf && !_is_buf_allocated(iov)) { 5798 return -EINVAL; 5799 } 5800 5801 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5802 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5803 cb, cb_arg); 5804 } 5805 5806 int 5807 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5808 struct iovec *iov, int iovcnt, 5809 uint64_t offset_blocks, uint64_t num_blocks, 5810 spdk_bdev_io_completion_cb cb, void *cb_arg, 5811 struct spdk_bdev_ext_io_opts *opts) 5812 { 5813 struct spdk_memory_domain *domain = NULL; 5814 struct spdk_accel_sequence *seq = NULL; 5815 void *domain_ctx = NULL, *md = NULL; 5816 uint32_t dif_check_flags = 0; 5817 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5818 uint32_t nvme_cdw12_raw = 0; 5819 uint32_t nvme_cdw13_raw = 0; 5820 5821 if (opts) { 5822 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5823 return -EINVAL; 5824 } 5825 md = opts->metadata; 5826 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5827 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5828 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5829 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5830 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5831 if (md) { 5832 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5833 return -EINVAL; 5834 } 5835 5836 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5837 return -EINVAL; 5838 } 5839 5840 if (spdk_unlikely(seq != NULL)) { 5841 return -EINVAL; 5842 } 5843 } 5844 } 5845 5846 dif_check_flags = bdev->dif_check_flags & 5847 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5848 5849 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5850 domain, domain_ctx, seq, dif_check_flags, 5851 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5852 } 5853 5854 static void 5855 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5856 { 5857 struct spdk_bdev_io *parent_io = cb_arg; 5858 struct spdk_bdev *bdev = parent_io->bdev; 5859 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5860 int i, rc = 0; 5861 5862 if (!success) { 5863 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5864 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5865 spdk_bdev_free_io(bdev_io); 5866 return; 5867 } 5868 5869 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5870 rc = memcmp(read_buf, 5871 parent_io->u.bdev.iovs[i].iov_base, 5872 parent_io->u.bdev.iovs[i].iov_len); 5873 if (rc) { 5874 break; 5875 } 5876 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5877 } 5878 5879 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5880 rc = memcmp(bdev_io->u.bdev.md_buf, 5881 parent_io->u.bdev.md_buf, 5882 spdk_bdev_get_md_size(bdev)); 5883 } 5884 5885 spdk_bdev_free_io(bdev_io); 5886 5887 if (rc == 0) { 5888 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5889 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5890 } else { 5891 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5892 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5893 } 5894 } 5895 5896 static void 5897 bdev_compare_do_read(void *_bdev_io) 5898 { 5899 struct spdk_bdev_io *bdev_io = _bdev_io; 5900 int rc; 5901 5902 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5903 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5904 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5905 bdev_compare_do_read_done, bdev_io); 5906 5907 if (rc == -ENOMEM) { 5908 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5909 } else if (rc != 0) { 5910 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5911 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5912 } 5913 } 5914 5915 static int 5916 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5917 struct iovec *iov, int iovcnt, void *md_buf, 5918 uint64_t offset_blocks, uint64_t num_blocks, 5919 spdk_bdev_io_completion_cb cb, void *cb_arg) 5920 { 5921 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5922 struct spdk_bdev_io *bdev_io; 5923 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5924 5925 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5926 return -EINVAL; 5927 } 5928 5929 bdev_io = bdev_channel_get_io(channel); 5930 if (!bdev_io) { 5931 return -ENOMEM; 5932 } 5933 5934 bdev_io->internal.ch = channel; 5935 bdev_io->internal.desc = desc; 5936 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5937 bdev_io->u.bdev.iovs = iov; 5938 bdev_io->u.bdev.iovcnt = iovcnt; 5939 bdev_io->u.bdev.md_buf = md_buf; 5940 bdev_io->u.bdev.num_blocks = num_blocks; 5941 bdev_io->u.bdev.offset_blocks = offset_blocks; 5942 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5943 bdev_io->u.bdev.memory_domain = NULL; 5944 bdev_io->u.bdev.memory_domain_ctx = NULL; 5945 bdev_io->u.bdev.accel_sequence = NULL; 5946 5947 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5948 bdev_io_submit(bdev_io); 5949 return 0; 5950 } 5951 5952 bdev_compare_do_read(bdev_io); 5953 5954 return 0; 5955 } 5956 5957 int 5958 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5959 struct iovec *iov, int iovcnt, 5960 uint64_t offset_blocks, uint64_t num_blocks, 5961 spdk_bdev_io_completion_cb cb, void *cb_arg) 5962 { 5963 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5964 num_blocks, cb, cb_arg); 5965 } 5966 5967 int 5968 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5969 struct iovec *iov, int iovcnt, void *md_buf, 5970 uint64_t offset_blocks, uint64_t num_blocks, 5971 spdk_bdev_io_completion_cb cb, void *cb_arg) 5972 { 5973 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5974 return -EINVAL; 5975 } 5976 5977 if (md_buf && !_is_buf_allocated(iov)) { 5978 return -EINVAL; 5979 } 5980 5981 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5982 num_blocks, cb, cb_arg); 5983 } 5984 5985 static int 5986 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5987 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5988 spdk_bdev_io_completion_cb cb, void *cb_arg) 5989 { 5990 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5991 struct spdk_bdev_io *bdev_io; 5992 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5993 5994 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5995 return -EINVAL; 5996 } 5997 5998 bdev_io = bdev_channel_get_io(channel); 5999 if (!bdev_io) { 6000 return -ENOMEM; 6001 } 6002 6003 bdev_io->internal.ch = channel; 6004 bdev_io->internal.desc = desc; 6005 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 6006 bdev_io->u.bdev.iovs = &bdev_io->iov; 6007 bdev_io->u.bdev.iovs[0].iov_base = buf; 6008 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 6009 bdev_io->u.bdev.iovcnt = 1; 6010 bdev_io->u.bdev.md_buf = md_buf; 6011 bdev_io->u.bdev.num_blocks = num_blocks; 6012 bdev_io->u.bdev.offset_blocks = offset_blocks; 6013 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6014 bdev_io->u.bdev.memory_domain = NULL; 6015 bdev_io->u.bdev.memory_domain_ctx = NULL; 6016 bdev_io->u.bdev.accel_sequence = NULL; 6017 6018 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6019 bdev_io_submit(bdev_io); 6020 return 0; 6021 } 6022 6023 bdev_compare_do_read(bdev_io); 6024 6025 return 0; 6026 } 6027 6028 int 6029 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6030 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6031 spdk_bdev_io_completion_cb cb, void *cb_arg) 6032 { 6033 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6034 cb, cb_arg); 6035 } 6036 6037 int 6038 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6039 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6040 spdk_bdev_io_completion_cb cb, void *cb_arg) 6041 { 6042 struct iovec iov = { 6043 .iov_base = buf, 6044 }; 6045 6046 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6047 return -EINVAL; 6048 } 6049 6050 if (md_buf && !_is_buf_allocated(&iov)) { 6051 return -EINVAL; 6052 } 6053 6054 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6055 cb, cb_arg); 6056 } 6057 6058 static void 6059 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6060 { 6061 struct spdk_bdev_io *bdev_io = ctx; 6062 6063 if (unlock_status) { 6064 SPDK_ERRLOG("LBA range unlock failed\n"); 6065 } 6066 6067 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6068 false, bdev_io->internal.caller_ctx); 6069 } 6070 6071 static void 6072 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6073 { 6074 bdev_io->internal.status = status; 6075 6076 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6077 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6078 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6079 } 6080 6081 static void 6082 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6083 { 6084 struct spdk_bdev_io *parent_io = cb_arg; 6085 6086 if (!success) { 6087 SPDK_ERRLOG("Compare and write operation failed\n"); 6088 } 6089 6090 spdk_bdev_free_io(bdev_io); 6091 6092 bdev_comparev_and_writev_blocks_unlock(parent_io, 6093 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6094 } 6095 6096 static void 6097 bdev_compare_and_write_do_write(void *_bdev_io) 6098 { 6099 struct spdk_bdev_io *bdev_io = _bdev_io; 6100 int rc; 6101 6102 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6103 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6104 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6105 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6106 bdev_compare_and_write_do_write_done, bdev_io); 6107 6108 6109 if (rc == -ENOMEM) { 6110 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6111 } else if (rc != 0) { 6112 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6113 } 6114 } 6115 6116 static void 6117 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6118 { 6119 struct spdk_bdev_io *parent_io = cb_arg; 6120 6121 spdk_bdev_free_io(bdev_io); 6122 6123 if (!success) { 6124 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6125 return; 6126 } 6127 6128 bdev_compare_and_write_do_write(parent_io); 6129 } 6130 6131 static void 6132 bdev_compare_and_write_do_compare(void *_bdev_io) 6133 { 6134 struct spdk_bdev_io *bdev_io = _bdev_io; 6135 int rc; 6136 6137 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6138 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6139 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6140 bdev_compare_and_write_do_compare_done, bdev_io); 6141 6142 if (rc == -ENOMEM) { 6143 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6144 } else if (rc != 0) { 6145 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6146 } 6147 } 6148 6149 static void 6150 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6151 { 6152 struct spdk_bdev_io *bdev_io = ctx; 6153 6154 if (status) { 6155 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6156 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6157 return; 6158 } 6159 6160 bdev_compare_and_write_do_compare(bdev_io); 6161 } 6162 6163 int 6164 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6165 struct iovec *compare_iov, int compare_iovcnt, 6166 struct iovec *write_iov, int write_iovcnt, 6167 uint64_t offset_blocks, uint64_t num_blocks, 6168 spdk_bdev_io_completion_cb cb, void *cb_arg) 6169 { 6170 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6171 struct spdk_bdev_io *bdev_io; 6172 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6173 6174 if (!desc->write) { 6175 return -EBADF; 6176 } 6177 6178 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6179 return -EINVAL; 6180 } 6181 6182 if (num_blocks > bdev->acwu) { 6183 return -EINVAL; 6184 } 6185 6186 bdev_io = bdev_channel_get_io(channel); 6187 if (!bdev_io) { 6188 return -ENOMEM; 6189 } 6190 6191 bdev_io->internal.ch = channel; 6192 bdev_io->internal.desc = desc; 6193 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6194 bdev_io->u.bdev.iovs = compare_iov; 6195 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6196 bdev_io->u.bdev.fused_iovs = write_iov; 6197 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6198 bdev_io->u.bdev.md_buf = NULL; 6199 bdev_io->u.bdev.num_blocks = num_blocks; 6200 bdev_io->u.bdev.offset_blocks = offset_blocks; 6201 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6202 bdev_io->u.bdev.memory_domain = NULL; 6203 bdev_io->u.bdev.memory_domain_ctx = NULL; 6204 bdev_io->u.bdev.accel_sequence = NULL; 6205 6206 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6207 bdev_io_submit(bdev_io); 6208 return 0; 6209 } 6210 6211 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6212 bdev_comparev_and_writev_blocks_locked, bdev_io); 6213 } 6214 6215 int 6216 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6217 struct iovec *iov, int iovcnt, 6218 uint64_t offset_blocks, uint64_t num_blocks, 6219 bool populate, 6220 spdk_bdev_io_completion_cb cb, void *cb_arg) 6221 { 6222 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6223 struct spdk_bdev_io *bdev_io; 6224 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6225 6226 if (!desc->write) { 6227 return -EBADF; 6228 } 6229 6230 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6231 return -EINVAL; 6232 } 6233 6234 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6235 return -ENOTSUP; 6236 } 6237 6238 bdev_io = bdev_channel_get_io(channel); 6239 if (!bdev_io) { 6240 return -ENOMEM; 6241 } 6242 6243 bdev_io->internal.ch = channel; 6244 bdev_io->internal.desc = desc; 6245 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6246 bdev_io->u.bdev.num_blocks = num_blocks; 6247 bdev_io->u.bdev.offset_blocks = offset_blocks; 6248 bdev_io->u.bdev.iovs = iov; 6249 bdev_io->u.bdev.iovcnt = iovcnt; 6250 bdev_io->u.bdev.md_buf = NULL; 6251 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6252 bdev_io->u.bdev.zcopy.commit = 0; 6253 bdev_io->u.bdev.zcopy.start = 1; 6254 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6255 bdev_io->u.bdev.memory_domain = NULL; 6256 bdev_io->u.bdev.memory_domain_ctx = NULL; 6257 bdev_io->u.bdev.accel_sequence = NULL; 6258 6259 bdev_io_submit(bdev_io); 6260 6261 return 0; 6262 } 6263 6264 int 6265 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6266 spdk_bdev_io_completion_cb cb, void *cb_arg) 6267 { 6268 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6269 return -EINVAL; 6270 } 6271 6272 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6273 bdev_io->u.bdev.zcopy.start = 0; 6274 bdev_io->internal.caller_ctx = cb_arg; 6275 bdev_io->internal.cb = cb; 6276 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6277 6278 bdev_io_submit(bdev_io); 6279 6280 return 0; 6281 } 6282 6283 int 6284 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6285 uint64_t offset, uint64_t len, 6286 spdk_bdev_io_completion_cb cb, void *cb_arg) 6287 { 6288 uint64_t offset_blocks, num_blocks; 6289 6290 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6291 len, &num_blocks) != 0) { 6292 return -EINVAL; 6293 } 6294 6295 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6296 } 6297 6298 int 6299 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6300 uint64_t offset_blocks, uint64_t num_blocks, 6301 spdk_bdev_io_completion_cb cb, void *cb_arg) 6302 { 6303 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6304 struct spdk_bdev_io *bdev_io; 6305 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6306 6307 if (!desc->write) { 6308 return -EBADF; 6309 } 6310 6311 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6312 return -EINVAL; 6313 } 6314 6315 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6316 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6317 return -ENOTSUP; 6318 } 6319 6320 bdev_io = bdev_channel_get_io(channel); 6321 6322 if (!bdev_io) { 6323 return -ENOMEM; 6324 } 6325 6326 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6327 bdev_io->internal.ch = channel; 6328 bdev_io->internal.desc = desc; 6329 bdev_io->u.bdev.offset_blocks = offset_blocks; 6330 bdev_io->u.bdev.num_blocks = num_blocks; 6331 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6332 bdev_io->u.bdev.memory_domain = NULL; 6333 bdev_io->u.bdev.memory_domain_ctx = NULL; 6334 bdev_io->u.bdev.accel_sequence = NULL; 6335 6336 /* If the write_zeroes size is large and should be split, use the generic split 6337 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6338 * 6339 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6340 * or emulate it using regular write request otherwise. 6341 */ 6342 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6343 bdev_io->internal.f.split) { 6344 bdev_io_submit(bdev_io); 6345 return 0; 6346 } 6347 6348 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6349 6350 return bdev_write_zero_buffer(bdev_io); 6351 } 6352 6353 int 6354 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6355 uint64_t offset, uint64_t nbytes, 6356 spdk_bdev_io_completion_cb cb, void *cb_arg) 6357 { 6358 uint64_t offset_blocks, num_blocks; 6359 6360 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6361 nbytes, &num_blocks) != 0) { 6362 return -EINVAL; 6363 } 6364 6365 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6366 } 6367 6368 static void 6369 bdev_io_complete_cb(void *ctx) 6370 { 6371 struct spdk_bdev_io *bdev_io = ctx; 6372 6373 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6374 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6375 } 6376 6377 int 6378 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6379 uint64_t offset_blocks, uint64_t num_blocks, 6380 spdk_bdev_io_completion_cb cb, void *cb_arg) 6381 { 6382 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6383 struct spdk_bdev_io *bdev_io; 6384 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6385 6386 if (!desc->write) { 6387 return -EBADF; 6388 } 6389 6390 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6391 return -EINVAL; 6392 } 6393 6394 bdev_io = bdev_channel_get_io(channel); 6395 if (!bdev_io) { 6396 return -ENOMEM; 6397 } 6398 6399 bdev_io->internal.ch = channel; 6400 bdev_io->internal.desc = desc; 6401 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6402 6403 bdev_io->u.bdev.iovs = &bdev_io->iov; 6404 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6405 bdev_io->u.bdev.iovs[0].iov_len = 0; 6406 bdev_io->u.bdev.iovcnt = 1; 6407 6408 bdev_io->u.bdev.offset_blocks = offset_blocks; 6409 bdev_io->u.bdev.num_blocks = num_blocks; 6410 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6411 bdev_io->u.bdev.memory_domain = NULL; 6412 bdev_io->u.bdev.memory_domain_ctx = NULL; 6413 bdev_io->u.bdev.accel_sequence = NULL; 6414 6415 if (num_blocks == 0) { 6416 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6417 return 0; 6418 } 6419 6420 bdev_io_submit(bdev_io); 6421 return 0; 6422 } 6423 6424 int 6425 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6426 uint64_t offset, uint64_t length, 6427 spdk_bdev_io_completion_cb cb, void *cb_arg) 6428 { 6429 uint64_t offset_blocks, num_blocks; 6430 6431 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6432 length, &num_blocks) != 0) { 6433 return -EINVAL; 6434 } 6435 6436 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6437 } 6438 6439 int 6440 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6441 uint64_t offset_blocks, uint64_t num_blocks, 6442 spdk_bdev_io_completion_cb cb, void *cb_arg) 6443 { 6444 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6445 struct spdk_bdev_io *bdev_io; 6446 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6447 6448 if (!desc->write) { 6449 return -EBADF; 6450 } 6451 6452 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6453 return -EINVAL; 6454 } 6455 6456 bdev_io = bdev_channel_get_io(channel); 6457 if (!bdev_io) { 6458 return -ENOMEM; 6459 } 6460 6461 bdev_io->internal.ch = channel; 6462 bdev_io->internal.desc = desc; 6463 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6464 bdev_io->u.bdev.iovs = NULL; 6465 bdev_io->u.bdev.iovcnt = 0; 6466 bdev_io->u.bdev.offset_blocks = offset_blocks; 6467 bdev_io->u.bdev.num_blocks = num_blocks; 6468 bdev_io->u.bdev.memory_domain = NULL; 6469 bdev_io->u.bdev.memory_domain_ctx = NULL; 6470 bdev_io->u.bdev.accel_sequence = NULL; 6471 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6472 6473 bdev_io_submit(bdev_io); 6474 return 0; 6475 } 6476 6477 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6478 6479 static void 6480 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6481 { 6482 struct spdk_bdev_channel *ch = _ctx; 6483 struct spdk_bdev_io *bdev_io; 6484 6485 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6486 6487 if (status == -EBUSY) { 6488 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6489 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6490 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6491 } else { 6492 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6493 6494 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6495 /* If outstanding IOs are still present and reset_io_drain_timeout 6496 * seconds passed, start the reset. */ 6497 bdev_io_submit_reset(bdev_io); 6498 } else { 6499 /* We still have in progress memory domain pull/push or we're 6500 * executing accel sequence. Since we cannot abort either of those 6501 * operations, fail the reset request. */ 6502 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6503 } 6504 } 6505 } else { 6506 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6507 SPDK_DEBUGLOG(bdev, 6508 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6509 ch->bdev->name); 6510 /* Mark the completion status as a SUCCESS and complete the reset. */ 6511 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6512 } 6513 } 6514 6515 static void 6516 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6517 struct spdk_io_channel *io_ch, void *_ctx) 6518 { 6519 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6520 int status = 0; 6521 6522 if (cur_ch->io_outstanding > 0 || 6523 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6524 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6525 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6526 * further iteration over the rest of the channels and pass non-zero status 6527 * to the callback function. */ 6528 status = -EBUSY; 6529 } 6530 spdk_bdev_for_each_channel_continue(i, status); 6531 } 6532 6533 static int 6534 bdev_reset_poll_for_outstanding_io(void *ctx) 6535 { 6536 struct spdk_bdev_channel *ch = ctx; 6537 struct spdk_bdev_io *bdev_io; 6538 6539 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6540 6541 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6542 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6543 bdev_reset_check_outstanding_io_done); 6544 6545 return SPDK_POLLER_BUSY; 6546 } 6547 6548 static void 6549 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6550 { 6551 struct spdk_bdev_channel *ch = _ctx; 6552 struct spdk_bdev_io *bdev_io; 6553 6554 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6555 6556 if (bdev->reset_io_drain_timeout == 0) { 6557 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6558 6559 bdev_io_submit_reset(bdev_io); 6560 return; 6561 } 6562 6563 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6564 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6565 6566 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6567 * submit the reset to the underlying module only if outstanding I/O 6568 * remain after reset_io_drain_timeout seconds have passed. */ 6569 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6570 bdev_reset_check_outstanding_io_done); 6571 } 6572 6573 static void 6574 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6575 struct spdk_io_channel *ch, void *_ctx) 6576 { 6577 struct spdk_bdev_channel *channel; 6578 struct spdk_bdev_mgmt_channel *mgmt_channel; 6579 struct spdk_bdev_shared_resource *shared_resource; 6580 bdev_io_tailq_t tmp_queued; 6581 6582 TAILQ_INIT(&tmp_queued); 6583 6584 channel = __io_ch_to_bdev_ch(ch); 6585 shared_resource = channel->shared_resource; 6586 mgmt_channel = shared_resource->mgmt_ch; 6587 6588 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6589 6590 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6591 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6592 } 6593 6594 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6595 bdev_abort_all_buf_io(mgmt_channel, channel); 6596 bdev_abort_all_queued_io(&tmp_queued, channel); 6597 6598 spdk_bdev_for_each_channel_continue(i, 0); 6599 } 6600 6601 static void 6602 bdev_start_reset(void *ctx) 6603 { 6604 struct spdk_bdev_channel *ch = ctx; 6605 6606 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6607 bdev_reset_freeze_channel_done); 6608 } 6609 6610 static void 6611 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6612 { 6613 struct spdk_bdev *bdev = ch->bdev; 6614 6615 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6616 6617 spdk_spin_lock(&bdev->internal.spinlock); 6618 if (bdev->internal.reset_in_progress == NULL) { 6619 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6620 /* 6621 * Take a channel reference for the target bdev for the life of this 6622 * reset. This guards against the channel getting destroyed while 6623 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6624 * progress. We will release the reference when this reset is 6625 * completed. 6626 */ 6627 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6628 bdev_start_reset(ch); 6629 } 6630 spdk_spin_unlock(&bdev->internal.spinlock); 6631 } 6632 6633 int 6634 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6635 spdk_bdev_io_completion_cb cb, void *cb_arg) 6636 { 6637 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6638 struct spdk_bdev_io *bdev_io; 6639 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6640 6641 bdev_io = bdev_channel_get_io(channel); 6642 if (!bdev_io) { 6643 return -ENOMEM; 6644 } 6645 6646 bdev_io->internal.ch = channel; 6647 bdev_io->internal.desc = desc; 6648 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6649 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6650 bdev_io->u.reset.ch_ref = NULL; 6651 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6652 6653 spdk_spin_lock(&bdev->internal.spinlock); 6654 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6655 spdk_spin_unlock(&bdev->internal.spinlock); 6656 6657 bdev_ch_add_to_io_submitted(bdev_io); 6658 6659 bdev_channel_start_reset(channel); 6660 6661 return 0; 6662 } 6663 6664 void 6665 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6666 struct spdk_bdev_io_stat *stat) 6667 { 6668 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6669 6670 bdev_get_io_stat(stat, channel->stat); 6671 } 6672 6673 static void 6674 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6675 { 6676 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6677 6678 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6679 bdev_iostat_ctx->cb_arg, 0); 6680 free(bdev_iostat_ctx); 6681 } 6682 6683 static void 6684 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6685 struct spdk_io_channel *ch, void *_ctx) 6686 { 6687 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6688 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6689 6690 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6691 spdk_bdev_for_each_channel_continue(i, 0); 6692 } 6693 6694 void 6695 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6696 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6697 { 6698 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6699 6700 assert(bdev != NULL); 6701 assert(stat != NULL); 6702 assert(cb != NULL); 6703 6704 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6705 if (bdev_iostat_ctx == NULL) { 6706 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6707 cb(bdev, stat, cb_arg, -ENOMEM); 6708 return; 6709 } 6710 6711 bdev_iostat_ctx->stat = stat; 6712 bdev_iostat_ctx->cb = cb; 6713 bdev_iostat_ctx->cb_arg = cb_arg; 6714 6715 /* Start with the statistics from previously deleted channels. */ 6716 spdk_spin_lock(&bdev->internal.spinlock); 6717 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6718 spdk_spin_unlock(&bdev->internal.spinlock); 6719 6720 /* Then iterate and add the statistics from each existing channel. */ 6721 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6722 bdev_get_device_stat_done); 6723 } 6724 6725 struct bdev_iostat_reset_ctx { 6726 enum spdk_bdev_reset_stat_mode mode; 6727 bdev_reset_device_stat_cb cb; 6728 void *cb_arg; 6729 }; 6730 6731 static void 6732 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6733 { 6734 struct bdev_iostat_reset_ctx *ctx = _ctx; 6735 6736 ctx->cb(bdev, ctx->cb_arg, 0); 6737 6738 free(ctx); 6739 } 6740 6741 static void 6742 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6743 struct spdk_io_channel *ch, void *_ctx) 6744 { 6745 struct bdev_iostat_reset_ctx *ctx = _ctx; 6746 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6747 6748 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6749 6750 spdk_bdev_for_each_channel_continue(i, 0); 6751 } 6752 6753 void 6754 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6755 bdev_reset_device_stat_cb cb, void *cb_arg) 6756 { 6757 struct bdev_iostat_reset_ctx *ctx; 6758 6759 assert(bdev != NULL); 6760 assert(cb != NULL); 6761 6762 ctx = calloc(1, sizeof(*ctx)); 6763 if (ctx == NULL) { 6764 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6765 cb(bdev, cb_arg, -ENOMEM); 6766 return; 6767 } 6768 6769 ctx->mode = mode; 6770 ctx->cb = cb; 6771 ctx->cb_arg = cb_arg; 6772 6773 spdk_spin_lock(&bdev->internal.spinlock); 6774 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6775 spdk_spin_unlock(&bdev->internal.spinlock); 6776 6777 spdk_bdev_for_each_channel(bdev, 6778 bdev_reset_each_channel_stat, 6779 ctx, 6780 bdev_reset_device_stat_done); 6781 } 6782 6783 int 6784 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6785 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6786 spdk_bdev_io_completion_cb cb, void *cb_arg) 6787 { 6788 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6789 struct spdk_bdev_io *bdev_io; 6790 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6791 6792 if (!desc->write) { 6793 return -EBADF; 6794 } 6795 6796 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6797 return -ENOTSUP; 6798 } 6799 6800 bdev_io = bdev_channel_get_io(channel); 6801 if (!bdev_io) { 6802 return -ENOMEM; 6803 } 6804 6805 bdev_io->internal.ch = channel; 6806 bdev_io->internal.desc = desc; 6807 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6808 bdev_io->u.nvme_passthru.cmd = *cmd; 6809 bdev_io->u.nvme_passthru.buf = buf; 6810 bdev_io->u.nvme_passthru.nbytes = nbytes; 6811 bdev_io->u.nvme_passthru.md_buf = NULL; 6812 bdev_io->u.nvme_passthru.md_len = 0; 6813 6814 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6815 6816 bdev_io_submit(bdev_io); 6817 return 0; 6818 } 6819 6820 int 6821 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6822 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6823 spdk_bdev_io_completion_cb cb, void *cb_arg) 6824 { 6825 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6826 struct spdk_bdev_io *bdev_io; 6827 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6828 6829 if (!desc->write) { 6830 /* 6831 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6832 * to easily determine if the command is a read or write, but for now just 6833 * do not allow io_passthru with a read-only descriptor. 6834 */ 6835 return -EBADF; 6836 } 6837 6838 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6839 return -ENOTSUP; 6840 } 6841 6842 bdev_io = bdev_channel_get_io(channel); 6843 if (!bdev_io) { 6844 return -ENOMEM; 6845 } 6846 6847 bdev_io->internal.ch = channel; 6848 bdev_io->internal.desc = desc; 6849 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6850 bdev_io->u.nvme_passthru.cmd = *cmd; 6851 bdev_io->u.nvme_passthru.buf = buf; 6852 bdev_io->u.nvme_passthru.nbytes = nbytes; 6853 bdev_io->u.nvme_passthru.md_buf = NULL; 6854 bdev_io->u.nvme_passthru.md_len = 0; 6855 6856 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6857 6858 bdev_io_submit(bdev_io); 6859 return 0; 6860 } 6861 6862 int 6863 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6864 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6865 spdk_bdev_io_completion_cb cb, void *cb_arg) 6866 { 6867 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6868 struct spdk_bdev_io *bdev_io; 6869 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6870 6871 if (!desc->write) { 6872 /* 6873 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6874 * to easily determine if the command is a read or write, but for now just 6875 * do not allow io_passthru with a read-only descriptor. 6876 */ 6877 return -EBADF; 6878 } 6879 6880 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6881 return -ENOTSUP; 6882 } 6883 6884 bdev_io = bdev_channel_get_io(channel); 6885 if (!bdev_io) { 6886 return -ENOMEM; 6887 } 6888 6889 bdev_io->internal.ch = channel; 6890 bdev_io->internal.desc = desc; 6891 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6892 bdev_io->u.nvme_passthru.cmd = *cmd; 6893 bdev_io->u.nvme_passthru.buf = buf; 6894 bdev_io->u.nvme_passthru.nbytes = nbytes; 6895 bdev_io->u.nvme_passthru.md_buf = md_buf; 6896 bdev_io->u.nvme_passthru.md_len = md_len; 6897 6898 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6899 6900 bdev_io_submit(bdev_io); 6901 return 0; 6902 } 6903 6904 int 6905 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6906 struct spdk_io_channel *ch, 6907 const struct spdk_nvme_cmd *cmd, 6908 struct iovec *iov, int iovcnt, size_t nbytes, 6909 void *md_buf, size_t md_len, 6910 spdk_bdev_io_completion_cb cb, void *cb_arg) 6911 { 6912 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6913 struct spdk_bdev_io *bdev_io; 6914 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6915 6916 if (!desc->write) { 6917 /* 6918 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6919 * to easily determine if the command is a read or write, but for now just 6920 * do not allow io_passthru with a read-only descriptor. 6921 */ 6922 return -EBADF; 6923 } 6924 6925 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6926 return -ENOTSUP; 6927 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6928 return -ENOTSUP; 6929 } 6930 6931 bdev_io = bdev_channel_get_io(channel); 6932 if (!bdev_io) { 6933 return -ENOMEM; 6934 } 6935 6936 bdev_io->internal.ch = channel; 6937 bdev_io->internal.desc = desc; 6938 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6939 bdev_io->u.nvme_passthru.cmd = *cmd; 6940 bdev_io->u.nvme_passthru.iovs = iov; 6941 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6942 bdev_io->u.nvme_passthru.nbytes = nbytes; 6943 bdev_io->u.nvme_passthru.md_buf = md_buf; 6944 bdev_io->u.nvme_passthru.md_len = md_len; 6945 6946 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6947 6948 bdev_io_submit(bdev_io); 6949 return 0; 6950 } 6951 6952 static void bdev_abort_retry(void *ctx); 6953 static void bdev_abort(struct spdk_bdev_io *parent_io); 6954 6955 static void 6956 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6957 { 6958 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6959 struct spdk_bdev_io *parent_io = cb_arg; 6960 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6961 6962 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6963 6964 spdk_bdev_free_io(bdev_io); 6965 6966 if (!success) { 6967 /* Check if the target I/O completed in the meantime. */ 6968 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6969 if (tmp_io == bio_to_abort) { 6970 break; 6971 } 6972 } 6973 6974 /* If the target I/O still exists, set the parent to failed. */ 6975 if (tmp_io != NULL) { 6976 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6977 } 6978 } 6979 6980 assert(parent_io->internal.f.split); 6981 6982 parent_io->internal.split.outstanding--; 6983 if (parent_io->internal.split.outstanding == 0) { 6984 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6985 bdev_abort_retry(parent_io); 6986 } else { 6987 bdev_io_complete(parent_io); 6988 } 6989 } 6990 } 6991 6992 static int 6993 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6994 struct spdk_bdev_io *bio_to_abort, 6995 spdk_bdev_io_completion_cb cb, void *cb_arg) 6996 { 6997 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6998 struct spdk_bdev_io *bdev_io; 6999 7000 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 7001 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 7002 /* TODO: Abort reset or abort request. */ 7003 return -ENOTSUP; 7004 } 7005 7006 bdev_io = bdev_channel_get_io(channel); 7007 if (bdev_io == NULL) { 7008 return -ENOMEM; 7009 } 7010 7011 bdev_io->internal.ch = channel; 7012 bdev_io->internal.desc = desc; 7013 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7014 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7015 7016 if (bio_to_abort->internal.f.split) { 7017 assert(bdev_io_should_split(bio_to_abort)); 7018 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7019 7020 /* Parent abort request is not submitted directly, but to manage its 7021 * execution add it to the submitted list here. 7022 */ 7023 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7024 bdev_ch_add_to_io_submitted(bdev_io); 7025 7026 bdev_abort(bdev_io); 7027 7028 return 0; 7029 } 7030 7031 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7032 7033 /* Submit the abort request to the underlying bdev module. */ 7034 bdev_io_submit(bdev_io); 7035 7036 return 0; 7037 } 7038 7039 static bool 7040 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7041 { 7042 struct spdk_bdev_io *iter; 7043 7044 TAILQ_FOREACH(iter, tailq, internal.link) { 7045 if (iter == bdev_io) { 7046 return true; 7047 } 7048 } 7049 7050 return false; 7051 } 7052 7053 static uint32_t 7054 _bdev_abort(struct spdk_bdev_io *parent_io) 7055 { 7056 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7057 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7058 void *bio_cb_arg; 7059 struct spdk_bdev_io *bio_to_abort; 7060 uint32_t matched_ios; 7061 int rc; 7062 7063 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7064 7065 /* matched_ios is returned and will be kept by the caller. 7066 * 7067 * This function will be used for two cases, 1) the same cb_arg is used for 7068 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7069 * Incrementing split_outstanding directly here may confuse readers especially 7070 * for the 1st case. 7071 * 7072 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7073 * works as expected. 7074 */ 7075 matched_ios = 0; 7076 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7077 7078 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7079 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7080 continue; 7081 } 7082 7083 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7084 /* Any I/O which was submitted after this abort command should be excluded. */ 7085 continue; 7086 } 7087 7088 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7089 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7090 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7091 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7092 break; 7093 } 7094 7095 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7096 if (rc != 0) { 7097 if (rc == -ENOMEM) { 7098 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7099 } else { 7100 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7101 } 7102 break; 7103 } 7104 matched_ios++; 7105 } 7106 7107 return matched_ios; 7108 } 7109 7110 static void 7111 bdev_abort_retry(void *ctx) 7112 { 7113 struct spdk_bdev_io *parent_io = ctx; 7114 uint32_t matched_ios; 7115 7116 matched_ios = _bdev_abort(parent_io); 7117 7118 if (matched_ios == 0) { 7119 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7120 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7121 } else { 7122 /* For retry, the case that no target I/O was found is success 7123 * because it means target I/Os completed in the meantime. 7124 */ 7125 bdev_io_complete(parent_io); 7126 } 7127 return; 7128 } 7129 7130 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7131 parent_io->internal.f.split = true; 7132 parent_io->internal.split.outstanding = matched_ios; 7133 } 7134 7135 static void 7136 bdev_abort(struct spdk_bdev_io *parent_io) 7137 { 7138 uint32_t matched_ios; 7139 7140 matched_ios = _bdev_abort(parent_io); 7141 7142 if (matched_ios == 0) { 7143 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7144 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7145 } else { 7146 /* The case the no target I/O was found is failure. */ 7147 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7148 bdev_io_complete(parent_io); 7149 } 7150 return; 7151 } 7152 7153 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7154 parent_io->internal.f.split = true; 7155 parent_io->internal.split.outstanding = matched_ios; 7156 } 7157 7158 int 7159 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7160 void *bio_cb_arg, 7161 spdk_bdev_io_completion_cb cb, void *cb_arg) 7162 { 7163 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7164 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7165 struct spdk_bdev_io *bdev_io; 7166 7167 if (bio_cb_arg == NULL) { 7168 return -EINVAL; 7169 } 7170 7171 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7172 return -ENOTSUP; 7173 } 7174 7175 bdev_io = bdev_channel_get_io(channel); 7176 if (bdev_io == NULL) { 7177 return -ENOMEM; 7178 } 7179 7180 bdev_io->internal.ch = channel; 7181 bdev_io->internal.desc = desc; 7182 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7183 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7184 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7185 7186 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7187 7188 /* Parent abort request is not submitted directly, but to manage its execution, 7189 * add it to the submitted list here. 7190 */ 7191 bdev_ch_add_to_io_submitted(bdev_io); 7192 7193 bdev_abort(bdev_io); 7194 7195 return 0; 7196 } 7197 7198 int 7199 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7200 struct spdk_bdev_io_wait_entry *entry) 7201 { 7202 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7203 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7204 7205 if (bdev != entry->bdev) { 7206 SPDK_ERRLOG("bdevs do not match\n"); 7207 return -EINVAL; 7208 } 7209 7210 if (mgmt_ch->per_thread_cache_count > 0) { 7211 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7212 return -EINVAL; 7213 } 7214 7215 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7216 return 0; 7217 } 7218 7219 static inline void 7220 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7221 { 7222 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7223 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7224 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7225 uint32_t blocklen = bdev_io->bdev->blocklen; 7226 7227 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7228 switch (bdev_io->type) { 7229 case SPDK_BDEV_IO_TYPE_READ: 7230 io_stat->bytes_read += num_blocks * blocklen; 7231 io_stat->num_read_ops++; 7232 io_stat->read_latency_ticks += tsc_diff; 7233 if (io_stat->max_read_latency_ticks < tsc_diff) { 7234 io_stat->max_read_latency_ticks = tsc_diff; 7235 } 7236 if (io_stat->min_read_latency_ticks > tsc_diff) { 7237 io_stat->min_read_latency_ticks = tsc_diff; 7238 } 7239 break; 7240 case SPDK_BDEV_IO_TYPE_WRITE: 7241 io_stat->bytes_written += num_blocks * blocklen; 7242 io_stat->num_write_ops++; 7243 io_stat->write_latency_ticks += tsc_diff; 7244 if (io_stat->max_write_latency_ticks < tsc_diff) { 7245 io_stat->max_write_latency_ticks = tsc_diff; 7246 } 7247 if (io_stat->min_write_latency_ticks > tsc_diff) { 7248 io_stat->min_write_latency_ticks = tsc_diff; 7249 } 7250 break; 7251 case SPDK_BDEV_IO_TYPE_UNMAP: 7252 io_stat->bytes_unmapped += num_blocks * blocklen; 7253 io_stat->num_unmap_ops++; 7254 io_stat->unmap_latency_ticks += tsc_diff; 7255 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7256 io_stat->max_unmap_latency_ticks = tsc_diff; 7257 } 7258 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7259 io_stat->min_unmap_latency_ticks = tsc_diff; 7260 } 7261 break; 7262 case SPDK_BDEV_IO_TYPE_ZCOPY: 7263 /* Track the data in the start phase only */ 7264 if (bdev_io->u.bdev.zcopy.start) { 7265 if (bdev_io->u.bdev.zcopy.populate) { 7266 io_stat->bytes_read += num_blocks * blocklen; 7267 io_stat->num_read_ops++; 7268 io_stat->read_latency_ticks += tsc_diff; 7269 if (io_stat->max_read_latency_ticks < tsc_diff) { 7270 io_stat->max_read_latency_ticks = tsc_diff; 7271 } 7272 if (io_stat->min_read_latency_ticks > tsc_diff) { 7273 io_stat->min_read_latency_ticks = tsc_diff; 7274 } 7275 } else { 7276 io_stat->bytes_written += num_blocks * blocklen; 7277 io_stat->num_write_ops++; 7278 io_stat->write_latency_ticks += tsc_diff; 7279 if (io_stat->max_write_latency_ticks < tsc_diff) { 7280 io_stat->max_write_latency_ticks = tsc_diff; 7281 } 7282 if (io_stat->min_write_latency_ticks > tsc_diff) { 7283 io_stat->min_write_latency_ticks = tsc_diff; 7284 } 7285 } 7286 } 7287 break; 7288 case SPDK_BDEV_IO_TYPE_COPY: 7289 io_stat->bytes_copied += num_blocks * blocklen; 7290 io_stat->num_copy_ops++; 7291 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7292 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7293 io_stat->max_copy_latency_ticks = tsc_diff; 7294 } 7295 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7296 io_stat->min_copy_latency_ticks = tsc_diff; 7297 } 7298 break; 7299 default: 7300 break; 7301 } 7302 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7303 io_stat = bdev_io->bdev->internal.stat; 7304 assert(io_stat->io_error != NULL); 7305 7306 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7307 io_stat->io_error->error_status[-io_status - 1]++; 7308 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7309 } 7310 7311 #ifdef SPDK_CONFIG_VTUNE 7312 uint64_t now_tsc = spdk_get_ticks(); 7313 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7314 uint64_t data[5]; 7315 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7316 7317 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7318 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7319 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7320 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7321 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7322 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7323 7324 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7325 __itt_metadata_u64, 5, data); 7326 7327 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7328 bdev_io->internal.ch->start_tsc = now_tsc; 7329 } 7330 #endif 7331 } 7332 7333 static inline void 7334 _bdev_io_complete(void *ctx) 7335 { 7336 struct spdk_bdev_io *bdev_io = ctx; 7337 7338 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7339 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7340 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7341 } 7342 7343 assert(bdev_io->internal.cb != NULL); 7344 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7345 7346 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7347 bdev_io->internal.caller_ctx); 7348 } 7349 7350 static inline void 7351 bdev_io_complete(void *ctx) 7352 { 7353 struct spdk_bdev_io *bdev_io = ctx; 7354 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7355 uint64_t tsc, tsc_diff; 7356 7357 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7358 /* 7359 * Defer completion to avoid potential infinite recursion if the 7360 * user's completion callback issues a new I/O. 7361 */ 7362 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7363 bdev_io_complete, bdev_io); 7364 return; 7365 } 7366 7367 tsc = spdk_get_ticks(); 7368 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7369 7370 bdev_ch_remove_from_io_submitted(bdev_io); 7371 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7372 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7373 7374 if (bdev_ch->histogram) { 7375 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7376 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7377 /* 7378 * Tally all I/O types if the histogram_io_type is set to 0. 7379 */ 7380 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7381 } 7382 } 7383 7384 bdev_io_update_io_stat(bdev_io, tsc_diff); 7385 _bdev_io_complete(bdev_io); 7386 } 7387 7388 /* The difference between this function and bdev_io_complete() is that this should be called to 7389 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7390 * io_submitted list and don't have submit_tsc updated. 7391 */ 7392 static inline void 7393 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7394 { 7395 /* Since the IO hasn't been submitted it's bound to be failed */ 7396 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7397 7398 /* At this point we don't know if the IO is completed from submission context or not, but, 7399 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7400 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7401 _bdev_io_complete, bdev_io); 7402 } 7403 7404 static void bdev_destroy_cb(void *io_device); 7405 7406 static void 7407 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7408 { 7409 struct spdk_bdev_io *bdev_io = _ctx; 7410 7411 if (bdev_io->u.reset.ch_ref != NULL) { 7412 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7413 bdev_io->u.reset.ch_ref = NULL; 7414 } 7415 7416 bdev_io_complete(bdev_io); 7417 7418 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7419 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7420 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7421 } 7422 } 7423 7424 static void 7425 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7426 struct spdk_io_channel *_ch, void *_ctx) 7427 { 7428 struct spdk_bdev_io *bdev_io = _ctx; 7429 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7430 struct spdk_bdev_io *queued_reset; 7431 7432 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7433 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7434 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7435 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7436 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7437 } 7438 7439 spdk_bdev_for_each_channel_continue(i, 0); 7440 } 7441 7442 static void 7443 bdev_io_complete_sequence_cb(void *ctx, int status) 7444 { 7445 struct spdk_bdev_io *bdev_io = ctx; 7446 7447 /* u.bdev.accel_sequence should have already been cleared at this point */ 7448 assert(bdev_io->u.bdev.accel_sequence == NULL); 7449 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7450 bdev_io->internal.f.has_accel_sequence = false; 7451 7452 if (spdk_unlikely(status != 0)) { 7453 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7454 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7455 } 7456 7457 bdev_io_complete(bdev_io); 7458 } 7459 7460 void 7461 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7462 { 7463 struct spdk_bdev *bdev = bdev_io->bdev; 7464 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7465 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7466 7467 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7468 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7469 spdk_bdev_get_module_name(bdev), 7470 bdev_io_status_get_string(bdev_io->internal.status)); 7471 assert(false); 7472 } 7473 bdev_io->internal.status = status; 7474 7475 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7476 bool unlock_channels = false; 7477 7478 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7479 SPDK_ERRLOG("NOMEM returned for reset\n"); 7480 } 7481 spdk_spin_lock(&bdev->internal.spinlock); 7482 if (bdev_io == bdev->internal.reset_in_progress) { 7483 bdev->internal.reset_in_progress = NULL; 7484 unlock_channels = true; 7485 } 7486 spdk_spin_unlock(&bdev->internal.spinlock); 7487 7488 if (unlock_channels) { 7489 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7490 bdev_reset_complete); 7491 return; 7492 } 7493 } else { 7494 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7495 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7496 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7497 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7498 return; 7499 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7500 !bdev_io_use_accel_sequence(bdev_io))) { 7501 _bdev_io_push_bounce_data_buffer(bdev_io, 7502 _bdev_io_complete_push_bounce_done); 7503 /* bdev IO will be completed in the callback */ 7504 return; 7505 } 7506 } 7507 7508 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7509 return; 7510 } 7511 } 7512 7513 bdev_io_complete(bdev_io); 7514 } 7515 7516 void 7517 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7518 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7519 { 7520 enum spdk_bdev_io_status status; 7521 7522 if (sc == SPDK_SCSI_STATUS_GOOD) { 7523 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7524 } else { 7525 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7526 bdev_io->internal.error.scsi.sc = sc; 7527 bdev_io->internal.error.scsi.sk = sk; 7528 bdev_io->internal.error.scsi.asc = asc; 7529 bdev_io->internal.error.scsi.ascq = ascq; 7530 } 7531 7532 spdk_bdev_io_complete(bdev_io, status); 7533 } 7534 7535 void 7536 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7537 int *sc, int *sk, int *asc, int *ascq) 7538 { 7539 assert(sc != NULL); 7540 assert(sk != NULL); 7541 assert(asc != NULL); 7542 assert(ascq != NULL); 7543 7544 switch (bdev_io->internal.status) { 7545 case SPDK_BDEV_IO_STATUS_SUCCESS: 7546 *sc = SPDK_SCSI_STATUS_GOOD; 7547 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7548 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7549 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7550 break; 7551 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7552 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7553 break; 7554 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7555 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7556 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7557 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7558 *ascq = bdev_io->internal.error.scsi.ascq; 7559 break; 7560 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7561 *sc = bdev_io->internal.error.scsi.sc; 7562 *sk = bdev_io->internal.error.scsi.sk; 7563 *asc = bdev_io->internal.error.scsi.asc; 7564 *ascq = bdev_io->internal.error.scsi.ascq; 7565 break; 7566 default: 7567 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7568 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7569 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7570 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7571 break; 7572 } 7573 } 7574 7575 void 7576 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7577 { 7578 enum spdk_bdev_io_status status; 7579 7580 if (aio_result == 0) { 7581 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7582 } else { 7583 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7584 } 7585 7586 bdev_io->internal.error.aio_result = aio_result; 7587 7588 spdk_bdev_io_complete(bdev_io, status); 7589 } 7590 7591 void 7592 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7593 { 7594 assert(aio_result != NULL); 7595 7596 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7597 *aio_result = bdev_io->internal.error.aio_result; 7598 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7599 *aio_result = 0; 7600 } else { 7601 *aio_result = -EIO; 7602 } 7603 } 7604 7605 void 7606 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7607 { 7608 enum spdk_bdev_io_status status; 7609 7610 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7611 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7612 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7613 status = SPDK_BDEV_IO_STATUS_ABORTED; 7614 } else { 7615 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7616 } 7617 7618 bdev_io->internal.error.nvme.cdw0 = cdw0; 7619 bdev_io->internal.error.nvme.sct = sct; 7620 bdev_io->internal.error.nvme.sc = sc; 7621 7622 spdk_bdev_io_complete(bdev_io, status); 7623 } 7624 7625 void 7626 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7627 { 7628 assert(sct != NULL); 7629 assert(sc != NULL); 7630 assert(cdw0 != NULL); 7631 7632 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7633 *sct = SPDK_NVME_SCT_GENERIC; 7634 *sc = SPDK_NVME_SC_SUCCESS; 7635 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7636 *cdw0 = 0; 7637 } else { 7638 *cdw0 = 1U; 7639 } 7640 return; 7641 } 7642 7643 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7644 *sct = SPDK_NVME_SCT_GENERIC; 7645 *sc = SPDK_NVME_SC_SUCCESS; 7646 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7647 *sct = bdev_io->internal.error.nvme.sct; 7648 *sc = bdev_io->internal.error.nvme.sc; 7649 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7650 *sct = SPDK_NVME_SCT_GENERIC; 7651 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7652 } else { 7653 *sct = SPDK_NVME_SCT_GENERIC; 7654 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7655 } 7656 7657 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7658 } 7659 7660 void 7661 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7662 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7663 { 7664 assert(first_sct != NULL); 7665 assert(first_sc != NULL); 7666 assert(second_sct != NULL); 7667 assert(second_sc != NULL); 7668 assert(cdw0 != NULL); 7669 7670 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7671 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7672 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7673 *first_sct = bdev_io->internal.error.nvme.sct; 7674 *first_sc = bdev_io->internal.error.nvme.sc; 7675 *second_sct = SPDK_NVME_SCT_GENERIC; 7676 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7677 } else { 7678 *first_sct = SPDK_NVME_SCT_GENERIC; 7679 *first_sc = SPDK_NVME_SC_SUCCESS; 7680 *second_sct = bdev_io->internal.error.nvme.sct; 7681 *second_sc = bdev_io->internal.error.nvme.sc; 7682 } 7683 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7684 *first_sct = SPDK_NVME_SCT_GENERIC; 7685 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7686 *second_sct = SPDK_NVME_SCT_GENERIC; 7687 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7688 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7689 *first_sct = SPDK_NVME_SCT_GENERIC; 7690 *first_sc = SPDK_NVME_SC_SUCCESS; 7691 *second_sct = SPDK_NVME_SCT_GENERIC; 7692 *second_sc = SPDK_NVME_SC_SUCCESS; 7693 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7694 *first_sct = SPDK_NVME_SCT_GENERIC; 7695 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7696 *second_sct = SPDK_NVME_SCT_GENERIC; 7697 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7698 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7699 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7700 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7701 *second_sct = SPDK_NVME_SCT_GENERIC; 7702 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7703 } else { 7704 *first_sct = SPDK_NVME_SCT_GENERIC; 7705 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7706 *second_sct = SPDK_NVME_SCT_GENERIC; 7707 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7708 } 7709 7710 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7711 } 7712 7713 void 7714 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7715 const struct spdk_bdev_io *base_io) 7716 { 7717 switch (base_io->internal.status) { 7718 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7719 spdk_bdev_io_complete_nvme_status(bdev_io, 7720 base_io->internal.error.nvme.cdw0, 7721 base_io->internal.error.nvme.sct, 7722 base_io->internal.error.nvme.sc); 7723 break; 7724 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7725 spdk_bdev_io_complete_scsi_status(bdev_io, 7726 base_io->internal.error.scsi.sc, 7727 base_io->internal.error.scsi.sk, 7728 base_io->internal.error.scsi.asc, 7729 base_io->internal.error.scsi.ascq); 7730 break; 7731 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7732 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7733 break; 7734 default: 7735 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7736 break; 7737 } 7738 } 7739 7740 struct spdk_thread * 7741 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7742 { 7743 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7744 } 7745 7746 struct spdk_io_channel * 7747 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7748 { 7749 return bdev_io->internal.ch->channel; 7750 } 7751 7752 static int 7753 bdev_register(struct spdk_bdev *bdev) 7754 { 7755 char *bdev_name; 7756 char uuid[SPDK_UUID_STRING_LEN]; 7757 struct spdk_iobuf_opts iobuf_opts; 7758 int ret; 7759 7760 assert(bdev->module != NULL); 7761 7762 if (!bdev->name) { 7763 SPDK_ERRLOG("Bdev name is NULL\n"); 7764 return -EINVAL; 7765 } 7766 7767 if (!strlen(bdev->name)) { 7768 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7769 return -EINVAL; 7770 } 7771 7772 /* Users often register their own I/O devices using the bdev name. In 7773 * order to avoid conflicts, prepend bdev_. */ 7774 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7775 if (!bdev_name) { 7776 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7777 return -ENOMEM; 7778 } 7779 7780 bdev->internal.stat = bdev_alloc_io_stat(true); 7781 if (!bdev->internal.stat) { 7782 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7783 free(bdev_name); 7784 return -ENOMEM; 7785 } 7786 7787 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7788 bdev->internal.measured_queue_depth = UINT64_MAX; 7789 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7790 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7791 bdev->internal.qd_poller = NULL; 7792 bdev->internal.qos = NULL; 7793 7794 TAILQ_INIT(&bdev->internal.open_descs); 7795 TAILQ_INIT(&bdev->internal.locked_ranges); 7796 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7797 TAILQ_INIT(&bdev->aliases); 7798 7799 /* UUID may be specified by the user or defined by bdev itself. 7800 * Otherwise it will be generated here, so this field will never be empty. */ 7801 if (spdk_uuid_is_null(&bdev->uuid)) { 7802 spdk_uuid_generate(&bdev->uuid); 7803 } 7804 7805 /* Add the UUID alias only if it's different than the name */ 7806 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7807 if (strcmp(bdev->name, uuid) != 0) { 7808 ret = spdk_bdev_alias_add(bdev, uuid); 7809 if (ret != 0) { 7810 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7811 bdev_free_io_stat(bdev->internal.stat); 7812 free(bdev_name); 7813 return ret; 7814 } 7815 } 7816 7817 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7818 if (spdk_bdev_get_buf_align(bdev) > 1) { 7819 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7820 iobuf_opts.large_bufsize / bdev->blocklen); 7821 } 7822 7823 /* If the user didn't specify a write unit size, set it to one. */ 7824 if (bdev->write_unit_size == 0) { 7825 bdev->write_unit_size = 1; 7826 } 7827 7828 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7829 if (bdev->acwu == 0) { 7830 bdev->acwu = bdev->write_unit_size; 7831 } 7832 7833 if (bdev->phys_blocklen == 0) { 7834 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7835 } 7836 7837 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7838 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7839 } 7840 7841 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7842 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7843 } 7844 7845 bdev->internal.reset_in_progress = NULL; 7846 bdev->internal.qd_poll_in_progress = false; 7847 bdev->internal.period = 0; 7848 bdev->internal.new_period = 0; 7849 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7850 7851 /* 7852 * Initialize spinlock before registering IO device because spinlock is used in 7853 * bdev_channel_create 7854 */ 7855 spdk_spin_init(&bdev->internal.spinlock); 7856 7857 spdk_io_device_register(__bdev_to_io_dev(bdev), 7858 bdev_channel_create, bdev_channel_destroy, 7859 sizeof(struct spdk_bdev_channel), 7860 bdev_name); 7861 7862 /* 7863 * Register bdev name only after the bdev object is ready. 7864 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7865 * create IO channels... 7866 */ 7867 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7868 if (ret != 0) { 7869 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7870 bdev_free_io_stat(bdev->internal.stat); 7871 spdk_spin_destroy(&bdev->internal.spinlock); 7872 free(bdev_name); 7873 return ret; 7874 } 7875 7876 free(bdev_name); 7877 7878 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7879 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7880 7881 return 0; 7882 } 7883 7884 static void 7885 bdev_destroy_cb(void *io_device) 7886 { 7887 int rc; 7888 struct spdk_bdev *bdev; 7889 spdk_bdev_unregister_cb cb_fn; 7890 void *cb_arg; 7891 7892 bdev = __bdev_from_io_dev(io_device); 7893 7894 if (bdev->internal.unregister_td != spdk_get_thread()) { 7895 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7896 return; 7897 } 7898 7899 cb_fn = bdev->internal.unregister_cb; 7900 cb_arg = bdev->internal.unregister_ctx; 7901 7902 spdk_spin_destroy(&bdev->internal.spinlock); 7903 free(bdev->internal.qos); 7904 bdev_free_io_stat(bdev->internal.stat); 7905 spdk_trace_unregister_owner(bdev->internal.trace_id); 7906 7907 rc = bdev->fn_table->destruct(bdev->ctxt); 7908 if (rc < 0) { 7909 SPDK_ERRLOG("destruct failed\n"); 7910 } 7911 if (rc <= 0 && cb_fn != NULL) { 7912 cb_fn(cb_arg, rc); 7913 } 7914 } 7915 7916 void 7917 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7918 { 7919 if (bdev->internal.unregister_cb != NULL) { 7920 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7921 } 7922 } 7923 7924 static void 7925 _remove_notify(void *arg) 7926 { 7927 struct spdk_bdev_desc *desc = arg; 7928 7929 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7930 } 7931 7932 /* returns: 0 - bdev removed and ready to be destructed. 7933 * -EBUSY - bdev can't be destructed yet. */ 7934 static int 7935 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7936 { 7937 struct spdk_bdev_desc *desc, *tmp; 7938 int rc = 0; 7939 char uuid[SPDK_UUID_STRING_LEN]; 7940 7941 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7942 assert(spdk_spin_held(&bdev->internal.spinlock)); 7943 7944 /* Notify each descriptor about hotremoval */ 7945 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7946 rc = -EBUSY; 7947 /* 7948 * Defer invocation of the event_cb to a separate message that will 7949 * run later on its thread. This ensures this context unwinds and 7950 * we don't recursively unregister this bdev again if the event_cb 7951 * immediately closes its descriptor. 7952 */ 7953 event_notify(desc, _remove_notify); 7954 } 7955 7956 /* If there are no descriptors, proceed removing the bdev */ 7957 if (rc == 0) { 7958 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7959 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7960 7961 /* Delete the name and the UUID alias */ 7962 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7963 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7964 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7965 7966 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7967 7968 if (bdev->internal.reset_in_progress != NULL) { 7969 /* If reset is in progress, let the completion callback for reset 7970 * unregister the bdev. 7971 */ 7972 rc = -EBUSY; 7973 } 7974 } 7975 7976 return rc; 7977 } 7978 7979 static void 7980 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7981 struct spdk_io_channel *io_ch, void *_ctx) 7982 { 7983 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7984 7985 bdev_channel_abort_queued_ios(bdev_ch); 7986 spdk_bdev_for_each_channel_continue(i, 0); 7987 } 7988 7989 static void 7990 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7991 { 7992 int rc; 7993 7994 spdk_spin_lock(&g_bdev_mgr.spinlock); 7995 spdk_spin_lock(&bdev->internal.spinlock); 7996 /* 7997 * Set the status to REMOVING after completing to abort channels. Otherwise, 7998 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7999 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 8000 * may fail. 8001 */ 8002 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 8003 rc = bdev_unregister_unsafe(bdev); 8004 spdk_spin_unlock(&bdev->internal.spinlock); 8005 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8006 8007 if (rc == 0) { 8008 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8009 } 8010 } 8011 8012 void 8013 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8014 { 8015 struct spdk_thread *thread; 8016 8017 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8018 8019 thread = spdk_get_thread(); 8020 if (!thread) { 8021 /* The user called this from a non-SPDK thread. */ 8022 if (cb_fn != NULL) { 8023 cb_fn(cb_arg, -ENOTSUP); 8024 } 8025 return; 8026 } 8027 8028 spdk_spin_lock(&g_bdev_mgr.spinlock); 8029 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8030 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8031 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8032 if (cb_fn) { 8033 cb_fn(cb_arg, -EBUSY); 8034 } 8035 return; 8036 } 8037 8038 spdk_spin_lock(&bdev->internal.spinlock); 8039 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8040 bdev->internal.unregister_cb = cb_fn; 8041 bdev->internal.unregister_ctx = cb_arg; 8042 bdev->internal.unregister_td = thread; 8043 spdk_spin_unlock(&bdev->internal.spinlock); 8044 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8045 8046 spdk_bdev_set_qd_sampling_period(bdev, 0); 8047 8048 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8049 bdev_unregister); 8050 } 8051 8052 int 8053 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8054 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8055 { 8056 struct spdk_bdev_desc *desc; 8057 struct spdk_bdev *bdev; 8058 int rc; 8059 8060 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8061 if (rc != 0) { 8062 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8063 return rc; 8064 } 8065 8066 bdev = spdk_bdev_desc_get_bdev(desc); 8067 8068 if (bdev->module != module) { 8069 spdk_bdev_close(desc); 8070 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8071 bdev_name); 8072 return -ENODEV; 8073 } 8074 8075 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8076 8077 spdk_bdev_close(desc); 8078 8079 return 0; 8080 } 8081 8082 static int 8083 bdev_start_qos(struct spdk_bdev *bdev) 8084 { 8085 struct set_qos_limit_ctx *ctx; 8086 8087 /* Enable QoS */ 8088 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8089 ctx = calloc(1, sizeof(*ctx)); 8090 if (ctx == NULL) { 8091 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8092 return -ENOMEM; 8093 } 8094 ctx->bdev = bdev; 8095 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8096 } 8097 8098 return 0; 8099 } 8100 8101 static void 8102 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8103 struct spdk_bdev *bdev) 8104 { 8105 enum spdk_bdev_claim_type type; 8106 const char *typename, *modname; 8107 extern struct spdk_log_flag SPDK_LOG_bdev; 8108 8109 assert(spdk_spin_held(&bdev->internal.spinlock)); 8110 8111 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8112 return; 8113 } 8114 8115 type = bdev->internal.claim_type; 8116 typename = spdk_bdev_claim_get_name(type); 8117 8118 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8119 modname = bdev->internal.claim.v1.module->name; 8120 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8121 bdev->name, detail, typename, modname); 8122 return; 8123 } 8124 8125 if (claim_type_is_v2(type)) { 8126 struct spdk_bdev_module_claim *claim; 8127 8128 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8129 modname = claim->module->name; 8130 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8131 bdev->name, detail, typename, modname); 8132 } 8133 return; 8134 } 8135 8136 assert(false); 8137 } 8138 8139 static int 8140 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8141 { 8142 struct spdk_thread *thread; 8143 int rc = 0; 8144 8145 thread = spdk_get_thread(); 8146 if (!thread) { 8147 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8148 return -ENOTSUP; 8149 } 8150 8151 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8152 spdk_get_thread()); 8153 8154 desc->bdev = bdev; 8155 desc->thread = thread; 8156 desc->write = write; 8157 8158 spdk_spin_lock(&bdev->internal.spinlock); 8159 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8160 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8161 spdk_spin_unlock(&bdev->internal.spinlock); 8162 return -ENODEV; 8163 } 8164 8165 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8166 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8167 spdk_spin_unlock(&bdev->internal.spinlock); 8168 return -EPERM; 8169 } 8170 8171 rc = bdev_start_qos(bdev); 8172 if (rc != 0) { 8173 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8174 spdk_spin_unlock(&bdev->internal.spinlock); 8175 return rc; 8176 } 8177 8178 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8179 8180 spdk_spin_unlock(&bdev->internal.spinlock); 8181 8182 return 0; 8183 } 8184 8185 static int 8186 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8187 struct spdk_bdev_desc **_desc) 8188 { 8189 struct spdk_bdev_desc *desc; 8190 unsigned int i; 8191 8192 desc = calloc(1, sizeof(*desc)); 8193 if (desc == NULL) { 8194 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8195 return -ENOMEM; 8196 } 8197 8198 TAILQ_INIT(&desc->pending_media_events); 8199 TAILQ_INIT(&desc->free_media_events); 8200 8201 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8202 desc->callback.event_fn = event_cb; 8203 desc->callback.ctx = event_ctx; 8204 spdk_spin_init(&desc->spinlock); 8205 8206 if (bdev->media_events) { 8207 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8208 sizeof(*desc->media_events_buffer)); 8209 if (desc->media_events_buffer == NULL) { 8210 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8211 bdev_desc_free(desc); 8212 return -ENOMEM; 8213 } 8214 8215 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8216 TAILQ_INSERT_TAIL(&desc->free_media_events, 8217 &desc->media_events_buffer[i], tailq); 8218 } 8219 } 8220 8221 if (bdev->fn_table->accel_sequence_supported != NULL) { 8222 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8223 desc->accel_sequence_supported[i] = 8224 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8225 (enum spdk_bdev_io_type)i); 8226 } 8227 } 8228 8229 *_desc = desc; 8230 8231 return 0; 8232 } 8233 8234 static int 8235 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8236 void *event_ctx, struct spdk_bdev_desc **_desc) 8237 { 8238 struct spdk_bdev_desc *desc; 8239 struct spdk_bdev *bdev; 8240 int rc; 8241 8242 bdev = bdev_get_by_name(bdev_name); 8243 8244 if (bdev == NULL) { 8245 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8246 return -ENODEV; 8247 } 8248 8249 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8250 if (rc != 0) { 8251 return rc; 8252 } 8253 8254 rc = bdev_open(bdev, write, desc); 8255 if (rc != 0) { 8256 bdev_desc_free(desc); 8257 desc = NULL; 8258 } 8259 8260 *_desc = desc; 8261 8262 return rc; 8263 } 8264 8265 int 8266 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8267 void *event_ctx, struct spdk_bdev_desc **_desc) 8268 { 8269 int rc; 8270 8271 if (event_cb == NULL) { 8272 SPDK_ERRLOG("Missing event callback function\n"); 8273 return -EINVAL; 8274 } 8275 8276 spdk_spin_lock(&g_bdev_mgr.spinlock); 8277 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8278 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8279 8280 return rc; 8281 } 8282 8283 struct spdk_bdev_open_async_ctx { 8284 char *bdev_name; 8285 spdk_bdev_event_cb_t event_cb; 8286 void *event_ctx; 8287 bool write; 8288 int rc; 8289 spdk_bdev_open_async_cb_t cb_fn; 8290 void *cb_arg; 8291 struct spdk_bdev_desc *desc; 8292 struct spdk_bdev_open_async_opts opts; 8293 uint64_t start_ticks; 8294 struct spdk_thread *orig_thread; 8295 struct spdk_poller *poller; 8296 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8297 }; 8298 8299 static void 8300 bdev_open_async_done(void *arg) 8301 { 8302 struct spdk_bdev_open_async_ctx *ctx = arg; 8303 8304 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8305 8306 free(ctx->bdev_name); 8307 free(ctx); 8308 } 8309 8310 static void 8311 bdev_open_async_cancel(void *arg) 8312 { 8313 struct spdk_bdev_open_async_ctx *ctx = arg; 8314 8315 assert(ctx->rc == -ESHUTDOWN); 8316 8317 spdk_poller_unregister(&ctx->poller); 8318 8319 bdev_open_async_done(ctx); 8320 } 8321 8322 /* This is called when the bdev library finishes at shutdown. */ 8323 static void 8324 bdev_open_async_fini(void) 8325 { 8326 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8327 8328 spdk_spin_lock(&g_bdev_mgr.spinlock); 8329 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8330 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8331 /* 8332 * We have to move to ctx->orig_thread to unregister ctx->poller. 8333 * However, there is a chance that ctx->poller is executed before 8334 * message is executed, which could result in bdev_open_async_done() 8335 * being called twice. To avoid such race condition, set ctx->rc to 8336 * -ESHUTDOWN. 8337 */ 8338 ctx->rc = -ESHUTDOWN; 8339 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8340 } 8341 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8342 } 8343 8344 static int bdev_open_async(void *arg); 8345 8346 static void 8347 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8348 { 8349 uint64_t timeout_ticks; 8350 8351 if (ctx->rc == -ESHUTDOWN) { 8352 /* This context is being canceled. Do nothing. */ 8353 return; 8354 } 8355 8356 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8357 &ctx->desc); 8358 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8359 goto exit; 8360 } 8361 8362 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8363 if (spdk_get_ticks() >= timeout_ticks) { 8364 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8365 ctx->rc = -ETIMEDOUT; 8366 goto exit; 8367 } 8368 8369 return; 8370 8371 exit: 8372 spdk_poller_unregister(&ctx->poller); 8373 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8374 8375 /* Completion callback is processed after stack unwinding. */ 8376 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8377 } 8378 8379 static int 8380 bdev_open_async(void *arg) 8381 { 8382 struct spdk_bdev_open_async_ctx *ctx = arg; 8383 8384 spdk_spin_lock(&g_bdev_mgr.spinlock); 8385 8386 _bdev_open_async(ctx); 8387 8388 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8389 8390 return SPDK_POLLER_BUSY; 8391 } 8392 8393 static void 8394 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8395 struct spdk_bdev_open_async_opts *opts_src, 8396 size_t size) 8397 { 8398 assert(opts); 8399 assert(opts_src); 8400 8401 opts->size = size; 8402 8403 #define SET_FIELD(field) \ 8404 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8405 opts->field = opts_src->field; \ 8406 } \ 8407 8408 SET_FIELD(timeout_ms); 8409 8410 /* Do not remove this statement, you should always update this statement when you adding a new field, 8411 * and do not forget to add the SET_FIELD statement for your added field. */ 8412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8413 8414 #undef SET_FIELD 8415 } 8416 8417 static void 8418 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8419 { 8420 assert(opts); 8421 8422 opts->size = size; 8423 8424 #define SET_FIELD(field, value) \ 8425 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8426 opts->field = value; \ 8427 } \ 8428 8429 SET_FIELD(timeout_ms, 0); 8430 8431 #undef SET_FIELD 8432 } 8433 8434 int 8435 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8436 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8437 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8438 { 8439 struct spdk_bdev_open_async_ctx *ctx; 8440 8441 if (event_cb == NULL) { 8442 SPDK_ERRLOG("Missing event callback function\n"); 8443 return -EINVAL; 8444 } 8445 8446 if (open_cb == NULL) { 8447 SPDK_ERRLOG("Missing open callback function\n"); 8448 return -EINVAL; 8449 } 8450 8451 if (opts != NULL && opts->size == 0) { 8452 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8453 return -EINVAL; 8454 } 8455 8456 ctx = calloc(1, sizeof(*ctx)); 8457 if (ctx == NULL) { 8458 SPDK_ERRLOG("Failed to allocate open context\n"); 8459 return -ENOMEM; 8460 } 8461 8462 ctx->bdev_name = strdup(bdev_name); 8463 if (ctx->bdev_name == NULL) { 8464 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8465 free(ctx); 8466 return -ENOMEM; 8467 } 8468 8469 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8470 if (ctx->poller == NULL) { 8471 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8472 free(ctx->bdev_name); 8473 free(ctx); 8474 return -ENOMEM; 8475 } 8476 8477 ctx->cb_fn = open_cb; 8478 ctx->cb_arg = open_cb_arg; 8479 ctx->write = write; 8480 ctx->event_cb = event_cb; 8481 ctx->event_ctx = event_ctx; 8482 ctx->orig_thread = spdk_get_thread(); 8483 ctx->start_ticks = spdk_get_ticks(); 8484 8485 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8486 if (opts != NULL) { 8487 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8488 } 8489 8490 spdk_spin_lock(&g_bdev_mgr.spinlock); 8491 8492 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8493 _bdev_open_async(ctx); 8494 8495 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8496 8497 return 0; 8498 } 8499 8500 static void 8501 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8502 { 8503 int rc; 8504 8505 spdk_spin_lock(&bdev->internal.spinlock); 8506 spdk_spin_lock(&desc->spinlock); 8507 8508 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8509 8510 desc->closed = true; 8511 8512 if (desc->claim != NULL) { 8513 bdev_desc_release_claims(desc); 8514 } 8515 8516 if (0 == desc->refs) { 8517 spdk_spin_unlock(&desc->spinlock); 8518 bdev_desc_free(desc); 8519 } else { 8520 spdk_spin_unlock(&desc->spinlock); 8521 } 8522 8523 /* If no more descriptors, kill QoS channel */ 8524 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8525 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8526 bdev->name, spdk_get_thread()); 8527 8528 if (bdev_qos_destroy(bdev)) { 8529 /* There isn't anything we can do to recover here. Just let the 8530 * old QoS poller keep running. The QoS handling won't change 8531 * cores when the user allocates a new channel, but it won't break. */ 8532 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8533 } 8534 } 8535 8536 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8537 rc = bdev_unregister_unsafe(bdev); 8538 spdk_spin_unlock(&bdev->internal.spinlock); 8539 8540 if (rc == 0) { 8541 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8542 } 8543 } else { 8544 spdk_spin_unlock(&bdev->internal.spinlock); 8545 } 8546 } 8547 8548 void 8549 spdk_bdev_close(struct spdk_bdev_desc *desc) 8550 { 8551 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8552 8553 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8554 spdk_get_thread()); 8555 8556 assert(desc->thread == spdk_get_thread()); 8557 8558 spdk_poller_unregister(&desc->io_timeout_poller); 8559 8560 spdk_spin_lock(&g_bdev_mgr.spinlock); 8561 8562 bdev_close(bdev, desc); 8563 8564 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8565 } 8566 8567 int32_t 8568 spdk_bdev_get_numa_id(struct spdk_bdev *bdev) 8569 { 8570 if (bdev->numa.id_valid) { 8571 return bdev->numa.id; 8572 } else { 8573 return SPDK_ENV_NUMA_ID_ANY; 8574 } 8575 } 8576 8577 static void 8578 bdev_register_finished(void *arg) 8579 { 8580 struct spdk_bdev_desc *desc = arg; 8581 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8582 8583 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8584 8585 spdk_spin_lock(&g_bdev_mgr.spinlock); 8586 8587 bdev_close(bdev, desc); 8588 8589 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8590 } 8591 8592 int 8593 spdk_bdev_register(struct spdk_bdev *bdev) 8594 { 8595 struct spdk_bdev_desc *desc; 8596 struct spdk_thread *thread = spdk_get_thread(); 8597 int rc; 8598 8599 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8600 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8601 thread ? spdk_thread_get_name(thread) : "null"); 8602 return -EINVAL; 8603 } 8604 8605 rc = bdev_register(bdev); 8606 if (rc != 0) { 8607 return rc; 8608 } 8609 8610 /* A descriptor is opened to prevent bdev deletion during examination */ 8611 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8612 if (rc != 0) { 8613 spdk_bdev_unregister(bdev, NULL, NULL); 8614 return rc; 8615 } 8616 8617 rc = bdev_open(bdev, false, desc); 8618 if (rc != 0) { 8619 bdev_desc_free(desc); 8620 spdk_bdev_unregister(bdev, NULL, NULL); 8621 return rc; 8622 } 8623 8624 /* Examine configuration before initializing I/O */ 8625 bdev_examine(bdev); 8626 8627 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8628 if (rc != 0) { 8629 bdev_close(bdev, desc); 8630 spdk_bdev_unregister(bdev, NULL, NULL); 8631 } 8632 8633 return rc; 8634 } 8635 8636 int 8637 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8638 struct spdk_bdev_module *module) 8639 { 8640 spdk_spin_lock(&bdev->internal.spinlock); 8641 8642 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8643 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8644 spdk_spin_unlock(&bdev->internal.spinlock); 8645 return -EPERM; 8646 } 8647 8648 if (desc && !desc->write) { 8649 desc->write = true; 8650 } 8651 8652 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8653 bdev->internal.claim.v1.module = module; 8654 8655 spdk_spin_unlock(&bdev->internal.spinlock); 8656 return 0; 8657 } 8658 8659 void 8660 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8661 { 8662 spdk_spin_lock(&bdev->internal.spinlock); 8663 8664 assert(bdev->internal.claim.v1.module != NULL); 8665 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8666 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8667 bdev->internal.claim.v1.module = NULL; 8668 8669 spdk_spin_unlock(&bdev->internal.spinlock); 8670 } 8671 8672 /* 8673 * Start claims v2 8674 */ 8675 8676 const char * 8677 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8678 { 8679 switch (type) { 8680 case SPDK_BDEV_CLAIM_NONE: 8681 return "not_claimed"; 8682 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8683 return "exclusive_write"; 8684 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8685 return "read_many_write_one"; 8686 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8687 return "read_many_write_none"; 8688 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8689 return "read_many_write_many"; 8690 default: 8691 break; 8692 } 8693 return "invalid_claim"; 8694 } 8695 8696 static bool 8697 claim_type_is_v2(enum spdk_bdev_claim_type type) 8698 { 8699 switch (type) { 8700 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8701 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8702 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8703 return true; 8704 default: 8705 break; 8706 } 8707 return false; 8708 } 8709 8710 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8711 static bool 8712 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8713 { 8714 switch (type) { 8715 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8716 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8717 return true; 8718 default: 8719 break; 8720 } 8721 return false; 8722 } 8723 8724 void 8725 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8726 { 8727 if (opts == NULL) { 8728 SPDK_ERRLOG("opts should not be NULL\n"); 8729 assert(opts != NULL); 8730 return; 8731 } 8732 if (size == 0) { 8733 SPDK_ERRLOG("size should not be zero\n"); 8734 assert(size != 0); 8735 return; 8736 } 8737 8738 memset(opts, 0, size); 8739 opts->opts_size = size; 8740 8741 #define FIELD_OK(field) \ 8742 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8743 8744 #define SET_FIELD(field, value) \ 8745 if (FIELD_OK(field)) { \ 8746 opts->field = value; \ 8747 } \ 8748 8749 SET_FIELD(shared_claim_key, 0); 8750 8751 #undef FIELD_OK 8752 #undef SET_FIELD 8753 } 8754 8755 static int 8756 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8757 { 8758 if (src->opts_size == 0) { 8759 SPDK_ERRLOG("size should not be zero\n"); 8760 return -1; 8761 } 8762 8763 memset(dst, 0, sizeof(*dst)); 8764 dst->opts_size = src->opts_size; 8765 8766 #define FIELD_OK(field) \ 8767 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8768 8769 #define SET_FIELD(field) \ 8770 if (FIELD_OK(field)) { \ 8771 dst->field = src->field; \ 8772 } \ 8773 8774 if (FIELD_OK(name)) { 8775 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8776 } 8777 8778 SET_FIELD(shared_claim_key); 8779 8780 /* You should not remove this statement, but need to update the assert statement 8781 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8782 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8783 8784 #undef FIELD_OK 8785 #undef SET_FIELD 8786 return 0; 8787 } 8788 8789 /* Returns 0 if a read-write-once claim can be taken. */ 8790 static int 8791 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8792 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8793 { 8794 struct spdk_bdev *bdev = desc->bdev; 8795 struct spdk_bdev_desc *open_desc; 8796 8797 assert(spdk_spin_held(&bdev->internal.spinlock)); 8798 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8799 8800 if (opts->shared_claim_key != 0) { 8801 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8802 bdev->name); 8803 return -EINVAL; 8804 } 8805 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8806 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8807 return -EPERM; 8808 } 8809 if (desc->claim != NULL) { 8810 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8811 bdev->name, desc->claim->module->name); 8812 return -EPERM; 8813 } 8814 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8815 if (desc != open_desc && open_desc->write) { 8816 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8817 "another descriptor is open for writing\n", 8818 bdev->name); 8819 return -EPERM; 8820 } 8821 } 8822 8823 return 0; 8824 } 8825 8826 /* Returns 0 if a read-only-many claim can be taken. */ 8827 static int 8828 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8829 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8830 { 8831 struct spdk_bdev *bdev = desc->bdev; 8832 struct spdk_bdev_desc *open_desc; 8833 8834 assert(spdk_spin_held(&bdev->internal.spinlock)); 8835 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8836 assert(desc->claim == NULL); 8837 8838 if (desc->write) { 8839 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8840 bdev->name); 8841 return -EINVAL; 8842 } 8843 if (opts->shared_claim_key != 0) { 8844 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8845 return -EINVAL; 8846 } 8847 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8848 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8849 if (open_desc->write) { 8850 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8851 "another descriptor is open for writing\n", 8852 bdev->name); 8853 return -EPERM; 8854 } 8855 } 8856 } 8857 8858 return 0; 8859 } 8860 8861 /* Returns 0 if a read-write-many claim can be taken. */ 8862 static int 8863 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8864 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8865 { 8866 struct spdk_bdev *bdev = desc->bdev; 8867 struct spdk_bdev_desc *open_desc; 8868 8869 assert(spdk_spin_held(&bdev->internal.spinlock)); 8870 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8871 assert(desc->claim == NULL); 8872 8873 if (opts->shared_claim_key == 0) { 8874 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8875 bdev->name); 8876 return -EINVAL; 8877 } 8878 switch (bdev->internal.claim_type) { 8879 case SPDK_BDEV_CLAIM_NONE: 8880 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8881 if (open_desc == desc) { 8882 continue; 8883 } 8884 if (open_desc->write) { 8885 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8886 "another descriptor is open for writing without a " 8887 "claim\n", bdev->name); 8888 return -EPERM; 8889 } 8890 } 8891 break; 8892 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8893 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8894 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8895 return -EPERM; 8896 } 8897 break; 8898 default: 8899 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8900 return -EBUSY; 8901 } 8902 8903 return 0; 8904 } 8905 8906 /* Updates desc and its bdev with a v2 claim. */ 8907 static int 8908 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8909 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8910 { 8911 struct spdk_bdev *bdev = desc->bdev; 8912 struct spdk_bdev_module_claim *claim; 8913 8914 assert(spdk_spin_held(&bdev->internal.spinlock)); 8915 assert(claim_type_is_v2(type)); 8916 assert(desc->claim == NULL); 8917 8918 claim = calloc(1, sizeof(*desc->claim)); 8919 if (claim == NULL) { 8920 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8921 return -ENOMEM; 8922 } 8923 claim->module = module; 8924 claim->desc = desc; 8925 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8926 memcpy(claim->name, opts->name, sizeof(claim->name)); 8927 desc->claim = claim; 8928 8929 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8930 bdev->internal.claim_type = type; 8931 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8932 bdev->internal.claim.v2.key = opts->shared_claim_key; 8933 } 8934 assert(type == bdev->internal.claim_type); 8935 8936 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8937 8938 if (!desc->write && claim_type_promotes_to_write(type)) { 8939 desc->write = true; 8940 } 8941 8942 return 0; 8943 } 8944 8945 int 8946 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8947 struct spdk_bdev_claim_opts *_opts, 8948 struct spdk_bdev_module *module) 8949 { 8950 struct spdk_bdev *bdev; 8951 struct spdk_bdev_claim_opts opts; 8952 int rc = 0; 8953 8954 if (desc == NULL) { 8955 SPDK_ERRLOG("descriptor must not be NULL\n"); 8956 return -EINVAL; 8957 } 8958 8959 bdev = desc->bdev; 8960 8961 if (_opts == NULL) { 8962 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8963 } else if (claim_opts_copy(_opts, &opts) != 0) { 8964 return -EINVAL; 8965 } 8966 8967 spdk_spin_lock(&bdev->internal.spinlock); 8968 8969 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8970 bdev->internal.claim_type != type) { 8971 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8972 spdk_spin_unlock(&bdev->internal.spinlock); 8973 return -EPERM; 8974 } 8975 8976 if (claim_type_is_v2(type) && desc->claim != NULL) { 8977 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8978 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8979 spdk_spin_unlock(&bdev->internal.spinlock); 8980 return -EPERM; 8981 } 8982 8983 switch (type) { 8984 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8985 spdk_spin_unlock(&bdev->internal.spinlock); 8986 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8987 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8988 rc = claim_verify_rwo(desc, type, &opts, module); 8989 break; 8990 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8991 rc = claim_verify_rom(desc, type, &opts, module); 8992 break; 8993 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8994 rc = claim_verify_rwm(desc, type, &opts, module); 8995 break; 8996 default: 8997 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8998 rc = -ENOTSUP; 8999 } 9000 9001 if (rc == 0) { 9002 rc = claim_bdev(desc, type, &opts, module); 9003 } 9004 9005 spdk_spin_unlock(&bdev->internal.spinlock); 9006 return rc; 9007 } 9008 9009 static void 9010 claim_reset(struct spdk_bdev *bdev) 9011 { 9012 assert(spdk_spin_held(&bdev->internal.spinlock)); 9013 assert(claim_type_is_v2(bdev->internal.claim_type)); 9014 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 9015 9016 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 9017 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 9018 } 9019 9020 static void 9021 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9022 { 9023 struct spdk_bdev *bdev = desc->bdev; 9024 9025 assert(spdk_spin_held(&bdev->internal.spinlock)); 9026 assert(claim_type_is_v2(bdev->internal.claim_type)); 9027 9028 if (bdev->internal.examine_in_progress == 0) { 9029 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9030 free(desc->claim); 9031 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9032 claim_reset(bdev); 9033 } 9034 } else { 9035 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9036 desc->claim->module = NULL; 9037 desc->claim->desc = NULL; 9038 } 9039 desc->claim = NULL; 9040 } 9041 9042 /* 9043 * End claims v2 9044 */ 9045 9046 struct spdk_bdev * 9047 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9048 { 9049 assert(desc != NULL); 9050 return desc->bdev; 9051 } 9052 9053 int 9054 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9055 { 9056 struct spdk_bdev *bdev, *tmp; 9057 struct spdk_bdev_desc *desc; 9058 int rc = 0; 9059 9060 assert(fn != NULL); 9061 9062 spdk_spin_lock(&g_bdev_mgr.spinlock); 9063 bdev = spdk_bdev_first(); 9064 while (bdev != NULL) { 9065 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9066 if (rc != 0) { 9067 break; 9068 } 9069 rc = bdev_open(bdev, false, desc); 9070 if (rc != 0) { 9071 bdev_desc_free(desc); 9072 if (rc == -ENODEV) { 9073 /* Ignore the error and move to the next bdev. */ 9074 rc = 0; 9075 bdev = spdk_bdev_next(bdev); 9076 continue; 9077 } 9078 break; 9079 } 9080 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9081 9082 rc = fn(ctx, bdev); 9083 9084 spdk_spin_lock(&g_bdev_mgr.spinlock); 9085 tmp = spdk_bdev_next(bdev); 9086 bdev_close(bdev, desc); 9087 if (rc != 0) { 9088 break; 9089 } 9090 bdev = tmp; 9091 } 9092 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9093 9094 return rc; 9095 } 9096 9097 int 9098 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9099 { 9100 struct spdk_bdev *bdev, *tmp; 9101 struct spdk_bdev_desc *desc; 9102 int rc = 0; 9103 9104 assert(fn != NULL); 9105 9106 spdk_spin_lock(&g_bdev_mgr.spinlock); 9107 bdev = spdk_bdev_first_leaf(); 9108 while (bdev != NULL) { 9109 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9110 if (rc != 0) { 9111 break; 9112 } 9113 rc = bdev_open(bdev, false, desc); 9114 if (rc != 0) { 9115 bdev_desc_free(desc); 9116 if (rc == -ENODEV) { 9117 /* Ignore the error and move to the next bdev. */ 9118 rc = 0; 9119 bdev = spdk_bdev_next_leaf(bdev); 9120 continue; 9121 } 9122 break; 9123 } 9124 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9125 9126 rc = fn(ctx, bdev); 9127 9128 spdk_spin_lock(&g_bdev_mgr.spinlock); 9129 tmp = spdk_bdev_next_leaf(bdev); 9130 bdev_close(bdev, desc); 9131 if (rc != 0) { 9132 break; 9133 } 9134 bdev = tmp; 9135 } 9136 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9137 9138 return rc; 9139 } 9140 9141 void 9142 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9143 { 9144 struct iovec *iovs; 9145 int iovcnt; 9146 9147 if (bdev_io == NULL) { 9148 return; 9149 } 9150 9151 switch (bdev_io->type) { 9152 case SPDK_BDEV_IO_TYPE_READ: 9153 case SPDK_BDEV_IO_TYPE_WRITE: 9154 case SPDK_BDEV_IO_TYPE_ZCOPY: 9155 iovs = bdev_io->u.bdev.iovs; 9156 iovcnt = bdev_io->u.bdev.iovcnt; 9157 break; 9158 default: 9159 iovs = NULL; 9160 iovcnt = 0; 9161 break; 9162 } 9163 9164 if (iovp) { 9165 *iovp = iovs; 9166 } 9167 if (iovcntp) { 9168 *iovcntp = iovcnt; 9169 } 9170 } 9171 9172 void * 9173 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9174 { 9175 if (bdev_io == NULL) { 9176 return NULL; 9177 } 9178 9179 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9180 return NULL; 9181 } 9182 9183 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9184 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9185 return bdev_io->u.bdev.md_buf; 9186 } 9187 9188 return NULL; 9189 } 9190 9191 void * 9192 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9193 { 9194 if (bdev_io == NULL) { 9195 assert(false); 9196 return NULL; 9197 } 9198 9199 return bdev_io->internal.caller_ctx; 9200 } 9201 9202 void 9203 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9204 { 9205 9206 if (spdk_bdev_module_list_find(bdev_module->name)) { 9207 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9208 assert(false); 9209 } 9210 9211 spdk_spin_init(&bdev_module->internal.spinlock); 9212 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9213 9214 /* 9215 * Modules with examine callbacks must be initialized first, so they are 9216 * ready to handle examine callbacks from later modules that will 9217 * register physical bdevs. 9218 */ 9219 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9220 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9221 } else { 9222 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9223 } 9224 } 9225 9226 struct spdk_bdev_module * 9227 spdk_bdev_module_list_find(const char *name) 9228 { 9229 struct spdk_bdev_module *bdev_module; 9230 9231 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9232 if (strcmp(name, bdev_module->name) == 0) { 9233 break; 9234 } 9235 } 9236 9237 return bdev_module; 9238 } 9239 9240 static int 9241 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9242 { 9243 uint64_t num_blocks; 9244 void *md_buf = NULL; 9245 9246 num_blocks = bdev_io->u.bdev.num_blocks; 9247 9248 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9249 md_buf = (char *)g_bdev_mgr.zero_buffer + 9250 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9251 } 9252 9253 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9254 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9255 g_bdev_mgr.zero_buffer, md_buf, 9256 bdev_io->u.bdev.offset_blocks, num_blocks, 9257 bdev_write_zero_buffer_done, bdev_io); 9258 } 9259 9260 static void 9261 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9262 { 9263 struct spdk_bdev_io *parent_io = cb_arg; 9264 9265 spdk_bdev_free_io(bdev_io); 9266 9267 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9268 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9269 } 9270 9271 static void 9272 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9273 { 9274 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9275 ctx->bdev->internal.qos_mod_in_progress = false; 9276 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9277 9278 if (ctx->cb_fn) { 9279 ctx->cb_fn(ctx->cb_arg, status); 9280 } 9281 free(ctx); 9282 } 9283 9284 static void 9285 bdev_disable_qos_done(void *cb_arg) 9286 { 9287 struct set_qos_limit_ctx *ctx = cb_arg; 9288 struct spdk_bdev *bdev = ctx->bdev; 9289 struct spdk_bdev_qos *qos; 9290 9291 spdk_spin_lock(&bdev->internal.spinlock); 9292 qos = bdev->internal.qos; 9293 bdev->internal.qos = NULL; 9294 spdk_spin_unlock(&bdev->internal.spinlock); 9295 9296 if (qos->thread != NULL) { 9297 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9298 spdk_poller_unregister(&qos->poller); 9299 } 9300 9301 free(qos); 9302 9303 bdev_set_qos_limit_done(ctx, 0); 9304 } 9305 9306 static void 9307 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9308 { 9309 struct set_qos_limit_ctx *ctx = _ctx; 9310 struct spdk_thread *thread; 9311 9312 spdk_spin_lock(&bdev->internal.spinlock); 9313 thread = bdev->internal.qos->thread; 9314 spdk_spin_unlock(&bdev->internal.spinlock); 9315 9316 if (thread != NULL) { 9317 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9318 } else { 9319 bdev_disable_qos_done(ctx); 9320 } 9321 } 9322 9323 static void 9324 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9325 struct spdk_io_channel *ch, void *_ctx) 9326 { 9327 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9328 struct spdk_bdev_io *bdev_io; 9329 9330 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9331 9332 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9333 /* Re-submit the queued I/O. */ 9334 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9335 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9336 _bdev_io_submit(bdev_io); 9337 } 9338 9339 spdk_bdev_for_each_channel_continue(i, 0); 9340 } 9341 9342 static void 9343 bdev_update_qos_rate_limit_msg(void *cb_arg) 9344 { 9345 struct set_qos_limit_ctx *ctx = cb_arg; 9346 struct spdk_bdev *bdev = ctx->bdev; 9347 9348 spdk_spin_lock(&bdev->internal.spinlock); 9349 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9350 spdk_spin_unlock(&bdev->internal.spinlock); 9351 9352 bdev_set_qos_limit_done(ctx, 0); 9353 } 9354 9355 static void 9356 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9357 struct spdk_io_channel *ch, void *_ctx) 9358 { 9359 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9360 9361 spdk_spin_lock(&bdev->internal.spinlock); 9362 bdev_enable_qos(bdev, bdev_ch); 9363 spdk_spin_unlock(&bdev->internal.spinlock); 9364 spdk_bdev_for_each_channel_continue(i, 0); 9365 } 9366 9367 static void 9368 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9369 { 9370 struct set_qos_limit_ctx *ctx = _ctx; 9371 9372 bdev_set_qos_limit_done(ctx, status); 9373 } 9374 9375 static void 9376 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9377 { 9378 int i; 9379 9380 assert(bdev->internal.qos != NULL); 9381 9382 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9383 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9384 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9385 9386 if (limits[i] == 0) { 9387 bdev->internal.qos->rate_limits[i].limit = 9388 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9389 } 9390 } 9391 } 9392 } 9393 9394 void 9395 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9396 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9397 { 9398 struct set_qos_limit_ctx *ctx; 9399 uint32_t limit_set_complement; 9400 uint64_t min_limit_per_sec; 9401 int i; 9402 bool disable_rate_limit = true; 9403 9404 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9405 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9406 continue; 9407 } 9408 9409 if (limits[i] > 0) { 9410 disable_rate_limit = false; 9411 } 9412 9413 if (bdev_qos_is_iops_rate_limit(i) == true) { 9414 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9415 } else { 9416 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9417 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9418 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9419 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9420 } 9421 /* Change from megabyte to byte rate limit */ 9422 limits[i] = limits[i] * 1024 * 1024; 9423 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9424 } 9425 9426 limit_set_complement = limits[i] % min_limit_per_sec; 9427 if (limit_set_complement) { 9428 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9429 limits[i], min_limit_per_sec); 9430 limits[i] += min_limit_per_sec - limit_set_complement; 9431 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9432 } 9433 } 9434 9435 ctx = calloc(1, sizeof(*ctx)); 9436 if (ctx == NULL) { 9437 cb_fn(cb_arg, -ENOMEM); 9438 return; 9439 } 9440 9441 ctx->cb_fn = cb_fn; 9442 ctx->cb_arg = cb_arg; 9443 ctx->bdev = bdev; 9444 9445 spdk_spin_lock(&bdev->internal.spinlock); 9446 if (bdev->internal.qos_mod_in_progress) { 9447 spdk_spin_unlock(&bdev->internal.spinlock); 9448 free(ctx); 9449 cb_fn(cb_arg, -EAGAIN); 9450 return; 9451 } 9452 bdev->internal.qos_mod_in_progress = true; 9453 9454 if (disable_rate_limit == true && bdev->internal.qos) { 9455 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9456 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9457 (bdev->internal.qos->rate_limits[i].limit > 0 && 9458 bdev->internal.qos->rate_limits[i].limit != 9459 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9460 disable_rate_limit = false; 9461 break; 9462 } 9463 } 9464 } 9465 9466 if (disable_rate_limit == false) { 9467 if (bdev->internal.qos == NULL) { 9468 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9469 if (!bdev->internal.qos) { 9470 spdk_spin_unlock(&bdev->internal.spinlock); 9471 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9472 bdev_set_qos_limit_done(ctx, -ENOMEM); 9473 return; 9474 } 9475 } 9476 9477 if (bdev->internal.qos->thread == NULL) { 9478 /* Enabling */ 9479 bdev_set_qos_rate_limits(bdev, limits); 9480 9481 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9482 bdev_enable_qos_done); 9483 } else { 9484 /* Updating */ 9485 bdev_set_qos_rate_limits(bdev, limits); 9486 9487 spdk_thread_send_msg(bdev->internal.qos->thread, 9488 bdev_update_qos_rate_limit_msg, ctx); 9489 } 9490 } else { 9491 if (bdev->internal.qos != NULL) { 9492 bdev_set_qos_rate_limits(bdev, limits); 9493 9494 /* Disabling */ 9495 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9496 bdev_disable_qos_msg_done); 9497 } else { 9498 spdk_spin_unlock(&bdev->internal.spinlock); 9499 bdev_set_qos_limit_done(ctx, 0); 9500 return; 9501 } 9502 } 9503 9504 spdk_spin_unlock(&bdev->internal.spinlock); 9505 } 9506 9507 struct spdk_bdev_histogram_ctx { 9508 spdk_bdev_histogram_status_cb cb_fn; 9509 void *cb_arg; 9510 struct spdk_bdev *bdev; 9511 int status; 9512 }; 9513 9514 static void 9515 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9516 { 9517 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9518 9519 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9520 ctx->bdev->internal.histogram_in_progress = false; 9521 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9522 ctx->cb_fn(ctx->cb_arg, ctx->status); 9523 free(ctx); 9524 } 9525 9526 static void 9527 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9528 struct spdk_io_channel *_ch, void *_ctx) 9529 { 9530 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9531 9532 if (ch->histogram != NULL) { 9533 spdk_histogram_data_free(ch->histogram); 9534 ch->histogram = NULL; 9535 } 9536 spdk_bdev_for_each_channel_continue(i, 0); 9537 } 9538 9539 static void 9540 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9541 { 9542 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9543 9544 if (status != 0) { 9545 ctx->status = status; 9546 ctx->bdev->internal.histogram_enabled = false; 9547 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9548 bdev_histogram_disable_channel_cb); 9549 } else { 9550 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9551 ctx->bdev->internal.histogram_in_progress = false; 9552 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9553 ctx->cb_fn(ctx->cb_arg, ctx->status); 9554 free(ctx); 9555 } 9556 } 9557 9558 static void 9559 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9560 struct spdk_io_channel *_ch, void *_ctx) 9561 { 9562 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9563 int status = 0; 9564 9565 if (ch->histogram == NULL) { 9566 ch->histogram = spdk_histogram_data_alloc(); 9567 if (ch->histogram == NULL) { 9568 status = -ENOMEM; 9569 } 9570 } 9571 9572 spdk_bdev_for_each_channel_continue(i, status); 9573 } 9574 9575 void 9576 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9577 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9578 { 9579 struct spdk_bdev_histogram_ctx *ctx; 9580 9581 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9582 if (ctx == NULL) { 9583 cb_fn(cb_arg, -ENOMEM); 9584 return; 9585 } 9586 9587 ctx->bdev = bdev; 9588 ctx->status = 0; 9589 ctx->cb_fn = cb_fn; 9590 ctx->cb_arg = cb_arg; 9591 9592 spdk_spin_lock(&bdev->internal.spinlock); 9593 if (bdev->internal.histogram_in_progress) { 9594 spdk_spin_unlock(&bdev->internal.spinlock); 9595 free(ctx); 9596 cb_fn(cb_arg, -EAGAIN); 9597 return; 9598 } 9599 9600 bdev->internal.histogram_in_progress = true; 9601 spdk_spin_unlock(&bdev->internal.spinlock); 9602 9603 bdev->internal.histogram_enabled = enable; 9604 bdev->internal.histogram_io_type = opts->io_type; 9605 9606 if (enable) { 9607 /* Allocate histogram for each channel */ 9608 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9609 bdev_histogram_enable_channel_cb); 9610 } else { 9611 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9612 bdev_histogram_disable_channel_cb); 9613 } 9614 } 9615 9616 void 9617 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9618 { 9619 if (opts == NULL) { 9620 SPDK_ERRLOG("opts should not be NULL\n"); 9621 assert(opts != NULL); 9622 return; 9623 } 9624 if (size == 0) { 9625 SPDK_ERRLOG("size should not be zero\n"); 9626 assert(size != 0); 9627 return; 9628 } 9629 9630 memset(opts, 0, size); 9631 opts->size = size; 9632 9633 #define FIELD_OK(field) \ 9634 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9635 9636 #define SET_FIELD(field, value) \ 9637 if (FIELD_OK(field)) { \ 9638 opts->field = value; \ 9639 } \ 9640 9641 SET_FIELD(io_type, 0); 9642 9643 /* You should not remove this statement, but need to update the assert statement 9644 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9645 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9646 9647 #undef FIELD_OK 9648 #undef SET_FIELD 9649 } 9650 9651 void 9652 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9653 void *cb_arg, bool enable) 9654 { 9655 struct spdk_bdev_enable_histogram_opts opts; 9656 9657 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9658 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9659 } 9660 9661 struct spdk_bdev_histogram_data_ctx { 9662 spdk_bdev_histogram_data_cb cb_fn; 9663 void *cb_arg; 9664 struct spdk_bdev *bdev; 9665 /** merged histogram data from all channels */ 9666 struct spdk_histogram_data *histogram; 9667 }; 9668 9669 static void 9670 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9671 { 9672 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9673 9674 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9675 free(ctx); 9676 } 9677 9678 static void 9679 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9680 struct spdk_io_channel *_ch, void *_ctx) 9681 { 9682 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9683 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9684 int status = 0; 9685 9686 if (ch->histogram == NULL) { 9687 status = -EFAULT; 9688 } else { 9689 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9690 } 9691 9692 spdk_bdev_for_each_channel_continue(i, status); 9693 } 9694 9695 void 9696 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9697 spdk_bdev_histogram_data_cb cb_fn, 9698 void *cb_arg) 9699 { 9700 struct spdk_bdev_histogram_data_ctx *ctx; 9701 9702 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9703 if (ctx == NULL) { 9704 cb_fn(cb_arg, -ENOMEM, NULL); 9705 return; 9706 } 9707 9708 ctx->bdev = bdev; 9709 ctx->cb_fn = cb_fn; 9710 ctx->cb_arg = cb_arg; 9711 9712 ctx->histogram = histogram; 9713 9714 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9715 bdev_histogram_get_channel_cb); 9716 } 9717 9718 void 9719 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9720 void *cb_arg) 9721 { 9722 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9723 int status = 0; 9724 9725 assert(cb_fn != NULL); 9726 9727 if (bdev_ch->histogram == NULL) { 9728 status = -EFAULT; 9729 } 9730 cb_fn(cb_arg, status, bdev_ch->histogram); 9731 } 9732 9733 size_t 9734 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9735 size_t max_events) 9736 { 9737 struct media_event_entry *entry; 9738 size_t num_events = 0; 9739 9740 for (; num_events < max_events; ++num_events) { 9741 entry = TAILQ_FIRST(&desc->pending_media_events); 9742 if (entry == NULL) { 9743 break; 9744 } 9745 9746 events[num_events] = entry->event; 9747 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9748 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9749 } 9750 9751 return num_events; 9752 } 9753 9754 int 9755 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9756 size_t num_events) 9757 { 9758 struct spdk_bdev_desc *desc; 9759 struct media_event_entry *entry; 9760 size_t event_id; 9761 int rc = 0; 9762 9763 assert(bdev->media_events); 9764 9765 spdk_spin_lock(&bdev->internal.spinlock); 9766 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9767 if (desc->write) { 9768 break; 9769 } 9770 } 9771 9772 if (desc == NULL || desc->media_events_buffer == NULL) { 9773 rc = -ENODEV; 9774 goto out; 9775 } 9776 9777 for (event_id = 0; event_id < num_events; ++event_id) { 9778 entry = TAILQ_FIRST(&desc->free_media_events); 9779 if (entry == NULL) { 9780 break; 9781 } 9782 9783 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9784 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9785 entry->event = events[event_id]; 9786 } 9787 9788 rc = event_id; 9789 out: 9790 spdk_spin_unlock(&bdev->internal.spinlock); 9791 return rc; 9792 } 9793 9794 static void 9795 _media_management_notify(void *arg) 9796 { 9797 struct spdk_bdev_desc *desc = arg; 9798 9799 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9800 } 9801 9802 void 9803 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9804 { 9805 struct spdk_bdev_desc *desc; 9806 9807 spdk_spin_lock(&bdev->internal.spinlock); 9808 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9809 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9810 event_notify(desc, _media_management_notify); 9811 } 9812 } 9813 spdk_spin_unlock(&bdev->internal.spinlock); 9814 } 9815 9816 struct locked_lba_range_ctx { 9817 struct lba_range range; 9818 struct lba_range *current_range; 9819 struct lba_range *owner_range; 9820 struct spdk_poller *poller; 9821 lock_range_cb cb_fn; 9822 void *cb_arg; 9823 }; 9824 9825 static void 9826 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9827 { 9828 struct locked_lba_range_ctx *ctx = _ctx; 9829 9830 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9831 free(ctx); 9832 } 9833 9834 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9835 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9836 9837 static void 9838 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9839 { 9840 struct locked_lba_range_ctx *ctx = _ctx; 9841 9842 if (status == -ENOMEM) { 9843 /* One of the channels could not allocate a range object. 9844 * So we have to go back and clean up any ranges that were 9845 * allocated successfully before we return error status to 9846 * the caller. We can reuse the unlock function to do that 9847 * clean up. 9848 */ 9849 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9850 bdev_lock_error_cleanup_cb); 9851 return; 9852 } 9853 9854 /* All channels have locked this range and no I/O overlapping the range 9855 * are outstanding! Set the owner_ch for the range object for the 9856 * locking channel, so that this channel will know that it is allowed 9857 * to write to this range. 9858 */ 9859 if (ctx->owner_range != NULL) { 9860 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9861 } 9862 9863 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9864 9865 /* Don't free the ctx here. Its range is in the bdev's global list of 9866 * locked ranges still, and will be removed and freed when this range 9867 * is later unlocked. 9868 */ 9869 } 9870 9871 static int 9872 bdev_lock_lba_range_check_io(void *_i) 9873 { 9874 struct spdk_bdev_channel_iter *i = _i; 9875 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9876 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9877 struct locked_lba_range_ctx *ctx = i->ctx; 9878 struct lba_range *range = ctx->current_range; 9879 struct spdk_bdev_io *bdev_io; 9880 9881 spdk_poller_unregister(&ctx->poller); 9882 9883 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9884 * range. But we need to wait until any outstanding IO overlapping with this range 9885 * are completed. 9886 */ 9887 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9888 if (bdev_io_range_is_locked(bdev_io, range)) { 9889 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9890 return SPDK_POLLER_BUSY; 9891 } 9892 } 9893 9894 spdk_bdev_for_each_channel_continue(i, 0); 9895 return SPDK_POLLER_BUSY; 9896 } 9897 9898 static void 9899 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9900 struct spdk_io_channel *_ch, void *_ctx) 9901 { 9902 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9903 struct locked_lba_range_ctx *ctx = _ctx; 9904 struct lba_range *range; 9905 9906 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9907 if (range->length == ctx->range.length && 9908 range->offset == ctx->range.offset && 9909 range->locked_ctx == ctx->range.locked_ctx) { 9910 /* This range already exists on this channel, so don't add 9911 * it again. This can happen when a new channel is created 9912 * while the for_each_channel operation is in progress. 9913 * Do not check for outstanding I/O in that case, since the 9914 * range was locked before any I/O could be submitted to the 9915 * new channel. 9916 */ 9917 spdk_bdev_for_each_channel_continue(i, 0); 9918 return; 9919 } 9920 } 9921 9922 range = calloc(1, sizeof(*range)); 9923 if (range == NULL) { 9924 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9925 return; 9926 } 9927 9928 range->length = ctx->range.length; 9929 range->offset = ctx->range.offset; 9930 range->locked_ctx = ctx->range.locked_ctx; 9931 range->quiesce = ctx->range.quiesce; 9932 ctx->current_range = range; 9933 if (ctx->range.owner_ch == ch) { 9934 /* This is the range object for the channel that will hold 9935 * the lock. Store it in the ctx object so that we can easily 9936 * set its owner_ch after the lock is finally acquired. 9937 */ 9938 ctx->owner_range = range; 9939 } 9940 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9941 bdev_lock_lba_range_check_io(i); 9942 } 9943 9944 static void 9945 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9946 { 9947 assert(spdk_get_thread() == ctx->range.owner_thread); 9948 assert(ctx->range.owner_ch == NULL || 9949 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9950 9951 /* We will add a copy of this range to each channel now. */ 9952 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9953 bdev_lock_lba_range_cb); 9954 } 9955 9956 static bool 9957 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9958 { 9959 struct lba_range *r; 9960 9961 TAILQ_FOREACH(r, tailq, tailq) { 9962 if (bdev_lba_range_overlapped(range, r)) { 9963 return true; 9964 } 9965 } 9966 return false; 9967 } 9968 9969 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9970 9971 static int 9972 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9973 uint64_t offset, uint64_t length, 9974 lock_range_cb cb_fn, void *cb_arg) 9975 { 9976 struct locked_lba_range_ctx *ctx; 9977 9978 ctx = calloc(1, sizeof(*ctx)); 9979 if (ctx == NULL) { 9980 return -ENOMEM; 9981 } 9982 9983 ctx->range.offset = offset; 9984 ctx->range.length = length; 9985 ctx->range.owner_thread = spdk_get_thread(); 9986 ctx->range.owner_ch = ch; 9987 ctx->range.locked_ctx = cb_arg; 9988 ctx->range.bdev = bdev; 9989 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9990 ctx->cb_fn = cb_fn; 9991 ctx->cb_arg = cb_arg; 9992 9993 spdk_spin_lock(&bdev->internal.spinlock); 9994 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9995 /* There is an active lock overlapping with this range. 9996 * Put it on the pending list until this range no 9997 * longer overlaps with another. 9998 */ 9999 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 10000 } else { 10001 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 10002 bdev_lock_lba_range_ctx(bdev, ctx); 10003 } 10004 spdk_spin_unlock(&bdev->internal.spinlock); 10005 return 0; 10006 } 10007 10008 static int 10009 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10010 uint64_t offset, uint64_t length, 10011 lock_range_cb cb_fn, void *cb_arg) 10012 { 10013 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10014 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10015 10016 if (cb_arg == NULL) { 10017 SPDK_ERRLOG("cb_arg must not be NULL\n"); 10018 return -EINVAL; 10019 } 10020 10021 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10022 } 10023 10024 static void 10025 bdev_lock_lba_range_ctx_msg(void *_ctx) 10026 { 10027 struct locked_lba_range_ctx *ctx = _ctx; 10028 10029 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10030 } 10031 10032 static void 10033 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10034 { 10035 struct locked_lba_range_ctx *ctx = _ctx; 10036 struct locked_lba_range_ctx *pending_ctx; 10037 struct lba_range *range, *tmp; 10038 10039 spdk_spin_lock(&bdev->internal.spinlock); 10040 /* Check if there are any pending locked ranges that overlap with this range 10041 * that was just unlocked. If there are, check that it doesn't overlap with any 10042 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10043 * the lock process. 10044 */ 10045 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10046 if (bdev_lba_range_overlapped(range, &ctx->range) && 10047 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10048 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10049 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10050 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10051 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10052 bdev_lock_lba_range_ctx_msg, pending_ctx); 10053 } 10054 } 10055 spdk_spin_unlock(&bdev->internal.spinlock); 10056 10057 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10058 free(ctx); 10059 } 10060 10061 static void 10062 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10063 struct spdk_io_channel *_ch, void *_ctx) 10064 { 10065 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10066 struct locked_lba_range_ctx *ctx = _ctx; 10067 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10068 struct spdk_bdev_io *bdev_io; 10069 struct lba_range *range; 10070 10071 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10072 if (ctx->range.offset == range->offset && 10073 ctx->range.length == range->length && 10074 ctx->range.locked_ctx == range->locked_ctx) { 10075 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10076 free(range); 10077 break; 10078 } 10079 } 10080 10081 /* Note: we should almost always be able to assert that the range specified 10082 * was found. But there are some very rare corner cases where a new channel 10083 * gets created simultaneously with a range unlock, where this function 10084 * would execute on that new channel and wouldn't have the range. 10085 * We also use this to clean up range allocations when a later allocation 10086 * fails in the locking path. 10087 * So we can't actually assert() here. 10088 */ 10089 10090 /* Swap the locked IO into a temporary list, and then try to submit them again. 10091 * We could hyper-optimize this to only resubmit locked I/O that overlap 10092 * with the range that was just unlocked, but this isn't a performance path so 10093 * we go for simplicity here. 10094 */ 10095 TAILQ_INIT(&io_locked); 10096 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10097 while (!TAILQ_EMPTY(&io_locked)) { 10098 bdev_io = TAILQ_FIRST(&io_locked); 10099 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10100 bdev_io_submit(bdev_io); 10101 } 10102 10103 spdk_bdev_for_each_channel_continue(i, 0); 10104 } 10105 10106 static int 10107 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10108 lock_range_cb cb_fn, void *cb_arg) 10109 { 10110 struct locked_lba_range_ctx *ctx; 10111 struct lba_range *range; 10112 10113 spdk_spin_lock(&bdev->internal.spinlock); 10114 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10115 * and remove it. This ensures new channels don't inherit the locked range. 10116 * Then we will send a message to each channel to remove the range from its 10117 * per-channel list. 10118 */ 10119 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10120 if (range->offset == offset && range->length == length && 10121 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10122 break; 10123 } 10124 } 10125 if (range == NULL) { 10126 assert(false); 10127 spdk_spin_unlock(&bdev->internal.spinlock); 10128 return -EINVAL; 10129 } 10130 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10131 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10132 spdk_spin_unlock(&bdev->internal.spinlock); 10133 10134 ctx->cb_fn = cb_fn; 10135 ctx->cb_arg = cb_arg; 10136 10137 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10138 bdev_unlock_lba_range_cb); 10139 return 0; 10140 } 10141 10142 static int 10143 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10144 uint64_t offset, uint64_t length, 10145 lock_range_cb cb_fn, void *cb_arg) 10146 { 10147 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10148 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10149 struct lba_range *range; 10150 bool range_found = false; 10151 10152 /* Let's make sure the specified channel actually has a lock on 10153 * the specified range. Note that the range must match exactly. 10154 */ 10155 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10156 if (range->offset == offset && range->length == length && 10157 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10158 range_found = true; 10159 break; 10160 } 10161 } 10162 10163 if (!range_found) { 10164 return -EINVAL; 10165 } 10166 10167 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10168 } 10169 10170 struct bdev_quiesce_ctx { 10171 spdk_bdev_quiesce_cb cb_fn; 10172 void *cb_arg; 10173 }; 10174 10175 static void 10176 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10177 { 10178 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10179 10180 if (quiesce_ctx->cb_fn != NULL) { 10181 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10182 } 10183 10184 free(quiesce_ctx); 10185 } 10186 10187 static void 10188 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10189 { 10190 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10191 struct spdk_bdev_module *module = range->bdev->module; 10192 10193 if (status != 0) { 10194 if (quiesce_ctx->cb_fn != NULL) { 10195 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10196 } 10197 free(quiesce_ctx); 10198 return; 10199 } 10200 10201 spdk_spin_lock(&module->internal.spinlock); 10202 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10203 spdk_spin_unlock(&module->internal.spinlock); 10204 10205 if (quiesce_ctx->cb_fn != NULL) { 10206 /* copy the context in case the range is unlocked by the callback */ 10207 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10208 10209 quiesce_ctx->cb_fn = NULL; 10210 quiesce_ctx->cb_arg = NULL; 10211 10212 tmp.cb_fn(tmp.cb_arg, status); 10213 } 10214 /* quiesce_ctx will be freed on unquiesce */ 10215 } 10216 10217 static int 10218 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10219 uint64_t offset, uint64_t length, 10220 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10221 bool unquiesce) 10222 { 10223 struct bdev_quiesce_ctx *quiesce_ctx; 10224 int rc; 10225 10226 if (module != bdev->module) { 10227 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10228 return -EINVAL; 10229 } 10230 10231 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10232 return -EINVAL; 10233 } 10234 10235 if (unquiesce) { 10236 struct lba_range *range; 10237 10238 /* Make sure the specified range is actually quiesced in the specified module and 10239 * then remove it from the list. Note that the range must match exactly. 10240 */ 10241 spdk_spin_lock(&module->internal.spinlock); 10242 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10243 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10244 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10245 break; 10246 } 10247 } 10248 spdk_spin_unlock(&module->internal.spinlock); 10249 10250 if (range == NULL) { 10251 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10252 return -EINVAL; 10253 } 10254 10255 quiesce_ctx = range->locked_ctx; 10256 quiesce_ctx->cb_fn = cb_fn; 10257 quiesce_ctx->cb_arg = cb_arg; 10258 10259 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10260 } else { 10261 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10262 if (quiesce_ctx == NULL) { 10263 return -ENOMEM; 10264 } 10265 10266 quiesce_ctx->cb_fn = cb_fn; 10267 quiesce_ctx->cb_arg = cb_arg; 10268 10269 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10270 if (rc != 0) { 10271 free(quiesce_ctx); 10272 } 10273 } 10274 10275 return rc; 10276 } 10277 10278 int 10279 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10280 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10281 { 10282 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10283 } 10284 10285 int 10286 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10287 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10288 { 10289 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10290 } 10291 10292 int 10293 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10294 uint64_t offset, uint64_t length, 10295 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10296 { 10297 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10298 } 10299 10300 int 10301 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10302 uint64_t offset, uint64_t length, 10303 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10304 { 10305 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10306 } 10307 10308 int 10309 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10310 int array_size) 10311 { 10312 if (!bdev) { 10313 return -EINVAL; 10314 } 10315 10316 if (bdev->fn_table->get_memory_domains) { 10317 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10318 } 10319 10320 return 0; 10321 } 10322 10323 struct spdk_bdev_for_each_io_ctx { 10324 void *ctx; 10325 spdk_bdev_io_fn fn; 10326 spdk_bdev_for_each_io_cb cb; 10327 }; 10328 10329 static void 10330 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10331 struct spdk_io_channel *io_ch, void *_ctx) 10332 { 10333 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10334 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10335 struct spdk_bdev_io *bdev_io; 10336 int rc = 0; 10337 10338 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10339 rc = ctx->fn(ctx->ctx, bdev_io); 10340 if (rc != 0) { 10341 break; 10342 } 10343 } 10344 10345 spdk_bdev_for_each_channel_continue(i, rc); 10346 } 10347 10348 static void 10349 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10350 { 10351 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10352 10353 ctx->cb(ctx->ctx, status); 10354 10355 free(ctx); 10356 } 10357 10358 void 10359 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10360 spdk_bdev_for_each_io_cb cb) 10361 { 10362 struct spdk_bdev_for_each_io_ctx *ctx; 10363 10364 assert(fn != NULL && cb != NULL); 10365 10366 ctx = calloc(1, sizeof(*ctx)); 10367 if (ctx == NULL) { 10368 SPDK_ERRLOG("Failed to allocate context.\n"); 10369 cb(_ctx, -ENOMEM); 10370 return; 10371 } 10372 10373 ctx->ctx = _ctx; 10374 ctx->fn = fn; 10375 ctx->cb = cb; 10376 10377 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10378 bdev_for_each_io_done); 10379 } 10380 10381 void 10382 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10383 { 10384 spdk_for_each_channel_continue(iter->i, status); 10385 } 10386 10387 static struct spdk_bdev * 10388 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10389 { 10390 void *io_device = spdk_io_channel_iter_get_io_device(i); 10391 10392 return __bdev_from_io_dev(io_device); 10393 } 10394 10395 static void 10396 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10397 { 10398 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10399 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10400 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10401 10402 iter->i = i; 10403 iter->fn(iter, bdev, ch, iter->ctx); 10404 } 10405 10406 static void 10407 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10408 { 10409 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10410 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10411 10412 iter->i = i; 10413 iter->cpl(bdev, iter->ctx, status); 10414 10415 free(iter); 10416 } 10417 10418 void 10419 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10420 void *ctx, spdk_bdev_for_each_channel_done cpl) 10421 { 10422 struct spdk_bdev_channel_iter *iter; 10423 10424 assert(bdev != NULL && fn != NULL && ctx != NULL); 10425 10426 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10427 if (iter == NULL) { 10428 SPDK_ERRLOG("Unable to allocate iterator\n"); 10429 assert(false); 10430 return; 10431 } 10432 10433 iter->fn = fn; 10434 iter->cpl = cpl; 10435 iter->ctx = ctx; 10436 10437 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10438 iter, bdev_each_channel_cpl); 10439 } 10440 10441 static void 10442 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10443 { 10444 struct spdk_bdev_io *parent_io = cb_arg; 10445 10446 spdk_bdev_free_io(bdev_io); 10447 10448 /* Check return status of write */ 10449 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10450 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10451 } 10452 10453 static void 10454 bdev_copy_do_write(void *_bdev_io) 10455 { 10456 struct spdk_bdev_io *bdev_io = _bdev_io; 10457 int rc; 10458 10459 /* Write blocks */ 10460 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10461 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10462 bdev_io->u.bdev.iovs[0].iov_base, 10463 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10464 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10465 10466 if (rc == -ENOMEM) { 10467 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10468 } else if (rc != 0) { 10469 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10470 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10471 } 10472 } 10473 10474 static void 10475 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10476 { 10477 struct spdk_bdev_io *parent_io = cb_arg; 10478 10479 spdk_bdev_free_io(bdev_io); 10480 10481 /* Check return status of read */ 10482 if (!success) { 10483 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10484 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10485 return; 10486 } 10487 10488 /* Do write */ 10489 bdev_copy_do_write(parent_io); 10490 } 10491 10492 static void 10493 bdev_copy_do_read(void *_bdev_io) 10494 { 10495 struct spdk_bdev_io *bdev_io = _bdev_io; 10496 int rc; 10497 10498 /* Read blocks */ 10499 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10500 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10501 bdev_io->u.bdev.iovs[0].iov_base, 10502 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10503 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10504 10505 if (rc == -ENOMEM) { 10506 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10507 } else if (rc != 0) { 10508 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10509 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10510 } 10511 } 10512 10513 static void 10514 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10515 { 10516 if (!success) { 10517 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10518 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10519 return; 10520 } 10521 10522 bdev_copy_do_read(bdev_io); 10523 } 10524 10525 int 10526 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10527 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10528 spdk_bdev_io_completion_cb cb, void *cb_arg) 10529 { 10530 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10531 struct spdk_bdev_io *bdev_io; 10532 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10533 10534 if (!desc->write) { 10535 return -EBADF; 10536 } 10537 10538 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10539 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10540 SPDK_DEBUGLOG(bdev, 10541 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10542 dst_offset_blocks, src_offset_blocks, num_blocks); 10543 return -EINVAL; 10544 } 10545 10546 bdev_io = bdev_channel_get_io(channel); 10547 if (!bdev_io) { 10548 return -ENOMEM; 10549 } 10550 10551 bdev_io->internal.ch = channel; 10552 bdev_io->internal.desc = desc; 10553 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10554 10555 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10556 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10557 bdev_io->u.bdev.num_blocks = num_blocks; 10558 bdev_io->u.bdev.memory_domain = NULL; 10559 bdev_io->u.bdev.memory_domain_ctx = NULL; 10560 bdev_io->u.bdev.iovs = NULL; 10561 bdev_io->u.bdev.iovcnt = 0; 10562 bdev_io->u.bdev.md_buf = NULL; 10563 bdev_io->u.bdev.accel_sequence = NULL; 10564 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10565 10566 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10567 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10568 return 0; 10569 } 10570 10571 10572 /* If the copy size is large and should be split, use the generic split logic 10573 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10574 * 10575 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10576 * emulate it using regular read and write requests otherwise. 10577 */ 10578 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10579 bdev_io->internal.f.split) { 10580 bdev_io_submit(bdev_io); 10581 return 0; 10582 } 10583 10584 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10585 10586 return 0; 10587 } 10588 10589 SPDK_LOG_REGISTER_COMPONENT(bdev) 10590 10591 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10592 { 10593 struct spdk_trace_tpoint_opts opts[] = { 10594 { 10595 "BDEV_IO_START", TRACE_BDEV_IO_START, 10596 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10597 { 10598 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10599 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10600 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10601 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10602 } 10603 }, 10604 { 10605 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10606 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10607 { 10608 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10609 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10610 } 10611 }, 10612 { 10613 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10614 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10615 { 10616 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10617 } 10618 }, 10619 { 10620 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10621 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10622 { 10623 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10624 } 10625 }, 10626 }; 10627 10628 10629 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10630 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10631 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10632 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10633 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10634 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_START, OBJECT_BDEV_IO, 0); 10635 spdk_trace_tpoint_register_relation(TRACE_BLOB_REQ_SET_COMPLETE, OBJECT_BDEV_IO, 0); 10636 } 10637