1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 /* Counts number of bdev_io in the io_submitted TAILQ */ 308 uint16_t queue_depth; 309 310 uint16_t trace_id; 311 312 struct spdk_histogram_data *histogram; 313 314 #ifdef SPDK_CONFIG_VTUNE 315 uint64_t start_tsc; 316 uint64_t interval_tsc; 317 __itt_string_handle *handle; 318 struct spdk_bdev_io_stat *prev_stat; 319 #endif 320 321 bdev_io_tailq_t queued_resets; 322 323 lba_range_tailq_t locked_ranges; 324 325 /** List of I/Os queued by QoS. */ 326 bdev_io_tailq_t qos_queued_io; 327 }; 328 329 struct media_event_entry { 330 struct spdk_bdev_media_event event; 331 TAILQ_ENTRY(media_event_entry) tailq; 332 }; 333 334 #define MEDIA_EVENT_POOL_SIZE 64 335 336 struct spdk_bdev_desc { 337 struct spdk_bdev *bdev; 338 struct spdk_thread *thread; 339 struct { 340 spdk_bdev_event_cb_t event_fn; 341 void *ctx; 342 } callback; 343 bool closed; 344 bool write; 345 bool memory_domains_supported; 346 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 347 struct spdk_spinlock spinlock; 348 uint32_t refs; 349 TAILQ_HEAD(, media_event_entry) pending_media_events; 350 TAILQ_HEAD(, media_event_entry) free_media_events; 351 struct media_event_entry *media_events_buffer; 352 TAILQ_ENTRY(spdk_bdev_desc) link; 353 354 uint64_t timeout_in_sec; 355 spdk_bdev_io_timeout_cb cb_fn; 356 void *cb_arg; 357 struct spdk_poller *io_timeout_poller; 358 struct spdk_bdev_module_claim *claim; 359 }; 360 361 struct spdk_bdev_iostat_ctx { 362 struct spdk_bdev_io_stat *stat; 363 spdk_bdev_get_device_stat_cb cb; 364 void *cb_arg; 365 }; 366 367 struct set_qos_limit_ctx { 368 void (*cb_fn)(void *cb_arg, int status); 369 void *cb_arg; 370 struct spdk_bdev *bdev; 371 }; 372 373 struct spdk_bdev_channel_iter { 374 spdk_bdev_for_each_channel_msg fn; 375 spdk_bdev_for_each_channel_done cpl; 376 struct spdk_io_channel_iter *i; 377 void *ctx; 378 }; 379 380 struct spdk_bdev_io_error_stat { 381 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 382 }; 383 384 enum bdev_io_retry_state { 385 BDEV_IO_RETRY_STATE_INVALID, 386 BDEV_IO_RETRY_STATE_PULL, 387 BDEV_IO_RETRY_STATE_PULL_MD, 388 BDEV_IO_RETRY_STATE_SUBMIT, 389 BDEV_IO_RETRY_STATE_PUSH, 390 BDEV_IO_RETRY_STATE_PUSH_MD, 391 }; 392 393 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 394 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 395 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 396 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 397 398 static inline void bdev_io_complete(void *ctx); 399 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 400 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 401 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 404 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 405 406 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 407 struct spdk_io_channel *ch, void *_ctx); 408 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 409 410 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 412 uint64_t num_blocks, 413 struct spdk_memory_domain *domain, void *domain_ctx, 414 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 415 spdk_bdev_io_completion_cb cb, void *cb_arg); 416 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 417 struct iovec *iov, int iovcnt, void *md_buf, 418 uint64_t offset_blocks, uint64_t num_blocks, 419 struct spdk_memory_domain *domain, void *domain_ctx, 420 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 421 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 422 spdk_bdev_io_completion_cb cb, void *cb_arg); 423 424 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 425 uint64_t offset, uint64_t length, 426 lock_range_cb cb_fn, void *cb_arg); 427 428 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 429 uint64_t offset, uint64_t length, 430 lock_range_cb cb_fn, void *cb_arg); 431 432 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 433 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 434 435 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 436 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 437 static void claim_reset(struct spdk_bdev *bdev); 438 439 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 440 441 #define bdev_get_ext_io_opt(opts, field, defval) \ 442 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 443 444 static inline void 445 bdev_ch_add_to_io_submitted(struct spdk_bdev_io *bdev_io) 446 { 447 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 448 bdev_io->internal.ch->queue_depth++; 449 } 450 451 static inline void 452 bdev_ch_remove_from_io_submitted(struct spdk_bdev_io *bdev_io) 453 { 454 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 455 bdev_io->internal.ch->queue_depth--; 456 } 457 458 void 459 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 460 { 461 if (!opts) { 462 SPDK_ERRLOG("opts should not be NULL\n"); 463 return; 464 } 465 466 if (!opts_size) { 467 SPDK_ERRLOG("opts_size should not be zero value\n"); 468 return; 469 } 470 471 opts->opts_size = opts_size; 472 473 #define SET_FIELD(field) \ 474 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 475 opts->field = g_bdev_opts.field; \ 476 } \ 477 478 SET_FIELD(bdev_io_pool_size); 479 SET_FIELD(bdev_io_cache_size); 480 SET_FIELD(bdev_auto_examine); 481 SET_FIELD(iobuf_small_cache_size); 482 SET_FIELD(iobuf_large_cache_size); 483 484 /* Do not remove this statement, you should always update this statement when you adding a new field, 485 * and do not forget to add the SET_FIELD statement for your added field. */ 486 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 487 488 #undef SET_FIELD 489 } 490 491 int 492 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 493 { 494 uint32_t min_pool_size; 495 496 if (!opts) { 497 SPDK_ERRLOG("opts cannot be NULL\n"); 498 return -1; 499 } 500 501 if (!opts->opts_size) { 502 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 503 return -1; 504 } 505 506 /* 507 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 508 * initialization. A second mgmt_ch will be created on the same thread when the application starts 509 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 510 */ 511 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 512 if (opts->bdev_io_pool_size < min_pool_size) { 513 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 514 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 515 spdk_thread_get_count()); 516 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 517 return -1; 518 } 519 520 #define SET_FIELD(field) \ 521 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 522 g_bdev_opts.field = opts->field; \ 523 } \ 524 525 SET_FIELD(bdev_io_pool_size); 526 SET_FIELD(bdev_io_cache_size); 527 SET_FIELD(bdev_auto_examine); 528 SET_FIELD(iobuf_small_cache_size); 529 SET_FIELD(iobuf_large_cache_size); 530 531 g_bdev_opts.opts_size = opts->opts_size; 532 533 #undef SET_FIELD 534 535 return 0; 536 } 537 538 static struct spdk_bdev * 539 bdev_get_by_name(const char *bdev_name) 540 { 541 struct spdk_bdev_name find; 542 struct spdk_bdev_name *res; 543 544 find.name = (char *)bdev_name; 545 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 546 if (res != NULL) { 547 return res->bdev; 548 } 549 550 return NULL; 551 } 552 553 struct spdk_bdev * 554 spdk_bdev_get_by_name(const char *bdev_name) 555 { 556 struct spdk_bdev *bdev; 557 558 spdk_spin_lock(&g_bdev_mgr.spinlock); 559 bdev = bdev_get_by_name(bdev_name); 560 spdk_spin_unlock(&g_bdev_mgr.spinlock); 561 562 return bdev; 563 } 564 565 struct bdev_io_status_string { 566 enum spdk_bdev_io_status status; 567 const char *str; 568 }; 569 570 static const struct bdev_io_status_string bdev_io_status_strings[] = { 571 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 572 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 573 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 574 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 575 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 576 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 577 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 578 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 579 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 580 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 581 }; 582 583 static const char * 584 bdev_io_status_get_string(enum spdk_bdev_io_status status) 585 { 586 uint32_t i; 587 588 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 589 if (bdev_io_status_strings[i].status == status) { 590 return bdev_io_status_strings[i].str; 591 } 592 } 593 594 return "reserved"; 595 } 596 597 struct spdk_bdev_wait_for_examine_ctx { 598 struct spdk_poller *poller; 599 spdk_bdev_wait_for_examine_cb cb_fn; 600 void *cb_arg; 601 }; 602 603 static bool bdev_module_all_actions_completed(void); 604 605 static int 606 bdev_wait_for_examine_cb(void *arg) 607 { 608 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 609 610 if (!bdev_module_all_actions_completed()) { 611 return SPDK_POLLER_IDLE; 612 } 613 614 spdk_poller_unregister(&ctx->poller); 615 ctx->cb_fn(ctx->cb_arg); 616 free(ctx); 617 618 return SPDK_POLLER_BUSY; 619 } 620 621 int 622 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 623 { 624 struct spdk_bdev_wait_for_examine_ctx *ctx; 625 626 ctx = calloc(1, sizeof(*ctx)); 627 if (ctx == NULL) { 628 return -ENOMEM; 629 } 630 ctx->cb_fn = cb_fn; 631 ctx->cb_arg = cb_arg; 632 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 633 634 return 0; 635 } 636 637 struct spdk_bdev_examine_item { 638 char *name; 639 TAILQ_ENTRY(spdk_bdev_examine_item) link; 640 }; 641 642 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 643 644 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 645 g_bdev_examine_allowlist); 646 647 static inline bool 648 bdev_examine_allowlist_check(const char *name) 649 { 650 struct spdk_bdev_examine_item *item; 651 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 652 if (strcmp(name, item->name) == 0) { 653 return true; 654 } 655 } 656 return false; 657 } 658 659 static inline void 660 bdev_examine_allowlist_free(void) 661 { 662 struct spdk_bdev_examine_item *item; 663 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 664 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 665 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 666 free(item->name); 667 free(item); 668 } 669 } 670 671 static inline bool 672 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 673 { 674 struct spdk_bdev_alias *tmp; 675 if (bdev_examine_allowlist_check(bdev->name)) { 676 return true; 677 } 678 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 679 if (bdev_examine_allowlist_check(tmp->alias.name)) { 680 return true; 681 } 682 } 683 return false; 684 } 685 686 static inline bool 687 bdev_ok_to_examine(struct spdk_bdev *bdev) 688 { 689 if (g_bdev_opts.bdev_auto_examine) { 690 return true; 691 } else { 692 return bdev_in_examine_allowlist(bdev); 693 } 694 } 695 696 static void 697 bdev_examine(struct spdk_bdev *bdev) 698 { 699 struct spdk_bdev_module *module; 700 struct spdk_bdev_module_claim *claim, *tmpclaim; 701 uint32_t action; 702 703 if (!bdev_ok_to_examine(bdev)) { 704 return; 705 } 706 707 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 708 if (module->examine_config) { 709 spdk_spin_lock(&module->internal.spinlock); 710 action = module->internal.action_in_progress; 711 module->internal.action_in_progress++; 712 spdk_spin_unlock(&module->internal.spinlock); 713 module->examine_config(bdev); 714 if (action != module->internal.action_in_progress) { 715 SPDK_ERRLOG("examine_config for module %s did not call " 716 "spdk_bdev_module_examine_done()\n", module->name); 717 } 718 } 719 } 720 721 spdk_spin_lock(&bdev->internal.spinlock); 722 723 switch (bdev->internal.claim_type) { 724 case SPDK_BDEV_CLAIM_NONE: 725 /* Examine by all bdev modules */ 726 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 727 if (module->examine_disk) { 728 spdk_spin_lock(&module->internal.spinlock); 729 module->internal.action_in_progress++; 730 spdk_spin_unlock(&module->internal.spinlock); 731 spdk_spin_unlock(&bdev->internal.spinlock); 732 module->examine_disk(bdev); 733 spdk_spin_lock(&bdev->internal.spinlock); 734 } 735 } 736 break; 737 case SPDK_BDEV_CLAIM_EXCL_WRITE: 738 /* Examine by the one bdev module with a v1 claim */ 739 module = bdev->internal.claim.v1.module; 740 if (module->examine_disk) { 741 spdk_spin_lock(&module->internal.spinlock); 742 module->internal.action_in_progress++; 743 spdk_spin_unlock(&module->internal.spinlock); 744 spdk_spin_unlock(&bdev->internal.spinlock); 745 module->examine_disk(bdev); 746 return; 747 } 748 break; 749 default: 750 /* Examine by all bdev modules with a v2 claim */ 751 assert(claim_type_is_v2(bdev->internal.claim_type)); 752 /* 753 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 754 * list, perhaps accessing freed memory. Without protection, this could happen 755 * while the lock is dropped during the examine callback. 756 */ 757 bdev->internal.examine_in_progress++; 758 759 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 760 module = claim->module; 761 762 if (module == NULL) { 763 /* This is a vestigial claim, held by examine_count */ 764 continue; 765 } 766 767 if (module->examine_disk == NULL) { 768 continue; 769 } 770 771 spdk_spin_lock(&module->internal.spinlock); 772 module->internal.action_in_progress++; 773 spdk_spin_unlock(&module->internal.spinlock); 774 775 /* Call examine_disk without holding internal.spinlock. */ 776 spdk_spin_unlock(&bdev->internal.spinlock); 777 module->examine_disk(bdev); 778 spdk_spin_lock(&bdev->internal.spinlock); 779 } 780 781 assert(bdev->internal.examine_in_progress > 0); 782 bdev->internal.examine_in_progress--; 783 if (bdev->internal.examine_in_progress == 0) { 784 /* Remove any claims that were released during examine_disk */ 785 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 786 if (claim->desc != NULL) { 787 continue; 788 } 789 790 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 791 free(claim); 792 } 793 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 794 claim_reset(bdev); 795 } 796 } 797 } 798 799 spdk_spin_unlock(&bdev->internal.spinlock); 800 } 801 802 int 803 spdk_bdev_examine(const char *name) 804 { 805 struct spdk_bdev *bdev; 806 struct spdk_bdev_examine_item *item; 807 struct spdk_thread *thread = spdk_get_thread(); 808 809 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 810 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 811 thread ? spdk_thread_get_name(thread) : "null"); 812 return -EINVAL; 813 } 814 815 if (g_bdev_opts.bdev_auto_examine) { 816 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled\n"); 817 return -EINVAL; 818 } 819 820 if (bdev_examine_allowlist_check(name)) { 821 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 822 return -EEXIST; 823 } 824 825 item = calloc(1, sizeof(*item)); 826 if (!item) { 827 return -ENOMEM; 828 } 829 item->name = strdup(name); 830 if (!item->name) { 831 free(item); 832 return -ENOMEM; 833 } 834 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 835 836 bdev = spdk_bdev_get_by_name(name); 837 if (bdev) { 838 bdev_examine(bdev); 839 } 840 return 0; 841 } 842 843 static inline void 844 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 845 { 846 struct spdk_bdev_examine_item *item; 847 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 848 spdk_json_write_object_begin(w); 849 spdk_json_write_named_string(w, "method", "bdev_examine"); 850 spdk_json_write_named_object_begin(w, "params"); 851 spdk_json_write_named_string(w, "name", item->name); 852 spdk_json_write_object_end(w); 853 spdk_json_write_object_end(w); 854 } 855 } 856 857 struct spdk_bdev * 858 spdk_bdev_first(void) 859 { 860 struct spdk_bdev *bdev; 861 862 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 863 if (bdev) { 864 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 865 } 866 867 return bdev; 868 } 869 870 struct spdk_bdev * 871 spdk_bdev_next(struct spdk_bdev *prev) 872 { 873 struct spdk_bdev *bdev; 874 875 bdev = TAILQ_NEXT(prev, internal.link); 876 if (bdev) { 877 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 878 } 879 880 return bdev; 881 } 882 883 static struct spdk_bdev * 884 _bdev_next_leaf(struct spdk_bdev *bdev) 885 { 886 while (bdev != NULL) { 887 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 888 return bdev; 889 } else { 890 bdev = TAILQ_NEXT(bdev, internal.link); 891 } 892 } 893 894 return bdev; 895 } 896 897 struct spdk_bdev * 898 spdk_bdev_first_leaf(void) 899 { 900 struct spdk_bdev *bdev; 901 902 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 903 904 if (bdev) { 905 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 906 } 907 908 return bdev; 909 } 910 911 struct spdk_bdev * 912 spdk_bdev_next_leaf(struct spdk_bdev *prev) 913 { 914 struct spdk_bdev *bdev; 915 916 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 917 918 if (bdev) { 919 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 920 } 921 922 return bdev; 923 } 924 925 static inline bool 926 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 927 { 928 return bdev_io->internal.f.has_memory_domain; 929 } 930 931 static inline bool 932 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 933 { 934 return bdev_io->internal.f.has_accel_sequence; 935 } 936 937 static inline void 938 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 939 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 940 { 941 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 942 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 943 * channels we will instead wait for half to complete. 944 */ 945 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 946 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 947 948 assert(state != BDEV_IO_RETRY_STATE_INVALID); 949 bdev_io->internal.retry_state = state; 950 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 951 } 952 953 static inline void 954 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 955 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 956 { 957 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 958 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 959 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 960 961 assert(state != BDEV_IO_RETRY_STATE_INVALID); 962 bdev_io->internal.retry_state = state; 963 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 964 } 965 966 void 967 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 968 { 969 struct iovec *iovs; 970 971 if (bdev_io->u.bdev.iovs == NULL) { 972 bdev_io->u.bdev.iovs = &bdev_io->iov; 973 bdev_io->u.bdev.iovcnt = 1; 974 } 975 976 iovs = bdev_io->u.bdev.iovs; 977 978 assert(iovs != NULL); 979 assert(bdev_io->u.bdev.iovcnt >= 1); 980 981 iovs[0].iov_base = buf; 982 iovs[0].iov_len = len; 983 } 984 985 void 986 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 987 { 988 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 989 bdev_io->u.bdev.md_buf = md_buf; 990 } 991 992 static bool 993 _is_buf_allocated(const struct iovec *iovs) 994 { 995 if (iovs == NULL) { 996 return false; 997 } 998 999 return iovs[0].iov_base != NULL; 1000 } 1001 1002 static bool 1003 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1004 { 1005 int i; 1006 uintptr_t iov_base; 1007 1008 if (spdk_likely(alignment == 1)) { 1009 return true; 1010 } 1011 1012 for (i = 0; i < iovcnt; i++) { 1013 iov_base = (uintptr_t)iovs[i].iov_base; 1014 if ((iov_base & (alignment - 1)) != 0) { 1015 return false; 1016 } 1017 } 1018 1019 return true; 1020 } 1021 1022 static inline bool 1023 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1024 { 1025 if (!bdev_io_use_accel_sequence(bdev_io)) { 1026 return false; 1027 } 1028 1029 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1030 * bdev module didn't support accel sequences */ 1031 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.f.split; 1032 } 1033 1034 static inline void 1035 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1036 struct spdk_bdev_shared_resource *shared_resource) 1037 { 1038 bdev_ch->io_outstanding++; 1039 shared_resource->io_outstanding++; 1040 } 1041 1042 static inline void 1043 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1044 struct spdk_bdev_shared_resource *shared_resource) 1045 { 1046 assert(bdev_ch->io_outstanding > 0); 1047 assert(shared_resource->io_outstanding > 0); 1048 bdev_ch->io_outstanding--; 1049 shared_resource->io_outstanding--; 1050 } 1051 1052 static void 1053 bdev_io_submit_sequence_cb(void *ctx, int status) 1054 { 1055 struct spdk_bdev_io *bdev_io = ctx; 1056 1057 assert(bdev_io_use_accel_sequence(bdev_io)); 1058 1059 bdev_io->u.bdev.accel_sequence = NULL; 1060 bdev_io->internal.f.has_accel_sequence = false; 1061 1062 if (spdk_unlikely(status != 0)) { 1063 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1064 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1065 bdev_io_complete_unsubmitted(bdev_io); 1066 return; 1067 } 1068 1069 bdev_io_submit(bdev_io); 1070 } 1071 1072 static void 1073 bdev_io_exec_sequence_cb(void *ctx, int status) 1074 { 1075 struct spdk_bdev_io *bdev_io = ctx; 1076 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1077 1078 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1079 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1080 1081 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1082 bdev_ch_retry_io(ch); 1083 } 1084 1085 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1086 } 1087 1088 static void 1089 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1090 { 1091 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1092 1093 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1094 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1095 assert(bdev_io_use_accel_sequence(bdev_io)); 1096 1097 /* Since the operations are appended during submission, they're in the opposite order than 1098 * how we want to execute them for reads (i.e. we need to execute the most recently added 1099 * operation first), so reverse the sequence before executing it. 1100 */ 1101 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1102 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1103 } 1104 1105 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1106 bdev_io_increment_outstanding(ch, ch->shared_resource); 1107 bdev_io->internal.data_transfer_cpl = cb_fn; 1108 1109 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1110 bdev_io_exec_sequence_cb, bdev_io); 1111 } 1112 1113 static void 1114 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1115 { 1116 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1117 void *buf; 1118 1119 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1120 buf = bdev_io->internal.buf.ptr; 1121 bdev_io->internal.buf.ptr = NULL; 1122 bdev_io->internal.f.has_buf = false; 1123 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1124 bdev_io->internal.get_aux_buf_cb = NULL; 1125 } else { 1126 assert(bdev_io->internal.get_buf_cb != NULL); 1127 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1128 bdev_io->internal.get_buf_cb = NULL; 1129 } 1130 } 1131 1132 static void 1133 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1134 { 1135 struct spdk_bdev_io *bdev_io = ctx; 1136 1137 if (rc) { 1138 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1139 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1140 } 1141 bdev_io_get_buf_complete(bdev_io, !rc); 1142 } 1143 1144 static void 1145 bdev_io_pull_md_buf_done(void *ctx, int status) 1146 { 1147 struct spdk_bdev_io *bdev_io = ctx; 1148 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1149 1150 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1151 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1152 1153 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1154 bdev_ch_retry_io(ch); 1155 } 1156 1157 assert(bdev_io->internal.data_transfer_cpl); 1158 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1159 } 1160 1161 static void 1162 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1163 { 1164 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1165 int rc = 0; 1166 1167 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1168 if (bdev_io_use_memory_domain(bdev_io)) { 1169 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1170 bdev_io_increment_outstanding(ch, ch->shared_resource); 1171 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1172 bdev_io->internal.memory_domain_ctx, 1173 &bdev_io->internal.orig_md_iov, 1, 1174 &bdev_io->internal.bounce_md_iov, 1, 1175 bdev_io_pull_md_buf_done, bdev_io); 1176 if (rc == 0) { 1177 /* Continue to submit IO in completion callback */ 1178 return; 1179 } 1180 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1181 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1182 if (rc != -ENOMEM) { 1183 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1184 spdk_memory_domain_get_dma_device_id( 1185 bdev_io->internal.memory_domain), rc); 1186 } 1187 } else { 1188 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1189 bdev_io->internal.orig_md_iov.iov_base, 1190 bdev_io->internal.orig_md_iov.iov_len); 1191 } 1192 } 1193 1194 if (spdk_unlikely(rc == -ENOMEM)) { 1195 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1196 } else { 1197 assert(bdev_io->internal.data_transfer_cpl); 1198 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1199 } 1200 } 1201 1202 static void 1203 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1204 { 1205 /* save original md_buf */ 1206 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1207 bdev_io->internal.orig_md_iov.iov_len = len; 1208 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1209 bdev_io->internal.bounce_md_iov.iov_len = len; 1210 /* set bounce md_buf */ 1211 bdev_io->u.bdev.md_buf = md_buf; 1212 1213 bdev_io_pull_md_buf(bdev_io); 1214 } 1215 1216 static void 1217 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1218 { 1219 struct spdk_bdev *bdev = bdev_io->bdev; 1220 uint64_t md_len; 1221 void *buf; 1222 1223 if (spdk_bdev_is_md_separate(bdev)) { 1224 assert(!bdev_io_use_accel_sequence(bdev_io)); 1225 1226 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1227 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1228 1229 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1230 1231 if (bdev_io->u.bdev.md_buf != NULL) { 1232 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1233 return; 1234 } else { 1235 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1236 } 1237 } 1238 1239 bdev_io_get_buf_complete(bdev_io, true); 1240 } 1241 1242 static inline void 1243 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1244 { 1245 if (rc) { 1246 SPDK_ERRLOG("Failed to get data buffer\n"); 1247 assert(bdev_io->internal.data_transfer_cpl); 1248 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1249 return; 1250 } 1251 1252 _bdev_io_set_md_buf(bdev_io); 1253 } 1254 1255 static void 1256 bdev_io_pull_data_done_and_track(void *ctx, int status) 1257 { 1258 struct spdk_bdev_io *bdev_io = ctx; 1259 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1260 1261 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1262 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1263 1264 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1265 bdev_ch_retry_io(ch); 1266 } 1267 1268 bdev_io_pull_data_done(bdev_io, status); 1269 } 1270 1271 static void 1272 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1273 { 1274 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1275 int rc = 0; 1276 1277 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1278 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1279 * operation */ 1280 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1281 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1282 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1283 assert(bdev_io_use_accel_sequence(bdev_io)); 1284 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1285 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1286 NULL, NULL, 1287 bdev_io->internal.orig_iovs, 1288 bdev_io->internal.orig_iovcnt, 1289 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1290 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1291 NULL, NULL); 1292 } else { 1293 /* We need to reverse the src/dst for reads */ 1294 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1295 assert(bdev_io_use_accel_sequence(bdev_io)); 1296 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1297 bdev_io->internal.orig_iovs, 1298 bdev_io->internal.orig_iovcnt, 1299 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 1300 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 1301 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1302 NULL, NULL, NULL, NULL); 1303 } 1304 1305 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1306 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1307 bdev_io->internal.accel_sequence); 1308 } 1309 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1310 /* if this is write path, copy data from original buffer to bounce buffer */ 1311 if (bdev_io_use_memory_domain(bdev_io)) { 1312 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1313 bdev_io_increment_outstanding(ch, ch->shared_resource); 1314 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1315 bdev_io->internal.memory_domain_ctx, 1316 bdev_io->internal.orig_iovs, 1317 (uint32_t) bdev_io->internal.orig_iovcnt, 1318 bdev_io->u.bdev.iovs, 1, 1319 bdev_io_pull_data_done_and_track, 1320 bdev_io); 1321 if (rc == 0) { 1322 /* Continue to submit IO in completion callback */ 1323 return; 1324 } 1325 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1326 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1327 if (rc != -ENOMEM) { 1328 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1329 spdk_memory_domain_get_dma_device_id( 1330 bdev_io->internal.memory_domain)); 1331 } 1332 } else { 1333 assert(bdev_io->u.bdev.iovcnt == 1); 1334 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1335 bdev_io->u.bdev.iovs[0].iov_len, 1336 bdev_io->internal.orig_iovs, 1337 bdev_io->internal.orig_iovcnt); 1338 } 1339 } 1340 1341 if (spdk_unlikely(rc == -ENOMEM)) { 1342 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1343 } else { 1344 bdev_io_pull_data_done(bdev_io, rc); 1345 } 1346 } 1347 1348 static void 1349 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1350 bdev_copy_bounce_buffer_cpl cpl_cb) 1351 { 1352 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1353 1354 bdev_io->internal.data_transfer_cpl = cpl_cb; 1355 /* save original iovec */ 1356 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1357 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1358 /* set bounce iov */ 1359 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1360 bdev_io->u.bdev.iovcnt = 1; 1361 /* set bounce buffer for this operation */ 1362 bdev_io->u.bdev.iovs[0].iov_base = buf; 1363 bdev_io->u.bdev.iovs[0].iov_len = len; 1364 1365 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1366 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1367 } else { 1368 bdev_io_pull_data(bdev_io); 1369 } 1370 } 1371 1372 static void 1373 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1374 { 1375 struct spdk_bdev *bdev = bdev_io->bdev; 1376 bool buf_allocated; 1377 uint64_t alignment; 1378 void *aligned_buf; 1379 1380 bdev_io->internal.buf.ptr = buf; 1381 bdev_io->internal.f.has_buf = true; 1382 1383 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1384 bdev_io_get_buf_complete(bdev_io, true); 1385 return; 1386 } 1387 1388 alignment = spdk_bdev_get_buf_align(bdev); 1389 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1390 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1391 1392 if (buf_allocated) { 1393 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1394 /* Continue in completion callback */ 1395 return; 1396 } else { 1397 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1398 } 1399 1400 _bdev_io_set_md_buf(bdev_io); 1401 } 1402 1403 static inline uint64_t 1404 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1405 { 1406 struct spdk_bdev *bdev = bdev_io->bdev; 1407 uint64_t md_len, alignment; 1408 1409 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1410 1411 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1412 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1413 1414 return len + alignment + md_len; 1415 } 1416 1417 static void 1418 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1419 { 1420 struct spdk_bdev_mgmt_channel *ch; 1421 1422 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1423 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1424 } 1425 1426 static void 1427 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1428 { 1429 assert(bdev_io->internal.f.has_buf); 1430 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf.ptr, bdev_io->internal.buf.len); 1431 bdev_io->internal.buf.ptr = NULL; 1432 bdev_io->internal.f.has_buf = false; 1433 } 1434 1435 void 1436 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1437 { 1438 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1439 1440 assert(buf != NULL); 1441 _bdev_io_put_buf(bdev_io, buf, len); 1442 } 1443 1444 static inline void 1445 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1446 struct spdk_bdev_io *bdev_io) 1447 { 1448 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1449 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1450 * sequence pointer to make sure we won't touch it anymore. */ 1451 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1452 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1453 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1454 bdev_io->internal.f.has_accel_sequence = false; 1455 } 1456 1457 bdev->fn_table->submit_request(ioch, bdev_io); 1458 } 1459 1460 static inline void 1461 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1462 { 1463 struct spdk_bdev *bdev = bdev_io->bdev; 1464 1465 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1466 bdev_io->internal.error.nvme.cdw0 = 0; 1467 bdev_io->num_retries++; 1468 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1469 } 1470 1471 static void 1472 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1473 { 1474 struct spdk_bdev_io *bdev_io; 1475 1476 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1477 /* 1478 * Allow some more I/O to complete before retrying the nomem_io queue. 1479 * Some drivers (such as nvme) cannot immediately take a new I/O in 1480 * the context of a completion, because the resources for the I/O are 1481 * not released until control returns to the bdev poller. Also, we 1482 * may require several small I/O to complete before a larger I/O 1483 * (that requires splitting) can be submitted. 1484 */ 1485 return; 1486 } 1487 1488 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1489 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1490 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1491 1492 switch (bdev_io->internal.retry_state) { 1493 case BDEV_IO_RETRY_STATE_SUBMIT: 1494 bdev_ch_resubmit_io(shared_resource, bdev_io); 1495 break; 1496 case BDEV_IO_RETRY_STATE_PULL: 1497 bdev_io_pull_data(bdev_io); 1498 break; 1499 case BDEV_IO_RETRY_STATE_PULL_MD: 1500 bdev_io_pull_md_buf(bdev_io); 1501 break; 1502 case BDEV_IO_RETRY_STATE_PUSH: 1503 bdev_io_push_bounce_data(bdev_io); 1504 break; 1505 case BDEV_IO_RETRY_STATE_PUSH_MD: 1506 bdev_io_push_bounce_md_buf(bdev_io); 1507 break; 1508 default: 1509 assert(0 && "invalid retry state"); 1510 break; 1511 } 1512 1513 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1514 /* This IO completed again with NOMEM status, so break the loop and 1515 * don't try anymore. Note that a bdev_io that fails with NOMEM 1516 * always gets requeued at the front of the list, to maintain 1517 * ordering. 1518 */ 1519 break; 1520 } 1521 } 1522 } 1523 1524 static void 1525 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1526 { 1527 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1528 } 1529 1530 static int 1531 bdev_no_mem_poller(void *ctx) 1532 { 1533 struct spdk_bdev_shared_resource *shared_resource = ctx; 1534 1535 spdk_poller_unregister(&shared_resource->nomem_poller); 1536 1537 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1538 bdev_shared_ch_retry_io(shared_resource); 1539 } 1540 /* the retry cb may re-register the poller so double check */ 1541 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1542 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1543 /* No IOs were submitted, try again */ 1544 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1545 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1546 } 1547 1548 return SPDK_POLLER_BUSY; 1549 } 1550 1551 static inline bool 1552 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1553 { 1554 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1555 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1556 1557 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1558 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1559 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1560 1561 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1562 /* Special case when we have nomem IOs and no outstanding IOs which completions 1563 * could trigger retry of queued IOs 1564 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1565 * new IOs submitted, e.g. qd==1 */ 1566 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1567 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1568 } 1569 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1570 * ownership of that sequence is transferred back to the bdev layer, so we need to 1571 * restore internal.accel_sequence to make sure that the sequence is handled 1572 * correctly in case the I/O is later aborted. */ 1573 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1574 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1575 assert(!bdev_io_use_accel_sequence(bdev_io)); 1576 bdev_io->internal.f.has_accel_sequence = true; 1577 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1578 } 1579 1580 return true; 1581 } 1582 1583 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1584 bdev_ch_retry_io(bdev_ch); 1585 } 1586 1587 return false; 1588 } 1589 1590 static void 1591 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1592 { 1593 struct spdk_bdev_io *bdev_io = ctx; 1594 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1595 1596 if (rc) { 1597 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1598 } 1599 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1600 * to waiting for the conditional free of internal.buf.ptr in spdk_bdev_free_io()). 1601 */ 1602 bdev_io_put_buf(bdev_io); 1603 1604 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1605 bdev_ch_retry_io(ch); 1606 } 1607 1608 /* Continue with IO completion flow */ 1609 bdev_io_complete(bdev_io); 1610 } 1611 1612 static void 1613 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1614 { 1615 struct spdk_bdev_io *bdev_io = ctx; 1616 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1617 1618 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1619 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1620 1621 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1622 bdev_ch_retry_io(ch); 1623 } 1624 1625 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1626 } 1627 1628 static inline void 1629 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1630 { 1631 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1632 int rc = 0; 1633 1634 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1635 /* do the same for metadata buffer */ 1636 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1637 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1638 1639 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1640 if (bdev_io_use_memory_domain(bdev_io)) { 1641 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1642 bdev_io_increment_outstanding(ch, ch->shared_resource); 1643 /* If memory domain is used then we need to call async push function */ 1644 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1645 bdev_io->internal.memory_domain_ctx, 1646 &bdev_io->internal.orig_md_iov, 1647 (uint32_t)bdev_io->internal.orig_iovcnt, 1648 &bdev_io->internal.bounce_md_iov, 1, 1649 bdev_io_push_bounce_md_buf_done, 1650 bdev_io); 1651 if (rc == 0) { 1652 /* Continue IO completion in async callback */ 1653 return; 1654 } 1655 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1656 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1657 if (rc != -ENOMEM) { 1658 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1659 spdk_memory_domain_get_dma_device_id( 1660 bdev_io->internal.memory_domain)); 1661 } 1662 } else { 1663 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1664 bdev_io->internal.orig_md_iov.iov_len); 1665 } 1666 } 1667 } 1668 1669 if (spdk_unlikely(rc == -ENOMEM)) { 1670 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1671 } else { 1672 assert(bdev_io->internal.data_transfer_cpl); 1673 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1674 } 1675 } 1676 1677 static inline void 1678 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1679 { 1680 assert(bdev_io->internal.data_transfer_cpl); 1681 if (rc) { 1682 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1683 return; 1684 } 1685 1686 /* set original buffer for this io */ 1687 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1688 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1689 /* disable bouncing buffer for this io */ 1690 bdev_io->internal.orig_iovcnt = 0; 1691 bdev_io->internal.orig_iovs = NULL; 1692 1693 bdev_io_push_bounce_md_buf(bdev_io); 1694 } 1695 1696 static void 1697 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1698 { 1699 struct spdk_bdev_io *bdev_io = ctx; 1700 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1701 1702 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1703 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1704 1705 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1706 bdev_ch_retry_io(ch); 1707 } 1708 1709 bdev_io_push_bounce_data_done(bdev_io, status); 1710 } 1711 1712 static inline void 1713 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1714 { 1715 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1716 int rc = 0; 1717 1718 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1719 assert(!bdev_io_use_accel_sequence(bdev_io)); 1720 1721 /* if this is read path, copy data from bounce buffer to original buffer */ 1722 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1723 if (bdev_io_use_memory_domain(bdev_io)) { 1724 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1725 bdev_io_increment_outstanding(ch, ch->shared_resource); 1726 /* If memory domain is used then we need to call async push function */ 1727 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1728 bdev_io->internal.memory_domain_ctx, 1729 bdev_io->internal.orig_iovs, 1730 (uint32_t)bdev_io->internal.orig_iovcnt, 1731 &bdev_io->internal.bounce_iov, 1, 1732 bdev_io_push_bounce_data_done_and_track, 1733 bdev_io); 1734 if (rc == 0) { 1735 /* Continue IO completion in async callback */ 1736 return; 1737 } 1738 1739 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1740 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1741 if (rc != -ENOMEM) { 1742 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1743 spdk_memory_domain_get_dma_device_id( 1744 bdev_io->internal.memory_domain)); 1745 } 1746 } else { 1747 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1748 bdev_io->internal.orig_iovcnt, 1749 bdev_io->internal.bounce_iov.iov_base, 1750 bdev_io->internal.bounce_iov.iov_len); 1751 } 1752 } 1753 1754 if (spdk_unlikely(rc == -ENOMEM)) { 1755 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1756 } else { 1757 bdev_io_push_bounce_data_done(bdev_io, rc); 1758 } 1759 } 1760 1761 static inline void 1762 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1763 { 1764 bdev_io->internal.data_transfer_cpl = cpl_cb; 1765 bdev_io_push_bounce_data(bdev_io); 1766 } 1767 1768 static void 1769 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1770 { 1771 struct spdk_bdev_io *bdev_io; 1772 1773 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1774 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf.len); 1775 } 1776 1777 static void 1778 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1779 { 1780 struct spdk_bdev_mgmt_channel *mgmt_ch; 1781 uint64_t max_len; 1782 void *buf; 1783 1784 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1785 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1786 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1787 1788 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1789 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1790 bdev_io_get_buf_complete(bdev_io, false); 1791 return; 1792 } 1793 1794 bdev_io->internal.buf.len = len; 1795 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1796 bdev_io_get_iobuf_cb); 1797 if (buf != NULL) { 1798 _bdev_io_set_buf(bdev_io, buf, len); 1799 } 1800 } 1801 1802 void 1803 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1804 { 1805 struct spdk_bdev *bdev = bdev_io->bdev; 1806 uint64_t alignment; 1807 1808 assert(cb != NULL); 1809 bdev_io->internal.get_buf_cb = cb; 1810 1811 alignment = spdk_bdev_get_buf_align(bdev); 1812 1813 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1814 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1815 /* Buffer already present and aligned */ 1816 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1817 return; 1818 } 1819 1820 bdev_io_get_buf(bdev_io, len); 1821 } 1822 1823 static void 1824 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1825 bool success) 1826 { 1827 if (!success) { 1828 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1829 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1830 bdev_io_complete_unsubmitted(bdev_io); 1831 return; 1832 } 1833 1834 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1835 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1836 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1837 return; 1838 } 1839 /* For reads we'll execute the sequence after the data is read, so, for now, only 1840 * clear out accel_sequence pointer and submit the IO */ 1841 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1842 bdev_io->u.bdev.accel_sequence = NULL; 1843 } 1844 1845 bdev_io_submit(bdev_io); 1846 } 1847 1848 static void 1849 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1850 uint64_t len) 1851 { 1852 assert(cb != NULL); 1853 bdev_io->internal.get_buf_cb = cb; 1854 1855 bdev_io_get_buf(bdev_io, len); 1856 } 1857 1858 void 1859 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1860 { 1861 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1862 1863 assert(cb != NULL); 1864 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1865 bdev_io->internal.get_aux_buf_cb = cb; 1866 bdev_io_get_buf(bdev_io, len); 1867 } 1868 1869 static int 1870 bdev_module_get_max_ctx_size(void) 1871 { 1872 struct spdk_bdev_module *bdev_module; 1873 int max_bdev_module_size = 0; 1874 1875 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1876 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1877 max_bdev_module_size = bdev_module->get_ctx_size(); 1878 } 1879 } 1880 1881 return max_bdev_module_size; 1882 } 1883 1884 static void 1885 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1886 { 1887 if (!bdev->internal.histogram_enabled) { 1888 return; 1889 } 1890 1891 spdk_json_write_object_begin(w); 1892 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1893 1894 spdk_json_write_named_object_begin(w, "params"); 1895 spdk_json_write_named_string(w, "name", bdev->name); 1896 1897 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1898 1899 if (bdev->internal.histogram_io_type) { 1900 spdk_json_write_named_string(w, "opc", 1901 spdk_bdev_get_io_type_name(bdev->internal.histogram_io_type)); 1902 } 1903 1904 spdk_json_write_object_end(w); 1905 1906 spdk_json_write_object_end(w); 1907 } 1908 1909 static void 1910 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1911 { 1912 int i; 1913 struct spdk_bdev_qos *qos = bdev->internal.qos; 1914 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1915 1916 if (!qos) { 1917 return; 1918 } 1919 1920 spdk_bdev_get_qos_rate_limits(bdev, limits); 1921 1922 spdk_json_write_object_begin(w); 1923 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1924 1925 spdk_json_write_named_object_begin(w, "params"); 1926 spdk_json_write_named_string(w, "name", bdev->name); 1927 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1928 if (limits[i] > 0) { 1929 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1930 } 1931 } 1932 spdk_json_write_object_end(w); 1933 1934 spdk_json_write_object_end(w); 1935 } 1936 1937 void 1938 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1939 { 1940 struct spdk_bdev_module *bdev_module; 1941 struct spdk_bdev *bdev; 1942 1943 assert(w != NULL); 1944 1945 spdk_json_write_array_begin(w); 1946 1947 spdk_json_write_object_begin(w); 1948 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1949 spdk_json_write_named_object_begin(w, "params"); 1950 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1951 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1952 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1953 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1954 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1955 spdk_json_write_object_end(w); 1956 spdk_json_write_object_end(w); 1957 1958 bdev_examine_allowlist_config_json(w); 1959 1960 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1961 if (bdev_module->config_json) { 1962 bdev_module->config_json(w); 1963 } 1964 } 1965 1966 spdk_spin_lock(&g_bdev_mgr.spinlock); 1967 1968 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1969 if (bdev->fn_table->write_config_json) { 1970 bdev->fn_table->write_config_json(bdev, w); 1971 } 1972 1973 bdev_qos_config_json(bdev, w); 1974 bdev_enable_histogram_config_json(bdev, w); 1975 } 1976 1977 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1978 1979 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1980 spdk_json_write_object_begin(w); 1981 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1982 spdk_json_write_object_end(w); 1983 1984 spdk_json_write_array_end(w); 1985 } 1986 1987 static void 1988 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1989 { 1990 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1991 struct spdk_bdev_io *bdev_io; 1992 1993 spdk_iobuf_channel_fini(&ch->iobuf); 1994 1995 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1996 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1997 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1998 ch->per_thread_cache_count--; 1999 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2000 } 2001 2002 assert(ch->per_thread_cache_count == 0); 2003 } 2004 2005 static int 2006 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 2007 { 2008 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 2009 struct spdk_bdev_io *bdev_io; 2010 uint32_t i; 2011 int rc; 2012 2013 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 2014 g_bdev_opts.iobuf_small_cache_size, 2015 g_bdev_opts.iobuf_large_cache_size); 2016 if (rc != 0) { 2017 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 2018 return -1; 2019 } 2020 2021 STAILQ_INIT(&ch->per_thread_cache); 2022 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 2023 2024 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 2025 ch->per_thread_cache_count = 0; 2026 for (i = 0; i < ch->bdev_io_cache_size; i++) { 2027 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2028 if (bdev_io == NULL) { 2029 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 2030 assert(false); 2031 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2032 return -1; 2033 } 2034 ch->per_thread_cache_count++; 2035 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2036 } 2037 2038 TAILQ_INIT(&ch->shared_resources); 2039 TAILQ_INIT(&ch->io_wait_queue); 2040 2041 return 0; 2042 } 2043 2044 static void 2045 bdev_init_complete(int rc) 2046 { 2047 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2048 void *cb_arg = g_init_cb_arg; 2049 struct spdk_bdev_module *m; 2050 2051 g_bdev_mgr.init_complete = true; 2052 g_init_cb_fn = NULL; 2053 g_init_cb_arg = NULL; 2054 2055 /* 2056 * For modules that need to know when subsystem init is complete, 2057 * inform them now. 2058 */ 2059 if (rc == 0) { 2060 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2061 if (m->init_complete) { 2062 m->init_complete(); 2063 } 2064 } 2065 } 2066 2067 cb_fn(cb_arg, rc); 2068 } 2069 2070 static bool 2071 bdev_module_all_actions_completed(void) 2072 { 2073 struct spdk_bdev_module *m; 2074 2075 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2076 if (m->internal.action_in_progress > 0) { 2077 return false; 2078 } 2079 } 2080 return true; 2081 } 2082 2083 static void 2084 bdev_module_action_complete(void) 2085 { 2086 /* 2087 * Don't finish bdev subsystem initialization if 2088 * module pre-initialization is still in progress, or 2089 * the subsystem been already initialized. 2090 */ 2091 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2092 return; 2093 } 2094 2095 /* 2096 * Check all bdev modules for inits/examinations in progress. If any 2097 * exist, return immediately since we cannot finish bdev subsystem 2098 * initialization until all are completed. 2099 */ 2100 if (!bdev_module_all_actions_completed()) { 2101 return; 2102 } 2103 2104 /* 2105 * Modules already finished initialization - now that all 2106 * the bdev modules have finished their asynchronous I/O 2107 * processing, the entire bdev layer can be marked as complete. 2108 */ 2109 bdev_init_complete(0); 2110 } 2111 2112 static void 2113 bdev_module_action_done(struct spdk_bdev_module *module) 2114 { 2115 spdk_spin_lock(&module->internal.spinlock); 2116 assert(module->internal.action_in_progress > 0); 2117 module->internal.action_in_progress--; 2118 spdk_spin_unlock(&module->internal.spinlock); 2119 bdev_module_action_complete(); 2120 } 2121 2122 void 2123 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2124 { 2125 assert(module->async_init); 2126 bdev_module_action_done(module); 2127 } 2128 2129 void 2130 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2131 { 2132 bdev_module_action_done(module); 2133 } 2134 2135 /** The last initialized bdev module */ 2136 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2137 2138 static void 2139 bdev_init_failed(void *cb_arg) 2140 { 2141 struct spdk_bdev_module *module = cb_arg; 2142 2143 spdk_spin_lock(&module->internal.spinlock); 2144 assert(module->internal.action_in_progress > 0); 2145 module->internal.action_in_progress--; 2146 spdk_spin_unlock(&module->internal.spinlock); 2147 bdev_init_complete(-1); 2148 } 2149 2150 static int 2151 bdev_modules_init(void) 2152 { 2153 struct spdk_bdev_module *module; 2154 int rc = 0; 2155 2156 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2157 g_resume_bdev_module = module; 2158 if (module->async_init) { 2159 spdk_spin_lock(&module->internal.spinlock); 2160 module->internal.action_in_progress = 1; 2161 spdk_spin_unlock(&module->internal.spinlock); 2162 } 2163 rc = module->module_init(); 2164 if (rc != 0) { 2165 /* Bump action_in_progress to prevent other modules from completion of modules_init 2166 * Send message to defer application shutdown until resources are cleaned up */ 2167 spdk_spin_lock(&module->internal.spinlock); 2168 module->internal.action_in_progress = 1; 2169 spdk_spin_unlock(&module->internal.spinlock); 2170 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2171 return rc; 2172 } 2173 } 2174 2175 g_resume_bdev_module = NULL; 2176 return 0; 2177 } 2178 2179 void 2180 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2181 { 2182 int rc = 0; 2183 char mempool_name[32]; 2184 2185 assert(cb_fn != NULL); 2186 2187 g_init_cb_fn = cb_fn; 2188 g_init_cb_arg = cb_arg; 2189 2190 spdk_notify_type_register("bdev_register"); 2191 spdk_notify_type_register("bdev_unregister"); 2192 2193 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2194 2195 rc = spdk_iobuf_register_module("bdev"); 2196 if (rc != 0) { 2197 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2198 bdev_init_complete(-1); 2199 return; 2200 } 2201 2202 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2203 g_bdev_opts.bdev_io_pool_size, 2204 sizeof(struct spdk_bdev_io) + 2205 bdev_module_get_max_ctx_size(), 2206 0, 2207 SPDK_ENV_SOCKET_ID_ANY); 2208 2209 if (g_bdev_mgr.bdev_io_pool == NULL) { 2210 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2211 bdev_init_complete(-1); 2212 return; 2213 } 2214 2215 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2216 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2217 if (!g_bdev_mgr.zero_buffer) { 2218 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2219 bdev_init_complete(-1); 2220 return; 2221 } 2222 2223 #ifdef SPDK_CONFIG_VTUNE 2224 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2225 #endif 2226 2227 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2228 bdev_mgmt_channel_destroy, 2229 sizeof(struct spdk_bdev_mgmt_channel), 2230 "bdev_mgr"); 2231 2232 rc = bdev_modules_init(); 2233 g_bdev_mgr.module_init_complete = true; 2234 if (rc != 0) { 2235 SPDK_ERRLOG("bdev modules init failed\n"); 2236 return; 2237 } 2238 2239 bdev_module_action_complete(); 2240 } 2241 2242 static void 2243 bdev_mgr_unregister_cb(void *io_device) 2244 { 2245 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2246 2247 if (g_bdev_mgr.bdev_io_pool) { 2248 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2249 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2250 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2251 g_bdev_opts.bdev_io_pool_size); 2252 } 2253 2254 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2255 } 2256 2257 spdk_free(g_bdev_mgr.zero_buffer); 2258 2259 bdev_examine_allowlist_free(); 2260 2261 cb_fn(g_fini_cb_arg); 2262 g_fini_cb_fn = NULL; 2263 g_fini_cb_arg = NULL; 2264 g_bdev_mgr.init_complete = false; 2265 g_bdev_mgr.module_init_complete = false; 2266 } 2267 2268 static void 2269 bdev_module_fini_iter(void *arg) 2270 { 2271 struct spdk_bdev_module *bdev_module; 2272 2273 /* FIXME: Handling initialization failures is broken now, 2274 * so we won't even try cleaning up after successfully 2275 * initialized modules. if module_init_complete is false, 2276 * just call spdk_bdev_mgr_unregister_cb 2277 */ 2278 if (!g_bdev_mgr.module_init_complete) { 2279 bdev_mgr_unregister_cb(NULL); 2280 return; 2281 } 2282 2283 /* Start iterating from the last touched module */ 2284 if (!g_resume_bdev_module) { 2285 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2286 } else { 2287 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2288 internal.tailq); 2289 } 2290 2291 while (bdev_module) { 2292 if (bdev_module->async_fini) { 2293 /* Save our place so we can resume later. We must 2294 * save the variable here, before calling module_fini() 2295 * below, because in some cases the module may immediately 2296 * call spdk_bdev_module_fini_done() and re-enter 2297 * this function to continue iterating. */ 2298 g_resume_bdev_module = bdev_module; 2299 } 2300 2301 if (bdev_module->module_fini) { 2302 bdev_module->module_fini(); 2303 } 2304 2305 if (bdev_module->async_fini) { 2306 return; 2307 } 2308 2309 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2310 internal.tailq); 2311 } 2312 2313 g_resume_bdev_module = NULL; 2314 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2315 } 2316 2317 void 2318 spdk_bdev_module_fini_done(void) 2319 { 2320 if (spdk_get_thread() != g_fini_thread) { 2321 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2322 } else { 2323 bdev_module_fini_iter(NULL); 2324 } 2325 } 2326 2327 static void 2328 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2329 { 2330 struct spdk_bdev *bdev = cb_arg; 2331 2332 if (bdeverrno && bdev) { 2333 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2334 bdev->name); 2335 2336 /* 2337 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2338 * bdev; try to continue by manually removing this bdev from the list and continue 2339 * with the next bdev in the list. 2340 */ 2341 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2342 } 2343 2344 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2345 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2346 /* 2347 * Bdev module finish need to be deferred as we might be in the middle of some context 2348 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2349 * after returning. 2350 */ 2351 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2352 return; 2353 } 2354 2355 /* 2356 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2357 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2358 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2359 * base bdevs. 2360 * 2361 * Also, walk the list in the reverse order. 2362 */ 2363 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2364 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2365 spdk_spin_lock(&bdev->internal.spinlock); 2366 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2367 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2368 spdk_spin_unlock(&bdev->internal.spinlock); 2369 continue; 2370 } 2371 spdk_spin_unlock(&bdev->internal.spinlock); 2372 2373 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2374 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2375 return; 2376 } 2377 2378 /* 2379 * If any bdev fails to unclaim underlying bdev properly, we may face the 2380 * case of bdev list consisting of claimed bdevs only (if claims are managed 2381 * correctly, this would mean there's a loop in the claims graph which is 2382 * clearly impossible). Warn and unregister last bdev on the list then. 2383 */ 2384 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2385 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2386 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2387 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2388 return; 2389 } 2390 } 2391 2392 static void 2393 bdev_module_fini_start_iter(void *arg) 2394 { 2395 struct spdk_bdev_module *bdev_module; 2396 2397 if (!g_resume_bdev_module) { 2398 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2399 } else { 2400 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2401 } 2402 2403 while (bdev_module) { 2404 if (bdev_module->async_fini_start) { 2405 /* Save our place so we can resume later. We must 2406 * save the variable here, before calling fini_start() 2407 * below, because in some cases the module may immediately 2408 * call spdk_bdev_module_fini_start_done() and re-enter 2409 * this function to continue iterating. */ 2410 g_resume_bdev_module = bdev_module; 2411 } 2412 2413 if (bdev_module->fini_start) { 2414 bdev_module->fini_start(); 2415 } 2416 2417 if (bdev_module->async_fini_start) { 2418 return; 2419 } 2420 2421 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2422 } 2423 2424 g_resume_bdev_module = NULL; 2425 2426 bdev_finish_unregister_bdevs_iter(NULL, 0); 2427 } 2428 2429 void 2430 spdk_bdev_module_fini_start_done(void) 2431 { 2432 if (spdk_get_thread() != g_fini_thread) { 2433 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2434 } else { 2435 bdev_module_fini_start_iter(NULL); 2436 } 2437 } 2438 2439 static void 2440 bdev_finish_wait_for_examine_done(void *cb_arg) 2441 { 2442 bdev_module_fini_start_iter(NULL); 2443 } 2444 2445 static void bdev_open_async_fini(void); 2446 2447 void 2448 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2449 { 2450 int rc; 2451 2452 assert(cb_fn != NULL); 2453 2454 g_fini_thread = spdk_get_thread(); 2455 2456 g_fini_cb_fn = cb_fn; 2457 g_fini_cb_arg = cb_arg; 2458 2459 bdev_open_async_fini(); 2460 2461 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2462 if (rc != 0) { 2463 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2464 bdev_finish_wait_for_examine_done(NULL); 2465 } 2466 } 2467 2468 struct spdk_bdev_io * 2469 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2470 { 2471 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2472 struct spdk_bdev_io *bdev_io; 2473 2474 if (ch->per_thread_cache_count > 0) { 2475 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2476 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2477 ch->per_thread_cache_count--; 2478 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2479 /* 2480 * Don't try to look for bdev_ios in the global pool if there are 2481 * waiters on bdev_ios - we don't want this caller to jump the line. 2482 */ 2483 bdev_io = NULL; 2484 } else { 2485 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2486 } 2487 2488 return bdev_io; 2489 } 2490 2491 void 2492 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2493 { 2494 struct spdk_bdev_mgmt_channel *ch; 2495 2496 assert(bdev_io != NULL); 2497 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2498 2499 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2500 2501 if (bdev_io->internal.f.has_buf) { 2502 bdev_io_put_buf(bdev_io); 2503 } 2504 2505 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2506 ch->per_thread_cache_count++; 2507 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2508 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2509 struct spdk_bdev_io_wait_entry *entry; 2510 2511 entry = TAILQ_FIRST(&ch->io_wait_queue); 2512 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2513 entry->cb_fn(entry->cb_arg); 2514 } 2515 } else { 2516 /* We should never have a full cache with entries on the io wait queue. */ 2517 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2518 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2519 } 2520 } 2521 2522 static bool 2523 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2524 { 2525 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2526 2527 switch (limit) { 2528 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2529 return true; 2530 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2531 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2532 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2533 return false; 2534 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2535 default: 2536 return false; 2537 } 2538 } 2539 2540 static bool 2541 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2542 { 2543 switch (bdev_io->type) { 2544 case SPDK_BDEV_IO_TYPE_NVME_IO: 2545 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2546 case SPDK_BDEV_IO_TYPE_READ: 2547 case SPDK_BDEV_IO_TYPE_WRITE: 2548 return true; 2549 case SPDK_BDEV_IO_TYPE_ZCOPY: 2550 if (bdev_io->u.bdev.zcopy.start) { 2551 return true; 2552 } else { 2553 return false; 2554 } 2555 default: 2556 return false; 2557 } 2558 } 2559 2560 static bool 2561 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2562 { 2563 switch (bdev_io->type) { 2564 case SPDK_BDEV_IO_TYPE_NVME_IO: 2565 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2566 /* Bit 1 (0x2) set for read operation */ 2567 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2568 return true; 2569 } else { 2570 return false; 2571 } 2572 case SPDK_BDEV_IO_TYPE_READ: 2573 return true; 2574 case SPDK_BDEV_IO_TYPE_ZCOPY: 2575 /* Populate to read from disk */ 2576 if (bdev_io->u.bdev.zcopy.populate) { 2577 return true; 2578 } else { 2579 return false; 2580 } 2581 default: 2582 return false; 2583 } 2584 } 2585 2586 static uint64_t 2587 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2588 { 2589 struct spdk_bdev *bdev = bdev_io->bdev; 2590 2591 switch (bdev_io->type) { 2592 case SPDK_BDEV_IO_TYPE_NVME_IO: 2593 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2594 return bdev_io->u.nvme_passthru.nbytes; 2595 case SPDK_BDEV_IO_TYPE_READ: 2596 case SPDK_BDEV_IO_TYPE_WRITE: 2597 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2598 case SPDK_BDEV_IO_TYPE_ZCOPY: 2599 /* Track the data in the start phase only */ 2600 if (bdev_io->u.bdev.zcopy.start) { 2601 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2602 } else { 2603 return 0; 2604 } 2605 default: 2606 return 0; 2607 } 2608 } 2609 2610 static inline bool 2611 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2612 { 2613 int64_t remaining_this_timeslice; 2614 2615 if (!limit->max_per_timeslice) { 2616 /* The QoS is disabled */ 2617 return false; 2618 } 2619 2620 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2621 __ATOMIC_RELAXED); 2622 if (remaining_this_timeslice + (int64_t)delta > 0) { 2623 /* There was still a quota for this delta -> the IO shouldn't be queued 2624 * 2625 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2626 * quota can be allowed once a while. Such overrun then taken into account in 2627 * the QoS poller, where the next timeslice quota is calculated. 2628 */ 2629 return false; 2630 } 2631 2632 /* There was no quota for this delta -> the IO should be queued 2633 * The remaining_this_timeslice must be rewinded so it reflects the real 2634 * amount of IOs or bytes allowed. 2635 */ 2636 __atomic_add_fetch( 2637 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2638 return true; 2639 } 2640 2641 static inline void 2642 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2643 { 2644 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2645 } 2646 2647 static bool 2648 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2649 { 2650 return bdev_qos_rw_queue_io(limit, io, 1); 2651 } 2652 2653 static void 2654 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2655 { 2656 bdev_qos_rw_rewind_io(limit, io, 1); 2657 } 2658 2659 static bool 2660 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2661 { 2662 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2663 } 2664 2665 static void 2666 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2667 { 2668 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2669 } 2670 2671 static bool 2672 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2673 { 2674 if (bdev_is_read_io(io) == false) { 2675 return false; 2676 } 2677 2678 return bdev_qos_rw_bps_queue(limit, io); 2679 } 2680 2681 static void 2682 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2683 { 2684 if (bdev_is_read_io(io) != false) { 2685 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2686 } 2687 } 2688 2689 static bool 2690 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2691 { 2692 if (bdev_is_read_io(io) == true) { 2693 return false; 2694 } 2695 2696 return bdev_qos_rw_bps_queue(limit, io); 2697 } 2698 2699 static void 2700 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2701 { 2702 if (bdev_is_read_io(io) != true) { 2703 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2704 } 2705 } 2706 2707 static void 2708 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2709 { 2710 int i; 2711 2712 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2713 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2714 qos->rate_limits[i].queue_io = NULL; 2715 continue; 2716 } 2717 2718 switch (i) { 2719 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2720 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2721 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2722 break; 2723 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2724 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2725 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2726 break; 2727 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2728 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2729 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2730 break; 2731 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2732 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2733 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2734 break; 2735 default: 2736 break; 2737 } 2738 } 2739 } 2740 2741 static void 2742 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2743 struct spdk_bdev_io *bdev_io, 2744 enum spdk_bdev_io_status status) 2745 { 2746 bdev_io->internal.in_submit_request = true; 2747 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2748 spdk_bdev_io_complete(bdev_io, status); 2749 bdev_io->internal.in_submit_request = false; 2750 } 2751 2752 static inline void 2753 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2754 { 2755 struct spdk_bdev *bdev = bdev_io->bdev; 2756 struct spdk_io_channel *ch = bdev_ch->channel; 2757 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2758 2759 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2760 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2761 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2762 2763 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2764 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2765 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2766 SPDK_BDEV_IO_STATUS_SUCCESS); 2767 return; 2768 } 2769 } 2770 2771 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2772 bdev_io->bdev->split_on_write_unit && 2773 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2774 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2775 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2776 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2777 return; 2778 } 2779 2780 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2781 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2782 bdev_io->internal.in_submit_request = true; 2783 bdev_submit_request(bdev, ch, bdev_io); 2784 bdev_io->internal.in_submit_request = false; 2785 } else { 2786 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2787 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2788 /* Special case when we have nomem IOs and no outstanding IOs which completions 2789 * could trigger retry of queued IOs */ 2790 bdev_shared_ch_retry_io(shared_resource); 2791 } 2792 } 2793 } 2794 2795 static bool 2796 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2797 { 2798 int i; 2799 2800 if (bdev_qos_io_to_limit(bdev_io) == true) { 2801 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2802 if (!qos->rate_limits[i].queue_io) { 2803 continue; 2804 } 2805 2806 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2807 bdev_io) == true) { 2808 for (i -= 1; i >= 0 ; i--) { 2809 if (!qos->rate_limits[i].queue_io) { 2810 continue; 2811 } 2812 2813 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2814 } 2815 return true; 2816 } 2817 } 2818 } 2819 2820 return false; 2821 } 2822 2823 static int 2824 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2825 { 2826 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2827 int submitted_ios = 0; 2828 2829 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2830 if (!bdev_qos_queue_io(qos, bdev_io)) { 2831 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2832 bdev_io_do_submit(ch, bdev_io); 2833 2834 submitted_ios++; 2835 } 2836 } 2837 2838 return submitted_ios; 2839 } 2840 2841 static void 2842 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2843 { 2844 int rc; 2845 2846 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2847 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2848 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2849 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2850 &bdev_io->internal.waitq_entry); 2851 if (rc != 0) { 2852 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2853 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2854 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2855 } 2856 } 2857 2858 static bool 2859 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2860 { 2861 uint32_t io_boundary; 2862 struct spdk_bdev *bdev = bdev_io->bdev; 2863 uint32_t max_segment_size = bdev->max_segment_size; 2864 uint32_t max_size = bdev->max_rw_size; 2865 int max_segs = bdev->max_num_segments; 2866 2867 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2868 io_boundary = bdev->write_unit_size; 2869 } else if (bdev->split_on_optimal_io_boundary) { 2870 io_boundary = bdev->optimal_io_boundary; 2871 } else { 2872 io_boundary = 0; 2873 } 2874 2875 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2876 return false; 2877 } 2878 2879 if (io_boundary) { 2880 uint64_t start_stripe, end_stripe; 2881 2882 start_stripe = bdev_io->u.bdev.offset_blocks; 2883 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2884 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2885 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2886 start_stripe >>= spdk_u32log2(io_boundary); 2887 end_stripe >>= spdk_u32log2(io_boundary); 2888 } else { 2889 start_stripe /= io_boundary; 2890 end_stripe /= io_boundary; 2891 } 2892 2893 if (start_stripe != end_stripe) { 2894 return true; 2895 } 2896 } 2897 2898 if (max_segs) { 2899 if (bdev_io->u.bdev.iovcnt > max_segs) { 2900 return true; 2901 } 2902 } 2903 2904 if (max_segment_size) { 2905 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2906 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2907 return true; 2908 } 2909 } 2910 } 2911 2912 if (max_size) { 2913 if (bdev_io->u.bdev.num_blocks > max_size) { 2914 return true; 2915 } 2916 } 2917 2918 return false; 2919 } 2920 2921 static bool 2922 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2923 { 2924 uint32_t num_unmap_segments; 2925 2926 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2927 return false; 2928 } 2929 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2930 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2931 return true; 2932 } 2933 2934 return false; 2935 } 2936 2937 static bool 2938 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2939 { 2940 if (!bdev_io->bdev->max_write_zeroes) { 2941 return false; 2942 } 2943 2944 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2945 return true; 2946 } 2947 2948 return false; 2949 } 2950 2951 static bool 2952 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2953 { 2954 if (bdev_io->bdev->max_copy != 0 && 2955 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2956 return true; 2957 } 2958 2959 return false; 2960 } 2961 2962 static bool 2963 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2964 { 2965 switch (bdev_io->type) { 2966 case SPDK_BDEV_IO_TYPE_READ: 2967 case SPDK_BDEV_IO_TYPE_WRITE: 2968 return bdev_rw_should_split(bdev_io); 2969 case SPDK_BDEV_IO_TYPE_UNMAP: 2970 return bdev_unmap_should_split(bdev_io); 2971 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2972 return bdev_write_zeroes_should_split(bdev_io); 2973 case SPDK_BDEV_IO_TYPE_COPY: 2974 return bdev_copy_should_split(bdev_io); 2975 default: 2976 return false; 2977 } 2978 } 2979 2980 static uint32_t 2981 _to_next_boundary(uint64_t offset, uint32_t boundary) 2982 { 2983 return (boundary - (offset % boundary)); 2984 } 2985 2986 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2987 2988 static void _bdev_rw_split(void *_bdev_io); 2989 2990 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2991 2992 static void 2993 _bdev_unmap_split(void *_bdev_io) 2994 { 2995 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2996 } 2997 2998 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2999 3000 static void 3001 _bdev_write_zeroes_split(void *_bdev_io) 3002 { 3003 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 3004 } 3005 3006 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 3007 3008 static void 3009 _bdev_copy_split(void *_bdev_io) 3010 { 3011 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 3012 } 3013 3014 static int 3015 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 3016 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 3017 { 3018 int rc; 3019 uint64_t current_offset, current_remaining, current_src_offset; 3020 spdk_bdev_io_wait_cb io_wait_fn; 3021 3022 current_offset = *offset; 3023 current_remaining = *remaining; 3024 3025 assert(bdev_io->internal.f.split); 3026 3027 bdev_io->internal.split.outstanding++; 3028 3029 io_wait_fn = _bdev_rw_split; 3030 switch (bdev_io->type) { 3031 case SPDK_BDEV_IO_TYPE_READ: 3032 assert(bdev_io->u.bdev.accel_sequence == NULL); 3033 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3034 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3035 iov, iovcnt, md_buf, current_offset, 3036 num_blocks, 3037 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3038 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3039 NULL, 3040 bdev_io->u.bdev.dif_check_flags, 3041 bdev_io_split_done, bdev_io); 3042 break; 3043 case SPDK_BDEV_IO_TYPE_WRITE: 3044 assert(bdev_io->u.bdev.accel_sequence == NULL); 3045 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3046 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3047 iov, iovcnt, md_buf, current_offset, 3048 num_blocks, 3049 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain : NULL, 3050 bdev_io_use_memory_domain(bdev_io) ? bdev_io->internal.memory_domain_ctx : NULL, 3051 NULL, 3052 bdev_io->u.bdev.dif_check_flags, 3053 bdev_io->u.bdev.nvme_cdw12.raw, 3054 bdev_io->u.bdev.nvme_cdw13.raw, 3055 bdev_io_split_done, bdev_io); 3056 break; 3057 case SPDK_BDEV_IO_TYPE_UNMAP: 3058 io_wait_fn = _bdev_unmap_split; 3059 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3060 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3061 current_offset, num_blocks, 3062 bdev_io_split_done, bdev_io); 3063 break; 3064 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3065 io_wait_fn = _bdev_write_zeroes_split; 3066 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3067 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3068 current_offset, num_blocks, 3069 bdev_io_split_done, bdev_io); 3070 break; 3071 case SPDK_BDEV_IO_TYPE_COPY: 3072 io_wait_fn = _bdev_copy_split; 3073 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3074 (current_offset - bdev_io->u.bdev.offset_blocks); 3075 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3076 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3077 current_offset, current_src_offset, num_blocks, 3078 bdev_io_split_done, bdev_io); 3079 break; 3080 default: 3081 assert(false); 3082 rc = -EINVAL; 3083 break; 3084 } 3085 3086 if (rc == 0) { 3087 current_offset += num_blocks; 3088 current_remaining -= num_blocks; 3089 bdev_io->internal.split.current_offset_blocks = current_offset; 3090 bdev_io->internal.split.remaining_num_blocks = current_remaining; 3091 *offset = current_offset; 3092 *remaining = current_remaining; 3093 } else { 3094 bdev_io->internal.split.outstanding--; 3095 if (rc == -ENOMEM) { 3096 if (bdev_io->internal.split.outstanding == 0) { 3097 /* No I/O is outstanding. Hence we should wait here. */ 3098 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3099 } 3100 } else { 3101 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3102 if (bdev_io->internal.split.outstanding == 0) { 3103 bdev_ch_remove_from_io_submitted(bdev_io); 3104 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3105 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3106 bdev_io->internal.ch->queue_depth); 3107 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3108 } 3109 } 3110 } 3111 3112 return rc; 3113 } 3114 3115 static void 3116 _bdev_rw_split(void *_bdev_io) 3117 { 3118 struct iovec *parent_iov, *iov; 3119 struct spdk_bdev_io *bdev_io = _bdev_io; 3120 struct spdk_bdev *bdev = bdev_io->bdev; 3121 uint64_t parent_offset, current_offset, remaining; 3122 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3123 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3124 uint32_t iovcnt, iov_len, child_iovsize; 3125 uint32_t blocklen = bdev->blocklen; 3126 uint32_t io_boundary; 3127 uint32_t max_segment_size = bdev->max_segment_size; 3128 uint32_t max_child_iovcnt = bdev->max_num_segments; 3129 uint32_t max_size = bdev->max_rw_size; 3130 void *md_buf = NULL; 3131 int rc; 3132 3133 max_size = max_size ? max_size : UINT32_MAX; 3134 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3135 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3136 SPDK_BDEV_IO_NUM_CHILD_IOV; 3137 3138 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3139 io_boundary = bdev->write_unit_size; 3140 } else if (bdev->split_on_optimal_io_boundary) { 3141 io_boundary = bdev->optimal_io_boundary; 3142 } else { 3143 io_boundary = UINT32_MAX; 3144 } 3145 3146 assert(bdev_io->internal.f.split); 3147 3148 remaining = bdev_io->internal.split.remaining_num_blocks; 3149 current_offset = bdev_io->internal.split.current_offset_blocks; 3150 parent_offset = bdev_io->u.bdev.offset_blocks; 3151 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3152 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3153 3154 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3155 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3156 if (parent_iov_offset < parent_iov->iov_len) { 3157 break; 3158 } 3159 parent_iov_offset -= parent_iov->iov_len; 3160 } 3161 3162 child_iovcnt = 0; 3163 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3164 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3165 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3166 to_next_boundary = spdk_min(remaining, to_next_boundary); 3167 to_next_boundary = spdk_min(max_size, to_next_boundary); 3168 to_next_boundary_bytes = to_next_boundary * blocklen; 3169 3170 iov = &bdev_io->child_iov[child_iovcnt]; 3171 iovcnt = 0; 3172 3173 if (bdev_io->u.bdev.md_buf) { 3174 md_buf = (char *)bdev_io->u.bdev.md_buf + 3175 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3176 } 3177 3178 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3179 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3180 iovcnt < child_iovsize) { 3181 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3182 iov_len = parent_iov->iov_len - parent_iov_offset; 3183 3184 iov_len = spdk_min(iov_len, max_segment_size); 3185 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3186 to_next_boundary_bytes -= iov_len; 3187 3188 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3189 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3190 3191 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3192 parent_iov_offset += iov_len; 3193 } else { 3194 parent_iovpos++; 3195 parent_iov_offset = 0; 3196 } 3197 child_iovcnt++; 3198 iovcnt++; 3199 } 3200 3201 if (to_next_boundary_bytes > 0) { 3202 /* We had to stop this child I/O early because we ran out of 3203 * child_iov space or were limited by max_num_segments. 3204 * Ensure the iovs to be aligned with block size and 3205 * then adjust to_next_boundary before starting the 3206 * child I/O. 3207 */ 3208 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3209 iovcnt == child_iovsize); 3210 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3211 if (to_last_block_bytes != 0) { 3212 uint32_t child_iovpos = child_iovcnt - 1; 3213 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3214 * so the loop will naturally end 3215 */ 3216 3217 to_last_block_bytes = blocklen - to_last_block_bytes; 3218 to_next_boundary_bytes += to_last_block_bytes; 3219 while (to_last_block_bytes > 0 && iovcnt > 0) { 3220 iov_len = spdk_min(to_last_block_bytes, 3221 bdev_io->child_iov[child_iovpos].iov_len); 3222 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3223 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3224 child_iovpos--; 3225 if (--iovcnt == 0) { 3226 /* If the child IO is less than a block size just return. 3227 * If the first child IO of any split round is less than 3228 * a block size, an error exit. 3229 */ 3230 if (bdev_io->internal.split.outstanding == 0) { 3231 SPDK_ERRLOG("The first child io was less than a block size\n"); 3232 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3233 bdev_ch_remove_from_io_submitted(bdev_io); 3234 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3235 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx, 3236 bdev_io->internal.ch->queue_depth); 3237 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3238 } 3239 3240 return; 3241 } 3242 } 3243 3244 to_last_block_bytes -= iov_len; 3245 3246 if (parent_iov_offset == 0) { 3247 parent_iovpos--; 3248 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3249 } 3250 parent_iov_offset -= iov_len; 3251 } 3252 3253 assert(to_last_block_bytes == 0); 3254 } 3255 to_next_boundary -= to_next_boundary_bytes / blocklen; 3256 } 3257 3258 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3259 ¤t_offset, &remaining); 3260 if (spdk_unlikely(rc)) { 3261 return; 3262 } 3263 } 3264 } 3265 3266 static void 3267 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3268 { 3269 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3270 uint32_t num_children_reqs = 0; 3271 int rc; 3272 3273 assert(bdev_io->internal.f.split); 3274 3275 offset = bdev_io->internal.split.current_offset_blocks; 3276 remaining = bdev_io->internal.split.remaining_num_blocks; 3277 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3278 3279 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3280 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3281 3282 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3283 &offset, &remaining); 3284 if (spdk_likely(rc == 0)) { 3285 num_children_reqs++; 3286 } else { 3287 return; 3288 } 3289 } 3290 } 3291 3292 static void 3293 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3294 { 3295 uint64_t offset, write_zeroes_blocks, remaining; 3296 uint32_t num_children_reqs = 0; 3297 int rc; 3298 3299 assert(bdev_io->internal.f.split); 3300 3301 offset = bdev_io->internal.split.current_offset_blocks; 3302 remaining = bdev_io->internal.split.remaining_num_blocks; 3303 3304 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3305 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3306 3307 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3308 &offset, &remaining); 3309 if (spdk_likely(rc == 0)) { 3310 num_children_reqs++; 3311 } else { 3312 return; 3313 } 3314 } 3315 } 3316 3317 static void 3318 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3319 { 3320 uint64_t offset, copy_blocks, remaining; 3321 uint32_t num_children_reqs = 0; 3322 int rc; 3323 3324 assert(bdev_io->internal.f.split); 3325 3326 offset = bdev_io->internal.split.current_offset_blocks; 3327 remaining = bdev_io->internal.split.remaining_num_blocks; 3328 3329 assert(bdev_io->bdev->max_copy != 0); 3330 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3331 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3332 3333 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3334 &offset, &remaining); 3335 if (spdk_likely(rc == 0)) { 3336 num_children_reqs++; 3337 } else { 3338 return; 3339 } 3340 } 3341 } 3342 3343 static void 3344 parent_bdev_io_complete(void *ctx, int rc) 3345 { 3346 struct spdk_bdev_io *parent_io = ctx; 3347 3348 if (rc) { 3349 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3350 } 3351 3352 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3353 parent_io->internal.caller_ctx); 3354 } 3355 3356 static void 3357 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3358 { 3359 struct spdk_bdev_io *bdev_io = ctx; 3360 3361 /* u.bdev.accel_sequence should have already been cleared at this point */ 3362 assert(bdev_io->u.bdev.accel_sequence == NULL); 3363 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3364 bdev_io->internal.f.has_accel_sequence = false; 3365 3366 if (spdk_unlikely(status != 0)) { 3367 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3368 } 3369 3370 parent_bdev_io_complete(bdev_io, status); 3371 } 3372 3373 static void 3374 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3375 { 3376 struct spdk_bdev_io *parent_io = cb_arg; 3377 3378 spdk_bdev_free_io(bdev_io); 3379 3380 assert(parent_io->internal.f.split); 3381 3382 if (!success) { 3383 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3384 /* If any child I/O failed, stop further splitting process. */ 3385 parent_io->internal.split.current_offset_blocks += parent_io->internal.split.remaining_num_blocks; 3386 parent_io->internal.split.remaining_num_blocks = 0; 3387 } 3388 parent_io->internal.split.outstanding--; 3389 if (parent_io->internal.split.outstanding != 0) { 3390 return; 3391 } 3392 3393 /* 3394 * Parent I/O finishes when all blocks are consumed. 3395 */ 3396 if (parent_io->internal.split.remaining_num_blocks == 0) { 3397 assert(parent_io->internal.cb != bdev_io_split_done); 3398 bdev_ch_remove_from_io_submitted(parent_io); 3399 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3400 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx, 3401 parent_io->internal.ch->queue_depth); 3402 3403 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3404 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3405 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3406 return; 3407 } else if (parent_io->internal.orig_iovcnt != 0 && 3408 !bdev_io_use_accel_sequence(bdev_io)) { 3409 /* bdev IO will be completed in the callback */ 3410 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3411 return; 3412 } 3413 } 3414 3415 parent_bdev_io_complete(parent_io, 0); 3416 return; 3417 } 3418 3419 /* 3420 * Continue with the splitting process. This function will complete the parent I/O if the 3421 * splitting is done. 3422 */ 3423 switch (parent_io->type) { 3424 case SPDK_BDEV_IO_TYPE_READ: 3425 case SPDK_BDEV_IO_TYPE_WRITE: 3426 _bdev_rw_split(parent_io); 3427 break; 3428 case SPDK_BDEV_IO_TYPE_UNMAP: 3429 bdev_unmap_split(parent_io); 3430 break; 3431 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3432 bdev_write_zeroes_split(parent_io); 3433 break; 3434 case SPDK_BDEV_IO_TYPE_COPY: 3435 bdev_copy_split(parent_io); 3436 break; 3437 default: 3438 assert(false); 3439 break; 3440 } 3441 } 3442 3443 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3444 bool success); 3445 3446 static void 3447 bdev_io_split(struct spdk_bdev_io *bdev_io) 3448 { 3449 assert(bdev_io_should_split(bdev_io)); 3450 assert(bdev_io->internal.f.split); 3451 3452 bdev_io->internal.split.current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3453 bdev_io->internal.split.remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3454 bdev_io->internal.split.outstanding = 0; 3455 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3456 3457 switch (bdev_io->type) { 3458 case SPDK_BDEV_IO_TYPE_READ: 3459 case SPDK_BDEV_IO_TYPE_WRITE: 3460 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3461 _bdev_rw_split(bdev_io); 3462 } else { 3463 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3464 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3465 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3466 } 3467 break; 3468 case SPDK_BDEV_IO_TYPE_UNMAP: 3469 bdev_unmap_split(bdev_io); 3470 break; 3471 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3472 bdev_write_zeroes_split(bdev_io); 3473 break; 3474 case SPDK_BDEV_IO_TYPE_COPY: 3475 bdev_copy_split(bdev_io); 3476 break; 3477 default: 3478 assert(false); 3479 break; 3480 } 3481 } 3482 3483 static void 3484 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3485 { 3486 if (!success) { 3487 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3488 return; 3489 } 3490 3491 _bdev_rw_split(bdev_io); 3492 } 3493 3494 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3495 * be inlined, at least on some compilers. 3496 */ 3497 static inline void 3498 _bdev_io_submit(void *ctx) 3499 { 3500 struct spdk_bdev_io *bdev_io = ctx; 3501 struct spdk_bdev *bdev = bdev_io->bdev; 3502 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3503 3504 if (spdk_likely(bdev_ch->flags == 0)) { 3505 bdev_io_do_submit(bdev_ch, bdev_io); 3506 return; 3507 } 3508 3509 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3510 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3511 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3512 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3513 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3514 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3515 } else { 3516 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3517 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3518 } 3519 } else { 3520 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3521 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3522 } 3523 } 3524 3525 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3526 3527 bool 3528 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3529 { 3530 if (range1->length == 0 || range2->length == 0) { 3531 return false; 3532 } 3533 3534 if (range1->offset + range1->length <= range2->offset) { 3535 return false; 3536 } 3537 3538 if (range2->offset + range2->length <= range1->offset) { 3539 return false; 3540 } 3541 3542 return true; 3543 } 3544 3545 static bool 3546 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3547 { 3548 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3549 struct lba_range r; 3550 3551 switch (bdev_io->type) { 3552 case SPDK_BDEV_IO_TYPE_NVME_IO: 3553 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3554 /* Don't try to decode the NVMe command - just assume worst-case and that 3555 * it overlaps a locked range. 3556 */ 3557 return true; 3558 case SPDK_BDEV_IO_TYPE_READ: 3559 if (!range->quiesce) { 3560 return false; 3561 } 3562 /* fallthrough */ 3563 case SPDK_BDEV_IO_TYPE_WRITE: 3564 case SPDK_BDEV_IO_TYPE_UNMAP: 3565 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3566 case SPDK_BDEV_IO_TYPE_ZCOPY: 3567 case SPDK_BDEV_IO_TYPE_COPY: 3568 r.offset = bdev_io->u.bdev.offset_blocks; 3569 r.length = bdev_io->u.bdev.num_blocks; 3570 if (!bdev_lba_range_overlapped(range, &r)) { 3571 /* This I/O doesn't overlap the specified LBA range. */ 3572 return false; 3573 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3574 /* This I/O overlaps, but the I/O is on the same channel that locked this 3575 * range, and the caller_ctx is the same as the locked_ctx. This means 3576 * that this I/O is associated with the lock, and is allowed to execute. 3577 */ 3578 return false; 3579 } else { 3580 return true; 3581 } 3582 default: 3583 return false; 3584 } 3585 } 3586 3587 void 3588 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3589 { 3590 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3591 3592 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3593 3594 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3595 struct lba_range *range; 3596 3597 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3598 if (bdev_io_range_is_locked(bdev_io, range)) { 3599 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3600 return; 3601 } 3602 } 3603 } 3604 3605 bdev_ch_add_to_io_submitted(bdev_io); 3606 3607 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3608 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3609 ch->trace_id, bdev_io->u.bdev.num_blocks, 3610 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3611 bdev_io->u.bdev.offset_blocks, ch->queue_depth); 3612 3613 if (bdev_io->internal.f.split) { 3614 bdev_io_split(bdev_io); 3615 return; 3616 } 3617 3618 _bdev_io_submit(bdev_io); 3619 } 3620 3621 static inline void 3622 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3623 { 3624 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3625 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3626 * For write operation we need to pull buffers from memory domain before submitting IO. 3627 * Once read operation completes, we need to use memory_domain push functionality to 3628 * update data in original memory domain IO buffer 3629 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3630 assert(bdev_io->internal.f.has_memory_domain); 3631 bdev_io->u.bdev.memory_domain = NULL; 3632 bdev_io->u.bdev.memory_domain_ctx = NULL; 3633 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3634 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3635 } 3636 3637 static inline void 3638 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3639 { 3640 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3641 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3642 3643 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3644 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3645 bdev_io_complete_unsubmitted(bdev_io); 3646 return; 3647 } 3648 3649 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3650 * support them, but we need to execute an accel sequence and the data buffer is from accel 3651 * memory domain (to avoid doing a push/pull from that domain). 3652 */ 3653 if (bdev_io_use_memory_domain(bdev_io)) { 3654 if (!desc->memory_domains_supported || 3655 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3656 _bdev_io_ext_use_bounce_buffer(bdev_io); 3657 return; 3658 } 3659 } 3660 3661 if (needs_exec) { 3662 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3663 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3664 return; 3665 } 3666 /* For reads we'll execute the sequence after the data is read, so, for now, only 3667 * clear out accel_sequence pointer and submit the IO */ 3668 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3669 bdev_io->u.bdev.accel_sequence = NULL; 3670 } 3671 3672 bdev_io_submit(bdev_io); 3673 } 3674 3675 static void 3676 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3677 { 3678 struct spdk_bdev *bdev = bdev_io->bdev; 3679 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3680 struct spdk_io_channel *ch = bdev_ch->channel; 3681 3682 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3683 3684 bdev_io->internal.in_submit_request = true; 3685 bdev_submit_request(bdev, ch, bdev_io); 3686 bdev_io->internal.in_submit_request = false; 3687 } 3688 3689 void 3690 bdev_io_init(struct spdk_bdev_io *bdev_io, 3691 struct spdk_bdev *bdev, void *cb_arg, 3692 spdk_bdev_io_completion_cb cb) 3693 { 3694 bdev_io->bdev = bdev; 3695 bdev_io->internal.f.raw = 0; 3696 bdev_io->internal.caller_ctx = cb_arg; 3697 bdev_io->internal.cb = cb; 3698 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3699 bdev_io->internal.in_submit_request = false; 3700 bdev_io->internal.orig_iovs = NULL; 3701 bdev_io->internal.orig_iovcnt = 0; 3702 bdev_io->internal.orig_md_iov.iov_base = NULL; 3703 bdev_io->internal.error.nvme.cdw0 = 0; 3704 bdev_io->num_retries = 0; 3705 bdev_io->internal.get_buf_cb = NULL; 3706 bdev_io->internal.get_aux_buf_cb = NULL; 3707 bdev_io->internal.data_transfer_cpl = NULL; 3708 bdev_io->internal.f.split = bdev_io_should_split(bdev_io); 3709 } 3710 3711 static bool 3712 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3713 { 3714 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3715 } 3716 3717 bool 3718 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3719 { 3720 bool supported; 3721 3722 supported = bdev_io_type_supported(bdev, io_type); 3723 3724 if (!supported) { 3725 switch (io_type) { 3726 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3727 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3728 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3729 break; 3730 default: 3731 break; 3732 } 3733 } 3734 3735 return supported; 3736 } 3737 3738 static const char *g_io_type_strings[] = { 3739 [SPDK_BDEV_IO_TYPE_READ] = "read", 3740 [SPDK_BDEV_IO_TYPE_WRITE] = "write", 3741 [SPDK_BDEV_IO_TYPE_UNMAP] = "unmap", 3742 [SPDK_BDEV_IO_TYPE_FLUSH] = "flush", 3743 [SPDK_BDEV_IO_TYPE_RESET] = "reset", 3744 [SPDK_BDEV_IO_TYPE_NVME_ADMIN] = "nvme_admin", 3745 [SPDK_BDEV_IO_TYPE_NVME_IO] = "nvme_io", 3746 [SPDK_BDEV_IO_TYPE_NVME_IO_MD] = "nvme_io_md", 3747 [SPDK_BDEV_IO_TYPE_WRITE_ZEROES] = "write_zeroes", 3748 [SPDK_BDEV_IO_TYPE_ZCOPY] = "zcopy", 3749 [SPDK_BDEV_IO_TYPE_GET_ZONE_INFO] = "get_zone_info", 3750 [SPDK_BDEV_IO_TYPE_ZONE_MANAGEMENT] = "zone_management", 3751 [SPDK_BDEV_IO_TYPE_ZONE_APPEND] = "zone_append", 3752 [SPDK_BDEV_IO_TYPE_COMPARE] = "compare", 3753 [SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE] = "compare_and_write", 3754 [SPDK_BDEV_IO_TYPE_ABORT] = "abort", 3755 [SPDK_BDEV_IO_TYPE_SEEK_HOLE] = "seek_hole", 3756 [SPDK_BDEV_IO_TYPE_SEEK_DATA] = "seek_data", 3757 [SPDK_BDEV_IO_TYPE_COPY] = "copy", 3758 [SPDK_BDEV_IO_TYPE_NVME_IOV_MD] = "nvme_iov_md", 3759 }; 3760 3761 const char * 3762 spdk_bdev_get_io_type_name(enum spdk_bdev_io_type io_type) 3763 { 3764 if (io_type <= SPDK_BDEV_IO_TYPE_INVALID || io_type >= SPDK_BDEV_NUM_IO_TYPES) { 3765 return NULL; 3766 } 3767 3768 return g_io_type_strings[io_type]; 3769 } 3770 3771 int 3772 spdk_bdev_get_io_type(const char *io_type_string) 3773 { 3774 int i; 3775 3776 for (i = SPDK_BDEV_IO_TYPE_READ; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 3777 if (!strcmp(io_type_string, g_io_type_strings[i])) { 3778 return i; 3779 } 3780 } 3781 3782 return -1; 3783 } 3784 3785 uint64_t 3786 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3787 { 3788 return bdev_io->internal.submit_tsc; 3789 } 3790 3791 int 3792 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3793 { 3794 if (bdev->fn_table->dump_info_json) { 3795 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3796 } 3797 3798 return 0; 3799 } 3800 3801 static void 3802 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3803 { 3804 uint32_t max_per_timeslice = 0; 3805 int i; 3806 3807 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3808 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3809 qos->rate_limits[i].max_per_timeslice = 0; 3810 continue; 3811 } 3812 3813 max_per_timeslice = qos->rate_limits[i].limit * 3814 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3815 3816 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3817 qos->rate_limits[i].min_per_timeslice); 3818 3819 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3820 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3821 } 3822 3823 bdev_qos_set_ops(qos); 3824 } 3825 3826 static void 3827 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3828 struct spdk_io_channel *io_ch, void *ctx) 3829 { 3830 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3831 int status; 3832 3833 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3834 3835 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3836 /* TODO: channels round robing */ 3837 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3838 3839 spdk_bdev_for_each_channel_continue(i, status); 3840 } 3841 3842 3843 static void 3844 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3845 { 3846 3847 } 3848 3849 static int 3850 bdev_channel_poll_qos(void *arg) 3851 { 3852 struct spdk_bdev *bdev = arg; 3853 struct spdk_bdev_qos *qos = bdev->internal.qos; 3854 uint64_t now = spdk_get_ticks(); 3855 int i; 3856 int64_t remaining_last_timeslice; 3857 3858 if (spdk_unlikely(qos->thread == NULL)) { 3859 /* Old QoS was unbound to remove and new QoS is not enabled yet. */ 3860 return SPDK_POLLER_IDLE; 3861 } 3862 3863 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3864 /* We received our callback earlier than expected - return 3865 * immediately and wait to do accounting until at least one 3866 * timeslice has actually expired. This should never happen 3867 * with a well-behaved timer implementation. 3868 */ 3869 return SPDK_POLLER_IDLE; 3870 } 3871 3872 /* Reset for next round of rate limiting */ 3873 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3874 /* We may have allowed the IOs or bytes to slightly overrun in the last 3875 * timeslice. remaining_this_timeslice is signed, so if it's negative 3876 * here, we'll account for the overrun so that the next timeslice will 3877 * be appropriately reduced. 3878 */ 3879 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3880 0, __ATOMIC_RELAXED); 3881 if (remaining_last_timeslice < 0) { 3882 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3883 * potentially use 2 atomic ops each, so they can intertwine. 3884 * This race can potentially cause the limits to be a little fuzzy but won't cause any real damage. 3885 */ 3886 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3887 remaining_last_timeslice, __ATOMIC_RELAXED); 3888 } 3889 } 3890 3891 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3892 qos->last_timeslice += qos->timeslice_size; 3893 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3894 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3895 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3896 } 3897 } 3898 3899 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3900 bdev_channel_submit_qos_io_done); 3901 3902 return SPDK_POLLER_BUSY; 3903 } 3904 3905 static void 3906 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3907 { 3908 struct spdk_bdev_shared_resource *shared_resource; 3909 struct lba_range *range; 3910 3911 bdev_free_io_stat(ch->stat); 3912 #ifdef SPDK_CONFIG_VTUNE 3913 bdev_free_io_stat(ch->prev_stat); 3914 #endif 3915 3916 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3917 range = TAILQ_FIRST(&ch->locked_ranges); 3918 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3919 free(range); 3920 } 3921 3922 spdk_put_io_channel(ch->channel); 3923 spdk_put_io_channel(ch->accel_channel); 3924 3925 shared_resource = ch->shared_resource; 3926 3927 assert(TAILQ_EMPTY(&ch->io_locked)); 3928 assert(TAILQ_EMPTY(&ch->io_submitted)); 3929 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3930 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3931 assert(ch->io_outstanding == 0); 3932 assert(shared_resource->ref > 0); 3933 shared_resource->ref--; 3934 if (shared_resource->ref == 0) { 3935 assert(shared_resource->io_outstanding == 0); 3936 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3937 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3938 spdk_poller_unregister(&shared_resource->nomem_poller); 3939 free(shared_resource); 3940 } 3941 } 3942 3943 static void 3944 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3945 { 3946 struct spdk_bdev_qos *qos = bdev->internal.qos; 3947 int i; 3948 3949 assert(spdk_spin_held(&bdev->internal.spinlock)); 3950 3951 /* Rate limiting on this bdev enabled */ 3952 if (qos) { 3953 if (qos->ch == NULL) { 3954 struct spdk_io_channel *io_ch; 3955 3956 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3957 bdev->name, spdk_get_thread()); 3958 3959 /* No qos channel has been selected, so set one up */ 3960 3961 /* Take another reference to ch */ 3962 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3963 assert(io_ch != NULL); 3964 qos->ch = ch; 3965 3966 qos->thread = spdk_io_channel_get_thread(io_ch); 3967 3968 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3969 if (bdev_qos_is_iops_rate_limit(i) == true) { 3970 qos->rate_limits[i].min_per_timeslice = 3971 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3972 } else { 3973 qos->rate_limits[i].min_per_timeslice = 3974 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3975 } 3976 3977 if (qos->rate_limits[i].limit == 0) { 3978 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3979 } 3980 } 3981 bdev_qos_update_max_quota_per_timeslice(qos); 3982 qos->timeslice_size = 3983 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3984 qos->last_timeslice = spdk_get_ticks(); 3985 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3986 bdev, 3987 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3988 } 3989 3990 ch->flags |= BDEV_CH_QOS_ENABLED; 3991 } 3992 } 3993 3994 struct poll_timeout_ctx { 3995 struct spdk_bdev_desc *desc; 3996 uint64_t timeout_in_sec; 3997 spdk_bdev_io_timeout_cb cb_fn; 3998 void *cb_arg; 3999 }; 4000 4001 static void 4002 bdev_desc_free(struct spdk_bdev_desc *desc) 4003 { 4004 spdk_spin_destroy(&desc->spinlock); 4005 free(desc->media_events_buffer); 4006 free(desc); 4007 } 4008 4009 static void 4010 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 4011 { 4012 struct poll_timeout_ctx *ctx = _ctx; 4013 struct spdk_bdev_desc *desc = ctx->desc; 4014 4015 free(ctx); 4016 4017 spdk_spin_lock(&desc->spinlock); 4018 desc->refs--; 4019 if (desc->closed == true && desc->refs == 0) { 4020 spdk_spin_unlock(&desc->spinlock); 4021 bdev_desc_free(desc); 4022 return; 4023 } 4024 spdk_spin_unlock(&desc->spinlock); 4025 } 4026 4027 static void 4028 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4029 struct spdk_io_channel *io_ch, void *_ctx) 4030 { 4031 struct poll_timeout_ctx *ctx = _ctx; 4032 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4033 struct spdk_bdev_desc *desc = ctx->desc; 4034 struct spdk_bdev_io *bdev_io; 4035 uint64_t now; 4036 4037 spdk_spin_lock(&desc->spinlock); 4038 if (desc->closed == true) { 4039 spdk_spin_unlock(&desc->spinlock); 4040 spdk_bdev_for_each_channel_continue(i, -1); 4041 return; 4042 } 4043 spdk_spin_unlock(&desc->spinlock); 4044 4045 now = spdk_get_ticks(); 4046 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 4047 /* Exclude any I/O that are generated via splitting. */ 4048 if (bdev_io->internal.cb == bdev_io_split_done) { 4049 continue; 4050 } 4051 4052 /* Once we find an I/O that has not timed out, we can immediately 4053 * exit the loop. 4054 */ 4055 if (now < (bdev_io->internal.submit_tsc + 4056 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 4057 goto end; 4058 } 4059 4060 if (bdev_io->internal.desc == desc) { 4061 ctx->cb_fn(ctx->cb_arg, bdev_io); 4062 } 4063 } 4064 4065 end: 4066 spdk_bdev_for_each_channel_continue(i, 0); 4067 } 4068 4069 static int 4070 bdev_poll_timeout_io(void *arg) 4071 { 4072 struct spdk_bdev_desc *desc = arg; 4073 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4074 struct poll_timeout_ctx *ctx; 4075 4076 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 4077 if (!ctx) { 4078 SPDK_ERRLOG("failed to allocate memory\n"); 4079 return SPDK_POLLER_BUSY; 4080 } 4081 ctx->desc = desc; 4082 ctx->cb_arg = desc->cb_arg; 4083 ctx->cb_fn = desc->cb_fn; 4084 ctx->timeout_in_sec = desc->timeout_in_sec; 4085 4086 /* Take a ref on the descriptor in case it gets closed while we are checking 4087 * all of the channels. 4088 */ 4089 spdk_spin_lock(&desc->spinlock); 4090 desc->refs++; 4091 spdk_spin_unlock(&desc->spinlock); 4092 4093 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 4094 bdev_channel_poll_timeout_io_done); 4095 4096 return SPDK_POLLER_BUSY; 4097 } 4098 4099 int 4100 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 4101 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 4102 { 4103 assert(desc->thread == spdk_get_thread()); 4104 4105 spdk_poller_unregister(&desc->io_timeout_poller); 4106 4107 if (timeout_in_sec) { 4108 assert(cb_fn != NULL); 4109 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4110 desc, 4111 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4112 1000); 4113 if (desc->io_timeout_poller == NULL) { 4114 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4115 return -1; 4116 } 4117 } 4118 4119 desc->cb_fn = cb_fn; 4120 desc->cb_arg = cb_arg; 4121 desc->timeout_in_sec = timeout_in_sec; 4122 4123 return 0; 4124 } 4125 4126 static int 4127 bdev_channel_create(void *io_device, void *ctx_buf) 4128 { 4129 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4130 struct spdk_bdev_channel *ch = ctx_buf; 4131 struct spdk_io_channel *mgmt_io_ch; 4132 struct spdk_bdev_mgmt_channel *mgmt_ch; 4133 struct spdk_bdev_shared_resource *shared_resource; 4134 struct lba_range *range; 4135 4136 ch->bdev = bdev; 4137 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4138 if (!ch->channel) { 4139 return -1; 4140 } 4141 4142 ch->accel_channel = spdk_accel_get_io_channel(); 4143 if (!ch->accel_channel) { 4144 spdk_put_io_channel(ch->channel); 4145 return -1; 4146 } 4147 4148 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4149 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4150 4151 assert(ch->histogram == NULL); 4152 if (bdev->internal.histogram_enabled) { 4153 ch->histogram = spdk_histogram_data_alloc(); 4154 if (ch->histogram == NULL) { 4155 SPDK_ERRLOG("Could not allocate histogram\n"); 4156 } 4157 } 4158 4159 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4160 if (!mgmt_io_ch) { 4161 spdk_put_io_channel(ch->channel); 4162 spdk_put_io_channel(ch->accel_channel); 4163 return -1; 4164 } 4165 4166 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4167 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4168 if (shared_resource->shared_ch == ch->channel) { 4169 spdk_put_io_channel(mgmt_io_ch); 4170 shared_resource->ref++; 4171 break; 4172 } 4173 } 4174 4175 if (shared_resource == NULL) { 4176 shared_resource = calloc(1, sizeof(*shared_resource)); 4177 if (shared_resource == NULL) { 4178 spdk_put_io_channel(ch->channel); 4179 spdk_put_io_channel(ch->accel_channel); 4180 spdk_put_io_channel(mgmt_io_ch); 4181 return -1; 4182 } 4183 4184 shared_resource->mgmt_ch = mgmt_ch; 4185 shared_resource->io_outstanding = 0; 4186 TAILQ_INIT(&shared_resource->nomem_io); 4187 shared_resource->nomem_threshold = 0; 4188 shared_resource->shared_ch = ch->channel; 4189 shared_resource->ref = 1; 4190 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4191 } 4192 4193 ch->io_outstanding = 0; 4194 TAILQ_INIT(&ch->queued_resets); 4195 TAILQ_INIT(&ch->locked_ranges); 4196 TAILQ_INIT(&ch->qos_queued_io); 4197 ch->flags = 0; 4198 ch->trace_id = bdev->internal.trace_id; 4199 ch->shared_resource = shared_resource; 4200 4201 TAILQ_INIT(&ch->io_submitted); 4202 TAILQ_INIT(&ch->io_locked); 4203 TAILQ_INIT(&ch->io_accel_exec); 4204 TAILQ_INIT(&ch->io_memory_domain); 4205 4206 ch->stat = bdev_alloc_io_stat(false); 4207 if (ch->stat == NULL) { 4208 bdev_channel_destroy_resource(ch); 4209 return -1; 4210 } 4211 4212 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4213 4214 #ifdef SPDK_CONFIG_VTUNE 4215 { 4216 char *name; 4217 __itt_init_ittlib(NULL, 0); 4218 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4219 if (!name) { 4220 bdev_channel_destroy_resource(ch); 4221 return -1; 4222 } 4223 ch->handle = __itt_string_handle_create(name); 4224 free(name); 4225 ch->start_tsc = spdk_get_ticks(); 4226 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4227 ch->prev_stat = bdev_alloc_io_stat(false); 4228 if (ch->prev_stat == NULL) { 4229 bdev_channel_destroy_resource(ch); 4230 return -1; 4231 } 4232 } 4233 #endif 4234 4235 spdk_spin_lock(&bdev->internal.spinlock); 4236 bdev_enable_qos(bdev, ch); 4237 4238 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4239 struct lba_range *new_range; 4240 4241 new_range = calloc(1, sizeof(*new_range)); 4242 if (new_range == NULL) { 4243 spdk_spin_unlock(&bdev->internal.spinlock); 4244 bdev_channel_destroy_resource(ch); 4245 return -1; 4246 } 4247 new_range->length = range->length; 4248 new_range->offset = range->offset; 4249 new_range->locked_ctx = range->locked_ctx; 4250 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4251 } 4252 4253 spdk_spin_unlock(&bdev->internal.spinlock); 4254 4255 return 0; 4256 } 4257 4258 static int 4259 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4260 void *cb_ctx) 4261 { 4262 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4263 struct spdk_bdev_io *bdev_io; 4264 uint64_t buf_len; 4265 4266 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4267 if (bdev_io->internal.ch == bdev_ch) { 4268 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4269 spdk_iobuf_entry_abort(ch, entry, buf_len); 4270 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4271 } 4272 4273 return 0; 4274 } 4275 4276 /* 4277 * Abort I/O that are waiting on a data buffer. 4278 */ 4279 static void 4280 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4281 { 4282 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4283 bdev_abort_all_buf_io_cb, ch); 4284 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4285 bdev_abort_all_buf_io_cb, ch); 4286 } 4287 4288 /* 4289 * Abort I/O that are queued waiting for submission. These types of I/O are 4290 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4291 */ 4292 static void 4293 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4294 { 4295 struct spdk_bdev_io *bdev_io, *tmp; 4296 4297 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4298 if (bdev_io->internal.ch == ch) { 4299 TAILQ_REMOVE(queue, bdev_io, internal.link); 4300 /* 4301 * spdk_bdev_io_complete() assumes that the completed I/O had 4302 * been submitted to the bdev module. Since in this case it 4303 * hadn't, bump io_outstanding to account for the decrement 4304 * that spdk_bdev_io_complete() will do. 4305 */ 4306 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4307 bdev_io_increment_outstanding(ch, ch->shared_resource); 4308 } 4309 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4310 } 4311 } 4312 } 4313 4314 static bool 4315 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4316 { 4317 struct spdk_bdev_io *bdev_io; 4318 4319 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4320 if (bdev_io == bio_to_abort) { 4321 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4322 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4323 return true; 4324 } 4325 } 4326 4327 return false; 4328 } 4329 4330 static int 4331 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4332 { 4333 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4334 uint64_t buf_len; 4335 4336 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4337 if (bdev_io == bio_to_abort) { 4338 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf.len); 4339 spdk_iobuf_entry_abort(ch, entry, buf_len); 4340 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4341 return 1; 4342 } 4343 4344 return 0; 4345 } 4346 4347 static bool 4348 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4349 { 4350 int rc; 4351 4352 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4353 bdev_abort_buf_io_cb, bio_to_abort); 4354 if (rc == 1) { 4355 return true; 4356 } 4357 4358 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4359 bdev_abort_buf_io_cb, bio_to_abort); 4360 return rc == 1; 4361 } 4362 4363 static void 4364 bdev_qos_channel_destroy(void *cb_arg) 4365 { 4366 struct spdk_bdev_qos *qos = cb_arg; 4367 4368 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4369 spdk_poller_unregister(&qos->poller); 4370 4371 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4372 4373 free(qos); 4374 } 4375 4376 static int 4377 bdev_qos_destroy(struct spdk_bdev *bdev) 4378 { 4379 int i; 4380 4381 /* 4382 * Cleanly shutting down the QoS poller is tricky, because 4383 * during the asynchronous operation the user could open 4384 * a new descriptor and create a new channel, spawning 4385 * a new QoS poller. 4386 * 4387 * The strategy is to create a new QoS structure here and swap it 4388 * in. The shutdown path then continues to refer to the old one 4389 * until it completes and then releases it. 4390 */ 4391 struct spdk_bdev_qos *new_qos, *old_qos; 4392 4393 old_qos = bdev->internal.qos; 4394 4395 new_qos = calloc(1, sizeof(*new_qos)); 4396 if (!new_qos) { 4397 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4398 return -ENOMEM; 4399 } 4400 4401 /* Copy the old QoS data into the newly allocated structure */ 4402 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4403 4404 /* Zero out the key parts of the QoS structure */ 4405 new_qos->ch = NULL; 4406 new_qos->thread = NULL; 4407 new_qos->poller = NULL; 4408 /* 4409 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4410 * It will be used later for the new QoS structure. 4411 */ 4412 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4413 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4414 new_qos->rate_limits[i].min_per_timeslice = 0; 4415 new_qos->rate_limits[i].max_per_timeslice = 0; 4416 } 4417 4418 bdev->internal.qos = new_qos; 4419 4420 if (old_qos->thread == NULL) { 4421 free(old_qos); 4422 } else { 4423 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4424 } 4425 4426 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4427 * been destroyed yet. The destruction path will end up waiting for the final 4428 * channel to be put before it releases resources. */ 4429 4430 return 0; 4431 } 4432 4433 void 4434 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4435 { 4436 total->bytes_read += add->bytes_read; 4437 total->num_read_ops += add->num_read_ops; 4438 total->bytes_written += add->bytes_written; 4439 total->num_write_ops += add->num_write_ops; 4440 total->bytes_unmapped += add->bytes_unmapped; 4441 total->num_unmap_ops += add->num_unmap_ops; 4442 total->bytes_copied += add->bytes_copied; 4443 total->num_copy_ops += add->num_copy_ops; 4444 total->read_latency_ticks += add->read_latency_ticks; 4445 total->write_latency_ticks += add->write_latency_ticks; 4446 total->unmap_latency_ticks += add->unmap_latency_ticks; 4447 total->copy_latency_ticks += add->copy_latency_ticks; 4448 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4449 total->max_read_latency_ticks = add->max_read_latency_ticks; 4450 } 4451 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4452 total->min_read_latency_ticks = add->min_read_latency_ticks; 4453 } 4454 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4455 total->max_write_latency_ticks = add->max_write_latency_ticks; 4456 } 4457 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4458 total->min_write_latency_ticks = add->min_write_latency_ticks; 4459 } 4460 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4461 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4462 } 4463 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4464 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4465 } 4466 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4467 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4468 } 4469 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4470 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4471 } 4472 } 4473 4474 static void 4475 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4476 { 4477 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4478 4479 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4480 memcpy(to_stat->io_error, from_stat->io_error, 4481 sizeof(struct spdk_bdev_io_error_stat)); 4482 } 4483 } 4484 4485 void 4486 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4487 { 4488 stat->max_read_latency_ticks = 0; 4489 stat->min_read_latency_ticks = UINT64_MAX; 4490 stat->max_write_latency_ticks = 0; 4491 stat->min_write_latency_ticks = UINT64_MAX; 4492 stat->max_unmap_latency_ticks = 0; 4493 stat->min_unmap_latency_ticks = UINT64_MAX; 4494 stat->max_copy_latency_ticks = 0; 4495 stat->min_copy_latency_ticks = UINT64_MAX; 4496 4497 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4498 return; 4499 } 4500 4501 stat->bytes_read = 0; 4502 stat->num_read_ops = 0; 4503 stat->bytes_written = 0; 4504 stat->num_write_ops = 0; 4505 stat->bytes_unmapped = 0; 4506 stat->num_unmap_ops = 0; 4507 stat->bytes_copied = 0; 4508 stat->num_copy_ops = 0; 4509 stat->read_latency_ticks = 0; 4510 stat->write_latency_ticks = 0; 4511 stat->unmap_latency_ticks = 0; 4512 stat->copy_latency_ticks = 0; 4513 4514 if (stat->io_error != NULL) { 4515 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4516 } 4517 } 4518 4519 struct spdk_bdev_io_stat * 4520 bdev_alloc_io_stat(bool io_error_stat) 4521 { 4522 struct spdk_bdev_io_stat *stat; 4523 4524 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4525 if (stat == NULL) { 4526 return NULL; 4527 } 4528 4529 if (io_error_stat) { 4530 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4531 if (stat->io_error == NULL) { 4532 free(stat); 4533 return NULL; 4534 } 4535 } else { 4536 stat->io_error = NULL; 4537 } 4538 4539 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4540 4541 return stat; 4542 } 4543 4544 void 4545 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4546 { 4547 if (stat != NULL) { 4548 free(stat->io_error); 4549 free(stat); 4550 } 4551 } 4552 4553 void 4554 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4555 { 4556 int i; 4557 4558 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4559 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4560 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4561 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4562 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4563 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4564 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4565 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4566 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4567 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4568 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4569 stat->min_read_latency_ticks != UINT64_MAX ? 4570 stat->min_read_latency_ticks : 0); 4571 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4572 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4573 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4574 stat->min_write_latency_ticks != UINT64_MAX ? 4575 stat->min_write_latency_ticks : 0); 4576 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4577 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4578 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4579 stat->min_unmap_latency_ticks != UINT64_MAX ? 4580 stat->min_unmap_latency_ticks : 0); 4581 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4582 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4583 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4584 stat->min_copy_latency_ticks != UINT64_MAX ? 4585 stat->min_copy_latency_ticks : 0); 4586 4587 if (stat->io_error != NULL) { 4588 spdk_json_write_named_object_begin(w, "io_error"); 4589 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4590 if (stat->io_error->error_status[i] != 0) { 4591 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4592 stat->io_error->error_status[i]); 4593 } 4594 } 4595 spdk_json_write_object_end(w); 4596 } 4597 } 4598 4599 static void 4600 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4601 { 4602 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4603 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4604 4605 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4606 bdev_abort_all_buf_io(mgmt_ch, ch); 4607 } 4608 4609 static void 4610 bdev_channel_destroy(void *io_device, void *ctx_buf) 4611 { 4612 struct spdk_bdev_channel *ch = ctx_buf; 4613 4614 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4615 spdk_get_thread()); 4616 4617 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4618 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4619 4620 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4621 spdk_spin_lock(&ch->bdev->internal.spinlock); 4622 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4623 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4624 4625 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4626 4627 bdev_channel_abort_queued_ios(ch); 4628 4629 if (ch->histogram) { 4630 spdk_histogram_data_free(ch->histogram); 4631 } 4632 4633 bdev_channel_destroy_resource(ch); 4634 } 4635 4636 /* 4637 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4638 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4639 */ 4640 static int 4641 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4642 { 4643 struct spdk_bdev_name *tmp; 4644 4645 bdev_name->name = strdup(name); 4646 if (bdev_name->name == NULL) { 4647 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4648 return -ENOMEM; 4649 } 4650 4651 bdev_name->bdev = bdev; 4652 4653 spdk_spin_lock(&g_bdev_mgr.spinlock); 4654 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4655 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4656 4657 if (tmp != NULL) { 4658 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4659 free(bdev_name->name); 4660 return -EEXIST; 4661 } 4662 4663 return 0; 4664 } 4665 4666 static void 4667 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4668 { 4669 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4670 free(bdev_name->name); 4671 } 4672 4673 static void 4674 bdev_name_del(struct spdk_bdev_name *bdev_name) 4675 { 4676 spdk_spin_lock(&g_bdev_mgr.spinlock); 4677 bdev_name_del_unsafe(bdev_name); 4678 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4679 } 4680 4681 int 4682 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4683 { 4684 struct spdk_bdev_alias *tmp; 4685 int ret; 4686 4687 if (alias == NULL) { 4688 SPDK_ERRLOG("Empty alias passed\n"); 4689 return -EINVAL; 4690 } 4691 4692 tmp = calloc(1, sizeof(*tmp)); 4693 if (tmp == NULL) { 4694 SPDK_ERRLOG("Unable to allocate alias\n"); 4695 return -ENOMEM; 4696 } 4697 4698 ret = bdev_name_add(&tmp->alias, bdev, alias); 4699 if (ret != 0) { 4700 free(tmp); 4701 return ret; 4702 } 4703 4704 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4705 4706 return 0; 4707 } 4708 4709 static int 4710 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4711 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4712 { 4713 struct spdk_bdev_alias *tmp; 4714 4715 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4716 if (strcmp(alias, tmp->alias.name) == 0) { 4717 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4718 alias_del_fn(&tmp->alias); 4719 free(tmp); 4720 return 0; 4721 } 4722 } 4723 4724 return -ENOENT; 4725 } 4726 4727 int 4728 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4729 { 4730 int rc; 4731 4732 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4733 if (rc == -ENOENT) { 4734 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4735 } 4736 4737 return rc; 4738 } 4739 4740 void 4741 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4742 { 4743 struct spdk_bdev_alias *p, *tmp; 4744 4745 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4746 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4747 bdev_name_del(&p->alias); 4748 free(p); 4749 } 4750 } 4751 4752 struct spdk_io_channel * 4753 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4754 { 4755 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4756 } 4757 4758 void * 4759 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4760 { 4761 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4762 void *ctx = NULL; 4763 4764 if (bdev->fn_table->get_module_ctx) { 4765 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4766 } 4767 4768 return ctx; 4769 } 4770 4771 const char * 4772 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4773 { 4774 return bdev->module->name; 4775 } 4776 4777 const char * 4778 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4779 { 4780 return bdev->name; 4781 } 4782 4783 const char * 4784 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4785 { 4786 return bdev->product_name; 4787 } 4788 4789 const struct spdk_bdev_aliases_list * 4790 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4791 { 4792 return &bdev->aliases; 4793 } 4794 4795 uint32_t 4796 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4797 { 4798 return bdev->blocklen; 4799 } 4800 4801 uint32_t 4802 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4803 { 4804 return bdev->write_unit_size; 4805 } 4806 4807 uint64_t 4808 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4809 { 4810 return bdev->blockcnt; 4811 } 4812 4813 const char * 4814 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4815 { 4816 return qos_rpc_type[type]; 4817 } 4818 4819 void 4820 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4821 { 4822 int i; 4823 4824 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4825 4826 spdk_spin_lock(&bdev->internal.spinlock); 4827 if (bdev->internal.qos) { 4828 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4829 if (bdev->internal.qos->rate_limits[i].limit != 4830 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4831 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4832 if (bdev_qos_is_iops_rate_limit(i) == false) { 4833 /* Change from Byte to Megabyte which is user visible. */ 4834 limits[i] = limits[i] / 1024 / 1024; 4835 } 4836 } 4837 } 4838 } 4839 spdk_spin_unlock(&bdev->internal.spinlock); 4840 } 4841 4842 size_t 4843 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4844 { 4845 return 1 << bdev->required_alignment; 4846 } 4847 4848 uint32_t 4849 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4850 { 4851 return bdev->optimal_io_boundary; 4852 } 4853 4854 bool 4855 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4856 { 4857 return bdev->write_cache; 4858 } 4859 4860 const struct spdk_uuid * 4861 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4862 { 4863 return &bdev->uuid; 4864 } 4865 4866 uint16_t 4867 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4868 { 4869 return bdev->acwu; 4870 } 4871 4872 uint32_t 4873 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4874 { 4875 return bdev->md_len; 4876 } 4877 4878 bool 4879 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4880 { 4881 return (bdev->md_len != 0) && bdev->md_interleave; 4882 } 4883 4884 bool 4885 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4886 { 4887 return (bdev->md_len != 0) && !bdev->md_interleave; 4888 } 4889 4890 bool 4891 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4892 { 4893 return bdev->zoned; 4894 } 4895 4896 uint32_t 4897 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4898 { 4899 if (spdk_bdev_is_md_interleaved(bdev)) { 4900 return bdev->blocklen - bdev->md_len; 4901 } else { 4902 return bdev->blocklen; 4903 } 4904 } 4905 4906 uint32_t 4907 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4908 { 4909 return bdev->phys_blocklen; 4910 } 4911 4912 static uint32_t 4913 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4914 { 4915 if (!spdk_bdev_is_md_interleaved(bdev)) { 4916 return bdev->blocklen + bdev->md_len; 4917 } else { 4918 return bdev->blocklen; 4919 } 4920 } 4921 4922 /* We have to use the typedef in the function declaration to appease astyle. */ 4923 typedef enum spdk_dif_type spdk_dif_type_t; 4924 typedef enum spdk_dif_pi_format spdk_dif_pi_format_t; 4925 4926 spdk_dif_type_t 4927 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4928 { 4929 if (bdev->md_len != 0) { 4930 return bdev->dif_type; 4931 } else { 4932 return SPDK_DIF_DISABLE; 4933 } 4934 } 4935 4936 spdk_dif_pi_format_t 4937 spdk_bdev_get_dif_pi_format(const struct spdk_bdev *bdev) 4938 { 4939 return bdev->dif_pi_format; 4940 } 4941 4942 bool 4943 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4944 { 4945 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4946 return bdev->dif_is_head_of_md; 4947 } else { 4948 return false; 4949 } 4950 } 4951 4952 bool 4953 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4954 enum spdk_dif_check_type check_type) 4955 { 4956 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4957 return false; 4958 } 4959 4960 switch (check_type) { 4961 case SPDK_DIF_CHECK_TYPE_REFTAG: 4962 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4963 case SPDK_DIF_CHECK_TYPE_APPTAG: 4964 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4965 case SPDK_DIF_CHECK_TYPE_GUARD: 4966 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4967 default: 4968 return false; 4969 } 4970 } 4971 4972 static uint32_t 4973 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4974 { 4975 uint64_t aligned_length, max_write_blocks; 4976 4977 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4978 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4979 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4980 4981 return max_write_blocks; 4982 } 4983 4984 uint32_t 4985 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4986 { 4987 return bdev->max_copy; 4988 } 4989 4990 uint64_t 4991 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4992 { 4993 return bdev->internal.measured_queue_depth; 4994 } 4995 4996 uint64_t 4997 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4998 { 4999 return bdev->internal.period; 5000 } 5001 5002 uint64_t 5003 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 5004 { 5005 return bdev->internal.weighted_io_time; 5006 } 5007 5008 uint64_t 5009 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 5010 { 5011 return bdev->internal.io_time; 5012 } 5013 5014 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 5015 { 5016 return bdev->ctratt; 5017 } 5018 5019 static void bdev_update_qd_sampling_period(void *ctx); 5020 5021 static void 5022 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 5023 { 5024 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 5025 5026 if (bdev->internal.measured_queue_depth) { 5027 bdev->internal.io_time += bdev->internal.period; 5028 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 5029 } 5030 5031 bdev->internal.qd_poll_in_progress = false; 5032 5033 bdev_update_qd_sampling_period(bdev); 5034 } 5035 5036 static void 5037 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5038 struct spdk_io_channel *io_ch, void *_ctx) 5039 { 5040 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 5041 5042 bdev->internal.temporary_queue_depth += ch->io_outstanding; 5043 spdk_bdev_for_each_channel_continue(i, 0); 5044 } 5045 5046 static int 5047 bdev_calculate_measured_queue_depth(void *ctx) 5048 { 5049 struct spdk_bdev *bdev = ctx; 5050 5051 bdev->internal.qd_poll_in_progress = true; 5052 bdev->internal.temporary_queue_depth = 0; 5053 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 5054 return SPDK_POLLER_BUSY; 5055 } 5056 5057 static void 5058 bdev_update_qd_sampling_period(void *ctx) 5059 { 5060 struct spdk_bdev *bdev = ctx; 5061 5062 if (bdev->internal.period == bdev->internal.new_period) { 5063 return; 5064 } 5065 5066 if (bdev->internal.qd_poll_in_progress) { 5067 return; 5068 } 5069 5070 bdev->internal.period = bdev->internal.new_period; 5071 5072 spdk_poller_unregister(&bdev->internal.qd_poller); 5073 if (bdev->internal.period != 0) { 5074 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5075 bdev, bdev->internal.period); 5076 } else { 5077 spdk_bdev_close(bdev->internal.qd_desc); 5078 bdev->internal.qd_desc = NULL; 5079 } 5080 } 5081 5082 static void 5083 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 5084 { 5085 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 5086 } 5087 5088 void 5089 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 5090 { 5091 int rc; 5092 5093 if (bdev->internal.new_period == period) { 5094 return; 5095 } 5096 5097 bdev->internal.new_period = period; 5098 5099 if (bdev->internal.qd_desc != NULL) { 5100 assert(bdev->internal.period != 0); 5101 5102 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 5103 bdev_update_qd_sampling_period, bdev); 5104 return; 5105 } 5106 5107 assert(bdev->internal.period == 0); 5108 5109 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5110 NULL, &bdev->internal.qd_desc); 5111 if (rc != 0) { 5112 return; 5113 } 5114 5115 bdev->internal.period = period; 5116 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5117 bdev, period); 5118 } 5119 5120 struct bdev_get_current_qd_ctx { 5121 uint64_t current_qd; 5122 spdk_bdev_get_current_qd_cb cb_fn; 5123 void *cb_arg; 5124 }; 5125 5126 static void 5127 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5128 { 5129 struct bdev_get_current_qd_ctx *ctx = _ctx; 5130 5131 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5132 5133 free(ctx); 5134 } 5135 5136 static void 5137 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5138 struct spdk_io_channel *io_ch, void *_ctx) 5139 { 5140 struct bdev_get_current_qd_ctx *ctx = _ctx; 5141 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5142 5143 ctx->current_qd += bdev_ch->io_outstanding; 5144 5145 spdk_bdev_for_each_channel_continue(i, 0); 5146 } 5147 5148 void 5149 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5150 void *cb_arg) 5151 { 5152 struct bdev_get_current_qd_ctx *ctx; 5153 5154 assert(cb_fn != NULL); 5155 5156 ctx = calloc(1, sizeof(*ctx)); 5157 if (ctx == NULL) { 5158 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5159 return; 5160 } 5161 5162 ctx->cb_fn = cb_fn; 5163 ctx->cb_arg = cb_arg; 5164 5165 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5166 } 5167 5168 static void 5169 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5170 { 5171 assert(desc->thread == spdk_get_thread()); 5172 5173 spdk_spin_lock(&desc->spinlock); 5174 desc->refs--; 5175 if (!desc->closed) { 5176 spdk_spin_unlock(&desc->spinlock); 5177 desc->callback.event_fn(type, 5178 desc->bdev, 5179 desc->callback.ctx); 5180 return; 5181 } else if (desc->refs == 0) { 5182 /* This descriptor was closed after this event_notify message was sent. 5183 * spdk_bdev_close() could not free the descriptor since this message was 5184 * in flight, so we free it now using bdev_desc_free(). 5185 */ 5186 spdk_spin_unlock(&desc->spinlock); 5187 bdev_desc_free(desc); 5188 return; 5189 } 5190 spdk_spin_unlock(&desc->spinlock); 5191 } 5192 5193 static void 5194 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5195 { 5196 spdk_spin_lock(&desc->spinlock); 5197 desc->refs++; 5198 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5199 spdk_spin_unlock(&desc->spinlock); 5200 } 5201 5202 static void 5203 _resize_notify(void *ctx) 5204 { 5205 struct spdk_bdev_desc *desc = ctx; 5206 5207 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5208 } 5209 5210 int 5211 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5212 { 5213 struct spdk_bdev_desc *desc; 5214 int ret; 5215 5216 if (size == bdev->blockcnt) { 5217 return 0; 5218 } 5219 5220 spdk_spin_lock(&bdev->internal.spinlock); 5221 5222 /* bdev has open descriptors */ 5223 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5224 bdev->blockcnt > size) { 5225 ret = -EBUSY; 5226 } else { 5227 bdev->blockcnt = size; 5228 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5229 event_notify(desc, _resize_notify); 5230 } 5231 ret = 0; 5232 } 5233 5234 spdk_spin_unlock(&bdev->internal.spinlock); 5235 5236 return ret; 5237 } 5238 5239 /* 5240 * Convert I/O offset and length from bytes to blocks. 5241 * 5242 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5243 */ 5244 static uint64_t 5245 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5246 uint64_t num_bytes, uint64_t *num_blocks) 5247 { 5248 uint32_t block_size = bdev->blocklen; 5249 uint8_t shift_cnt; 5250 5251 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5252 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5253 shift_cnt = spdk_u32log2(block_size); 5254 *offset_blocks = offset_bytes >> shift_cnt; 5255 *num_blocks = num_bytes >> shift_cnt; 5256 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5257 (num_bytes - (*num_blocks << shift_cnt)); 5258 } else { 5259 *offset_blocks = offset_bytes / block_size; 5260 *num_blocks = num_bytes / block_size; 5261 return (offset_bytes % block_size) | (num_bytes % block_size); 5262 } 5263 } 5264 5265 static bool 5266 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5267 { 5268 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5269 * has been an overflow and hence the offset has been wrapped around */ 5270 if (offset_blocks + num_blocks < offset_blocks) { 5271 return false; 5272 } 5273 5274 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5275 if (offset_blocks + num_blocks > bdev->blockcnt) { 5276 return false; 5277 } 5278 5279 return true; 5280 } 5281 5282 static void 5283 bdev_seek_complete_cb(void *ctx) 5284 { 5285 struct spdk_bdev_io *bdev_io = ctx; 5286 5287 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5288 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5289 } 5290 5291 static int 5292 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5293 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5294 spdk_bdev_io_completion_cb cb, void *cb_arg) 5295 { 5296 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5297 struct spdk_bdev_io *bdev_io; 5298 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5299 5300 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5301 5302 /* Check if offset_blocks is valid looking at the validity of one block */ 5303 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5304 return -EINVAL; 5305 } 5306 5307 bdev_io = bdev_channel_get_io(channel); 5308 if (!bdev_io) { 5309 return -ENOMEM; 5310 } 5311 5312 bdev_io->internal.ch = channel; 5313 bdev_io->internal.desc = desc; 5314 bdev_io->type = io_type; 5315 bdev_io->u.bdev.offset_blocks = offset_blocks; 5316 bdev_io->u.bdev.memory_domain = NULL; 5317 bdev_io->u.bdev.memory_domain_ctx = NULL; 5318 bdev_io->u.bdev.accel_sequence = NULL; 5319 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5320 5321 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5322 /* In case bdev doesn't support seek to next data/hole offset, 5323 * it is assumed that only data and no holes are present */ 5324 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5325 bdev_io->u.bdev.seek.offset = offset_blocks; 5326 } else { 5327 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5328 } 5329 5330 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5331 return 0; 5332 } 5333 5334 bdev_io_submit(bdev_io); 5335 return 0; 5336 } 5337 5338 int 5339 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5340 uint64_t offset_blocks, 5341 spdk_bdev_io_completion_cb cb, void *cb_arg) 5342 { 5343 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5344 } 5345 5346 int 5347 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5348 uint64_t offset_blocks, 5349 spdk_bdev_io_completion_cb cb, void *cb_arg) 5350 { 5351 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5352 } 5353 5354 uint64_t 5355 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5356 { 5357 return bdev_io->u.bdev.seek.offset; 5358 } 5359 5360 static int 5361 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5362 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5363 spdk_bdev_io_completion_cb cb, void *cb_arg) 5364 { 5365 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5366 struct spdk_bdev_io *bdev_io; 5367 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5368 5369 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5370 return -EINVAL; 5371 } 5372 5373 bdev_io = bdev_channel_get_io(channel); 5374 if (!bdev_io) { 5375 return -ENOMEM; 5376 } 5377 5378 bdev_io->internal.ch = channel; 5379 bdev_io->internal.desc = desc; 5380 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5381 bdev_io->u.bdev.iovs = &bdev_io->iov; 5382 bdev_io->u.bdev.iovs[0].iov_base = buf; 5383 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5384 bdev_io->u.bdev.iovcnt = 1; 5385 bdev_io->u.bdev.md_buf = md_buf; 5386 bdev_io->u.bdev.num_blocks = num_blocks; 5387 bdev_io->u.bdev.offset_blocks = offset_blocks; 5388 bdev_io->u.bdev.memory_domain = NULL; 5389 bdev_io->u.bdev.memory_domain_ctx = NULL; 5390 bdev_io->u.bdev.accel_sequence = NULL; 5391 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5392 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5393 5394 bdev_io_submit(bdev_io); 5395 return 0; 5396 } 5397 5398 int 5399 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5400 void *buf, uint64_t offset, uint64_t nbytes, 5401 spdk_bdev_io_completion_cb cb, void *cb_arg) 5402 { 5403 uint64_t offset_blocks, num_blocks; 5404 5405 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5406 nbytes, &num_blocks) != 0) { 5407 return -EINVAL; 5408 } 5409 5410 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5411 } 5412 5413 int 5414 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5415 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5416 spdk_bdev_io_completion_cb cb, void *cb_arg) 5417 { 5418 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5419 } 5420 5421 int 5422 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5423 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5424 spdk_bdev_io_completion_cb cb, void *cb_arg) 5425 { 5426 struct iovec iov = { 5427 .iov_base = buf, 5428 }; 5429 5430 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5431 return -EINVAL; 5432 } 5433 5434 if (md_buf && !_is_buf_allocated(&iov)) { 5435 return -EINVAL; 5436 } 5437 5438 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5439 cb, cb_arg); 5440 } 5441 5442 int 5443 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5444 struct iovec *iov, int iovcnt, 5445 uint64_t offset, uint64_t nbytes, 5446 spdk_bdev_io_completion_cb cb, void *cb_arg) 5447 { 5448 uint64_t offset_blocks, num_blocks; 5449 5450 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5451 nbytes, &num_blocks) != 0) { 5452 return -EINVAL; 5453 } 5454 5455 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5456 } 5457 5458 static int 5459 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5460 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5461 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5462 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5463 spdk_bdev_io_completion_cb cb, void *cb_arg) 5464 { 5465 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5466 struct spdk_bdev_io *bdev_io; 5467 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5468 5469 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5470 return -EINVAL; 5471 } 5472 5473 bdev_io = bdev_channel_get_io(channel); 5474 if (spdk_unlikely(!bdev_io)) { 5475 return -ENOMEM; 5476 } 5477 5478 bdev_io->internal.ch = channel; 5479 bdev_io->internal.desc = desc; 5480 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5481 bdev_io->u.bdev.iovs = iov; 5482 bdev_io->u.bdev.iovcnt = iovcnt; 5483 bdev_io->u.bdev.md_buf = md_buf; 5484 bdev_io->u.bdev.num_blocks = num_blocks; 5485 bdev_io->u.bdev.offset_blocks = offset_blocks; 5486 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5487 5488 if (seq != NULL) { 5489 bdev_io->internal.f.has_accel_sequence = true; 5490 bdev_io->internal.accel_sequence = seq; 5491 } 5492 5493 if (domain != NULL) { 5494 bdev_io->internal.f.has_memory_domain = true; 5495 bdev_io->internal.memory_domain = domain; 5496 bdev_io->internal.memory_domain_ctx = domain_ctx; 5497 } 5498 5499 bdev_io->u.bdev.memory_domain = domain; 5500 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5501 bdev_io->u.bdev.accel_sequence = seq; 5502 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5503 5504 _bdev_io_submit_ext(desc, bdev_io); 5505 5506 return 0; 5507 } 5508 5509 int 5510 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5511 struct iovec *iov, int iovcnt, 5512 uint64_t offset_blocks, uint64_t num_blocks, 5513 spdk_bdev_io_completion_cb cb, void *cb_arg) 5514 { 5515 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5516 5517 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5518 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5519 } 5520 5521 int 5522 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5523 struct iovec *iov, int iovcnt, void *md_buf, 5524 uint64_t offset_blocks, uint64_t num_blocks, 5525 spdk_bdev_io_completion_cb cb, void *cb_arg) 5526 { 5527 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5528 5529 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5530 return -EINVAL; 5531 } 5532 5533 if (md_buf && !_is_buf_allocated(iov)) { 5534 return -EINVAL; 5535 } 5536 5537 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5538 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5539 } 5540 5541 static inline bool 5542 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5543 { 5544 /* 5545 * We check if opts size is at least of size when we first introduced 5546 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5547 * are not checked internal. 5548 */ 5549 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5550 sizeof(opts->metadata) && 5551 opts->size <= sizeof(*opts) && 5552 /* When memory domain is used, the user must provide data buffers */ 5553 (!opts->memory_domain || (iov && iov[0].iov_base)); 5554 } 5555 5556 int 5557 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5558 struct iovec *iov, int iovcnt, 5559 uint64_t offset_blocks, uint64_t num_blocks, 5560 spdk_bdev_io_completion_cb cb, void *cb_arg, 5561 struct spdk_bdev_ext_io_opts *opts) 5562 { 5563 struct spdk_memory_domain *domain = NULL; 5564 struct spdk_accel_sequence *seq = NULL; 5565 void *domain_ctx = NULL, *md = NULL; 5566 uint32_t dif_check_flags = 0; 5567 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5568 5569 if (opts) { 5570 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5571 return -EINVAL; 5572 } 5573 5574 md = opts->metadata; 5575 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5576 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5577 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5578 if (md) { 5579 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5580 return -EINVAL; 5581 } 5582 5583 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5584 return -EINVAL; 5585 } 5586 5587 if (spdk_unlikely(seq != NULL)) { 5588 return -EINVAL; 5589 } 5590 } 5591 } 5592 5593 dif_check_flags = bdev->dif_check_flags & 5594 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5595 5596 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5597 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5598 } 5599 5600 static int 5601 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5602 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5603 spdk_bdev_io_completion_cb cb, void *cb_arg) 5604 { 5605 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5606 struct spdk_bdev_io *bdev_io; 5607 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5608 5609 if (!desc->write) { 5610 return -EBADF; 5611 } 5612 5613 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5614 return -EINVAL; 5615 } 5616 5617 bdev_io = bdev_channel_get_io(channel); 5618 if (!bdev_io) { 5619 return -ENOMEM; 5620 } 5621 5622 bdev_io->internal.ch = channel; 5623 bdev_io->internal.desc = desc; 5624 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5625 bdev_io->u.bdev.iovs = &bdev_io->iov; 5626 bdev_io->u.bdev.iovs[0].iov_base = buf; 5627 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5628 bdev_io->u.bdev.iovcnt = 1; 5629 bdev_io->u.bdev.md_buf = md_buf; 5630 bdev_io->u.bdev.num_blocks = num_blocks; 5631 bdev_io->u.bdev.offset_blocks = offset_blocks; 5632 bdev_io->u.bdev.memory_domain = NULL; 5633 bdev_io->u.bdev.memory_domain_ctx = NULL; 5634 bdev_io->u.bdev.accel_sequence = NULL; 5635 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5636 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5637 5638 bdev_io_submit(bdev_io); 5639 return 0; 5640 } 5641 5642 int 5643 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5644 void *buf, uint64_t offset, uint64_t nbytes, 5645 spdk_bdev_io_completion_cb cb, void *cb_arg) 5646 { 5647 uint64_t offset_blocks, num_blocks; 5648 5649 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5650 nbytes, &num_blocks) != 0) { 5651 return -EINVAL; 5652 } 5653 5654 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5655 } 5656 5657 int 5658 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5659 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5660 spdk_bdev_io_completion_cb cb, void *cb_arg) 5661 { 5662 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5663 cb, cb_arg); 5664 } 5665 5666 int 5667 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5668 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5669 spdk_bdev_io_completion_cb cb, void *cb_arg) 5670 { 5671 struct iovec iov = { 5672 .iov_base = buf, 5673 }; 5674 5675 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5676 return -EINVAL; 5677 } 5678 5679 if (md_buf && !_is_buf_allocated(&iov)) { 5680 return -EINVAL; 5681 } 5682 5683 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5684 cb, cb_arg); 5685 } 5686 5687 static int 5688 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5689 struct iovec *iov, int iovcnt, void *md_buf, 5690 uint64_t offset_blocks, uint64_t num_blocks, 5691 struct spdk_memory_domain *domain, void *domain_ctx, 5692 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5693 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5694 spdk_bdev_io_completion_cb cb, void *cb_arg) 5695 { 5696 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5697 struct spdk_bdev_io *bdev_io; 5698 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5699 5700 if (spdk_unlikely(!desc->write)) { 5701 return -EBADF; 5702 } 5703 5704 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5705 return -EINVAL; 5706 } 5707 5708 bdev_io = bdev_channel_get_io(channel); 5709 if (spdk_unlikely(!bdev_io)) { 5710 return -ENOMEM; 5711 } 5712 5713 bdev_io->internal.ch = channel; 5714 bdev_io->internal.desc = desc; 5715 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5716 bdev_io->u.bdev.iovs = iov; 5717 bdev_io->u.bdev.iovcnt = iovcnt; 5718 bdev_io->u.bdev.md_buf = md_buf; 5719 bdev_io->u.bdev.num_blocks = num_blocks; 5720 bdev_io->u.bdev.offset_blocks = offset_blocks; 5721 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5722 if (seq != NULL) { 5723 bdev_io->internal.f.has_accel_sequence = true; 5724 bdev_io->internal.accel_sequence = seq; 5725 } 5726 5727 if (domain != NULL) { 5728 bdev_io->internal.f.has_memory_domain = true; 5729 bdev_io->internal.memory_domain = domain; 5730 bdev_io->internal.memory_domain_ctx = domain_ctx; 5731 } 5732 5733 bdev_io->u.bdev.memory_domain = domain; 5734 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5735 bdev_io->u.bdev.accel_sequence = seq; 5736 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5737 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5738 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5739 5740 _bdev_io_submit_ext(desc, bdev_io); 5741 5742 return 0; 5743 } 5744 5745 int 5746 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5747 struct iovec *iov, int iovcnt, 5748 uint64_t offset, uint64_t len, 5749 spdk_bdev_io_completion_cb cb, void *cb_arg) 5750 { 5751 uint64_t offset_blocks, num_blocks; 5752 5753 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5754 len, &num_blocks) != 0) { 5755 return -EINVAL; 5756 } 5757 5758 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5759 } 5760 5761 int 5762 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5763 struct iovec *iov, int iovcnt, 5764 uint64_t offset_blocks, uint64_t num_blocks, 5765 spdk_bdev_io_completion_cb cb, void *cb_arg) 5766 { 5767 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5768 5769 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5770 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5771 cb, cb_arg); 5772 } 5773 5774 int 5775 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5776 struct iovec *iov, int iovcnt, void *md_buf, 5777 uint64_t offset_blocks, uint64_t num_blocks, 5778 spdk_bdev_io_completion_cb cb, void *cb_arg) 5779 { 5780 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5781 5782 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5783 return -EINVAL; 5784 } 5785 5786 if (md_buf && !_is_buf_allocated(iov)) { 5787 return -EINVAL; 5788 } 5789 5790 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5791 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5792 cb, cb_arg); 5793 } 5794 5795 int 5796 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5797 struct iovec *iov, int iovcnt, 5798 uint64_t offset_blocks, uint64_t num_blocks, 5799 spdk_bdev_io_completion_cb cb, void *cb_arg, 5800 struct spdk_bdev_ext_io_opts *opts) 5801 { 5802 struct spdk_memory_domain *domain = NULL; 5803 struct spdk_accel_sequence *seq = NULL; 5804 void *domain_ctx = NULL, *md = NULL; 5805 uint32_t dif_check_flags = 0; 5806 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5807 uint32_t nvme_cdw12_raw = 0; 5808 uint32_t nvme_cdw13_raw = 0; 5809 5810 if (opts) { 5811 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5812 return -EINVAL; 5813 } 5814 md = opts->metadata; 5815 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5816 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5817 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5818 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5819 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5820 if (md) { 5821 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5822 return -EINVAL; 5823 } 5824 5825 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5826 return -EINVAL; 5827 } 5828 5829 if (spdk_unlikely(seq != NULL)) { 5830 return -EINVAL; 5831 } 5832 } 5833 } 5834 5835 dif_check_flags = bdev->dif_check_flags & 5836 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5837 5838 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5839 domain, domain_ctx, seq, dif_check_flags, 5840 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5841 } 5842 5843 static void 5844 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5845 { 5846 struct spdk_bdev_io *parent_io = cb_arg; 5847 struct spdk_bdev *bdev = parent_io->bdev; 5848 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5849 int i, rc = 0; 5850 5851 if (!success) { 5852 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5853 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5854 spdk_bdev_free_io(bdev_io); 5855 return; 5856 } 5857 5858 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5859 rc = memcmp(read_buf, 5860 parent_io->u.bdev.iovs[i].iov_base, 5861 parent_io->u.bdev.iovs[i].iov_len); 5862 if (rc) { 5863 break; 5864 } 5865 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5866 } 5867 5868 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5869 rc = memcmp(bdev_io->u.bdev.md_buf, 5870 parent_io->u.bdev.md_buf, 5871 spdk_bdev_get_md_size(bdev)); 5872 } 5873 5874 spdk_bdev_free_io(bdev_io); 5875 5876 if (rc == 0) { 5877 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5878 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5879 } else { 5880 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5881 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5882 } 5883 } 5884 5885 static void 5886 bdev_compare_do_read(void *_bdev_io) 5887 { 5888 struct spdk_bdev_io *bdev_io = _bdev_io; 5889 int rc; 5890 5891 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5892 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5893 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5894 bdev_compare_do_read_done, bdev_io); 5895 5896 if (rc == -ENOMEM) { 5897 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5898 } else if (rc != 0) { 5899 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5900 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5901 } 5902 } 5903 5904 static int 5905 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5906 struct iovec *iov, int iovcnt, void *md_buf, 5907 uint64_t offset_blocks, uint64_t num_blocks, 5908 spdk_bdev_io_completion_cb cb, void *cb_arg) 5909 { 5910 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5911 struct spdk_bdev_io *bdev_io; 5912 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5913 5914 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5915 return -EINVAL; 5916 } 5917 5918 bdev_io = bdev_channel_get_io(channel); 5919 if (!bdev_io) { 5920 return -ENOMEM; 5921 } 5922 5923 bdev_io->internal.ch = channel; 5924 bdev_io->internal.desc = desc; 5925 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5926 bdev_io->u.bdev.iovs = iov; 5927 bdev_io->u.bdev.iovcnt = iovcnt; 5928 bdev_io->u.bdev.md_buf = md_buf; 5929 bdev_io->u.bdev.num_blocks = num_blocks; 5930 bdev_io->u.bdev.offset_blocks = offset_blocks; 5931 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5932 bdev_io->u.bdev.memory_domain = NULL; 5933 bdev_io->u.bdev.memory_domain_ctx = NULL; 5934 bdev_io->u.bdev.accel_sequence = NULL; 5935 5936 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5937 bdev_io_submit(bdev_io); 5938 return 0; 5939 } 5940 5941 bdev_compare_do_read(bdev_io); 5942 5943 return 0; 5944 } 5945 5946 int 5947 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5948 struct iovec *iov, int iovcnt, 5949 uint64_t offset_blocks, uint64_t num_blocks, 5950 spdk_bdev_io_completion_cb cb, void *cb_arg) 5951 { 5952 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5953 num_blocks, cb, cb_arg); 5954 } 5955 5956 int 5957 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5958 struct iovec *iov, int iovcnt, void *md_buf, 5959 uint64_t offset_blocks, uint64_t num_blocks, 5960 spdk_bdev_io_completion_cb cb, void *cb_arg) 5961 { 5962 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5963 return -EINVAL; 5964 } 5965 5966 if (md_buf && !_is_buf_allocated(iov)) { 5967 return -EINVAL; 5968 } 5969 5970 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5971 num_blocks, cb, cb_arg); 5972 } 5973 5974 static int 5975 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5976 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5977 spdk_bdev_io_completion_cb cb, void *cb_arg) 5978 { 5979 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5980 struct spdk_bdev_io *bdev_io; 5981 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5982 5983 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5984 return -EINVAL; 5985 } 5986 5987 bdev_io = bdev_channel_get_io(channel); 5988 if (!bdev_io) { 5989 return -ENOMEM; 5990 } 5991 5992 bdev_io->internal.ch = channel; 5993 bdev_io->internal.desc = desc; 5994 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5995 bdev_io->u.bdev.iovs = &bdev_io->iov; 5996 bdev_io->u.bdev.iovs[0].iov_base = buf; 5997 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5998 bdev_io->u.bdev.iovcnt = 1; 5999 bdev_io->u.bdev.md_buf = md_buf; 6000 bdev_io->u.bdev.num_blocks = num_blocks; 6001 bdev_io->u.bdev.offset_blocks = offset_blocks; 6002 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6003 bdev_io->u.bdev.memory_domain = NULL; 6004 bdev_io->u.bdev.memory_domain_ctx = NULL; 6005 bdev_io->u.bdev.accel_sequence = NULL; 6006 6007 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 6008 bdev_io_submit(bdev_io); 6009 return 0; 6010 } 6011 6012 bdev_compare_do_read(bdev_io); 6013 6014 return 0; 6015 } 6016 6017 int 6018 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6019 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 6020 spdk_bdev_io_completion_cb cb, void *cb_arg) 6021 { 6022 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 6023 cb, cb_arg); 6024 } 6025 6026 int 6027 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6028 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 6029 spdk_bdev_io_completion_cb cb, void *cb_arg) 6030 { 6031 struct iovec iov = { 6032 .iov_base = buf, 6033 }; 6034 6035 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 6036 return -EINVAL; 6037 } 6038 6039 if (md_buf && !_is_buf_allocated(&iov)) { 6040 return -EINVAL; 6041 } 6042 6043 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 6044 cb, cb_arg); 6045 } 6046 6047 static void 6048 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 6049 { 6050 struct spdk_bdev_io *bdev_io = ctx; 6051 6052 if (unlock_status) { 6053 SPDK_ERRLOG("LBA range unlock failed\n"); 6054 } 6055 6056 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 6057 false, bdev_io->internal.caller_ctx); 6058 } 6059 6060 static void 6061 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 6062 { 6063 bdev_io->internal.status = status; 6064 6065 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 6066 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6067 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 6068 } 6069 6070 static void 6071 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6072 { 6073 struct spdk_bdev_io *parent_io = cb_arg; 6074 6075 if (!success) { 6076 SPDK_ERRLOG("Compare and write operation failed\n"); 6077 } 6078 6079 spdk_bdev_free_io(bdev_io); 6080 6081 bdev_comparev_and_writev_blocks_unlock(parent_io, 6082 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 6083 } 6084 6085 static void 6086 bdev_compare_and_write_do_write(void *_bdev_io) 6087 { 6088 struct spdk_bdev_io *bdev_io = _bdev_io; 6089 int rc; 6090 6091 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 6092 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6093 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 6094 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6095 bdev_compare_and_write_do_write_done, bdev_io); 6096 6097 6098 if (rc == -ENOMEM) { 6099 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 6100 } else if (rc != 0) { 6101 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6102 } 6103 } 6104 6105 static void 6106 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6107 { 6108 struct spdk_bdev_io *parent_io = cb_arg; 6109 6110 spdk_bdev_free_io(bdev_io); 6111 6112 if (!success) { 6113 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 6114 return; 6115 } 6116 6117 bdev_compare_and_write_do_write(parent_io); 6118 } 6119 6120 static void 6121 bdev_compare_and_write_do_compare(void *_bdev_io) 6122 { 6123 struct spdk_bdev_io *bdev_io = _bdev_io; 6124 int rc; 6125 6126 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6127 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6128 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6129 bdev_compare_and_write_do_compare_done, bdev_io); 6130 6131 if (rc == -ENOMEM) { 6132 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6133 } else if (rc != 0) { 6134 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6135 } 6136 } 6137 6138 static void 6139 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6140 { 6141 struct spdk_bdev_io *bdev_io = ctx; 6142 6143 if (status) { 6144 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6145 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6146 return; 6147 } 6148 6149 bdev_compare_and_write_do_compare(bdev_io); 6150 } 6151 6152 int 6153 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6154 struct iovec *compare_iov, int compare_iovcnt, 6155 struct iovec *write_iov, int write_iovcnt, 6156 uint64_t offset_blocks, uint64_t num_blocks, 6157 spdk_bdev_io_completion_cb cb, void *cb_arg) 6158 { 6159 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6160 struct spdk_bdev_io *bdev_io; 6161 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6162 6163 if (!desc->write) { 6164 return -EBADF; 6165 } 6166 6167 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6168 return -EINVAL; 6169 } 6170 6171 if (num_blocks > bdev->acwu) { 6172 return -EINVAL; 6173 } 6174 6175 bdev_io = bdev_channel_get_io(channel); 6176 if (!bdev_io) { 6177 return -ENOMEM; 6178 } 6179 6180 bdev_io->internal.ch = channel; 6181 bdev_io->internal.desc = desc; 6182 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6183 bdev_io->u.bdev.iovs = compare_iov; 6184 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6185 bdev_io->u.bdev.fused_iovs = write_iov; 6186 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6187 bdev_io->u.bdev.md_buf = NULL; 6188 bdev_io->u.bdev.num_blocks = num_blocks; 6189 bdev_io->u.bdev.offset_blocks = offset_blocks; 6190 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6191 bdev_io->u.bdev.memory_domain = NULL; 6192 bdev_io->u.bdev.memory_domain_ctx = NULL; 6193 bdev_io->u.bdev.accel_sequence = NULL; 6194 6195 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6196 bdev_io_submit(bdev_io); 6197 return 0; 6198 } 6199 6200 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6201 bdev_comparev_and_writev_blocks_locked, bdev_io); 6202 } 6203 6204 int 6205 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6206 struct iovec *iov, int iovcnt, 6207 uint64_t offset_blocks, uint64_t num_blocks, 6208 bool populate, 6209 spdk_bdev_io_completion_cb cb, void *cb_arg) 6210 { 6211 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6212 struct spdk_bdev_io *bdev_io; 6213 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6214 6215 if (!desc->write) { 6216 return -EBADF; 6217 } 6218 6219 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6220 return -EINVAL; 6221 } 6222 6223 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6224 return -ENOTSUP; 6225 } 6226 6227 bdev_io = bdev_channel_get_io(channel); 6228 if (!bdev_io) { 6229 return -ENOMEM; 6230 } 6231 6232 bdev_io->internal.ch = channel; 6233 bdev_io->internal.desc = desc; 6234 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6235 bdev_io->u.bdev.num_blocks = num_blocks; 6236 bdev_io->u.bdev.offset_blocks = offset_blocks; 6237 bdev_io->u.bdev.iovs = iov; 6238 bdev_io->u.bdev.iovcnt = iovcnt; 6239 bdev_io->u.bdev.md_buf = NULL; 6240 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6241 bdev_io->u.bdev.zcopy.commit = 0; 6242 bdev_io->u.bdev.zcopy.start = 1; 6243 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6244 bdev_io->u.bdev.memory_domain = NULL; 6245 bdev_io->u.bdev.memory_domain_ctx = NULL; 6246 bdev_io->u.bdev.accel_sequence = NULL; 6247 6248 bdev_io_submit(bdev_io); 6249 6250 return 0; 6251 } 6252 6253 int 6254 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6255 spdk_bdev_io_completion_cb cb, void *cb_arg) 6256 { 6257 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6258 return -EINVAL; 6259 } 6260 6261 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6262 bdev_io->u.bdev.zcopy.start = 0; 6263 bdev_io->internal.caller_ctx = cb_arg; 6264 bdev_io->internal.cb = cb; 6265 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6266 6267 bdev_io_submit(bdev_io); 6268 6269 return 0; 6270 } 6271 6272 int 6273 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6274 uint64_t offset, uint64_t len, 6275 spdk_bdev_io_completion_cb cb, void *cb_arg) 6276 { 6277 uint64_t offset_blocks, num_blocks; 6278 6279 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6280 len, &num_blocks) != 0) { 6281 return -EINVAL; 6282 } 6283 6284 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6285 } 6286 6287 int 6288 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6289 uint64_t offset_blocks, uint64_t num_blocks, 6290 spdk_bdev_io_completion_cb cb, void *cb_arg) 6291 { 6292 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6293 struct spdk_bdev_io *bdev_io; 6294 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6295 6296 if (!desc->write) { 6297 return -EBADF; 6298 } 6299 6300 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6301 return -EINVAL; 6302 } 6303 6304 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6305 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6306 return -ENOTSUP; 6307 } 6308 6309 bdev_io = bdev_channel_get_io(channel); 6310 6311 if (!bdev_io) { 6312 return -ENOMEM; 6313 } 6314 6315 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6316 bdev_io->internal.ch = channel; 6317 bdev_io->internal.desc = desc; 6318 bdev_io->u.bdev.offset_blocks = offset_blocks; 6319 bdev_io->u.bdev.num_blocks = num_blocks; 6320 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6321 bdev_io->u.bdev.memory_domain = NULL; 6322 bdev_io->u.bdev.memory_domain_ctx = NULL; 6323 bdev_io->u.bdev.accel_sequence = NULL; 6324 6325 /* If the write_zeroes size is large and should be split, use the generic split 6326 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6327 * 6328 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6329 * or emulate it using regular write request otherwise. 6330 */ 6331 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6332 bdev_io->internal.f.split) { 6333 bdev_io_submit(bdev_io); 6334 return 0; 6335 } 6336 6337 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6338 6339 return bdev_write_zero_buffer(bdev_io); 6340 } 6341 6342 int 6343 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6344 uint64_t offset, uint64_t nbytes, 6345 spdk_bdev_io_completion_cb cb, void *cb_arg) 6346 { 6347 uint64_t offset_blocks, num_blocks; 6348 6349 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6350 nbytes, &num_blocks) != 0) { 6351 return -EINVAL; 6352 } 6353 6354 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6355 } 6356 6357 static void 6358 bdev_io_complete_cb(void *ctx) 6359 { 6360 struct spdk_bdev_io *bdev_io = ctx; 6361 6362 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6363 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6364 } 6365 6366 int 6367 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6368 uint64_t offset_blocks, uint64_t num_blocks, 6369 spdk_bdev_io_completion_cb cb, void *cb_arg) 6370 { 6371 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6372 struct spdk_bdev_io *bdev_io; 6373 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6374 6375 if (!desc->write) { 6376 return -EBADF; 6377 } 6378 6379 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6380 return -EINVAL; 6381 } 6382 6383 bdev_io = bdev_channel_get_io(channel); 6384 if (!bdev_io) { 6385 return -ENOMEM; 6386 } 6387 6388 bdev_io->internal.ch = channel; 6389 bdev_io->internal.desc = desc; 6390 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6391 6392 bdev_io->u.bdev.iovs = &bdev_io->iov; 6393 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6394 bdev_io->u.bdev.iovs[0].iov_len = 0; 6395 bdev_io->u.bdev.iovcnt = 1; 6396 6397 bdev_io->u.bdev.offset_blocks = offset_blocks; 6398 bdev_io->u.bdev.num_blocks = num_blocks; 6399 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6400 bdev_io->u.bdev.memory_domain = NULL; 6401 bdev_io->u.bdev.memory_domain_ctx = NULL; 6402 bdev_io->u.bdev.accel_sequence = NULL; 6403 6404 if (num_blocks == 0) { 6405 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6406 return 0; 6407 } 6408 6409 bdev_io_submit(bdev_io); 6410 return 0; 6411 } 6412 6413 int 6414 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6415 uint64_t offset, uint64_t length, 6416 spdk_bdev_io_completion_cb cb, void *cb_arg) 6417 { 6418 uint64_t offset_blocks, num_blocks; 6419 6420 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6421 length, &num_blocks) != 0) { 6422 return -EINVAL; 6423 } 6424 6425 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6426 } 6427 6428 int 6429 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6430 uint64_t offset_blocks, uint64_t num_blocks, 6431 spdk_bdev_io_completion_cb cb, void *cb_arg) 6432 { 6433 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6434 struct spdk_bdev_io *bdev_io; 6435 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6436 6437 if (!desc->write) { 6438 return -EBADF; 6439 } 6440 6441 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6442 return -EINVAL; 6443 } 6444 6445 bdev_io = bdev_channel_get_io(channel); 6446 if (!bdev_io) { 6447 return -ENOMEM; 6448 } 6449 6450 bdev_io->internal.ch = channel; 6451 bdev_io->internal.desc = desc; 6452 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6453 bdev_io->u.bdev.iovs = NULL; 6454 bdev_io->u.bdev.iovcnt = 0; 6455 bdev_io->u.bdev.offset_blocks = offset_blocks; 6456 bdev_io->u.bdev.num_blocks = num_blocks; 6457 bdev_io->u.bdev.memory_domain = NULL; 6458 bdev_io->u.bdev.memory_domain_ctx = NULL; 6459 bdev_io->u.bdev.accel_sequence = NULL; 6460 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6461 6462 bdev_io_submit(bdev_io); 6463 return 0; 6464 } 6465 6466 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6467 6468 static void 6469 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6470 { 6471 struct spdk_bdev_channel *ch = _ctx; 6472 struct spdk_bdev_io *bdev_io; 6473 6474 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6475 6476 if (status == -EBUSY) { 6477 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6478 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6479 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6480 } else { 6481 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6482 6483 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6484 /* If outstanding IOs are still present and reset_io_drain_timeout 6485 * seconds passed, start the reset. */ 6486 bdev_io_submit_reset(bdev_io); 6487 } else { 6488 /* We still have in progress memory domain pull/push or we're 6489 * executing accel sequence. Since we cannot abort either of those 6490 * operations, fail the reset request. */ 6491 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6492 } 6493 } 6494 } else { 6495 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6496 SPDK_DEBUGLOG(bdev, 6497 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6498 ch->bdev->name); 6499 /* Mark the completion status as a SUCCESS and complete the reset. */ 6500 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6501 } 6502 } 6503 6504 static void 6505 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6506 struct spdk_io_channel *io_ch, void *_ctx) 6507 { 6508 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6509 int status = 0; 6510 6511 if (cur_ch->io_outstanding > 0 || 6512 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6513 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6514 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6515 * further iteration over the rest of the channels and pass non-zero status 6516 * to the callback function. */ 6517 status = -EBUSY; 6518 } 6519 spdk_bdev_for_each_channel_continue(i, status); 6520 } 6521 6522 static int 6523 bdev_reset_poll_for_outstanding_io(void *ctx) 6524 { 6525 struct spdk_bdev_channel *ch = ctx; 6526 struct spdk_bdev_io *bdev_io; 6527 6528 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6529 6530 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6531 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6532 bdev_reset_check_outstanding_io_done); 6533 6534 return SPDK_POLLER_BUSY; 6535 } 6536 6537 static void 6538 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6539 { 6540 struct spdk_bdev_channel *ch = _ctx; 6541 struct spdk_bdev_io *bdev_io; 6542 6543 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6544 6545 if (bdev->reset_io_drain_timeout == 0) { 6546 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6547 6548 bdev_io_submit_reset(bdev_io); 6549 return; 6550 } 6551 6552 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6553 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6554 6555 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6556 * submit the reset to the underlying module only if outstanding I/O 6557 * remain after reset_io_drain_timeout seconds have passed. */ 6558 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6559 bdev_reset_check_outstanding_io_done); 6560 } 6561 6562 static void 6563 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6564 struct spdk_io_channel *ch, void *_ctx) 6565 { 6566 struct spdk_bdev_channel *channel; 6567 struct spdk_bdev_mgmt_channel *mgmt_channel; 6568 struct spdk_bdev_shared_resource *shared_resource; 6569 bdev_io_tailq_t tmp_queued; 6570 6571 TAILQ_INIT(&tmp_queued); 6572 6573 channel = __io_ch_to_bdev_ch(ch); 6574 shared_resource = channel->shared_resource; 6575 mgmt_channel = shared_resource->mgmt_ch; 6576 6577 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6578 6579 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6580 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6581 } 6582 6583 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6584 bdev_abort_all_buf_io(mgmt_channel, channel); 6585 bdev_abort_all_queued_io(&tmp_queued, channel); 6586 6587 spdk_bdev_for_each_channel_continue(i, 0); 6588 } 6589 6590 static void 6591 bdev_start_reset(void *ctx) 6592 { 6593 struct spdk_bdev_channel *ch = ctx; 6594 6595 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6596 bdev_reset_freeze_channel_done); 6597 } 6598 6599 static void 6600 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6601 { 6602 struct spdk_bdev *bdev = ch->bdev; 6603 6604 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6605 6606 spdk_spin_lock(&bdev->internal.spinlock); 6607 if (bdev->internal.reset_in_progress == NULL) { 6608 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6609 /* 6610 * Take a channel reference for the target bdev for the life of this 6611 * reset. This guards against the channel getting destroyed while 6612 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6613 * progress. We will release the reference when this reset is 6614 * completed. 6615 */ 6616 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6617 bdev_start_reset(ch); 6618 } 6619 spdk_spin_unlock(&bdev->internal.spinlock); 6620 } 6621 6622 int 6623 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6624 spdk_bdev_io_completion_cb cb, void *cb_arg) 6625 { 6626 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6627 struct spdk_bdev_io *bdev_io; 6628 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6629 6630 bdev_io = bdev_channel_get_io(channel); 6631 if (!bdev_io) { 6632 return -ENOMEM; 6633 } 6634 6635 bdev_io->internal.ch = channel; 6636 bdev_io->internal.desc = desc; 6637 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6638 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6639 bdev_io->u.reset.ch_ref = NULL; 6640 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6641 6642 spdk_spin_lock(&bdev->internal.spinlock); 6643 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6644 spdk_spin_unlock(&bdev->internal.spinlock); 6645 6646 bdev_ch_add_to_io_submitted(bdev_io); 6647 6648 bdev_channel_start_reset(channel); 6649 6650 return 0; 6651 } 6652 6653 void 6654 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6655 struct spdk_bdev_io_stat *stat) 6656 { 6657 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6658 6659 bdev_get_io_stat(stat, channel->stat); 6660 } 6661 6662 static void 6663 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6664 { 6665 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6666 6667 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6668 bdev_iostat_ctx->cb_arg, 0); 6669 free(bdev_iostat_ctx); 6670 } 6671 6672 static void 6673 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6674 struct spdk_io_channel *ch, void *_ctx) 6675 { 6676 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6677 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6678 6679 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6680 spdk_bdev_for_each_channel_continue(i, 0); 6681 } 6682 6683 void 6684 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6685 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6686 { 6687 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6688 6689 assert(bdev != NULL); 6690 assert(stat != NULL); 6691 assert(cb != NULL); 6692 6693 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6694 if (bdev_iostat_ctx == NULL) { 6695 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6696 cb(bdev, stat, cb_arg, -ENOMEM); 6697 return; 6698 } 6699 6700 bdev_iostat_ctx->stat = stat; 6701 bdev_iostat_ctx->cb = cb; 6702 bdev_iostat_ctx->cb_arg = cb_arg; 6703 6704 /* Start with the statistics from previously deleted channels. */ 6705 spdk_spin_lock(&bdev->internal.spinlock); 6706 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6707 spdk_spin_unlock(&bdev->internal.spinlock); 6708 6709 /* Then iterate and add the statistics from each existing channel. */ 6710 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6711 bdev_get_device_stat_done); 6712 } 6713 6714 struct bdev_iostat_reset_ctx { 6715 enum spdk_bdev_reset_stat_mode mode; 6716 bdev_reset_device_stat_cb cb; 6717 void *cb_arg; 6718 }; 6719 6720 static void 6721 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6722 { 6723 struct bdev_iostat_reset_ctx *ctx = _ctx; 6724 6725 ctx->cb(bdev, ctx->cb_arg, 0); 6726 6727 free(ctx); 6728 } 6729 6730 static void 6731 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6732 struct spdk_io_channel *ch, void *_ctx) 6733 { 6734 struct bdev_iostat_reset_ctx *ctx = _ctx; 6735 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6736 6737 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6738 6739 spdk_bdev_for_each_channel_continue(i, 0); 6740 } 6741 6742 void 6743 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6744 bdev_reset_device_stat_cb cb, void *cb_arg) 6745 { 6746 struct bdev_iostat_reset_ctx *ctx; 6747 6748 assert(bdev != NULL); 6749 assert(cb != NULL); 6750 6751 ctx = calloc(1, sizeof(*ctx)); 6752 if (ctx == NULL) { 6753 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6754 cb(bdev, cb_arg, -ENOMEM); 6755 return; 6756 } 6757 6758 ctx->mode = mode; 6759 ctx->cb = cb; 6760 ctx->cb_arg = cb_arg; 6761 6762 spdk_spin_lock(&bdev->internal.spinlock); 6763 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6764 spdk_spin_unlock(&bdev->internal.spinlock); 6765 6766 spdk_bdev_for_each_channel(bdev, 6767 bdev_reset_each_channel_stat, 6768 ctx, 6769 bdev_reset_device_stat_done); 6770 } 6771 6772 int 6773 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6774 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6775 spdk_bdev_io_completion_cb cb, void *cb_arg) 6776 { 6777 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6778 struct spdk_bdev_io *bdev_io; 6779 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6780 6781 if (!desc->write) { 6782 return -EBADF; 6783 } 6784 6785 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6786 return -ENOTSUP; 6787 } 6788 6789 bdev_io = bdev_channel_get_io(channel); 6790 if (!bdev_io) { 6791 return -ENOMEM; 6792 } 6793 6794 bdev_io->internal.ch = channel; 6795 bdev_io->internal.desc = desc; 6796 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6797 bdev_io->u.nvme_passthru.cmd = *cmd; 6798 bdev_io->u.nvme_passthru.buf = buf; 6799 bdev_io->u.nvme_passthru.nbytes = nbytes; 6800 bdev_io->u.nvme_passthru.md_buf = NULL; 6801 bdev_io->u.nvme_passthru.md_len = 0; 6802 6803 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6804 6805 bdev_io_submit(bdev_io); 6806 return 0; 6807 } 6808 6809 int 6810 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6811 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6812 spdk_bdev_io_completion_cb cb, void *cb_arg) 6813 { 6814 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6815 struct spdk_bdev_io *bdev_io; 6816 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6817 6818 if (!desc->write) { 6819 /* 6820 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6821 * to easily determine if the command is a read or write, but for now just 6822 * do not allow io_passthru with a read-only descriptor. 6823 */ 6824 return -EBADF; 6825 } 6826 6827 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6828 return -ENOTSUP; 6829 } 6830 6831 bdev_io = bdev_channel_get_io(channel); 6832 if (!bdev_io) { 6833 return -ENOMEM; 6834 } 6835 6836 bdev_io->internal.ch = channel; 6837 bdev_io->internal.desc = desc; 6838 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6839 bdev_io->u.nvme_passthru.cmd = *cmd; 6840 bdev_io->u.nvme_passthru.buf = buf; 6841 bdev_io->u.nvme_passthru.nbytes = nbytes; 6842 bdev_io->u.nvme_passthru.md_buf = NULL; 6843 bdev_io->u.nvme_passthru.md_len = 0; 6844 6845 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6846 6847 bdev_io_submit(bdev_io); 6848 return 0; 6849 } 6850 6851 int 6852 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6853 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6854 spdk_bdev_io_completion_cb cb, void *cb_arg) 6855 { 6856 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6857 struct spdk_bdev_io *bdev_io; 6858 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6859 6860 if (!desc->write) { 6861 /* 6862 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6863 * to easily determine if the command is a read or write, but for now just 6864 * do not allow io_passthru with a read-only descriptor. 6865 */ 6866 return -EBADF; 6867 } 6868 6869 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6870 return -ENOTSUP; 6871 } 6872 6873 bdev_io = bdev_channel_get_io(channel); 6874 if (!bdev_io) { 6875 return -ENOMEM; 6876 } 6877 6878 bdev_io->internal.ch = channel; 6879 bdev_io->internal.desc = desc; 6880 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6881 bdev_io->u.nvme_passthru.cmd = *cmd; 6882 bdev_io->u.nvme_passthru.buf = buf; 6883 bdev_io->u.nvme_passthru.nbytes = nbytes; 6884 bdev_io->u.nvme_passthru.md_buf = md_buf; 6885 bdev_io->u.nvme_passthru.md_len = md_len; 6886 6887 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6888 6889 bdev_io_submit(bdev_io); 6890 return 0; 6891 } 6892 6893 int 6894 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6895 struct spdk_io_channel *ch, 6896 const struct spdk_nvme_cmd *cmd, 6897 struct iovec *iov, int iovcnt, size_t nbytes, 6898 void *md_buf, size_t md_len, 6899 spdk_bdev_io_completion_cb cb, void *cb_arg) 6900 { 6901 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6902 struct spdk_bdev_io *bdev_io; 6903 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6904 6905 if (!desc->write) { 6906 /* 6907 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6908 * to easily determine if the command is a read or write, but for now just 6909 * do not allow io_passthru with a read-only descriptor. 6910 */ 6911 return -EBADF; 6912 } 6913 6914 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6915 return -ENOTSUP; 6916 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6917 return -ENOTSUP; 6918 } 6919 6920 bdev_io = bdev_channel_get_io(channel); 6921 if (!bdev_io) { 6922 return -ENOMEM; 6923 } 6924 6925 bdev_io->internal.ch = channel; 6926 bdev_io->internal.desc = desc; 6927 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6928 bdev_io->u.nvme_passthru.cmd = *cmd; 6929 bdev_io->u.nvme_passthru.iovs = iov; 6930 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6931 bdev_io->u.nvme_passthru.nbytes = nbytes; 6932 bdev_io->u.nvme_passthru.md_buf = md_buf; 6933 bdev_io->u.nvme_passthru.md_len = md_len; 6934 6935 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6936 6937 bdev_io_submit(bdev_io); 6938 return 0; 6939 } 6940 6941 static void bdev_abort_retry(void *ctx); 6942 static void bdev_abort(struct spdk_bdev_io *parent_io); 6943 6944 static void 6945 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6946 { 6947 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6948 struct spdk_bdev_io *parent_io = cb_arg; 6949 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6950 6951 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6952 6953 spdk_bdev_free_io(bdev_io); 6954 6955 if (!success) { 6956 /* Check if the target I/O completed in the meantime. */ 6957 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6958 if (tmp_io == bio_to_abort) { 6959 break; 6960 } 6961 } 6962 6963 /* If the target I/O still exists, set the parent to failed. */ 6964 if (tmp_io != NULL) { 6965 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6966 } 6967 } 6968 6969 assert(parent_io->internal.f.split); 6970 6971 parent_io->internal.split.outstanding--; 6972 if (parent_io->internal.split.outstanding == 0) { 6973 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6974 bdev_abort_retry(parent_io); 6975 } else { 6976 bdev_io_complete(parent_io); 6977 } 6978 } 6979 } 6980 6981 static int 6982 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6983 struct spdk_bdev_io *bio_to_abort, 6984 spdk_bdev_io_completion_cb cb, void *cb_arg) 6985 { 6986 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6987 struct spdk_bdev_io *bdev_io; 6988 6989 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6990 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6991 /* TODO: Abort reset or abort request. */ 6992 return -ENOTSUP; 6993 } 6994 6995 bdev_io = bdev_channel_get_io(channel); 6996 if (bdev_io == NULL) { 6997 return -ENOMEM; 6998 } 6999 7000 bdev_io->internal.ch = channel; 7001 bdev_io->internal.desc = desc; 7002 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7003 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7004 7005 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.f.split) { 7006 assert(bdev_io_should_split(bio_to_abort)); 7007 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 7008 7009 /* Parent abort request is not submitted directly, but to manage its 7010 * execution add it to the submitted list here. 7011 */ 7012 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7013 bdev_ch_add_to_io_submitted(bdev_io); 7014 7015 bdev_abort(bdev_io); 7016 7017 return 0; 7018 } 7019 7020 bdev_io->u.abort.bio_to_abort = bio_to_abort; 7021 7022 /* Submit the abort request to the underlying bdev module. */ 7023 bdev_io_submit(bdev_io); 7024 7025 return 0; 7026 } 7027 7028 static bool 7029 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 7030 { 7031 struct spdk_bdev_io *iter; 7032 7033 TAILQ_FOREACH(iter, tailq, internal.link) { 7034 if (iter == bdev_io) { 7035 return true; 7036 } 7037 } 7038 7039 return false; 7040 } 7041 7042 static uint32_t 7043 _bdev_abort(struct spdk_bdev_io *parent_io) 7044 { 7045 struct spdk_bdev_desc *desc = parent_io->internal.desc; 7046 struct spdk_bdev_channel *channel = parent_io->internal.ch; 7047 void *bio_cb_arg; 7048 struct spdk_bdev_io *bio_to_abort; 7049 uint32_t matched_ios; 7050 int rc; 7051 7052 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 7053 7054 /* matched_ios is returned and will be kept by the caller. 7055 * 7056 * This function will be used for two cases, 1) the same cb_arg is used for 7057 * multiple I/Os, 2) a single large I/O is split into smaller ones. 7058 * Incrementing split_outstanding directly here may confuse readers especially 7059 * for the 1st case. 7060 * 7061 * Completion of I/O abort is processed after stack unwinding. Hence this trick 7062 * works as expected. 7063 */ 7064 matched_ios = 0; 7065 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7066 7067 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 7068 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 7069 continue; 7070 } 7071 7072 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 7073 /* Any I/O which was submitted after this abort command should be excluded. */ 7074 continue; 7075 } 7076 7077 /* We can't abort a request that's being pushed/pulled or executed by accel */ 7078 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 7079 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 7080 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7081 break; 7082 } 7083 7084 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 7085 if (rc != 0) { 7086 if (rc == -ENOMEM) { 7087 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 7088 } else { 7089 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7090 } 7091 break; 7092 } 7093 matched_ios++; 7094 } 7095 7096 return matched_ios; 7097 } 7098 7099 static void 7100 bdev_abort_retry(void *ctx) 7101 { 7102 struct spdk_bdev_io *parent_io = ctx; 7103 uint32_t matched_ios; 7104 7105 matched_ios = _bdev_abort(parent_io); 7106 7107 if (matched_ios == 0) { 7108 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7109 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7110 } else { 7111 /* For retry, the case that no target I/O was found is success 7112 * because it means target I/Os completed in the meantime. 7113 */ 7114 bdev_io_complete(parent_io); 7115 } 7116 return; 7117 } 7118 7119 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7120 parent_io->internal.f.split = true; 7121 parent_io->internal.split.outstanding = matched_ios; 7122 } 7123 7124 static void 7125 bdev_abort(struct spdk_bdev_io *parent_io) 7126 { 7127 uint32_t matched_ios; 7128 7129 matched_ios = _bdev_abort(parent_io); 7130 7131 if (matched_ios == 0) { 7132 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7133 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7134 } else { 7135 /* The case the no target I/O was found is failure. */ 7136 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7137 bdev_io_complete(parent_io); 7138 } 7139 return; 7140 } 7141 7142 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7143 parent_io->internal.f.split = true; 7144 parent_io->internal.split.outstanding = matched_ios; 7145 } 7146 7147 int 7148 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7149 void *bio_cb_arg, 7150 spdk_bdev_io_completion_cb cb, void *cb_arg) 7151 { 7152 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7153 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7154 struct spdk_bdev_io *bdev_io; 7155 7156 if (bio_cb_arg == NULL) { 7157 return -EINVAL; 7158 } 7159 7160 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7161 return -ENOTSUP; 7162 } 7163 7164 bdev_io = bdev_channel_get_io(channel); 7165 if (bdev_io == NULL) { 7166 return -ENOMEM; 7167 } 7168 7169 bdev_io->internal.ch = channel; 7170 bdev_io->internal.desc = desc; 7171 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7172 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7173 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7174 7175 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7176 7177 /* Parent abort request is not submitted directly, but to manage its execution, 7178 * add it to the submitted list here. 7179 */ 7180 bdev_ch_add_to_io_submitted(bdev_io); 7181 7182 bdev_abort(bdev_io); 7183 7184 return 0; 7185 } 7186 7187 int 7188 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7189 struct spdk_bdev_io_wait_entry *entry) 7190 { 7191 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7192 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7193 7194 if (bdev != entry->bdev) { 7195 SPDK_ERRLOG("bdevs do not match\n"); 7196 return -EINVAL; 7197 } 7198 7199 if (mgmt_ch->per_thread_cache_count > 0) { 7200 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7201 return -EINVAL; 7202 } 7203 7204 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7205 return 0; 7206 } 7207 7208 static inline void 7209 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7210 { 7211 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7212 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7213 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7214 uint32_t blocklen = bdev_io->bdev->blocklen; 7215 7216 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7217 switch (bdev_io->type) { 7218 case SPDK_BDEV_IO_TYPE_READ: 7219 io_stat->bytes_read += num_blocks * blocklen; 7220 io_stat->num_read_ops++; 7221 io_stat->read_latency_ticks += tsc_diff; 7222 if (io_stat->max_read_latency_ticks < tsc_diff) { 7223 io_stat->max_read_latency_ticks = tsc_diff; 7224 } 7225 if (io_stat->min_read_latency_ticks > tsc_diff) { 7226 io_stat->min_read_latency_ticks = tsc_diff; 7227 } 7228 break; 7229 case SPDK_BDEV_IO_TYPE_WRITE: 7230 io_stat->bytes_written += num_blocks * blocklen; 7231 io_stat->num_write_ops++; 7232 io_stat->write_latency_ticks += tsc_diff; 7233 if (io_stat->max_write_latency_ticks < tsc_diff) { 7234 io_stat->max_write_latency_ticks = tsc_diff; 7235 } 7236 if (io_stat->min_write_latency_ticks > tsc_diff) { 7237 io_stat->min_write_latency_ticks = tsc_diff; 7238 } 7239 break; 7240 case SPDK_BDEV_IO_TYPE_UNMAP: 7241 io_stat->bytes_unmapped += num_blocks * blocklen; 7242 io_stat->num_unmap_ops++; 7243 io_stat->unmap_latency_ticks += tsc_diff; 7244 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7245 io_stat->max_unmap_latency_ticks = tsc_diff; 7246 } 7247 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7248 io_stat->min_unmap_latency_ticks = tsc_diff; 7249 } 7250 break; 7251 case SPDK_BDEV_IO_TYPE_ZCOPY: 7252 /* Track the data in the start phase only */ 7253 if (bdev_io->u.bdev.zcopy.start) { 7254 if (bdev_io->u.bdev.zcopy.populate) { 7255 io_stat->bytes_read += num_blocks * blocklen; 7256 io_stat->num_read_ops++; 7257 io_stat->read_latency_ticks += tsc_diff; 7258 if (io_stat->max_read_latency_ticks < tsc_diff) { 7259 io_stat->max_read_latency_ticks = tsc_diff; 7260 } 7261 if (io_stat->min_read_latency_ticks > tsc_diff) { 7262 io_stat->min_read_latency_ticks = tsc_diff; 7263 } 7264 } else { 7265 io_stat->bytes_written += num_blocks * blocklen; 7266 io_stat->num_write_ops++; 7267 io_stat->write_latency_ticks += tsc_diff; 7268 if (io_stat->max_write_latency_ticks < tsc_diff) { 7269 io_stat->max_write_latency_ticks = tsc_diff; 7270 } 7271 if (io_stat->min_write_latency_ticks > tsc_diff) { 7272 io_stat->min_write_latency_ticks = tsc_diff; 7273 } 7274 } 7275 } 7276 break; 7277 case SPDK_BDEV_IO_TYPE_COPY: 7278 io_stat->bytes_copied += num_blocks * blocklen; 7279 io_stat->num_copy_ops++; 7280 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7281 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7282 io_stat->max_copy_latency_ticks = tsc_diff; 7283 } 7284 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7285 io_stat->min_copy_latency_ticks = tsc_diff; 7286 } 7287 break; 7288 default: 7289 break; 7290 } 7291 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7292 io_stat = bdev_io->bdev->internal.stat; 7293 assert(io_stat->io_error != NULL); 7294 7295 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7296 io_stat->io_error->error_status[-io_status - 1]++; 7297 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7298 } 7299 7300 #ifdef SPDK_CONFIG_VTUNE 7301 uint64_t now_tsc = spdk_get_ticks(); 7302 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7303 uint64_t data[5]; 7304 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7305 7306 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7307 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7308 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7309 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7310 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7311 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7312 7313 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7314 __itt_metadata_u64, 5, data); 7315 7316 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7317 bdev_io->internal.ch->start_tsc = now_tsc; 7318 } 7319 #endif 7320 } 7321 7322 static inline void 7323 _bdev_io_complete(void *ctx) 7324 { 7325 struct spdk_bdev_io *bdev_io = ctx; 7326 7327 if (spdk_unlikely(bdev_io_use_accel_sequence(bdev_io))) { 7328 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7329 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7330 } 7331 7332 assert(bdev_io->internal.cb != NULL); 7333 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7334 7335 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7336 bdev_io->internal.caller_ctx); 7337 } 7338 7339 static inline void 7340 bdev_io_complete(void *ctx) 7341 { 7342 struct spdk_bdev_io *bdev_io = ctx; 7343 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7344 uint64_t tsc, tsc_diff; 7345 7346 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7347 /* 7348 * Defer completion to avoid potential infinite recursion if the 7349 * user's completion callback issues a new I/O. 7350 */ 7351 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7352 bdev_io_complete, bdev_io); 7353 return; 7354 } 7355 7356 tsc = spdk_get_ticks(); 7357 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7358 7359 bdev_ch_remove_from_io_submitted(bdev_io); 7360 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7361 bdev_io->internal.caller_ctx, bdev_ch->queue_depth); 7362 7363 if (bdev_ch->histogram) { 7364 if (bdev_io->bdev->internal.histogram_io_type == 0 || 7365 bdev_io->bdev->internal.histogram_io_type == bdev_io->type) { 7366 /* 7367 * Tally all I/O types if the histogram_io_type is set to 0. 7368 */ 7369 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7370 } 7371 } 7372 7373 bdev_io_update_io_stat(bdev_io, tsc_diff); 7374 _bdev_io_complete(bdev_io); 7375 } 7376 7377 /* The difference between this function and bdev_io_complete() is that this should be called to 7378 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7379 * io_submitted list and don't have submit_tsc updated. 7380 */ 7381 static inline void 7382 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7383 { 7384 /* Since the IO hasn't been submitted it's bound to be failed */ 7385 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7386 7387 /* At this point we don't know if the IO is completed from submission context or not, but, 7388 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7389 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7390 _bdev_io_complete, bdev_io); 7391 } 7392 7393 static void bdev_destroy_cb(void *io_device); 7394 7395 static void 7396 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7397 { 7398 struct spdk_bdev_io *bdev_io = _ctx; 7399 7400 if (bdev_io->u.reset.ch_ref != NULL) { 7401 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7402 bdev_io->u.reset.ch_ref = NULL; 7403 } 7404 7405 bdev_io_complete(bdev_io); 7406 7407 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7408 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7409 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7410 } 7411 } 7412 7413 static void 7414 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7415 struct spdk_io_channel *_ch, void *_ctx) 7416 { 7417 struct spdk_bdev_io *bdev_io = _ctx; 7418 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7419 struct spdk_bdev_io *queued_reset; 7420 7421 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7422 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7423 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7424 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7425 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7426 } 7427 7428 spdk_bdev_for_each_channel_continue(i, 0); 7429 } 7430 7431 static void 7432 bdev_io_complete_sequence_cb(void *ctx, int status) 7433 { 7434 struct spdk_bdev_io *bdev_io = ctx; 7435 7436 /* u.bdev.accel_sequence should have already been cleared at this point */ 7437 assert(bdev_io->u.bdev.accel_sequence == NULL); 7438 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7439 bdev_io->internal.f.has_accel_sequence = false; 7440 7441 if (spdk_unlikely(status != 0)) { 7442 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7443 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7444 } 7445 7446 bdev_io_complete(bdev_io); 7447 } 7448 7449 void 7450 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7451 { 7452 struct spdk_bdev *bdev = bdev_io->bdev; 7453 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7454 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7455 7456 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7457 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7458 spdk_bdev_get_module_name(bdev), 7459 bdev_io_status_get_string(bdev_io->internal.status)); 7460 assert(false); 7461 } 7462 bdev_io->internal.status = status; 7463 7464 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7465 bool unlock_channels = false; 7466 7467 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7468 SPDK_ERRLOG("NOMEM returned for reset\n"); 7469 } 7470 spdk_spin_lock(&bdev->internal.spinlock); 7471 if (bdev_io == bdev->internal.reset_in_progress) { 7472 bdev->internal.reset_in_progress = NULL; 7473 unlock_channels = true; 7474 } 7475 spdk_spin_unlock(&bdev->internal.spinlock); 7476 7477 if (unlock_channels) { 7478 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7479 bdev_reset_complete); 7480 return; 7481 } 7482 } else { 7483 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7484 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7485 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7486 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7487 return; 7488 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7489 !bdev_io_use_accel_sequence(bdev_io))) { 7490 _bdev_io_push_bounce_data_buffer(bdev_io, 7491 _bdev_io_complete_push_bounce_done); 7492 /* bdev IO will be completed in the callback */ 7493 return; 7494 } 7495 } 7496 7497 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7498 return; 7499 } 7500 } 7501 7502 bdev_io_complete(bdev_io); 7503 } 7504 7505 void 7506 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7507 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7508 { 7509 enum spdk_bdev_io_status status; 7510 7511 if (sc == SPDK_SCSI_STATUS_GOOD) { 7512 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7513 } else { 7514 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7515 bdev_io->internal.error.scsi.sc = sc; 7516 bdev_io->internal.error.scsi.sk = sk; 7517 bdev_io->internal.error.scsi.asc = asc; 7518 bdev_io->internal.error.scsi.ascq = ascq; 7519 } 7520 7521 spdk_bdev_io_complete(bdev_io, status); 7522 } 7523 7524 void 7525 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7526 int *sc, int *sk, int *asc, int *ascq) 7527 { 7528 assert(sc != NULL); 7529 assert(sk != NULL); 7530 assert(asc != NULL); 7531 assert(ascq != NULL); 7532 7533 switch (bdev_io->internal.status) { 7534 case SPDK_BDEV_IO_STATUS_SUCCESS: 7535 *sc = SPDK_SCSI_STATUS_GOOD; 7536 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7537 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7538 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7539 break; 7540 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7541 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7542 break; 7543 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7544 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7545 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7546 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7547 *ascq = bdev_io->internal.error.scsi.ascq; 7548 break; 7549 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7550 *sc = bdev_io->internal.error.scsi.sc; 7551 *sk = bdev_io->internal.error.scsi.sk; 7552 *asc = bdev_io->internal.error.scsi.asc; 7553 *ascq = bdev_io->internal.error.scsi.ascq; 7554 break; 7555 default: 7556 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7557 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7558 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7559 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7560 break; 7561 } 7562 } 7563 7564 void 7565 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7566 { 7567 enum spdk_bdev_io_status status; 7568 7569 if (aio_result == 0) { 7570 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7571 } else { 7572 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7573 } 7574 7575 bdev_io->internal.error.aio_result = aio_result; 7576 7577 spdk_bdev_io_complete(bdev_io, status); 7578 } 7579 7580 void 7581 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7582 { 7583 assert(aio_result != NULL); 7584 7585 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7586 *aio_result = bdev_io->internal.error.aio_result; 7587 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7588 *aio_result = 0; 7589 } else { 7590 *aio_result = -EIO; 7591 } 7592 } 7593 7594 void 7595 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7596 { 7597 enum spdk_bdev_io_status status; 7598 7599 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7600 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7601 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7602 status = SPDK_BDEV_IO_STATUS_ABORTED; 7603 } else { 7604 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7605 } 7606 7607 bdev_io->internal.error.nvme.cdw0 = cdw0; 7608 bdev_io->internal.error.nvme.sct = sct; 7609 bdev_io->internal.error.nvme.sc = sc; 7610 7611 spdk_bdev_io_complete(bdev_io, status); 7612 } 7613 7614 void 7615 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7616 { 7617 assert(sct != NULL); 7618 assert(sc != NULL); 7619 assert(cdw0 != NULL); 7620 7621 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7622 *sct = SPDK_NVME_SCT_GENERIC; 7623 *sc = SPDK_NVME_SC_SUCCESS; 7624 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7625 *cdw0 = 0; 7626 } else { 7627 *cdw0 = 1U; 7628 } 7629 return; 7630 } 7631 7632 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7633 *sct = SPDK_NVME_SCT_GENERIC; 7634 *sc = SPDK_NVME_SC_SUCCESS; 7635 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7636 *sct = bdev_io->internal.error.nvme.sct; 7637 *sc = bdev_io->internal.error.nvme.sc; 7638 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7639 *sct = SPDK_NVME_SCT_GENERIC; 7640 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7641 } else { 7642 *sct = SPDK_NVME_SCT_GENERIC; 7643 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7644 } 7645 7646 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7647 } 7648 7649 void 7650 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7651 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7652 { 7653 assert(first_sct != NULL); 7654 assert(first_sc != NULL); 7655 assert(second_sct != NULL); 7656 assert(second_sc != NULL); 7657 assert(cdw0 != NULL); 7658 7659 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7660 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7661 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7662 *first_sct = bdev_io->internal.error.nvme.sct; 7663 *first_sc = bdev_io->internal.error.nvme.sc; 7664 *second_sct = SPDK_NVME_SCT_GENERIC; 7665 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7666 } else { 7667 *first_sct = SPDK_NVME_SCT_GENERIC; 7668 *first_sc = SPDK_NVME_SC_SUCCESS; 7669 *second_sct = bdev_io->internal.error.nvme.sct; 7670 *second_sc = bdev_io->internal.error.nvme.sc; 7671 } 7672 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7673 *first_sct = SPDK_NVME_SCT_GENERIC; 7674 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7675 *second_sct = SPDK_NVME_SCT_GENERIC; 7676 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7677 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7678 *first_sct = SPDK_NVME_SCT_GENERIC; 7679 *first_sc = SPDK_NVME_SC_SUCCESS; 7680 *second_sct = SPDK_NVME_SCT_GENERIC; 7681 *second_sc = SPDK_NVME_SC_SUCCESS; 7682 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7683 *first_sct = SPDK_NVME_SCT_GENERIC; 7684 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7685 *second_sct = SPDK_NVME_SCT_GENERIC; 7686 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7687 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7688 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7689 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7690 *second_sct = SPDK_NVME_SCT_GENERIC; 7691 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7692 } else { 7693 *first_sct = SPDK_NVME_SCT_GENERIC; 7694 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7695 *second_sct = SPDK_NVME_SCT_GENERIC; 7696 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7697 } 7698 7699 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7700 } 7701 7702 void 7703 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7704 const struct spdk_bdev_io *base_io) 7705 { 7706 switch (base_io->internal.status) { 7707 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7708 spdk_bdev_io_complete_nvme_status(bdev_io, 7709 base_io->internal.error.nvme.cdw0, 7710 base_io->internal.error.nvme.sct, 7711 base_io->internal.error.nvme.sc); 7712 break; 7713 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7714 spdk_bdev_io_complete_scsi_status(bdev_io, 7715 base_io->internal.error.scsi.sc, 7716 base_io->internal.error.scsi.sk, 7717 base_io->internal.error.scsi.asc, 7718 base_io->internal.error.scsi.ascq); 7719 break; 7720 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7721 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7722 break; 7723 default: 7724 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7725 break; 7726 } 7727 } 7728 7729 struct spdk_thread * 7730 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7731 { 7732 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7733 } 7734 7735 struct spdk_io_channel * 7736 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7737 { 7738 return bdev_io->internal.ch->channel; 7739 } 7740 7741 static int 7742 bdev_register(struct spdk_bdev *bdev) 7743 { 7744 char *bdev_name; 7745 char uuid[SPDK_UUID_STRING_LEN]; 7746 struct spdk_iobuf_opts iobuf_opts; 7747 int ret; 7748 7749 assert(bdev->module != NULL); 7750 7751 if (!bdev->name) { 7752 SPDK_ERRLOG("Bdev name is NULL\n"); 7753 return -EINVAL; 7754 } 7755 7756 if (!strlen(bdev->name)) { 7757 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7758 return -EINVAL; 7759 } 7760 7761 /* Users often register their own I/O devices using the bdev name. In 7762 * order to avoid conflicts, prepend bdev_. */ 7763 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7764 if (!bdev_name) { 7765 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7766 return -ENOMEM; 7767 } 7768 7769 bdev->internal.stat = bdev_alloc_io_stat(true); 7770 if (!bdev->internal.stat) { 7771 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7772 free(bdev_name); 7773 return -ENOMEM; 7774 } 7775 7776 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7777 bdev->internal.measured_queue_depth = UINT64_MAX; 7778 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7779 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7780 bdev->internal.qd_poller = NULL; 7781 bdev->internal.qos = NULL; 7782 7783 TAILQ_INIT(&bdev->internal.open_descs); 7784 TAILQ_INIT(&bdev->internal.locked_ranges); 7785 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7786 TAILQ_INIT(&bdev->aliases); 7787 7788 /* UUID may be specified by the user or defined by bdev itself. 7789 * Otherwise it will be generated here, so this field will never be empty. */ 7790 if (spdk_uuid_is_null(&bdev->uuid)) { 7791 spdk_uuid_generate(&bdev->uuid); 7792 } 7793 7794 /* Add the UUID alias only if it's different than the name */ 7795 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7796 if (strcmp(bdev->name, uuid) != 0) { 7797 ret = spdk_bdev_alias_add(bdev, uuid); 7798 if (ret != 0) { 7799 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7800 bdev_free_io_stat(bdev->internal.stat); 7801 free(bdev_name); 7802 return ret; 7803 } 7804 } 7805 7806 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7807 if (spdk_bdev_get_buf_align(bdev) > 1) { 7808 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7809 iobuf_opts.large_bufsize / bdev->blocklen); 7810 } 7811 7812 /* If the user didn't specify a write unit size, set it to one. */ 7813 if (bdev->write_unit_size == 0) { 7814 bdev->write_unit_size = 1; 7815 } 7816 7817 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7818 if (bdev->acwu == 0) { 7819 bdev->acwu = bdev->write_unit_size; 7820 } 7821 7822 if (bdev->phys_blocklen == 0) { 7823 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7824 } 7825 7826 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7827 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7828 } 7829 7830 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7831 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7832 } 7833 7834 bdev->internal.reset_in_progress = NULL; 7835 bdev->internal.qd_poll_in_progress = false; 7836 bdev->internal.period = 0; 7837 bdev->internal.new_period = 0; 7838 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7839 7840 /* 7841 * Initialize spinlock before registering IO device because spinlock is used in 7842 * bdev_channel_create 7843 */ 7844 spdk_spin_init(&bdev->internal.spinlock); 7845 7846 spdk_io_device_register(__bdev_to_io_dev(bdev), 7847 bdev_channel_create, bdev_channel_destroy, 7848 sizeof(struct spdk_bdev_channel), 7849 bdev_name); 7850 7851 /* 7852 * Register bdev name only after the bdev object is ready. 7853 * After bdev_name_add returns, it is possible for other threads to start using the bdev, 7854 * create IO channels... 7855 */ 7856 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7857 if (ret != 0) { 7858 spdk_io_device_unregister(__bdev_to_io_dev(bdev), NULL); 7859 bdev_free_io_stat(bdev->internal.stat); 7860 spdk_spin_destroy(&bdev->internal.spinlock); 7861 free(bdev_name); 7862 return ret; 7863 } 7864 7865 free(bdev_name); 7866 7867 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7868 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7869 7870 return 0; 7871 } 7872 7873 static void 7874 bdev_destroy_cb(void *io_device) 7875 { 7876 int rc; 7877 struct spdk_bdev *bdev; 7878 spdk_bdev_unregister_cb cb_fn; 7879 void *cb_arg; 7880 7881 bdev = __bdev_from_io_dev(io_device); 7882 7883 if (bdev->internal.unregister_td != spdk_get_thread()) { 7884 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7885 return; 7886 } 7887 7888 cb_fn = bdev->internal.unregister_cb; 7889 cb_arg = bdev->internal.unregister_ctx; 7890 7891 spdk_spin_destroy(&bdev->internal.spinlock); 7892 free(bdev->internal.qos); 7893 bdev_free_io_stat(bdev->internal.stat); 7894 spdk_trace_unregister_owner(bdev->internal.trace_id); 7895 7896 rc = bdev->fn_table->destruct(bdev->ctxt); 7897 if (rc < 0) { 7898 SPDK_ERRLOG("destruct failed\n"); 7899 } 7900 if (rc <= 0 && cb_fn != NULL) { 7901 cb_fn(cb_arg, rc); 7902 } 7903 } 7904 7905 void 7906 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7907 { 7908 if (bdev->internal.unregister_cb != NULL) { 7909 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7910 } 7911 } 7912 7913 static void 7914 _remove_notify(void *arg) 7915 { 7916 struct spdk_bdev_desc *desc = arg; 7917 7918 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7919 } 7920 7921 /* returns: 0 - bdev removed and ready to be destructed. 7922 * -EBUSY - bdev can't be destructed yet. */ 7923 static int 7924 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7925 { 7926 struct spdk_bdev_desc *desc, *tmp; 7927 int rc = 0; 7928 char uuid[SPDK_UUID_STRING_LEN]; 7929 7930 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7931 assert(spdk_spin_held(&bdev->internal.spinlock)); 7932 7933 /* Notify each descriptor about hotremoval */ 7934 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7935 rc = -EBUSY; 7936 /* 7937 * Defer invocation of the event_cb to a separate message that will 7938 * run later on its thread. This ensures this context unwinds and 7939 * we don't recursively unregister this bdev again if the event_cb 7940 * immediately closes its descriptor. 7941 */ 7942 event_notify(desc, _remove_notify); 7943 } 7944 7945 /* If there are no descriptors, proceed removing the bdev */ 7946 if (rc == 0) { 7947 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7948 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7949 7950 /* Delete the name and the UUID alias */ 7951 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7952 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7953 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7954 7955 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7956 7957 if (bdev->internal.reset_in_progress != NULL) { 7958 /* If reset is in progress, let the completion callback for reset 7959 * unregister the bdev. 7960 */ 7961 rc = -EBUSY; 7962 } 7963 } 7964 7965 return rc; 7966 } 7967 7968 static void 7969 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7970 struct spdk_io_channel *io_ch, void *_ctx) 7971 { 7972 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7973 7974 bdev_channel_abort_queued_ios(bdev_ch); 7975 spdk_bdev_for_each_channel_continue(i, 0); 7976 } 7977 7978 static void 7979 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7980 { 7981 int rc; 7982 7983 spdk_spin_lock(&g_bdev_mgr.spinlock); 7984 spdk_spin_lock(&bdev->internal.spinlock); 7985 /* 7986 * Set the status to REMOVING after completing to abort channels. Otherwise, 7987 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7988 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7989 * may fail. 7990 */ 7991 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7992 rc = bdev_unregister_unsafe(bdev); 7993 spdk_spin_unlock(&bdev->internal.spinlock); 7994 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7995 7996 if (rc == 0) { 7997 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7998 } 7999 } 8000 8001 void 8002 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8003 { 8004 struct spdk_thread *thread; 8005 8006 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 8007 8008 thread = spdk_get_thread(); 8009 if (!thread) { 8010 /* The user called this from a non-SPDK thread. */ 8011 if (cb_fn != NULL) { 8012 cb_fn(cb_arg, -ENOTSUP); 8013 } 8014 return; 8015 } 8016 8017 spdk_spin_lock(&g_bdev_mgr.spinlock); 8018 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8019 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8020 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8021 if (cb_fn) { 8022 cb_fn(cb_arg, -EBUSY); 8023 } 8024 return; 8025 } 8026 8027 spdk_spin_lock(&bdev->internal.spinlock); 8028 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 8029 bdev->internal.unregister_cb = cb_fn; 8030 bdev->internal.unregister_ctx = cb_arg; 8031 bdev->internal.unregister_td = thread; 8032 spdk_spin_unlock(&bdev->internal.spinlock); 8033 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8034 8035 spdk_bdev_set_qd_sampling_period(bdev, 0); 8036 8037 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 8038 bdev_unregister); 8039 } 8040 8041 int 8042 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 8043 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 8044 { 8045 struct spdk_bdev_desc *desc; 8046 struct spdk_bdev *bdev; 8047 int rc; 8048 8049 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 8050 if (rc != 0) { 8051 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 8052 return rc; 8053 } 8054 8055 bdev = spdk_bdev_desc_get_bdev(desc); 8056 8057 if (bdev->module != module) { 8058 spdk_bdev_close(desc); 8059 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 8060 bdev_name); 8061 return -ENODEV; 8062 } 8063 8064 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 8065 8066 spdk_bdev_close(desc); 8067 8068 return 0; 8069 } 8070 8071 static int 8072 bdev_start_qos(struct spdk_bdev *bdev) 8073 { 8074 struct set_qos_limit_ctx *ctx; 8075 8076 /* Enable QoS */ 8077 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 8078 ctx = calloc(1, sizeof(*ctx)); 8079 if (ctx == NULL) { 8080 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 8081 return -ENOMEM; 8082 } 8083 ctx->bdev = bdev; 8084 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 8085 } 8086 8087 return 0; 8088 } 8089 8090 static void 8091 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 8092 struct spdk_bdev *bdev) 8093 { 8094 enum spdk_bdev_claim_type type; 8095 const char *typename, *modname; 8096 extern struct spdk_log_flag SPDK_LOG_bdev; 8097 8098 assert(spdk_spin_held(&bdev->internal.spinlock)); 8099 8100 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 8101 return; 8102 } 8103 8104 type = bdev->internal.claim_type; 8105 typename = spdk_bdev_claim_get_name(type); 8106 8107 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 8108 modname = bdev->internal.claim.v1.module->name; 8109 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8110 bdev->name, detail, typename, modname); 8111 return; 8112 } 8113 8114 if (claim_type_is_v2(type)) { 8115 struct spdk_bdev_module_claim *claim; 8116 8117 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 8118 modname = claim->module->name; 8119 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 8120 bdev->name, detail, typename, modname); 8121 } 8122 return; 8123 } 8124 8125 assert(false); 8126 } 8127 8128 static int 8129 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 8130 { 8131 struct spdk_thread *thread; 8132 int rc = 0; 8133 8134 thread = spdk_get_thread(); 8135 if (!thread) { 8136 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 8137 return -ENOTSUP; 8138 } 8139 8140 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8141 spdk_get_thread()); 8142 8143 desc->bdev = bdev; 8144 desc->thread = thread; 8145 desc->write = write; 8146 8147 spdk_spin_lock(&bdev->internal.spinlock); 8148 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8149 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8150 spdk_spin_unlock(&bdev->internal.spinlock); 8151 return -ENODEV; 8152 } 8153 8154 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8155 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8156 spdk_spin_unlock(&bdev->internal.spinlock); 8157 return -EPERM; 8158 } 8159 8160 rc = bdev_start_qos(bdev); 8161 if (rc != 0) { 8162 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8163 spdk_spin_unlock(&bdev->internal.spinlock); 8164 return rc; 8165 } 8166 8167 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8168 8169 spdk_spin_unlock(&bdev->internal.spinlock); 8170 8171 return 0; 8172 } 8173 8174 static int 8175 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8176 struct spdk_bdev_desc **_desc) 8177 { 8178 struct spdk_bdev_desc *desc; 8179 unsigned int i; 8180 8181 desc = calloc(1, sizeof(*desc)); 8182 if (desc == NULL) { 8183 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8184 return -ENOMEM; 8185 } 8186 8187 TAILQ_INIT(&desc->pending_media_events); 8188 TAILQ_INIT(&desc->free_media_events); 8189 8190 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8191 desc->callback.event_fn = event_cb; 8192 desc->callback.ctx = event_ctx; 8193 spdk_spin_init(&desc->spinlock); 8194 8195 if (bdev->media_events) { 8196 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8197 sizeof(*desc->media_events_buffer)); 8198 if (desc->media_events_buffer == NULL) { 8199 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8200 bdev_desc_free(desc); 8201 return -ENOMEM; 8202 } 8203 8204 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8205 TAILQ_INSERT_TAIL(&desc->free_media_events, 8206 &desc->media_events_buffer[i], tailq); 8207 } 8208 } 8209 8210 if (bdev->fn_table->accel_sequence_supported != NULL) { 8211 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8212 desc->accel_sequence_supported[i] = 8213 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8214 (enum spdk_bdev_io_type)i); 8215 } 8216 } 8217 8218 *_desc = desc; 8219 8220 return 0; 8221 } 8222 8223 static int 8224 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8225 void *event_ctx, struct spdk_bdev_desc **_desc) 8226 { 8227 struct spdk_bdev_desc *desc; 8228 struct spdk_bdev *bdev; 8229 int rc; 8230 8231 bdev = bdev_get_by_name(bdev_name); 8232 8233 if (bdev == NULL) { 8234 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8235 return -ENODEV; 8236 } 8237 8238 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8239 if (rc != 0) { 8240 return rc; 8241 } 8242 8243 rc = bdev_open(bdev, write, desc); 8244 if (rc != 0) { 8245 bdev_desc_free(desc); 8246 desc = NULL; 8247 } 8248 8249 *_desc = desc; 8250 8251 return rc; 8252 } 8253 8254 int 8255 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8256 void *event_ctx, struct spdk_bdev_desc **_desc) 8257 { 8258 int rc; 8259 8260 if (event_cb == NULL) { 8261 SPDK_ERRLOG("Missing event callback function\n"); 8262 return -EINVAL; 8263 } 8264 8265 spdk_spin_lock(&g_bdev_mgr.spinlock); 8266 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8267 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8268 8269 return rc; 8270 } 8271 8272 struct spdk_bdev_open_async_ctx { 8273 char *bdev_name; 8274 spdk_bdev_event_cb_t event_cb; 8275 void *event_ctx; 8276 bool write; 8277 int rc; 8278 spdk_bdev_open_async_cb_t cb_fn; 8279 void *cb_arg; 8280 struct spdk_bdev_desc *desc; 8281 struct spdk_bdev_open_async_opts opts; 8282 uint64_t start_ticks; 8283 struct spdk_thread *orig_thread; 8284 struct spdk_poller *poller; 8285 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8286 }; 8287 8288 static void 8289 bdev_open_async_done(void *arg) 8290 { 8291 struct spdk_bdev_open_async_ctx *ctx = arg; 8292 8293 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8294 8295 free(ctx->bdev_name); 8296 free(ctx); 8297 } 8298 8299 static void 8300 bdev_open_async_cancel(void *arg) 8301 { 8302 struct spdk_bdev_open_async_ctx *ctx = arg; 8303 8304 assert(ctx->rc == -ESHUTDOWN); 8305 8306 spdk_poller_unregister(&ctx->poller); 8307 8308 bdev_open_async_done(ctx); 8309 } 8310 8311 /* This is called when the bdev library finishes at shutdown. */ 8312 static void 8313 bdev_open_async_fini(void) 8314 { 8315 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8316 8317 spdk_spin_lock(&g_bdev_mgr.spinlock); 8318 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8319 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8320 /* 8321 * We have to move to ctx->orig_thread to unregister ctx->poller. 8322 * However, there is a chance that ctx->poller is executed before 8323 * message is executed, which could result in bdev_open_async_done() 8324 * being called twice. To avoid such race condition, set ctx->rc to 8325 * -ESHUTDOWN. 8326 */ 8327 ctx->rc = -ESHUTDOWN; 8328 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8329 } 8330 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8331 } 8332 8333 static int bdev_open_async(void *arg); 8334 8335 static void 8336 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8337 { 8338 uint64_t timeout_ticks; 8339 8340 if (ctx->rc == -ESHUTDOWN) { 8341 /* This context is being canceled. Do nothing. */ 8342 return; 8343 } 8344 8345 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8346 &ctx->desc); 8347 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8348 goto exit; 8349 } 8350 8351 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8352 if (spdk_get_ticks() >= timeout_ticks) { 8353 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8354 ctx->rc = -ETIMEDOUT; 8355 goto exit; 8356 } 8357 8358 return; 8359 8360 exit: 8361 spdk_poller_unregister(&ctx->poller); 8362 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8363 8364 /* Completion callback is processed after stack unwinding. */ 8365 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8366 } 8367 8368 static int 8369 bdev_open_async(void *arg) 8370 { 8371 struct spdk_bdev_open_async_ctx *ctx = arg; 8372 8373 spdk_spin_lock(&g_bdev_mgr.spinlock); 8374 8375 _bdev_open_async(ctx); 8376 8377 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8378 8379 return SPDK_POLLER_BUSY; 8380 } 8381 8382 static void 8383 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8384 struct spdk_bdev_open_async_opts *opts_src, 8385 size_t size) 8386 { 8387 assert(opts); 8388 assert(opts_src); 8389 8390 opts->size = size; 8391 8392 #define SET_FIELD(field) \ 8393 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8394 opts->field = opts_src->field; \ 8395 } \ 8396 8397 SET_FIELD(timeout_ms); 8398 8399 /* Do not remove this statement, you should always update this statement when you adding a new field, 8400 * and do not forget to add the SET_FIELD statement for your added field. */ 8401 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8402 8403 #undef SET_FIELD 8404 } 8405 8406 static void 8407 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8408 { 8409 assert(opts); 8410 8411 opts->size = size; 8412 8413 #define SET_FIELD(field, value) \ 8414 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8415 opts->field = value; \ 8416 } \ 8417 8418 SET_FIELD(timeout_ms, 0); 8419 8420 #undef SET_FIELD 8421 } 8422 8423 int 8424 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8425 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8426 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8427 { 8428 struct spdk_bdev_open_async_ctx *ctx; 8429 8430 if (event_cb == NULL) { 8431 SPDK_ERRLOG("Missing event callback function\n"); 8432 return -EINVAL; 8433 } 8434 8435 if (open_cb == NULL) { 8436 SPDK_ERRLOG("Missing open callback function\n"); 8437 return -EINVAL; 8438 } 8439 8440 if (opts != NULL && opts->size == 0) { 8441 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8442 return -EINVAL; 8443 } 8444 8445 ctx = calloc(1, sizeof(*ctx)); 8446 if (ctx == NULL) { 8447 SPDK_ERRLOG("Failed to allocate open context\n"); 8448 return -ENOMEM; 8449 } 8450 8451 ctx->bdev_name = strdup(bdev_name); 8452 if (ctx->bdev_name == NULL) { 8453 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8454 free(ctx); 8455 return -ENOMEM; 8456 } 8457 8458 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8459 if (ctx->poller == NULL) { 8460 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8461 free(ctx->bdev_name); 8462 free(ctx); 8463 return -ENOMEM; 8464 } 8465 8466 ctx->cb_fn = open_cb; 8467 ctx->cb_arg = open_cb_arg; 8468 ctx->write = write; 8469 ctx->event_cb = event_cb; 8470 ctx->event_ctx = event_ctx; 8471 ctx->orig_thread = spdk_get_thread(); 8472 ctx->start_ticks = spdk_get_ticks(); 8473 8474 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8475 if (opts != NULL) { 8476 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8477 } 8478 8479 spdk_spin_lock(&g_bdev_mgr.spinlock); 8480 8481 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8482 _bdev_open_async(ctx); 8483 8484 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8485 8486 return 0; 8487 } 8488 8489 static void 8490 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8491 { 8492 int rc; 8493 8494 spdk_spin_lock(&bdev->internal.spinlock); 8495 spdk_spin_lock(&desc->spinlock); 8496 8497 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8498 8499 desc->closed = true; 8500 8501 if (desc->claim != NULL) { 8502 bdev_desc_release_claims(desc); 8503 } 8504 8505 if (0 == desc->refs) { 8506 spdk_spin_unlock(&desc->spinlock); 8507 bdev_desc_free(desc); 8508 } else { 8509 spdk_spin_unlock(&desc->spinlock); 8510 } 8511 8512 /* If no more descriptors, kill QoS channel */ 8513 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8514 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8515 bdev->name, spdk_get_thread()); 8516 8517 if (bdev_qos_destroy(bdev)) { 8518 /* There isn't anything we can do to recover here. Just let the 8519 * old QoS poller keep running. The QoS handling won't change 8520 * cores when the user allocates a new channel, but it won't break. */ 8521 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8522 } 8523 } 8524 8525 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8526 rc = bdev_unregister_unsafe(bdev); 8527 spdk_spin_unlock(&bdev->internal.spinlock); 8528 8529 if (rc == 0) { 8530 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8531 } 8532 } else { 8533 spdk_spin_unlock(&bdev->internal.spinlock); 8534 } 8535 } 8536 8537 void 8538 spdk_bdev_close(struct spdk_bdev_desc *desc) 8539 { 8540 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8541 8542 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8543 spdk_get_thread()); 8544 8545 assert(desc->thread == spdk_get_thread()); 8546 8547 spdk_poller_unregister(&desc->io_timeout_poller); 8548 8549 spdk_spin_lock(&g_bdev_mgr.spinlock); 8550 8551 bdev_close(bdev, desc); 8552 8553 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8554 } 8555 8556 static void 8557 bdev_register_finished(void *arg) 8558 { 8559 struct spdk_bdev_desc *desc = arg; 8560 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8561 8562 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8563 8564 spdk_spin_lock(&g_bdev_mgr.spinlock); 8565 8566 bdev_close(bdev, desc); 8567 8568 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8569 } 8570 8571 int 8572 spdk_bdev_register(struct spdk_bdev *bdev) 8573 { 8574 struct spdk_bdev_desc *desc; 8575 struct spdk_thread *thread = spdk_get_thread(); 8576 int rc; 8577 8578 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8579 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8580 thread ? spdk_thread_get_name(thread) : "null"); 8581 return -EINVAL; 8582 } 8583 8584 rc = bdev_register(bdev); 8585 if (rc != 0) { 8586 return rc; 8587 } 8588 8589 /* A descriptor is opened to prevent bdev deletion during examination */ 8590 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8591 if (rc != 0) { 8592 spdk_bdev_unregister(bdev, NULL, NULL); 8593 return rc; 8594 } 8595 8596 rc = bdev_open(bdev, false, desc); 8597 if (rc != 0) { 8598 bdev_desc_free(desc); 8599 spdk_bdev_unregister(bdev, NULL, NULL); 8600 return rc; 8601 } 8602 8603 /* Examine configuration before initializing I/O */ 8604 bdev_examine(bdev); 8605 8606 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8607 if (rc != 0) { 8608 bdev_close(bdev, desc); 8609 spdk_bdev_unregister(bdev, NULL, NULL); 8610 } 8611 8612 return rc; 8613 } 8614 8615 int 8616 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8617 struct spdk_bdev_module *module) 8618 { 8619 spdk_spin_lock(&bdev->internal.spinlock); 8620 8621 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8622 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8623 spdk_spin_unlock(&bdev->internal.spinlock); 8624 return -EPERM; 8625 } 8626 8627 if (desc && !desc->write) { 8628 desc->write = true; 8629 } 8630 8631 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8632 bdev->internal.claim.v1.module = module; 8633 8634 spdk_spin_unlock(&bdev->internal.spinlock); 8635 return 0; 8636 } 8637 8638 void 8639 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8640 { 8641 spdk_spin_lock(&bdev->internal.spinlock); 8642 8643 assert(bdev->internal.claim.v1.module != NULL); 8644 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8645 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8646 bdev->internal.claim.v1.module = NULL; 8647 8648 spdk_spin_unlock(&bdev->internal.spinlock); 8649 } 8650 8651 /* 8652 * Start claims v2 8653 */ 8654 8655 const char * 8656 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8657 { 8658 switch (type) { 8659 case SPDK_BDEV_CLAIM_NONE: 8660 return "not_claimed"; 8661 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8662 return "exclusive_write"; 8663 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8664 return "read_many_write_one"; 8665 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8666 return "read_many_write_none"; 8667 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8668 return "read_many_write_many"; 8669 default: 8670 break; 8671 } 8672 return "invalid_claim"; 8673 } 8674 8675 static bool 8676 claim_type_is_v2(enum spdk_bdev_claim_type type) 8677 { 8678 switch (type) { 8679 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8680 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8681 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8682 return true; 8683 default: 8684 break; 8685 } 8686 return false; 8687 } 8688 8689 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8690 static bool 8691 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8692 { 8693 switch (type) { 8694 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8695 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8696 return true; 8697 default: 8698 break; 8699 } 8700 return false; 8701 } 8702 8703 void 8704 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8705 { 8706 if (opts == NULL) { 8707 SPDK_ERRLOG("opts should not be NULL\n"); 8708 assert(opts != NULL); 8709 return; 8710 } 8711 if (size == 0) { 8712 SPDK_ERRLOG("size should not be zero\n"); 8713 assert(size != 0); 8714 return; 8715 } 8716 8717 memset(opts, 0, size); 8718 opts->opts_size = size; 8719 8720 #define FIELD_OK(field) \ 8721 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8722 8723 #define SET_FIELD(field, value) \ 8724 if (FIELD_OK(field)) { \ 8725 opts->field = value; \ 8726 } \ 8727 8728 SET_FIELD(shared_claim_key, 0); 8729 8730 #undef FIELD_OK 8731 #undef SET_FIELD 8732 } 8733 8734 static int 8735 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8736 { 8737 if (src->opts_size == 0) { 8738 SPDK_ERRLOG("size should not be zero\n"); 8739 return -1; 8740 } 8741 8742 memset(dst, 0, sizeof(*dst)); 8743 dst->opts_size = src->opts_size; 8744 8745 #define FIELD_OK(field) \ 8746 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8747 8748 #define SET_FIELD(field) \ 8749 if (FIELD_OK(field)) { \ 8750 dst->field = src->field; \ 8751 } \ 8752 8753 if (FIELD_OK(name)) { 8754 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8755 } 8756 8757 SET_FIELD(shared_claim_key); 8758 8759 /* You should not remove this statement, but need to update the assert statement 8760 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8761 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8762 8763 #undef FIELD_OK 8764 #undef SET_FIELD 8765 return 0; 8766 } 8767 8768 /* Returns 0 if a read-write-once claim can be taken. */ 8769 static int 8770 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8771 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8772 { 8773 struct spdk_bdev *bdev = desc->bdev; 8774 struct spdk_bdev_desc *open_desc; 8775 8776 assert(spdk_spin_held(&bdev->internal.spinlock)); 8777 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8778 8779 if (opts->shared_claim_key != 0) { 8780 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8781 bdev->name); 8782 return -EINVAL; 8783 } 8784 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8785 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8786 return -EPERM; 8787 } 8788 if (desc->claim != NULL) { 8789 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8790 bdev->name, desc->claim->module->name); 8791 return -EPERM; 8792 } 8793 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8794 if (desc != open_desc && open_desc->write) { 8795 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8796 "another descriptor is open for writing\n", 8797 bdev->name); 8798 return -EPERM; 8799 } 8800 } 8801 8802 return 0; 8803 } 8804 8805 /* Returns 0 if a read-only-many claim can be taken. */ 8806 static int 8807 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8808 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8809 { 8810 struct spdk_bdev *bdev = desc->bdev; 8811 struct spdk_bdev_desc *open_desc; 8812 8813 assert(spdk_spin_held(&bdev->internal.spinlock)); 8814 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8815 assert(desc->claim == NULL); 8816 8817 if (desc->write) { 8818 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8819 bdev->name); 8820 return -EINVAL; 8821 } 8822 if (opts->shared_claim_key != 0) { 8823 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8824 return -EINVAL; 8825 } 8826 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8827 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8828 if (open_desc->write) { 8829 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8830 "another descriptor is open for writing\n", 8831 bdev->name); 8832 return -EPERM; 8833 } 8834 } 8835 } 8836 8837 return 0; 8838 } 8839 8840 /* Returns 0 if a read-write-many claim can be taken. */ 8841 static int 8842 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8843 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8844 { 8845 struct spdk_bdev *bdev = desc->bdev; 8846 struct spdk_bdev_desc *open_desc; 8847 8848 assert(spdk_spin_held(&bdev->internal.spinlock)); 8849 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8850 assert(desc->claim == NULL); 8851 8852 if (opts->shared_claim_key == 0) { 8853 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8854 bdev->name); 8855 return -EINVAL; 8856 } 8857 switch (bdev->internal.claim_type) { 8858 case SPDK_BDEV_CLAIM_NONE: 8859 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8860 if (open_desc == desc) { 8861 continue; 8862 } 8863 if (open_desc->write) { 8864 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8865 "another descriptor is open for writing without a " 8866 "claim\n", bdev->name); 8867 return -EPERM; 8868 } 8869 } 8870 break; 8871 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8872 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8873 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8874 return -EPERM; 8875 } 8876 break; 8877 default: 8878 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8879 return -EBUSY; 8880 } 8881 8882 return 0; 8883 } 8884 8885 /* Updates desc and its bdev with a v2 claim. */ 8886 static int 8887 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8888 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8889 { 8890 struct spdk_bdev *bdev = desc->bdev; 8891 struct spdk_bdev_module_claim *claim; 8892 8893 assert(spdk_spin_held(&bdev->internal.spinlock)); 8894 assert(claim_type_is_v2(type)); 8895 assert(desc->claim == NULL); 8896 8897 claim = calloc(1, sizeof(*desc->claim)); 8898 if (claim == NULL) { 8899 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8900 return -ENOMEM; 8901 } 8902 claim->module = module; 8903 claim->desc = desc; 8904 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8905 memcpy(claim->name, opts->name, sizeof(claim->name)); 8906 desc->claim = claim; 8907 8908 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8909 bdev->internal.claim_type = type; 8910 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8911 bdev->internal.claim.v2.key = opts->shared_claim_key; 8912 } 8913 assert(type == bdev->internal.claim_type); 8914 8915 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8916 8917 if (!desc->write && claim_type_promotes_to_write(type)) { 8918 desc->write = true; 8919 } 8920 8921 return 0; 8922 } 8923 8924 int 8925 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8926 struct spdk_bdev_claim_opts *_opts, 8927 struct spdk_bdev_module *module) 8928 { 8929 struct spdk_bdev *bdev; 8930 struct spdk_bdev_claim_opts opts; 8931 int rc = 0; 8932 8933 if (desc == NULL) { 8934 SPDK_ERRLOG("descriptor must not be NULL\n"); 8935 return -EINVAL; 8936 } 8937 8938 bdev = desc->bdev; 8939 8940 if (_opts == NULL) { 8941 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8942 } else if (claim_opts_copy(_opts, &opts) != 0) { 8943 return -EINVAL; 8944 } 8945 8946 spdk_spin_lock(&bdev->internal.spinlock); 8947 8948 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8949 bdev->internal.claim_type != type) { 8950 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8951 spdk_spin_unlock(&bdev->internal.spinlock); 8952 return -EPERM; 8953 } 8954 8955 if (claim_type_is_v2(type) && desc->claim != NULL) { 8956 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8957 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8958 spdk_spin_unlock(&bdev->internal.spinlock); 8959 return -EPERM; 8960 } 8961 8962 switch (type) { 8963 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8964 spdk_spin_unlock(&bdev->internal.spinlock); 8965 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8966 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8967 rc = claim_verify_rwo(desc, type, &opts, module); 8968 break; 8969 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8970 rc = claim_verify_rom(desc, type, &opts, module); 8971 break; 8972 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8973 rc = claim_verify_rwm(desc, type, &opts, module); 8974 break; 8975 default: 8976 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8977 rc = -ENOTSUP; 8978 } 8979 8980 if (rc == 0) { 8981 rc = claim_bdev(desc, type, &opts, module); 8982 } 8983 8984 spdk_spin_unlock(&bdev->internal.spinlock); 8985 return rc; 8986 } 8987 8988 static void 8989 claim_reset(struct spdk_bdev *bdev) 8990 { 8991 assert(spdk_spin_held(&bdev->internal.spinlock)); 8992 assert(claim_type_is_v2(bdev->internal.claim_type)); 8993 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8994 8995 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8996 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8997 } 8998 8999 static void 9000 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 9001 { 9002 struct spdk_bdev *bdev = desc->bdev; 9003 9004 assert(spdk_spin_held(&bdev->internal.spinlock)); 9005 assert(claim_type_is_v2(bdev->internal.claim_type)); 9006 9007 if (bdev->internal.examine_in_progress == 0) { 9008 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 9009 free(desc->claim); 9010 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 9011 claim_reset(bdev); 9012 } 9013 } else { 9014 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 9015 desc->claim->module = NULL; 9016 desc->claim->desc = NULL; 9017 } 9018 desc->claim = NULL; 9019 } 9020 9021 /* 9022 * End claims v2 9023 */ 9024 9025 struct spdk_bdev * 9026 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 9027 { 9028 assert(desc != NULL); 9029 return desc->bdev; 9030 } 9031 9032 int 9033 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 9034 { 9035 struct spdk_bdev *bdev, *tmp; 9036 struct spdk_bdev_desc *desc; 9037 int rc = 0; 9038 9039 assert(fn != NULL); 9040 9041 spdk_spin_lock(&g_bdev_mgr.spinlock); 9042 bdev = spdk_bdev_first(); 9043 while (bdev != NULL) { 9044 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9045 if (rc != 0) { 9046 break; 9047 } 9048 rc = bdev_open(bdev, false, desc); 9049 if (rc != 0) { 9050 bdev_desc_free(desc); 9051 if (rc == -ENODEV) { 9052 /* Ignore the error and move to the next bdev. */ 9053 rc = 0; 9054 bdev = spdk_bdev_next(bdev); 9055 continue; 9056 } 9057 break; 9058 } 9059 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9060 9061 rc = fn(ctx, bdev); 9062 9063 spdk_spin_lock(&g_bdev_mgr.spinlock); 9064 tmp = spdk_bdev_next(bdev); 9065 bdev_close(bdev, desc); 9066 if (rc != 0) { 9067 break; 9068 } 9069 bdev = tmp; 9070 } 9071 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9072 9073 return rc; 9074 } 9075 9076 int 9077 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 9078 { 9079 struct spdk_bdev *bdev, *tmp; 9080 struct spdk_bdev_desc *desc; 9081 int rc = 0; 9082 9083 assert(fn != NULL); 9084 9085 spdk_spin_lock(&g_bdev_mgr.spinlock); 9086 bdev = spdk_bdev_first_leaf(); 9087 while (bdev != NULL) { 9088 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 9089 if (rc != 0) { 9090 break; 9091 } 9092 rc = bdev_open(bdev, false, desc); 9093 if (rc != 0) { 9094 bdev_desc_free(desc); 9095 if (rc == -ENODEV) { 9096 /* Ignore the error and move to the next bdev. */ 9097 rc = 0; 9098 bdev = spdk_bdev_next_leaf(bdev); 9099 continue; 9100 } 9101 break; 9102 } 9103 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9104 9105 rc = fn(ctx, bdev); 9106 9107 spdk_spin_lock(&g_bdev_mgr.spinlock); 9108 tmp = spdk_bdev_next_leaf(bdev); 9109 bdev_close(bdev, desc); 9110 if (rc != 0) { 9111 break; 9112 } 9113 bdev = tmp; 9114 } 9115 spdk_spin_unlock(&g_bdev_mgr.spinlock); 9116 9117 return rc; 9118 } 9119 9120 void 9121 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 9122 { 9123 struct iovec *iovs; 9124 int iovcnt; 9125 9126 if (bdev_io == NULL) { 9127 return; 9128 } 9129 9130 switch (bdev_io->type) { 9131 case SPDK_BDEV_IO_TYPE_READ: 9132 case SPDK_BDEV_IO_TYPE_WRITE: 9133 case SPDK_BDEV_IO_TYPE_ZCOPY: 9134 iovs = bdev_io->u.bdev.iovs; 9135 iovcnt = bdev_io->u.bdev.iovcnt; 9136 break; 9137 default: 9138 iovs = NULL; 9139 iovcnt = 0; 9140 break; 9141 } 9142 9143 if (iovp) { 9144 *iovp = iovs; 9145 } 9146 if (iovcntp) { 9147 *iovcntp = iovcnt; 9148 } 9149 } 9150 9151 void * 9152 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9153 { 9154 if (bdev_io == NULL) { 9155 return NULL; 9156 } 9157 9158 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9159 return NULL; 9160 } 9161 9162 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9163 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9164 return bdev_io->u.bdev.md_buf; 9165 } 9166 9167 return NULL; 9168 } 9169 9170 void * 9171 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9172 { 9173 if (bdev_io == NULL) { 9174 assert(false); 9175 return NULL; 9176 } 9177 9178 return bdev_io->internal.caller_ctx; 9179 } 9180 9181 void 9182 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9183 { 9184 9185 if (spdk_bdev_module_list_find(bdev_module->name)) { 9186 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9187 assert(false); 9188 } 9189 9190 spdk_spin_init(&bdev_module->internal.spinlock); 9191 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9192 9193 /* 9194 * Modules with examine callbacks must be initialized first, so they are 9195 * ready to handle examine callbacks from later modules that will 9196 * register physical bdevs. 9197 */ 9198 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9199 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9200 } else { 9201 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9202 } 9203 } 9204 9205 struct spdk_bdev_module * 9206 spdk_bdev_module_list_find(const char *name) 9207 { 9208 struct spdk_bdev_module *bdev_module; 9209 9210 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9211 if (strcmp(name, bdev_module->name) == 0) { 9212 break; 9213 } 9214 } 9215 9216 return bdev_module; 9217 } 9218 9219 static int 9220 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9221 { 9222 uint64_t num_blocks; 9223 void *md_buf = NULL; 9224 9225 num_blocks = bdev_io->u.bdev.num_blocks; 9226 9227 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9228 md_buf = (char *)g_bdev_mgr.zero_buffer + 9229 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9230 } 9231 9232 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9233 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9234 g_bdev_mgr.zero_buffer, md_buf, 9235 bdev_io->u.bdev.offset_blocks, num_blocks, 9236 bdev_write_zero_buffer_done, bdev_io); 9237 } 9238 9239 static void 9240 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9241 { 9242 struct spdk_bdev_io *parent_io = cb_arg; 9243 9244 spdk_bdev_free_io(bdev_io); 9245 9246 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9247 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9248 } 9249 9250 static void 9251 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9252 { 9253 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9254 ctx->bdev->internal.qos_mod_in_progress = false; 9255 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9256 9257 if (ctx->cb_fn) { 9258 ctx->cb_fn(ctx->cb_arg, status); 9259 } 9260 free(ctx); 9261 } 9262 9263 static void 9264 bdev_disable_qos_done(void *cb_arg) 9265 { 9266 struct set_qos_limit_ctx *ctx = cb_arg; 9267 struct spdk_bdev *bdev = ctx->bdev; 9268 struct spdk_bdev_qos *qos; 9269 9270 spdk_spin_lock(&bdev->internal.spinlock); 9271 qos = bdev->internal.qos; 9272 bdev->internal.qos = NULL; 9273 spdk_spin_unlock(&bdev->internal.spinlock); 9274 9275 if (qos->thread != NULL) { 9276 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9277 spdk_poller_unregister(&qos->poller); 9278 } 9279 9280 free(qos); 9281 9282 bdev_set_qos_limit_done(ctx, 0); 9283 } 9284 9285 static void 9286 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9287 { 9288 struct set_qos_limit_ctx *ctx = _ctx; 9289 struct spdk_thread *thread; 9290 9291 spdk_spin_lock(&bdev->internal.spinlock); 9292 thread = bdev->internal.qos->thread; 9293 spdk_spin_unlock(&bdev->internal.spinlock); 9294 9295 if (thread != NULL) { 9296 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9297 } else { 9298 bdev_disable_qos_done(ctx); 9299 } 9300 } 9301 9302 static void 9303 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9304 struct spdk_io_channel *ch, void *_ctx) 9305 { 9306 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9307 struct spdk_bdev_io *bdev_io; 9308 9309 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9310 9311 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9312 /* Re-submit the queued I/O. */ 9313 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9314 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9315 _bdev_io_submit(bdev_io); 9316 } 9317 9318 spdk_bdev_for_each_channel_continue(i, 0); 9319 } 9320 9321 static void 9322 bdev_update_qos_rate_limit_msg(void *cb_arg) 9323 { 9324 struct set_qos_limit_ctx *ctx = cb_arg; 9325 struct spdk_bdev *bdev = ctx->bdev; 9326 9327 spdk_spin_lock(&bdev->internal.spinlock); 9328 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9329 spdk_spin_unlock(&bdev->internal.spinlock); 9330 9331 bdev_set_qos_limit_done(ctx, 0); 9332 } 9333 9334 static void 9335 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9336 struct spdk_io_channel *ch, void *_ctx) 9337 { 9338 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9339 9340 spdk_spin_lock(&bdev->internal.spinlock); 9341 bdev_enable_qos(bdev, bdev_ch); 9342 spdk_spin_unlock(&bdev->internal.spinlock); 9343 spdk_bdev_for_each_channel_continue(i, 0); 9344 } 9345 9346 static void 9347 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9348 { 9349 struct set_qos_limit_ctx *ctx = _ctx; 9350 9351 bdev_set_qos_limit_done(ctx, status); 9352 } 9353 9354 static void 9355 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9356 { 9357 int i; 9358 9359 assert(bdev->internal.qos != NULL); 9360 9361 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9362 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9363 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9364 9365 if (limits[i] == 0) { 9366 bdev->internal.qos->rate_limits[i].limit = 9367 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9368 } 9369 } 9370 } 9371 } 9372 9373 void 9374 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9375 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9376 { 9377 struct set_qos_limit_ctx *ctx; 9378 uint32_t limit_set_complement; 9379 uint64_t min_limit_per_sec; 9380 int i; 9381 bool disable_rate_limit = true; 9382 9383 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9384 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9385 continue; 9386 } 9387 9388 if (limits[i] > 0) { 9389 disable_rate_limit = false; 9390 } 9391 9392 if (bdev_qos_is_iops_rate_limit(i) == true) { 9393 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9394 } else { 9395 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9396 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9397 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9398 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9399 } 9400 /* Change from megabyte to byte rate limit */ 9401 limits[i] = limits[i] * 1024 * 1024; 9402 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9403 } 9404 9405 limit_set_complement = limits[i] % min_limit_per_sec; 9406 if (limit_set_complement) { 9407 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9408 limits[i], min_limit_per_sec); 9409 limits[i] += min_limit_per_sec - limit_set_complement; 9410 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9411 } 9412 } 9413 9414 ctx = calloc(1, sizeof(*ctx)); 9415 if (ctx == NULL) { 9416 cb_fn(cb_arg, -ENOMEM); 9417 return; 9418 } 9419 9420 ctx->cb_fn = cb_fn; 9421 ctx->cb_arg = cb_arg; 9422 ctx->bdev = bdev; 9423 9424 spdk_spin_lock(&bdev->internal.spinlock); 9425 if (bdev->internal.qos_mod_in_progress) { 9426 spdk_spin_unlock(&bdev->internal.spinlock); 9427 free(ctx); 9428 cb_fn(cb_arg, -EAGAIN); 9429 return; 9430 } 9431 bdev->internal.qos_mod_in_progress = true; 9432 9433 if (disable_rate_limit == true && bdev->internal.qos) { 9434 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9435 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9436 (bdev->internal.qos->rate_limits[i].limit > 0 && 9437 bdev->internal.qos->rate_limits[i].limit != 9438 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9439 disable_rate_limit = false; 9440 break; 9441 } 9442 } 9443 } 9444 9445 if (disable_rate_limit == false) { 9446 if (bdev->internal.qos == NULL) { 9447 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9448 if (!bdev->internal.qos) { 9449 spdk_spin_unlock(&bdev->internal.spinlock); 9450 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9451 bdev_set_qos_limit_done(ctx, -ENOMEM); 9452 return; 9453 } 9454 } 9455 9456 if (bdev->internal.qos->thread == NULL) { 9457 /* Enabling */ 9458 bdev_set_qos_rate_limits(bdev, limits); 9459 9460 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9461 bdev_enable_qos_done); 9462 } else { 9463 /* Updating */ 9464 bdev_set_qos_rate_limits(bdev, limits); 9465 9466 spdk_thread_send_msg(bdev->internal.qos->thread, 9467 bdev_update_qos_rate_limit_msg, ctx); 9468 } 9469 } else { 9470 if (bdev->internal.qos != NULL) { 9471 bdev_set_qos_rate_limits(bdev, limits); 9472 9473 /* Disabling */ 9474 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9475 bdev_disable_qos_msg_done); 9476 } else { 9477 spdk_spin_unlock(&bdev->internal.spinlock); 9478 bdev_set_qos_limit_done(ctx, 0); 9479 return; 9480 } 9481 } 9482 9483 spdk_spin_unlock(&bdev->internal.spinlock); 9484 } 9485 9486 struct spdk_bdev_histogram_ctx { 9487 spdk_bdev_histogram_status_cb cb_fn; 9488 void *cb_arg; 9489 struct spdk_bdev *bdev; 9490 int status; 9491 }; 9492 9493 static void 9494 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9495 { 9496 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9497 9498 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9499 ctx->bdev->internal.histogram_in_progress = false; 9500 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9501 ctx->cb_fn(ctx->cb_arg, ctx->status); 9502 free(ctx); 9503 } 9504 9505 static void 9506 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9507 struct spdk_io_channel *_ch, void *_ctx) 9508 { 9509 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9510 9511 if (ch->histogram != NULL) { 9512 spdk_histogram_data_free(ch->histogram); 9513 ch->histogram = NULL; 9514 } 9515 spdk_bdev_for_each_channel_continue(i, 0); 9516 } 9517 9518 static void 9519 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9520 { 9521 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9522 9523 if (status != 0) { 9524 ctx->status = status; 9525 ctx->bdev->internal.histogram_enabled = false; 9526 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9527 bdev_histogram_disable_channel_cb); 9528 } else { 9529 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9530 ctx->bdev->internal.histogram_in_progress = false; 9531 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9532 ctx->cb_fn(ctx->cb_arg, ctx->status); 9533 free(ctx); 9534 } 9535 } 9536 9537 static void 9538 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9539 struct spdk_io_channel *_ch, void *_ctx) 9540 { 9541 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9542 int status = 0; 9543 9544 if (ch->histogram == NULL) { 9545 ch->histogram = spdk_histogram_data_alloc(); 9546 if (ch->histogram == NULL) { 9547 status = -ENOMEM; 9548 } 9549 } 9550 9551 spdk_bdev_for_each_channel_continue(i, status); 9552 } 9553 9554 void 9555 spdk_bdev_histogram_enable_ext(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9556 void *cb_arg, bool enable, struct spdk_bdev_enable_histogram_opts *opts) 9557 { 9558 struct spdk_bdev_histogram_ctx *ctx; 9559 9560 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9561 if (ctx == NULL) { 9562 cb_fn(cb_arg, -ENOMEM); 9563 return; 9564 } 9565 9566 ctx->bdev = bdev; 9567 ctx->status = 0; 9568 ctx->cb_fn = cb_fn; 9569 ctx->cb_arg = cb_arg; 9570 9571 spdk_spin_lock(&bdev->internal.spinlock); 9572 if (bdev->internal.histogram_in_progress) { 9573 spdk_spin_unlock(&bdev->internal.spinlock); 9574 free(ctx); 9575 cb_fn(cb_arg, -EAGAIN); 9576 return; 9577 } 9578 9579 bdev->internal.histogram_in_progress = true; 9580 spdk_spin_unlock(&bdev->internal.spinlock); 9581 9582 bdev->internal.histogram_enabled = enable; 9583 bdev->internal.histogram_io_type = opts->io_type; 9584 9585 if (enable) { 9586 /* Allocate histogram for each channel */ 9587 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9588 bdev_histogram_enable_channel_cb); 9589 } else { 9590 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9591 bdev_histogram_disable_channel_cb); 9592 } 9593 } 9594 9595 void 9596 spdk_bdev_enable_histogram_opts_init(struct spdk_bdev_enable_histogram_opts *opts, size_t size) 9597 { 9598 if (opts == NULL) { 9599 SPDK_ERRLOG("opts should not be NULL\n"); 9600 assert(opts != NULL); 9601 return; 9602 } 9603 if (size == 0) { 9604 SPDK_ERRLOG("size should not be zero\n"); 9605 assert(size != 0); 9606 return; 9607 } 9608 9609 memset(opts, 0, size); 9610 opts->size = size; 9611 9612 #define FIELD_OK(field) \ 9613 offsetof(struct spdk_bdev_enable_histogram_opts, field) + sizeof(opts->field) <= size 9614 9615 #define SET_FIELD(field, value) \ 9616 if (FIELD_OK(field)) { \ 9617 opts->field = value; \ 9618 } \ 9619 9620 SET_FIELD(io_type, 0); 9621 9622 /* You should not remove this statement, but need to update the assert statement 9623 * if you add a new field, and also add a corresponding SET_FIELD statement */ 9624 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_enable_histogram_opts) == 9, "Incorrect size"); 9625 9626 #undef FIELD_OK 9627 #undef SET_FIELD 9628 } 9629 9630 void 9631 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9632 void *cb_arg, bool enable) 9633 { 9634 struct spdk_bdev_enable_histogram_opts opts; 9635 9636 spdk_bdev_enable_histogram_opts_init(&opts, sizeof(opts)); 9637 spdk_bdev_histogram_enable_ext(bdev, cb_fn, cb_arg, enable, &opts); 9638 } 9639 9640 struct spdk_bdev_histogram_data_ctx { 9641 spdk_bdev_histogram_data_cb cb_fn; 9642 void *cb_arg; 9643 struct spdk_bdev *bdev; 9644 /** merged histogram data from all channels */ 9645 struct spdk_histogram_data *histogram; 9646 }; 9647 9648 static void 9649 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9650 { 9651 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9652 9653 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9654 free(ctx); 9655 } 9656 9657 static void 9658 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9659 struct spdk_io_channel *_ch, void *_ctx) 9660 { 9661 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9662 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9663 int status = 0; 9664 9665 if (ch->histogram == NULL) { 9666 status = -EFAULT; 9667 } else { 9668 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9669 } 9670 9671 spdk_bdev_for_each_channel_continue(i, status); 9672 } 9673 9674 void 9675 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9676 spdk_bdev_histogram_data_cb cb_fn, 9677 void *cb_arg) 9678 { 9679 struct spdk_bdev_histogram_data_ctx *ctx; 9680 9681 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9682 if (ctx == NULL) { 9683 cb_fn(cb_arg, -ENOMEM, NULL); 9684 return; 9685 } 9686 9687 ctx->bdev = bdev; 9688 ctx->cb_fn = cb_fn; 9689 ctx->cb_arg = cb_arg; 9690 9691 ctx->histogram = histogram; 9692 9693 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9694 bdev_histogram_get_channel_cb); 9695 } 9696 9697 void 9698 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9699 void *cb_arg) 9700 { 9701 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9702 int status = 0; 9703 9704 assert(cb_fn != NULL); 9705 9706 if (bdev_ch->histogram == NULL) { 9707 status = -EFAULT; 9708 } 9709 cb_fn(cb_arg, status, bdev_ch->histogram); 9710 } 9711 9712 size_t 9713 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9714 size_t max_events) 9715 { 9716 struct media_event_entry *entry; 9717 size_t num_events = 0; 9718 9719 for (; num_events < max_events; ++num_events) { 9720 entry = TAILQ_FIRST(&desc->pending_media_events); 9721 if (entry == NULL) { 9722 break; 9723 } 9724 9725 events[num_events] = entry->event; 9726 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9727 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9728 } 9729 9730 return num_events; 9731 } 9732 9733 int 9734 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9735 size_t num_events) 9736 { 9737 struct spdk_bdev_desc *desc; 9738 struct media_event_entry *entry; 9739 size_t event_id; 9740 int rc = 0; 9741 9742 assert(bdev->media_events); 9743 9744 spdk_spin_lock(&bdev->internal.spinlock); 9745 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9746 if (desc->write) { 9747 break; 9748 } 9749 } 9750 9751 if (desc == NULL || desc->media_events_buffer == NULL) { 9752 rc = -ENODEV; 9753 goto out; 9754 } 9755 9756 for (event_id = 0; event_id < num_events; ++event_id) { 9757 entry = TAILQ_FIRST(&desc->free_media_events); 9758 if (entry == NULL) { 9759 break; 9760 } 9761 9762 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9763 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9764 entry->event = events[event_id]; 9765 } 9766 9767 rc = event_id; 9768 out: 9769 spdk_spin_unlock(&bdev->internal.spinlock); 9770 return rc; 9771 } 9772 9773 static void 9774 _media_management_notify(void *arg) 9775 { 9776 struct spdk_bdev_desc *desc = arg; 9777 9778 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9779 } 9780 9781 void 9782 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9783 { 9784 struct spdk_bdev_desc *desc; 9785 9786 spdk_spin_lock(&bdev->internal.spinlock); 9787 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9788 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9789 event_notify(desc, _media_management_notify); 9790 } 9791 } 9792 spdk_spin_unlock(&bdev->internal.spinlock); 9793 } 9794 9795 struct locked_lba_range_ctx { 9796 struct lba_range range; 9797 struct lba_range *current_range; 9798 struct lba_range *owner_range; 9799 struct spdk_poller *poller; 9800 lock_range_cb cb_fn; 9801 void *cb_arg; 9802 }; 9803 9804 static void 9805 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9806 { 9807 struct locked_lba_range_ctx *ctx = _ctx; 9808 9809 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9810 free(ctx); 9811 } 9812 9813 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9814 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9815 9816 static void 9817 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9818 { 9819 struct locked_lba_range_ctx *ctx = _ctx; 9820 9821 if (status == -ENOMEM) { 9822 /* One of the channels could not allocate a range object. 9823 * So we have to go back and clean up any ranges that were 9824 * allocated successfully before we return error status to 9825 * the caller. We can reuse the unlock function to do that 9826 * clean up. 9827 */ 9828 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9829 bdev_lock_error_cleanup_cb); 9830 return; 9831 } 9832 9833 /* All channels have locked this range and no I/O overlapping the range 9834 * are outstanding! Set the owner_ch for the range object for the 9835 * locking channel, so that this channel will know that it is allowed 9836 * to write to this range. 9837 */ 9838 if (ctx->owner_range != NULL) { 9839 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9840 } 9841 9842 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9843 9844 /* Don't free the ctx here. Its range is in the bdev's global list of 9845 * locked ranges still, and will be removed and freed when this range 9846 * is later unlocked. 9847 */ 9848 } 9849 9850 static int 9851 bdev_lock_lba_range_check_io(void *_i) 9852 { 9853 struct spdk_bdev_channel_iter *i = _i; 9854 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9855 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9856 struct locked_lba_range_ctx *ctx = i->ctx; 9857 struct lba_range *range = ctx->current_range; 9858 struct spdk_bdev_io *bdev_io; 9859 9860 spdk_poller_unregister(&ctx->poller); 9861 9862 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9863 * range. But we need to wait until any outstanding IO overlapping with this range 9864 * are completed. 9865 */ 9866 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9867 if (bdev_io_range_is_locked(bdev_io, range)) { 9868 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9869 return SPDK_POLLER_BUSY; 9870 } 9871 } 9872 9873 spdk_bdev_for_each_channel_continue(i, 0); 9874 return SPDK_POLLER_BUSY; 9875 } 9876 9877 static void 9878 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9879 struct spdk_io_channel *_ch, void *_ctx) 9880 { 9881 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9882 struct locked_lba_range_ctx *ctx = _ctx; 9883 struct lba_range *range; 9884 9885 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9886 if (range->length == ctx->range.length && 9887 range->offset == ctx->range.offset && 9888 range->locked_ctx == ctx->range.locked_ctx) { 9889 /* This range already exists on this channel, so don't add 9890 * it again. This can happen when a new channel is created 9891 * while the for_each_channel operation is in progress. 9892 * Do not check for outstanding I/O in that case, since the 9893 * range was locked before any I/O could be submitted to the 9894 * new channel. 9895 */ 9896 spdk_bdev_for_each_channel_continue(i, 0); 9897 return; 9898 } 9899 } 9900 9901 range = calloc(1, sizeof(*range)); 9902 if (range == NULL) { 9903 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9904 return; 9905 } 9906 9907 range->length = ctx->range.length; 9908 range->offset = ctx->range.offset; 9909 range->locked_ctx = ctx->range.locked_ctx; 9910 range->quiesce = ctx->range.quiesce; 9911 ctx->current_range = range; 9912 if (ctx->range.owner_ch == ch) { 9913 /* This is the range object for the channel that will hold 9914 * the lock. Store it in the ctx object so that we can easily 9915 * set its owner_ch after the lock is finally acquired. 9916 */ 9917 ctx->owner_range = range; 9918 } 9919 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9920 bdev_lock_lba_range_check_io(i); 9921 } 9922 9923 static void 9924 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9925 { 9926 assert(spdk_get_thread() == ctx->range.owner_thread); 9927 assert(ctx->range.owner_ch == NULL || 9928 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9929 9930 /* We will add a copy of this range to each channel now. */ 9931 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9932 bdev_lock_lba_range_cb); 9933 } 9934 9935 static bool 9936 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9937 { 9938 struct lba_range *r; 9939 9940 TAILQ_FOREACH(r, tailq, tailq) { 9941 if (bdev_lba_range_overlapped(range, r)) { 9942 return true; 9943 } 9944 } 9945 return false; 9946 } 9947 9948 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9949 9950 static int 9951 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9952 uint64_t offset, uint64_t length, 9953 lock_range_cb cb_fn, void *cb_arg) 9954 { 9955 struct locked_lba_range_ctx *ctx; 9956 9957 ctx = calloc(1, sizeof(*ctx)); 9958 if (ctx == NULL) { 9959 return -ENOMEM; 9960 } 9961 9962 ctx->range.offset = offset; 9963 ctx->range.length = length; 9964 ctx->range.owner_thread = spdk_get_thread(); 9965 ctx->range.owner_ch = ch; 9966 ctx->range.locked_ctx = cb_arg; 9967 ctx->range.bdev = bdev; 9968 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9969 ctx->cb_fn = cb_fn; 9970 ctx->cb_arg = cb_arg; 9971 9972 spdk_spin_lock(&bdev->internal.spinlock); 9973 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9974 /* There is an active lock overlapping with this range. 9975 * Put it on the pending list until this range no 9976 * longer overlaps with another. 9977 */ 9978 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9979 } else { 9980 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9981 bdev_lock_lba_range_ctx(bdev, ctx); 9982 } 9983 spdk_spin_unlock(&bdev->internal.spinlock); 9984 return 0; 9985 } 9986 9987 static int 9988 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9989 uint64_t offset, uint64_t length, 9990 lock_range_cb cb_fn, void *cb_arg) 9991 { 9992 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9993 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9994 9995 if (cb_arg == NULL) { 9996 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9997 return -EINVAL; 9998 } 9999 10000 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 10001 } 10002 10003 static void 10004 bdev_lock_lba_range_ctx_msg(void *_ctx) 10005 { 10006 struct locked_lba_range_ctx *ctx = _ctx; 10007 10008 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 10009 } 10010 10011 static void 10012 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 10013 { 10014 struct locked_lba_range_ctx *ctx = _ctx; 10015 struct locked_lba_range_ctx *pending_ctx; 10016 struct lba_range *range, *tmp; 10017 10018 spdk_spin_lock(&bdev->internal.spinlock); 10019 /* Check if there are any pending locked ranges that overlap with this range 10020 * that was just unlocked. If there are, check that it doesn't overlap with any 10021 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 10022 * the lock process. 10023 */ 10024 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 10025 if (bdev_lba_range_overlapped(range, &ctx->range) && 10026 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 10027 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 10028 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10029 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 10030 spdk_thread_send_msg(pending_ctx->range.owner_thread, 10031 bdev_lock_lba_range_ctx_msg, pending_ctx); 10032 } 10033 } 10034 spdk_spin_unlock(&bdev->internal.spinlock); 10035 10036 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 10037 free(ctx); 10038 } 10039 10040 static void 10041 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10042 struct spdk_io_channel *_ch, void *_ctx) 10043 { 10044 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10045 struct locked_lba_range_ctx *ctx = _ctx; 10046 TAILQ_HEAD(, spdk_bdev_io) io_locked; 10047 struct spdk_bdev_io *bdev_io; 10048 struct lba_range *range; 10049 10050 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10051 if (ctx->range.offset == range->offset && 10052 ctx->range.length == range->length && 10053 ctx->range.locked_ctx == range->locked_ctx) { 10054 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 10055 free(range); 10056 break; 10057 } 10058 } 10059 10060 /* Note: we should almost always be able to assert that the range specified 10061 * was found. But there are some very rare corner cases where a new channel 10062 * gets created simultaneously with a range unlock, where this function 10063 * would execute on that new channel and wouldn't have the range. 10064 * We also use this to clean up range allocations when a later allocation 10065 * fails in the locking path. 10066 * So we can't actually assert() here. 10067 */ 10068 10069 /* Swap the locked IO into a temporary list, and then try to submit them again. 10070 * We could hyper-optimize this to only resubmit locked I/O that overlap 10071 * with the range that was just unlocked, but this isn't a performance path so 10072 * we go for simplicity here. 10073 */ 10074 TAILQ_INIT(&io_locked); 10075 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 10076 while (!TAILQ_EMPTY(&io_locked)) { 10077 bdev_io = TAILQ_FIRST(&io_locked); 10078 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 10079 bdev_io_submit(bdev_io); 10080 } 10081 10082 spdk_bdev_for_each_channel_continue(i, 0); 10083 } 10084 10085 static int 10086 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 10087 lock_range_cb cb_fn, void *cb_arg) 10088 { 10089 struct locked_lba_range_ctx *ctx; 10090 struct lba_range *range; 10091 10092 spdk_spin_lock(&bdev->internal.spinlock); 10093 /* To start the unlock the process, we find the range in the bdev's locked_ranges 10094 * and remove it. This ensures new channels don't inherit the locked range. 10095 * Then we will send a message to each channel to remove the range from its 10096 * per-channel list. 10097 */ 10098 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 10099 if (range->offset == offset && range->length == length && 10100 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 10101 break; 10102 } 10103 } 10104 if (range == NULL) { 10105 assert(false); 10106 spdk_spin_unlock(&bdev->internal.spinlock); 10107 return -EINVAL; 10108 } 10109 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 10110 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 10111 spdk_spin_unlock(&bdev->internal.spinlock); 10112 10113 ctx->cb_fn = cb_fn; 10114 ctx->cb_arg = cb_arg; 10115 10116 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 10117 bdev_unlock_lba_range_cb); 10118 return 0; 10119 } 10120 10121 static int 10122 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 10123 uint64_t offset, uint64_t length, 10124 lock_range_cb cb_fn, void *cb_arg) 10125 { 10126 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10127 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 10128 struct lba_range *range; 10129 bool range_found = false; 10130 10131 /* Let's make sure the specified channel actually has a lock on 10132 * the specified range. Note that the range must match exactly. 10133 */ 10134 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 10135 if (range->offset == offset && range->length == length && 10136 range->owner_ch == ch && range->locked_ctx == cb_arg) { 10137 range_found = true; 10138 break; 10139 } 10140 } 10141 10142 if (!range_found) { 10143 return -EINVAL; 10144 } 10145 10146 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 10147 } 10148 10149 struct bdev_quiesce_ctx { 10150 spdk_bdev_quiesce_cb cb_fn; 10151 void *cb_arg; 10152 }; 10153 10154 static void 10155 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 10156 { 10157 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10158 10159 if (quiesce_ctx->cb_fn != NULL) { 10160 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10161 } 10162 10163 free(quiesce_ctx); 10164 } 10165 10166 static void 10167 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 10168 { 10169 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 10170 struct spdk_bdev_module *module = range->bdev->module; 10171 10172 if (status != 0) { 10173 if (quiesce_ctx->cb_fn != NULL) { 10174 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 10175 } 10176 free(quiesce_ctx); 10177 return; 10178 } 10179 10180 spdk_spin_lock(&module->internal.spinlock); 10181 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 10182 spdk_spin_unlock(&module->internal.spinlock); 10183 10184 if (quiesce_ctx->cb_fn != NULL) { 10185 /* copy the context in case the range is unlocked by the callback */ 10186 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 10187 10188 quiesce_ctx->cb_fn = NULL; 10189 quiesce_ctx->cb_arg = NULL; 10190 10191 tmp.cb_fn(tmp.cb_arg, status); 10192 } 10193 /* quiesce_ctx will be freed on unquiesce */ 10194 } 10195 10196 static int 10197 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10198 uint64_t offset, uint64_t length, 10199 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10200 bool unquiesce) 10201 { 10202 struct bdev_quiesce_ctx *quiesce_ctx; 10203 int rc; 10204 10205 if (module != bdev->module) { 10206 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10207 return -EINVAL; 10208 } 10209 10210 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10211 return -EINVAL; 10212 } 10213 10214 if (unquiesce) { 10215 struct lba_range *range; 10216 10217 /* Make sure the specified range is actually quiesced in the specified module and 10218 * then remove it from the list. Note that the range must match exactly. 10219 */ 10220 spdk_spin_lock(&module->internal.spinlock); 10221 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10222 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10223 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10224 break; 10225 } 10226 } 10227 spdk_spin_unlock(&module->internal.spinlock); 10228 10229 if (range == NULL) { 10230 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10231 return -EINVAL; 10232 } 10233 10234 quiesce_ctx = range->locked_ctx; 10235 quiesce_ctx->cb_fn = cb_fn; 10236 quiesce_ctx->cb_arg = cb_arg; 10237 10238 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10239 } else { 10240 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10241 if (quiesce_ctx == NULL) { 10242 return -ENOMEM; 10243 } 10244 10245 quiesce_ctx->cb_fn = cb_fn; 10246 quiesce_ctx->cb_arg = cb_arg; 10247 10248 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10249 if (rc != 0) { 10250 free(quiesce_ctx); 10251 } 10252 } 10253 10254 return rc; 10255 } 10256 10257 int 10258 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10259 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10260 { 10261 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10262 } 10263 10264 int 10265 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10266 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10267 { 10268 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10269 } 10270 10271 int 10272 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10273 uint64_t offset, uint64_t length, 10274 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10275 { 10276 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10277 } 10278 10279 int 10280 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10281 uint64_t offset, uint64_t length, 10282 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10283 { 10284 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10285 } 10286 10287 int 10288 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10289 int array_size) 10290 { 10291 if (!bdev) { 10292 return -EINVAL; 10293 } 10294 10295 if (bdev->fn_table->get_memory_domains) { 10296 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10297 } 10298 10299 return 0; 10300 } 10301 10302 struct spdk_bdev_for_each_io_ctx { 10303 void *ctx; 10304 spdk_bdev_io_fn fn; 10305 spdk_bdev_for_each_io_cb cb; 10306 }; 10307 10308 static void 10309 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10310 struct spdk_io_channel *io_ch, void *_ctx) 10311 { 10312 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10313 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10314 struct spdk_bdev_io *bdev_io; 10315 int rc = 0; 10316 10317 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10318 rc = ctx->fn(ctx->ctx, bdev_io); 10319 if (rc != 0) { 10320 break; 10321 } 10322 } 10323 10324 spdk_bdev_for_each_channel_continue(i, rc); 10325 } 10326 10327 static void 10328 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10329 { 10330 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10331 10332 ctx->cb(ctx->ctx, status); 10333 10334 free(ctx); 10335 } 10336 10337 void 10338 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10339 spdk_bdev_for_each_io_cb cb) 10340 { 10341 struct spdk_bdev_for_each_io_ctx *ctx; 10342 10343 assert(fn != NULL && cb != NULL); 10344 10345 ctx = calloc(1, sizeof(*ctx)); 10346 if (ctx == NULL) { 10347 SPDK_ERRLOG("Failed to allocate context.\n"); 10348 cb(_ctx, -ENOMEM); 10349 return; 10350 } 10351 10352 ctx->ctx = _ctx; 10353 ctx->fn = fn; 10354 ctx->cb = cb; 10355 10356 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10357 bdev_for_each_io_done); 10358 } 10359 10360 void 10361 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10362 { 10363 spdk_for_each_channel_continue(iter->i, status); 10364 } 10365 10366 static struct spdk_bdev * 10367 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10368 { 10369 void *io_device = spdk_io_channel_iter_get_io_device(i); 10370 10371 return __bdev_from_io_dev(io_device); 10372 } 10373 10374 static void 10375 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10376 { 10377 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10378 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10379 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10380 10381 iter->i = i; 10382 iter->fn(iter, bdev, ch, iter->ctx); 10383 } 10384 10385 static void 10386 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10387 { 10388 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10389 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10390 10391 iter->i = i; 10392 iter->cpl(bdev, iter->ctx, status); 10393 10394 free(iter); 10395 } 10396 10397 void 10398 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10399 void *ctx, spdk_bdev_for_each_channel_done cpl) 10400 { 10401 struct spdk_bdev_channel_iter *iter; 10402 10403 assert(bdev != NULL && fn != NULL && ctx != NULL); 10404 10405 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10406 if (iter == NULL) { 10407 SPDK_ERRLOG("Unable to allocate iterator\n"); 10408 assert(false); 10409 return; 10410 } 10411 10412 iter->fn = fn; 10413 iter->cpl = cpl; 10414 iter->ctx = ctx; 10415 10416 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10417 iter, bdev_each_channel_cpl); 10418 } 10419 10420 static void 10421 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10422 { 10423 struct spdk_bdev_io *parent_io = cb_arg; 10424 10425 spdk_bdev_free_io(bdev_io); 10426 10427 /* Check return status of write */ 10428 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10429 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10430 } 10431 10432 static void 10433 bdev_copy_do_write(void *_bdev_io) 10434 { 10435 struct spdk_bdev_io *bdev_io = _bdev_io; 10436 int rc; 10437 10438 /* Write blocks */ 10439 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10440 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10441 bdev_io->u.bdev.iovs[0].iov_base, 10442 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10443 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10444 10445 if (rc == -ENOMEM) { 10446 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10447 } else if (rc != 0) { 10448 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10449 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10450 } 10451 } 10452 10453 static void 10454 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10455 { 10456 struct spdk_bdev_io *parent_io = cb_arg; 10457 10458 spdk_bdev_free_io(bdev_io); 10459 10460 /* Check return status of read */ 10461 if (!success) { 10462 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10463 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10464 return; 10465 } 10466 10467 /* Do write */ 10468 bdev_copy_do_write(parent_io); 10469 } 10470 10471 static void 10472 bdev_copy_do_read(void *_bdev_io) 10473 { 10474 struct spdk_bdev_io *bdev_io = _bdev_io; 10475 int rc; 10476 10477 /* Read blocks */ 10478 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10479 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10480 bdev_io->u.bdev.iovs[0].iov_base, 10481 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10482 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10483 10484 if (rc == -ENOMEM) { 10485 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10486 } else if (rc != 0) { 10487 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10488 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10489 } 10490 } 10491 10492 static void 10493 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10494 { 10495 if (!success) { 10496 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10497 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10498 return; 10499 } 10500 10501 bdev_copy_do_read(bdev_io); 10502 } 10503 10504 int 10505 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10506 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10507 spdk_bdev_io_completion_cb cb, void *cb_arg) 10508 { 10509 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10510 struct spdk_bdev_io *bdev_io; 10511 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10512 10513 if (!desc->write) { 10514 return -EBADF; 10515 } 10516 10517 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10518 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10519 SPDK_DEBUGLOG(bdev, 10520 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10521 dst_offset_blocks, src_offset_blocks, num_blocks); 10522 return -EINVAL; 10523 } 10524 10525 bdev_io = bdev_channel_get_io(channel); 10526 if (!bdev_io) { 10527 return -ENOMEM; 10528 } 10529 10530 bdev_io->internal.ch = channel; 10531 bdev_io->internal.desc = desc; 10532 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10533 10534 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10535 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10536 bdev_io->u.bdev.num_blocks = num_blocks; 10537 bdev_io->u.bdev.memory_domain = NULL; 10538 bdev_io->u.bdev.memory_domain_ctx = NULL; 10539 bdev_io->u.bdev.iovs = NULL; 10540 bdev_io->u.bdev.iovcnt = 0; 10541 bdev_io->u.bdev.md_buf = NULL; 10542 bdev_io->u.bdev.accel_sequence = NULL; 10543 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10544 10545 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10546 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10547 return 0; 10548 } 10549 10550 10551 /* If the copy size is large and should be split, use the generic split logic 10552 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10553 * 10554 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10555 * emulate it using regular read and write requests otherwise. 10556 */ 10557 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10558 bdev_io->internal.f.split) { 10559 bdev_io_submit(bdev_io); 10560 return 0; 10561 } 10562 10563 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10564 10565 return 0; 10566 } 10567 10568 SPDK_LOG_REGISTER_COMPONENT(bdev) 10569 10570 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10571 { 10572 struct spdk_trace_tpoint_opts opts[] = { 10573 { 10574 "BDEV_IO_START", TRACE_BDEV_IO_START, 10575 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10576 { 10577 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10578 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10579 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10580 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10581 } 10582 }, 10583 { 10584 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10585 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10586 { 10587 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10588 { "qd", SPDK_TRACE_ARG_TYPE_INT, 4 } 10589 } 10590 }, 10591 { 10592 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10593 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10594 { 10595 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10596 } 10597 }, 10598 { 10599 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10600 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10601 { 10602 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10603 } 10604 }, 10605 }; 10606 10607 10608 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10609 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10610 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10611 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10612 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10613 } 10614