1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 struct spdk_histogram_data *histogram; 308 309 #ifdef SPDK_CONFIG_VTUNE 310 uint64_t start_tsc; 311 uint64_t interval_tsc; 312 __itt_string_handle *handle; 313 struct spdk_bdev_io_stat *prev_stat; 314 #endif 315 316 bdev_io_tailq_t queued_resets; 317 318 lba_range_tailq_t locked_ranges; 319 320 /** List of I/Os queued by QoS. */ 321 bdev_io_tailq_t qos_queued_io; 322 }; 323 324 struct media_event_entry { 325 struct spdk_bdev_media_event event; 326 TAILQ_ENTRY(media_event_entry) tailq; 327 }; 328 329 #define MEDIA_EVENT_POOL_SIZE 64 330 331 struct spdk_bdev_desc { 332 struct spdk_bdev *bdev; 333 struct spdk_thread *thread; 334 struct { 335 spdk_bdev_event_cb_t event_fn; 336 void *ctx; 337 } callback; 338 bool closed; 339 bool write; 340 bool memory_domains_supported; 341 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 342 struct spdk_spinlock spinlock; 343 uint32_t refs; 344 TAILQ_HEAD(, media_event_entry) pending_media_events; 345 TAILQ_HEAD(, media_event_entry) free_media_events; 346 struct media_event_entry *media_events_buffer; 347 TAILQ_ENTRY(spdk_bdev_desc) link; 348 349 uint64_t timeout_in_sec; 350 spdk_bdev_io_timeout_cb cb_fn; 351 void *cb_arg; 352 struct spdk_poller *io_timeout_poller; 353 struct spdk_bdev_module_claim *claim; 354 }; 355 356 struct spdk_bdev_iostat_ctx { 357 struct spdk_bdev_io_stat *stat; 358 spdk_bdev_get_device_stat_cb cb; 359 void *cb_arg; 360 }; 361 362 struct set_qos_limit_ctx { 363 void (*cb_fn)(void *cb_arg, int status); 364 void *cb_arg; 365 struct spdk_bdev *bdev; 366 }; 367 368 struct spdk_bdev_channel_iter { 369 spdk_bdev_for_each_channel_msg fn; 370 spdk_bdev_for_each_channel_done cpl; 371 struct spdk_io_channel_iter *i; 372 void *ctx; 373 }; 374 375 struct spdk_bdev_io_error_stat { 376 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 377 }; 378 379 enum bdev_io_retry_state { 380 BDEV_IO_RETRY_STATE_INVALID, 381 BDEV_IO_RETRY_STATE_PULL, 382 BDEV_IO_RETRY_STATE_PULL_MD, 383 BDEV_IO_RETRY_STATE_SUBMIT, 384 BDEV_IO_RETRY_STATE_PUSH, 385 BDEV_IO_RETRY_STATE_PUSH_MD, 386 }; 387 388 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 389 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 390 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 391 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 392 393 static inline void bdev_io_complete(void *ctx); 394 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 395 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 396 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 397 398 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 399 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 400 401 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 402 struct spdk_io_channel *ch, void *_ctx); 403 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 404 405 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 406 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 407 uint64_t num_blocks, 408 struct spdk_memory_domain *domain, void *domain_ctx, 409 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 410 spdk_bdev_io_completion_cb cb, void *cb_arg); 411 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 412 struct iovec *iov, int iovcnt, void *md_buf, 413 uint64_t offset_blocks, uint64_t num_blocks, 414 struct spdk_memory_domain *domain, void *domain_ctx, 415 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 416 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 417 spdk_bdev_io_completion_cb cb, void *cb_arg); 418 419 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 420 uint64_t offset, uint64_t length, 421 lock_range_cb cb_fn, void *cb_arg); 422 423 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 424 uint64_t offset, uint64_t length, 425 lock_range_cb cb_fn, void *cb_arg); 426 427 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 428 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 429 430 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 431 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 432 static void claim_reset(struct spdk_bdev *bdev); 433 434 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 435 436 #define bdev_get_ext_io_opt(opts, field, defval) \ 437 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 438 sizeof((opts)->field) <= (opts)->size) ? (opts)->field : (defval)) 439 440 void 441 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 442 { 443 if (!opts) { 444 SPDK_ERRLOG("opts should not be NULL\n"); 445 return; 446 } 447 448 if (!opts_size) { 449 SPDK_ERRLOG("opts_size should not be zero value\n"); 450 return; 451 } 452 453 opts->opts_size = opts_size; 454 455 #define SET_FIELD(field) \ 456 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 457 opts->field = g_bdev_opts.field; \ 458 } \ 459 460 SET_FIELD(bdev_io_pool_size); 461 SET_FIELD(bdev_io_cache_size); 462 SET_FIELD(bdev_auto_examine); 463 SET_FIELD(iobuf_small_cache_size); 464 SET_FIELD(iobuf_large_cache_size); 465 466 /* Do not remove this statement, you should always update this statement when you adding a new field, 467 * and do not forget to add the SET_FIELD statement for your added field. */ 468 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 469 470 #undef SET_FIELD 471 } 472 473 int 474 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 475 { 476 uint32_t min_pool_size; 477 478 if (!opts) { 479 SPDK_ERRLOG("opts cannot be NULL\n"); 480 return -1; 481 } 482 483 if (!opts->opts_size) { 484 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 485 return -1; 486 } 487 488 /* 489 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 490 * initialization. A second mgmt_ch will be created on the same thread when the application starts 491 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 492 */ 493 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 494 if (opts->bdev_io_pool_size < min_pool_size) { 495 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 496 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 497 spdk_thread_get_count()); 498 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 499 return -1; 500 } 501 502 #define SET_FIELD(field) \ 503 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 504 g_bdev_opts.field = opts->field; \ 505 } \ 506 507 SET_FIELD(bdev_io_pool_size); 508 SET_FIELD(bdev_io_cache_size); 509 SET_FIELD(bdev_auto_examine); 510 SET_FIELD(iobuf_small_cache_size); 511 SET_FIELD(iobuf_large_cache_size); 512 513 g_bdev_opts.opts_size = opts->opts_size; 514 515 #undef SET_FIELD 516 517 return 0; 518 } 519 520 static struct spdk_bdev * 521 bdev_get_by_name(const char *bdev_name) 522 { 523 struct spdk_bdev_name find; 524 struct spdk_bdev_name *res; 525 526 find.name = (char *)bdev_name; 527 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 528 if (res != NULL) { 529 return res->bdev; 530 } 531 532 return NULL; 533 } 534 535 struct spdk_bdev * 536 spdk_bdev_get_by_name(const char *bdev_name) 537 { 538 struct spdk_bdev *bdev; 539 540 spdk_spin_lock(&g_bdev_mgr.spinlock); 541 bdev = bdev_get_by_name(bdev_name); 542 spdk_spin_unlock(&g_bdev_mgr.spinlock); 543 544 return bdev; 545 } 546 547 struct bdev_io_status_string { 548 enum spdk_bdev_io_status status; 549 const char *str; 550 }; 551 552 static const struct bdev_io_status_string bdev_io_status_strings[] = { 553 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 554 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 555 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 556 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 557 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 558 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 559 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 560 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 561 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 562 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 563 }; 564 565 static const char * 566 bdev_io_status_get_string(enum spdk_bdev_io_status status) 567 { 568 uint32_t i; 569 570 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 571 if (bdev_io_status_strings[i].status == status) { 572 return bdev_io_status_strings[i].str; 573 } 574 } 575 576 return "reserved"; 577 } 578 579 struct spdk_bdev_wait_for_examine_ctx { 580 struct spdk_poller *poller; 581 spdk_bdev_wait_for_examine_cb cb_fn; 582 void *cb_arg; 583 }; 584 585 static bool bdev_module_all_actions_completed(void); 586 587 static int 588 bdev_wait_for_examine_cb(void *arg) 589 { 590 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 591 592 if (!bdev_module_all_actions_completed()) { 593 return SPDK_POLLER_IDLE; 594 } 595 596 spdk_poller_unregister(&ctx->poller); 597 ctx->cb_fn(ctx->cb_arg); 598 free(ctx); 599 600 return SPDK_POLLER_BUSY; 601 } 602 603 int 604 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 605 { 606 struct spdk_bdev_wait_for_examine_ctx *ctx; 607 608 ctx = calloc(1, sizeof(*ctx)); 609 if (ctx == NULL) { 610 return -ENOMEM; 611 } 612 ctx->cb_fn = cb_fn; 613 ctx->cb_arg = cb_arg; 614 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 615 616 return 0; 617 } 618 619 struct spdk_bdev_examine_item { 620 char *name; 621 TAILQ_ENTRY(spdk_bdev_examine_item) link; 622 }; 623 624 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 625 626 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 627 g_bdev_examine_allowlist); 628 629 static inline bool 630 bdev_examine_allowlist_check(const char *name) 631 { 632 struct spdk_bdev_examine_item *item; 633 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 634 if (strcmp(name, item->name) == 0) { 635 return true; 636 } 637 } 638 return false; 639 } 640 641 static inline void 642 bdev_examine_allowlist_free(void) 643 { 644 struct spdk_bdev_examine_item *item; 645 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 646 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 647 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 648 free(item->name); 649 free(item); 650 } 651 } 652 653 static inline bool 654 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 655 { 656 struct spdk_bdev_alias *tmp; 657 if (bdev_examine_allowlist_check(bdev->name)) { 658 return true; 659 } 660 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 661 if (bdev_examine_allowlist_check(tmp->alias.name)) { 662 return true; 663 } 664 } 665 return false; 666 } 667 668 static inline bool 669 bdev_ok_to_examine(struct spdk_bdev *bdev) 670 { 671 if (g_bdev_opts.bdev_auto_examine) { 672 return true; 673 } else { 674 return bdev_in_examine_allowlist(bdev); 675 } 676 } 677 678 static void 679 bdev_examine(struct spdk_bdev *bdev) 680 { 681 struct spdk_bdev_module *module; 682 struct spdk_bdev_module_claim *claim, *tmpclaim; 683 uint32_t action; 684 685 if (!bdev_ok_to_examine(bdev)) { 686 return; 687 } 688 689 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 690 if (module->examine_config) { 691 spdk_spin_lock(&module->internal.spinlock); 692 action = module->internal.action_in_progress; 693 module->internal.action_in_progress++; 694 spdk_spin_unlock(&module->internal.spinlock); 695 module->examine_config(bdev); 696 if (action != module->internal.action_in_progress) { 697 SPDK_ERRLOG("examine_config for module %s did not call " 698 "spdk_bdev_module_examine_done()\n", module->name); 699 } 700 } 701 } 702 703 spdk_spin_lock(&bdev->internal.spinlock); 704 705 switch (bdev->internal.claim_type) { 706 case SPDK_BDEV_CLAIM_NONE: 707 /* Examine by all bdev modules */ 708 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 709 if (module->examine_disk) { 710 spdk_spin_lock(&module->internal.spinlock); 711 module->internal.action_in_progress++; 712 spdk_spin_unlock(&module->internal.spinlock); 713 spdk_spin_unlock(&bdev->internal.spinlock); 714 module->examine_disk(bdev); 715 spdk_spin_lock(&bdev->internal.spinlock); 716 } 717 } 718 break; 719 case SPDK_BDEV_CLAIM_EXCL_WRITE: 720 /* Examine by the one bdev module with a v1 claim */ 721 module = bdev->internal.claim.v1.module; 722 if (module->examine_disk) { 723 spdk_spin_lock(&module->internal.spinlock); 724 module->internal.action_in_progress++; 725 spdk_spin_unlock(&module->internal.spinlock); 726 spdk_spin_unlock(&bdev->internal.spinlock); 727 module->examine_disk(bdev); 728 return; 729 } 730 break; 731 default: 732 /* Examine by all bdev modules with a v2 claim */ 733 assert(claim_type_is_v2(bdev->internal.claim_type)); 734 /* 735 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 736 * list, perhaps accessing freed memory. Without protection, this could happen 737 * while the lock is dropped during the examine callback. 738 */ 739 bdev->internal.examine_in_progress++; 740 741 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 742 module = claim->module; 743 744 if (module == NULL) { 745 /* This is a vestigial claim, held by examine_count */ 746 continue; 747 } 748 749 if (module->examine_disk == NULL) { 750 continue; 751 } 752 753 spdk_spin_lock(&module->internal.spinlock); 754 module->internal.action_in_progress++; 755 spdk_spin_unlock(&module->internal.spinlock); 756 757 /* Call examine_disk without holding internal.spinlock. */ 758 spdk_spin_unlock(&bdev->internal.spinlock); 759 module->examine_disk(bdev); 760 spdk_spin_lock(&bdev->internal.spinlock); 761 } 762 763 assert(bdev->internal.examine_in_progress > 0); 764 bdev->internal.examine_in_progress--; 765 if (bdev->internal.examine_in_progress == 0) { 766 /* Remove any claims that were released during examine_disk */ 767 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 768 if (claim->desc != NULL) { 769 continue; 770 } 771 772 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 773 free(claim); 774 } 775 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 776 claim_reset(bdev); 777 } 778 } 779 } 780 781 spdk_spin_unlock(&bdev->internal.spinlock); 782 } 783 784 int 785 spdk_bdev_examine(const char *name) 786 { 787 struct spdk_bdev *bdev; 788 struct spdk_bdev_examine_item *item; 789 struct spdk_thread *thread = spdk_get_thread(); 790 791 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 792 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 793 thread ? spdk_thread_get_name(thread) : "null"); 794 return -EINVAL; 795 } 796 797 if (g_bdev_opts.bdev_auto_examine) { 798 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 799 return -EINVAL; 800 } 801 802 if (bdev_examine_allowlist_check(name)) { 803 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 804 return -EEXIST; 805 } 806 807 item = calloc(1, sizeof(*item)); 808 if (!item) { 809 return -ENOMEM; 810 } 811 item->name = strdup(name); 812 if (!item->name) { 813 free(item); 814 return -ENOMEM; 815 } 816 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 817 818 bdev = spdk_bdev_get_by_name(name); 819 if (bdev) { 820 bdev_examine(bdev); 821 } 822 return 0; 823 } 824 825 static inline void 826 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 827 { 828 struct spdk_bdev_examine_item *item; 829 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 830 spdk_json_write_object_begin(w); 831 spdk_json_write_named_string(w, "method", "bdev_examine"); 832 spdk_json_write_named_object_begin(w, "params"); 833 spdk_json_write_named_string(w, "name", item->name); 834 spdk_json_write_object_end(w); 835 spdk_json_write_object_end(w); 836 } 837 } 838 839 struct spdk_bdev * 840 spdk_bdev_first(void) 841 { 842 struct spdk_bdev *bdev; 843 844 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 845 if (bdev) { 846 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 847 } 848 849 return bdev; 850 } 851 852 struct spdk_bdev * 853 spdk_bdev_next(struct spdk_bdev *prev) 854 { 855 struct spdk_bdev *bdev; 856 857 bdev = TAILQ_NEXT(prev, internal.link); 858 if (bdev) { 859 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 860 } 861 862 return bdev; 863 } 864 865 static struct spdk_bdev * 866 _bdev_next_leaf(struct spdk_bdev *bdev) 867 { 868 while (bdev != NULL) { 869 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 870 return bdev; 871 } else { 872 bdev = TAILQ_NEXT(bdev, internal.link); 873 } 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_first_leaf(void) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 885 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 struct spdk_bdev * 894 spdk_bdev_next_leaf(struct spdk_bdev *prev) 895 { 896 struct spdk_bdev *bdev; 897 898 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 899 900 if (bdev) { 901 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 902 } 903 904 return bdev; 905 } 906 907 static inline bool 908 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 909 { 910 return bdev_io->internal.memory_domain; 911 } 912 913 static inline bool 914 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 915 { 916 return bdev_io->internal.has_accel_sequence; 917 } 918 919 static inline void 920 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 921 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 922 { 923 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 924 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 925 * channels we will instead wait for half to complete. 926 */ 927 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 928 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 929 930 assert(state != BDEV_IO_RETRY_STATE_INVALID); 931 bdev_io->internal.retry_state = state; 932 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 933 } 934 935 static inline void 936 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 937 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 938 { 939 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 940 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 941 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 942 943 assert(state != BDEV_IO_RETRY_STATE_INVALID); 944 bdev_io->internal.retry_state = state; 945 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 946 } 947 948 void 949 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 950 { 951 struct iovec *iovs; 952 953 if (bdev_io->u.bdev.iovs == NULL) { 954 bdev_io->u.bdev.iovs = &bdev_io->iov; 955 bdev_io->u.bdev.iovcnt = 1; 956 } 957 958 iovs = bdev_io->u.bdev.iovs; 959 960 assert(iovs != NULL); 961 assert(bdev_io->u.bdev.iovcnt >= 1); 962 963 iovs[0].iov_base = buf; 964 iovs[0].iov_len = len; 965 } 966 967 void 968 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 969 { 970 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 971 bdev_io->u.bdev.md_buf = md_buf; 972 } 973 974 static bool 975 _is_buf_allocated(const struct iovec *iovs) 976 { 977 if (iovs == NULL) { 978 return false; 979 } 980 981 return iovs[0].iov_base != NULL; 982 } 983 984 static bool 985 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 986 { 987 int i; 988 uintptr_t iov_base; 989 990 if (spdk_likely(alignment == 1)) { 991 return true; 992 } 993 994 for (i = 0; i < iovcnt; i++) { 995 iov_base = (uintptr_t)iovs[i].iov_base; 996 if ((iov_base & (alignment - 1)) != 0) { 997 return false; 998 } 999 } 1000 1001 return true; 1002 } 1003 1004 static inline bool 1005 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1006 { 1007 if (!bdev_io->internal.accel_sequence) { 1008 return false; 1009 } 1010 1011 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1012 * bdev module didn't support accel sequences */ 1013 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1014 } 1015 1016 static inline void 1017 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1018 struct spdk_bdev_shared_resource *shared_resource) 1019 { 1020 bdev_ch->io_outstanding++; 1021 shared_resource->io_outstanding++; 1022 } 1023 1024 static inline void 1025 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1026 struct spdk_bdev_shared_resource *shared_resource) 1027 { 1028 assert(bdev_ch->io_outstanding > 0); 1029 assert(shared_resource->io_outstanding > 0); 1030 bdev_ch->io_outstanding--; 1031 shared_resource->io_outstanding--; 1032 } 1033 1034 static void 1035 bdev_io_submit_sequence_cb(void *ctx, int status) 1036 { 1037 struct spdk_bdev_io *bdev_io = ctx; 1038 1039 bdev_io->u.bdev.accel_sequence = NULL; 1040 bdev_io->internal.accel_sequence = NULL; 1041 1042 if (spdk_unlikely(status != 0)) { 1043 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1045 bdev_io_complete_unsubmitted(bdev_io); 1046 return; 1047 } 1048 1049 bdev_io_submit(bdev_io); 1050 } 1051 1052 static void 1053 bdev_io_exec_sequence_cb(void *ctx, int status) 1054 { 1055 struct spdk_bdev_io *bdev_io = ctx; 1056 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1057 1058 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1059 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1060 1061 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1062 bdev_ch_retry_io(ch); 1063 } 1064 1065 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1066 } 1067 1068 static void 1069 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1070 { 1071 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1072 1073 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1074 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1075 1076 /* Since the operations are appended during submission, they're in the opposite order than 1077 * how we want to execute them for reads (i.e. we need to execute the most recently added 1078 * operation first), so reverse the sequence before executing it. 1079 */ 1080 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1081 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1082 } 1083 1084 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1085 bdev_io_increment_outstanding(ch, ch->shared_resource); 1086 bdev_io->internal.data_transfer_cpl = cb_fn; 1087 1088 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1089 bdev_io_exec_sequence_cb, bdev_io); 1090 } 1091 1092 static void 1093 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1094 { 1095 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1096 void *buf; 1097 1098 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1099 buf = bdev_io->internal.buf; 1100 bdev_io->internal.buf = NULL; 1101 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1102 bdev_io->internal.get_aux_buf_cb = NULL; 1103 } else { 1104 assert(bdev_io->internal.get_buf_cb != NULL); 1105 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1106 bdev_io->internal.get_buf_cb = NULL; 1107 } 1108 } 1109 1110 static void 1111 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1112 { 1113 struct spdk_bdev_io *bdev_io = ctx; 1114 1115 if (rc) { 1116 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1117 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1118 } 1119 bdev_io_get_buf_complete(bdev_io, !rc); 1120 } 1121 1122 static void 1123 bdev_io_pull_md_buf_done(void *ctx, int status) 1124 { 1125 struct spdk_bdev_io *bdev_io = ctx; 1126 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1127 1128 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1129 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1130 1131 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1132 bdev_ch_retry_io(ch); 1133 } 1134 1135 assert(bdev_io->internal.data_transfer_cpl); 1136 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1137 } 1138 1139 static void 1140 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1141 { 1142 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1143 int rc = 0; 1144 1145 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1146 if (bdev_io_use_memory_domain(bdev_io)) { 1147 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1148 bdev_io_increment_outstanding(ch, ch->shared_resource); 1149 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1150 bdev_io->internal.memory_domain_ctx, 1151 &bdev_io->internal.orig_md_iov, 1, 1152 &bdev_io->internal.bounce_md_iov, 1, 1153 bdev_io_pull_md_buf_done, bdev_io); 1154 if (rc == 0) { 1155 /* Continue to submit IO in completion callback */ 1156 return; 1157 } 1158 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1159 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1160 if (rc != -ENOMEM) { 1161 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1162 spdk_memory_domain_get_dma_device_id( 1163 bdev_io->internal.memory_domain), rc); 1164 } 1165 } else { 1166 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1167 bdev_io->internal.orig_md_iov.iov_base, 1168 bdev_io->internal.orig_md_iov.iov_len); 1169 } 1170 } 1171 1172 if (spdk_unlikely(rc == -ENOMEM)) { 1173 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1174 } else { 1175 assert(bdev_io->internal.data_transfer_cpl); 1176 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1177 } 1178 } 1179 1180 static void 1181 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1182 { 1183 /* save original md_buf */ 1184 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1185 bdev_io->internal.orig_md_iov.iov_len = len; 1186 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1187 bdev_io->internal.bounce_md_iov.iov_len = len; 1188 /* set bounce md_buf */ 1189 bdev_io->u.bdev.md_buf = md_buf; 1190 1191 bdev_io_pull_md_buf(bdev_io); 1192 } 1193 1194 static void 1195 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1196 { 1197 struct spdk_bdev *bdev = bdev_io->bdev; 1198 uint64_t md_len; 1199 void *buf; 1200 1201 if (spdk_bdev_is_md_separate(bdev)) { 1202 assert(!bdev_io_use_accel_sequence(bdev_io)); 1203 1204 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1205 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1206 1207 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1208 1209 if (bdev_io->u.bdev.md_buf != NULL) { 1210 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1211 return; 1212 } else { 1213 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1214 } 1215 } 1216 1217 bdev_io_get_buf_complete(bdev_io, true); 1218 } 1219 1220 static inline void 1221 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1222 { 1223 if (rc) { 1224 SPDK_ERRLOG("Failed to get data buffer\n"); 1225 assert(bdev_io->internal.data_transfer_cpl); 1226 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1227 return; 1228 } 1229 1230 _bdev_io_set_md_buf(bdev_io); 1231 } 1232 1233 static void 1234 bdev_io_pull_data_done_and_track(void *ctx, int status) 1235 { 1236 struct spdk_bdev_io *bdev_io = ctx; 1237 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1238 1239 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1240 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1241 1242 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1243 bdev_ch_retry_io(ch); 1244 } 1245 1246 bdev_io_pull_data_done(bdev_io, status); 1247 } 1248 1249 static void 1250 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1251 { 1252 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1253 int rc = 0; 1254 1255 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1256 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1257 * operation */ 1258 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1259 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1260 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1261 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1262 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1263 NULL, NULL, 1264 bdev_io->internal.orig_iovs, 1265 bdev_io->internal.orig_iovcnt, 1266 bdev_io->internal.memory_domain, 1267 bdev_io->internal.memory_domain_ctx, 1268 0, NULL, NULL); 1269 } else { 1270 /* We need to reverse the src/dst for reads */ 1271 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1272 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1273 bdev_io->internal.orig_iovs, 1274 bdev_io->internal.orig_iovcnt, 1275 bdev_io->internal.memory_domain, 1276 bdev_io->internal.memory_domain_ctx, 1277 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1278 NULL, NULL, 0, NULL, NULL); 1279 } 1280 1281 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1282 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1283 bdev_io->internal.accel_sequence); 1284 } 1285 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1286 /* if this is write path, copy data from original buffer to bounce buffer */ 1287 if (bdev_io_use_memory_domain(bdev_io)) { 1288 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1289 bdev_io_increment_outstanding(ch, ch->shared_resource); 1290 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1291 bdev_io->internal.memory_domain_ctx, 1292 bdev_io->internal.orig_iovs, 1293 (uint32_t) bdev_io->internal.orig_iovcnt, 1294 bdev_io->u.bdev.iovs, 1, 1295 bdev_io_pull_data_done_and_track, 1296 bdev_io); 1297 if (rc == 0) { 1298 /* Continue to submit IO in completion callback */ 1299 return; 1300 } 1301 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1302 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1303 if (rc != -ENOMEM) { 1304 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1305 spdk_memory_domain_get_dma_device_id( 1306 bdev_io->internal.memory_domain)); 1307 } 1308 } else { 1309 assert(bdev_io->u.bdev.iovcnt == 1); 1310 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1311 bdev_io->u.bdev.iovs[0].iov_len, 1312 bdev_io->internal.orig_iovs, 1313 bdev_io->internal.orig_iovcnt); 1314 } 1315 } 1316 1317 if (spdk_unlikely(rc == -ENOMEM)) { 1318 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1319 } else { 1320 bdev_io_pull_data_done(bdev_io, rc); 1321 } 1322 } 1323 1324 static void 1325 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1326 bdev_copy_bounce_buffer_cpl cpl_cb) 1327 { 1328 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1329 1330 bdev_io->internal.data_transfer_cpl = cpl_cb; 1331 /* save original iovec */ 1332 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1333 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1334 /* set bounce iov */ 1335 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1336 bdev_io->u.bdev.iovcnt = 1; 1337 /* set bounce buffer for this operation */ 1338 bdev_io->u.bdev.iovs[0].iov_base = buf; 1339 bdev_io->u.bdev.iovs[0].iov_len = len; 1340 1341 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1342 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1343 } else { 1344 bdev_io_pull_data(bdev_io); 1345 } 1346 } 1347 1348 static void 1349 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1350 { 1351 struct spdk_bdev *bdev = bdev_io->bdev; 1352 bool buf_allocated; 1353 uint64_t alignment; 1354 void *aligned_buf; 1355 1356 bdev_io->internal.buf = buf; 1357 1358 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1359 bdev_io_get_buf_complete(bdev_io, true); 1360 return; 1361 } 1362 1363 alignment = spdk_bdev_get_buf_align(bdev); 1364 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1365 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1366 1367 if (buf_allocated) { 1368 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1369 /* Continue in completion callback */ 1370 return; 1371 } else { 1372 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1373 } 1374 1375 _bdev_io_set_md_buf(bdev_io); 1376 } 1377 1378 static inline uint64_t 1379 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1380 { 1381 struct spdk_bdev *bdev = bdev_io->bdev; 1382 uint64_t md_len, alignment; 1383 1384 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1385 1386 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1387 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1388 1389 return len + alignment + md_len; 1390 } 1391 1392 static void 1393 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1394 { 1395 struct spdk_bdev_mgmt_channel *ch; 1396 1397 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1398 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1399 } 1400 1401 static void 1402 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1403 { 1404 assert(bdev_io->internal.buf != NULL); 1405 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1406 bdev_io->internal.buf = NULL; 1407 } 1408 1409 void 1410 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1411 { 1412 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1413 1414 assert(buf != NULL); 1415 _bdev_io_put_buf(bdev_io, buf, len); 1416 } 1417 1418 static inline void 1419 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1420 struct spdk_bdev_io *bdev_io) 1421 { 1422 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1423 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1424 * sequence pointer to make sure we won't touch it anymore. */ 1425 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1426 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1427 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1428 bdev_io->internal.accel_sequence = NULL; 1429 } 1430 1431 bdev->fn_table->submit_request(ioch, bdev_io); 1432 } 1433 1434 static inline void 1435 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1436 { 1437 struct spdk_bdev *bdev = bdev_io->bdev; 1438 1439 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1440 bdev_io->internal.error.nvme.cdw0 = 0; 1441 bdev_io->num_retries++; 1442 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1443 } 1444 1445 static void 1446 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1447 { 1448 struct spdk_bdev_io *bdev_io; 1449 1450 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1451 /* 1452 * Allow some more I/O to complete before retrying the nomem_io queue. 1453 * Some drivers (such as nvme) cannot immediately take a new I/O in 1454 * the context of a completion, because the resources for the I/O are 1455 * not released until control returns to the bdev poller. Also, we 1456 * may require several small I/O to complete before a larger I/O 1457 * (that requires splitting) can be submitted. 1458 */ 1459 return; 1460 } 1461 1462 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1463 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1464 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1465 1466 switch (bdev_io->internal.retry_state) { 1467 case BDEV_IO_RETRY_STATE_SUBMIT: 1468 bdev_ch_resubmit_io(shared_resource, bdev_io); 1469 break; 1470 case BDEV_IO_RETRY_STATE_PULL: 1471 bdev_io_pull_data(bdev_io); 1472 break; 1473 case BDEV_IO_RETRY_STATE_PULL_MD: 1474 bdev_io_pull_md_buf(bdev_io); 1475 break; 1476 case BDEV_IO_RETRY_STATE_PUSH: 1477 bdev_io_push_bounce_data(bdev_io); 1478 break; 1479 case BDEV_IO_RETRY_STATE_PUSH_MD: 1480 bdev_io_push_bounce_md_buf(bdev_io); 1481 break; 1482 default: 1483 assert(0 && "invalid retry state"); 1484 break; 1485 } 1486 1487 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1488 /* This IO completed again with NOMEM status, so break the loop and 1489 * don't try anymore. Note that a bdev_io that fails with NOMEM 1490 * always gets requeued at the front of the list, to maintain 1491 * ordering. 1492 */ 1493 break; 1494 } 1495 } 1496 } 1497 1498 static void 1499 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1500 { 1501 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1502 } 1503 1504 static int 1505 bdev_no_mem_poller(void *ctx) 1506 { 1507 struct spdk_bdev_shared_resource *shared_resource = ctx; 1508 1509 spdk_poller_unregister(&shared_resource->nomem_poller); 1510 1511 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1512 bdev_shared_ch_retry_io(shared_resource); 1513 } 1514 /* the retry cb may re-register the poller so double check */ 1515 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1516 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1517 /* No IOs were submitted, try again */ 1518 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1519 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1520 } 1521 1522 return SPDK_POLLER_BUSY; 1523 } 1524 1525 static inline bool 1526 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1527 { 1528 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1529 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1530 1531 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1532 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1533 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1534 1535 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1536 /* Special case when we have nomem IOs and no outstanding IOs which completions 1537 * could trigger retry of queued IOs 1538 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1539 * new IOs submitted, e.g. qd==1 */ 1540 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1541 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1542 } 1543 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1544 * ownership of that sequence is transferred back to the bdev layer, so we need to 1545 * restore internal.accel_sequence to make sure that the sequence is handled 1546 * correctly in case the I/O is later aborted. */ 1547 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1548 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1549 assert(bdev_io->internal.accel_sequence == NULL); 1550 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1551 } 1552 1553 return true; 1554 } 1555 1556 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1557 bdev_ch_retry_io(bdev_ch); 1558 } 1559 1560 return false; 1561 } 1562 1563 static void 1564 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1565 { 1566 struct spdk_bdev_io *bdev_io = ctx; 1567 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1568 1569 if (rc) { 1570 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1571 } 1572 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1573 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1574 */ 1575 bdev_io_put_buf(bdev_io); 1576 1577 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1578 bdev_ch_retry_io(ch); 1579 } 1580 1581 /* Continue with IO completion flow */ 1582 bdev_io_complete(bdev_io); 1583 } 1584 1585 static void 1586 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1587 { 1588 struct spdk_bdev_io *bdev_io = ctx; 1589 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1590 1591 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1592 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1593 1594 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1595 bdev_ch_retry_io(ch); 1596 } 1597 1598 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1599 } 1600 1601 static inline void 1602 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1603 { 1604 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1605 int rc = 0; 1606 1607 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1608 /* do the same for metadata buffer */ 1609 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1610 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1611 1612 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1613 if (bdev_io_use_memory_domain(bdev_io)) { 1614 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1615 bdev_io_increment_outstanding(ch, ch->shared_resource); 1616 /* If memory domain is used then we need to call async push function */ 1617 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1618 bdev_io->internal.memory_domain_ctx, 1619 &bdev_io->internal.orig_md_iov, 1620 (uint32_t)bdev_io->internal.orig_iovcnt, 1621 &bdev_io->internal.bounce_md_iov, 1, 1622 bdev_io_push_bounce_md_buf_done, 1623 bdev_io); 1624 if (rc == 0) { 1625 /* Continue IO completion in async callback */ 1626 return; 1627 } 1628 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1629 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1630 if (rc != -ENOMEM) { 1631 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1632 spdk_memory_domain_get_dma_device_id( 1633 bdev_io->internal.memory_domain)); 1634 } 1635 } else { 1636 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1637 bdev_io->internal.orig_md_iov.iov_len); 1638 } 1639 } 1640 } 1641 1642 if (spdk_unlikely(rc == -ENOMEM)) { 1643 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1644 } else { 1645 assert(bdev_io->internal.data_transfer_cpl); 1646 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1647 } 1648 } 1649 1650 static inline void 1651 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1652 { 1653 assert(bdev_io->internal.data_transfer_cpl); 1654 if (rc) { 1655 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1656 return; 1657 } 1658 1659 /* set original buffer for this io */ 1660 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1661 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1662 /* disable bouncing buffer for this io */ 1663 bdev_io->internal.orig_iovcnt = 0; 1664 bdev_io->internal.orig_iovs = NULL; 1665 1666 bdev_io_push_bounce_md_buf(bdev_io); 1667 } 1668 1669 static void 1670 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1671 { 1672 struct spdk_bdev_io *bdev_io = ctx; 1673 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1674 1675 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1676 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1677 1678 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1679 bdev_ch_retry_io(ch); 1680 } 1681 1682 bdev_io_push_bounce_data_done(bdev_io, status); 1683 } 1684 1685 static inline void 1686 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1687 { 1688 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1689 int rc = 0; 1690 1691 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1692 assert(!bdev_io_use_accel_sequence(bdev_io)); 1693 1694 /* if this is read path, copy data from bounce buffer to original buffer */ 1695 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1696 if (bdev_io_use_memory_domain(bdev_io)) { 1697 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1698 bdev_io_increment_outstanding(ch, ch->shared_resource); 1699 /* If memory domain is used then we need to call async push function */ 1700 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1701 bdev_io->internal.memory_domain_ctx, 1702 bdev_io->internal.orig_iovs, 1703 (uint32_t)bdev_io->internal.orig_iovcnt, 1704 &bdev_io->internal.bounce_iov, 1, 1705 bdev_io_push_bounce_data_done_and_track, 1706 bdev_io); 1707 if (rc == 0) { 1708 /* Continue IO completion in async callback */ 1709 return; 1710 } 1711 1712 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1713 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1714 if (rc != -ENOMEM) { 1715 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1716 spdk_memory_domain_get_dma_device_id( 1717 bdev_io->internal.memory_domain)); 1718 } 1719 } else { 1720 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1721 bdev_io->internal.orig_iovcnt, 1722 bdev_io->internal.bounce_iov.iov_base, 1723 bdev_io->internal.bounce_iov.iov_len); 1724 } 1725 } 1726 1727 if (spdk_unlikely(rc == -ENOMEM)) { 1728 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1729 } else { 1730 bdev_io_push_bounce_data_done(bdev_io, rc); 1731 } 1732 } 1733 1734 static inline void 1735 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1736 { 1737 bdev_io->internal.data_transfer_cpl = cpl_cb; 1738 bdev_io_push_bounce_data(bdev_io); 1739 } 1740 1741 static void 1742 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1743 { 1744 struct spdk_bdev_io *bdev_io; 1745 1746 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1747 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1748 } 1749 1750 static void 1751 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1752 { 1753 struct spdk_bdev_mgmt_channel *mgmt_ch; 1754 uint64_t max_len; 1755 void *buf; 1756 1757 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1758 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1759 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1760 1761 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1762 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1763 bdev_io_get_buf_complete(bdev_io, false); 1764 return; 1765 } 1766 1767 bdev_io->internal.buf_len = len; 1768 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1769 bdev_io_get_iobuf_cb); 1770 if (buf != NULL) { 1771 _bdev_io_set_buf(bdev_io, buf, len); 1772 } 1773 } 1774 1775 void 1776 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1777 { 1778 struct spdk_bdev *bdev = bdev_io->bdev; 1779 uint64_t alignment; 1780 1781 assert(cb != NULL); 1782 bdev_io->internal.get_buf_cb = cb; 1783 1784 alignment = spdk_bdev_get_buf_align(bdev); 1785 1786 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1787 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1788 /* Buffer already present and aligned */ 1789 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1790 return; 1791 } 1792 1793 bdev_io_get_buf(bdev_io, len); 1794 } 1795 1796 static void 1797 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1798 bool success) 1799 { 1800 if (!success) { 1801 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1802 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1803 bdev_io_complete_unsubmitted(bdev_io); 1804 return; 1805 } 1806 1807 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1808 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1809 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1810 return; 1811 } 1812 /* For reads we'll execute the sequence after the data is read, so, for now, only 1813 * clear out accel_sequence pointer and submit the IO */ 1814 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1815 bdev_io->u.bdev.accel_sequence = NULL; 1816 } 1817 1818 bdev_io_submit(bdev_io); 1819 } 1820 1821 static void 1822 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1823 uint64_t len) 1824 { 1825 assert(cb != NULL); 1826 bdev_io->internal.get_buf_cb = cb; 1827 1828 bdev_io_get_buf(bdev_io, len); 1829 } 1830 1831 void 1832 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1833 { 1834 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1835 1836 assert(cb != NULL); 1837 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1838 bdev_io->internal.get_aux_buf_cb = cb; 1839 bdev_io_get_buf(bdev_io, len); 1840 } 1841 1842 static int 1843 bdev_module_get_max_ctx_size(void) 1844 { 1845 struct spdk_bdev_module *bdev_module; 1846 int max_bdev_module_size = 0; 1847 1848 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1849 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1850 max_bdev_module_size = bdev_module->get_ctx_size(); 1851 } 1852 } 1853 1854 return max_bdev_module_size; 1855 } 1856 1857 static void 1858 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1859 { 1860 if (!bdev->internal.histogram_enabled) { 1861 return; 1862 } 1863 1864 spdk_json_write_object_begin(w); 1865 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1866 1867 spdk_json_write_named_object_begin(w, "params"); 1868 spdk_json_write_named_string(w, "name", bdev->name); 1869 1870 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1871 spdk_json_write_object_end(w); 1872 1873 spdk_json_write_object_end(w); 1874 } 1875 1876 static void 1877 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1878 { 1879 int i; 1880 struct spdk_bdev_qos *qos = bdev->internal.qos; 1881 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1882 1883 if (!qos) { 1884 return; 1885 } 1886 1887 spdk_bdev_get_qos_rate_limits(bdev, limits); 1888 1889 spdk_json_write_object_begin(w); 1890 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1891 1892 spdk_json_write_named_object_begin(w, "params"); 1893 spdk_json_write_named_string(w, "name", bdev->name); 1894 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1895 if (limits[i] > 0) { 1896 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1897 } 1898 } 1899 spdk_json_write_object_end(w); 1900 1901 spdk_json_write_object_end(w); 1902 } 1903 1904 void 1905 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1906 { 1907 struct spdk_bdev_module *bdev_module; 1908 struct spdk_bdev *bdev; 1909 1910 assert(w != NULL); 1911 1912 spdk_json_write_array_begin(w); 1913 1914 spdk_json_write_object_begin(w); 1915 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1916 spdk_json_write_named_object_begin(w, "params"); 1917 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1918 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1919 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1920 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1921 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1922 spdk_json_write_object_end(w); 1923 spdk_json_write_object_end(w); 1924 1925 bdev_examine_allowlist_config_json(w); 1926 1927 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1928 if (bdev_module->config_json) { 1929 bdev_module->config_json(w); 1930 } 1931 } 1932 1933 spdk_spin_lock(&g_bdev_mgr.spinlock); 1934 1935 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1936 if (bdev->fn_table->write_config_json) { 1937 bdev->fn_table->write_config_json(bdev, w); 1938 } 1939 1940 bdev_qos_config_json(bdev, w); 1941 bdev_enable_histogram_config_json(bdev, w); 1942 } 1943 1944 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1945 1946 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1947 spdk_json_write_object_begin(w); 1948 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1949 spdk_json_write_object_end(w); 1950 1951 spdk_json_write_array_end(w); 1952 } 1953 1954 static void 1955 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1956 { 1957 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1958 struct spdk_bdev_io *bdev_io; 1959 1960 spdk_iobuf_channel_fini(&ch->iobuf); 1961 1962 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1963 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1964 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1965 ch->per_thread_cache_count--; 1966 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1967 } 1968 1969 assert(ch->per_thread_cache_count == 0); 1970 } 1971 1972 static int 1973 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1974 { 1975 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1976 struct spdk_bdev_io *bdev_io; 1977 uint32_t i; 1978 int rc; 1979 1980 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 1981 g_bdev_opts.iobuf_small_cache_size, 1982 g_bdev_opts.iobuf_large_cache_size); 1983 if (rc != 0) { 1984 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1985 return -1; 1986 } 1987 1988 STAILQ_INIT(&ch->per_thread_cache); 1989 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1990 1991 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1992 ch->per_thread_cache_count = 0; 1993 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1994 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1995 if (bdev_io == NULL) { 1996 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1997 assert(false); 1998 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1999 return -1; 2000 } 2001 ch->per_thread_cache_count++; 2002 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2003 } 2004 2005 TAILQ_INIT(&ch->shared_resources); 2006 TAILQ_INIT(&ch->io_wait_queue); 2007 2008 return 0; 2009 } 2010 2011 static void 2012 bdev_init_complete(int rc) 2013 { 2014 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2015 void *cb_arg = g_init_cb_arg; 2016 struct spdk_bdev_module *m; 2017 2018 g_bdev_mgr.init_complete = true; 2019 g_init_cb_fn = NULL; 2020 g_init_cb_arg = NULL; 2021 2022 /* 2023 * For modules that need to know when subsystem init is complete, 2024 * inform them now. 2025 */ 2026 if (rc == 0) { 2027 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2028 if (m->init_complete) { 2029 m->init_complete(); 2030 } 2031 } 2032 } 2033 2034 cb_fn(cb_arg, rc); 2035 } 2036 2037 static bool 2038 bdev_module_all_actions_completed(void) 2039 { 2040 struct spdk_bdev_module *m; 2041 2042 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2043 if (m->internal.action_in_progress > 0) { 2044 return false; 2045 } 2046 } 2047 return true; 2048 } 2049 2050 static void 2051 bdev_module_action_complete(void) 2052 { 2053 /* 2054 * Don't finish bdev subsystem initialization if 2055 * module pre-initialization is still in progress, or 2056 * the subsystem been already initialized. 2057 */ 2058 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2059 return; 2060 } 2061 2062 /* 2063 * Check all bdev modules for inits/examinations in progress. If any 2064 * exist, return immediately since we cannot finish bdev subsystem 2065 * initialization until all are completed. 2066 */ 2067 if (!bdev_module_all_actions_completed()) { 2068 return; 2069 } 2070 2071 /* 2072 * Modules already finished initialization - now that all 2073 * the bdev modules have finished their asynchronous I/O 2074 * processing, the entire bdev layer can be marked as complete. 2075 */ 2076 bdev_init_complete(0); 2077 } 2078 2079 static void 2080 bdev_module_action_done(struct spdk_bdev_module *module) 2081 { 2082 spdk_spin_lock(&module->internal.spinlock); 2083 assert(module->internal.action_in_progress > 0); 2084 module->internal.action_in_progress--; 2085 spdk_spin_unlock(&module->internal.spinlock); 2086 bdev_module_action_complete(); 2087 } 2088 2089 void 2090 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2091 { 2092 assert(module->async_init); 2093 bdev_module_action_done(module); 2094 } 2095 2096 void 2097 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2098 { 2099 bdev_module_action_done(module); 2100 } 2101 2102 /** The last initialized bdev module */ 2103 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2104 2105 static void 2106 bdev_init_failed(void *cb_arg) 2107 { 2108 struct spdk_bdev_module *module = cb_arg; 2109 2110 spdk_spin_lock(&module->internal.spinlock); 2111 assert(module->internal.action_in_progress > 0); 2112 module->internal.action_in_progress--; 2113 spdk_spin_unlock(&module->internal.spinlock); 2114 bdev_init_complete(-1); 2115 } 2116 2117 static int 2118 bdev_modules_init(void) 2119 { 2120 struct spdk_bdev_module *module; 2121 int rc = 0; 2122 2123 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2124 g_resume_bdev_module = module; 2125 if (module->async_init) { 2126 spdk_spin_lock(&module->internal.spinlock); 2127 module->internal.action_in_progress = 1; 2128 spdk_spin_unlock(&module->internal.spinlock); 2129 } 2130 rc = module->module_init(); 2131 if (rc != 0) { 2132 /* Bump action_in_progress to prevent other modules from completion of modules_init 2133 * Send message to defer application shutdown until resources are cleaned up */ 2134 spdk_spin_lock(&module->internal.spinlock); 2135 module->internal.action_in_progress = 1; 2136 spdk_spin_unlock(&module->internal.spinlock); 2137 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2138 return rc; 2139 } 2140 } 2141 2142 g_resume_bdev_module = NULL; 2143 return 0; 2144 } 2145 2146 void 2147 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2148 { 2149 int rc = 0; 2150 char mempool_name[32]; 2151 2152 assert(cb_fn != NULL); 2153 2154 g_init_cb_fn = cb_fn; 2155 g_init_cb_arg = cb_arg; 2156 2157 spdk_notify_type_register("bdev_register"); 2158 spdk_notify_type_register("bdev_unregister"); 2159 2160 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2161 2162 rc = spdk_iobuf_register_module("bdev"); 2163 if (rc != 0) { 2164 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2165 bdev_init_complete(-1); 2166 return; 2167 } 2168 2169 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2170 g_bdev_opts.bdev_io_pool_size, 2171 sizeof(struct spdk_bdev_io) + 2172 bdev_module_get_max_ctx_size(), 2173 0, 2174 SPDK_ENV_SOCKET_ID_ANY); 2175 2176 if (g_bdev_mgr.bdev_io_pool == NULL) { 2177 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2178 bdev_init_complete(-1); 2179 return; 2180 } 2181 2182 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2183 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2184 if (!g_bdev_mgr.zero_buffer) { 2185 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2186 bdev_init_complete(-1); 2187 return; 2188 } 2189 2190 #ifdef SPDK_CONFIG_VTUNE 2191 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2192 #endif 2193 2194 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2195 bdev_mgmt_channel_destroy, 2196 sizeof(struct spdk_bdev_mgmt_channel), 2197 "bdev_mgr"); 2198 2199 rc = bdev_modules_init(); 2200 g_bdev_mgr.module_init_complete = true; 2201 if (rc != 0) { 2202 SPDK_ERRLOG("bdev modules init failed\n"); 2203 return; 2204 } 2205 2206 bdev_module_action_complete(); 2207 } 2208 2209 static void 2210 bdev_mgr_unregister_cb(void *io_device) 2211 { 2212 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2213 2214 if (g_bdev_mgr.bdev_io_pool) { 2215 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2216 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2217 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2218 g_bdev_opts.bdev_io_pool_size); 2219 } 2220 2221 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2222 } 2223 2224 spdk_free(g_bdev_mgr.zero_buffer); 2225 2226 bdev_examine_allowlist_free(); 2227 2228 cb_fn(g_fini_cb_arg); 2229 g_fini_cb_fn = NULL; 2230 g_fini_cb_arg = NULL; 2231 g_bdev_mgr.init_complete = false; 2232 g_bdev_mgr.module_init_complete = false; 2233 } 2234 2235 static void 2236 bdev_module_fini_iter(void *arg) 2237 { 2238 struct spdk_bdev_module *bdev_module; 2239 2240 /* FIXME: Handling initialization failures is broken now, 2241 * so we won't even try cleaning up after successfully 2242 * initialized modules. if module_init_complete is false, 2243 * just call spdk_bdev_mgr_unregister_cb 2244 */ 2245 if (!g_bdev_mgr.module_init_complete) { 2246 bdev_mgr_unregister_cb(NULL); 2247 return; 2248 } 2249 2250 /* Start iterating from the last touched module */ 2251 if (!g_resume_bdev_module) { 2252 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2253 } else { 2254 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2255 internal.tailq); 2256 } 2257 2258 while (bdev_module) { 2259 if (bdev_module->async_fini) { 2260 /* Save our place so we can resume later. We must 2261 * save the variable here, before calling module_fini() 2262 * below, because in some cases the module may immediately 2263 * call spdk_bdev_module_fini_done() and re-enter 2264 * this function to continue iterating. */ 2265 g_resume_bdev_module = bdev_module; 2266 } 2267 2268 if (bdev_module->module_fini) { 2269 bdev_module->module_fini(); 2270 } 2271 2272 if (bdev_module->async_fini) { 2273 return; 2274 } 2275 2276 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2277 internal.tailq); 2278 } 2279 2280 g_resume_bdev_module = NULL; 2281 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2282 } 2283 2284 void 2285 spdk_bdev_module_fini_done(void) 2286 { 2287 if (spdk_get_thread() != g_fini_thread) { 2288 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2289 } else { 2290 bdev_module_fini_iter(NULL); 2291 } 2292 } 2293 2294 static void 2295 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2296 { 2297 struct spdk_bdev *bdev = cb_arg; 2298 2299 if (bdeverrno && bdev) { 2300 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2301 bdev->name); 2302 2303 /* 2304 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2305 * bdev; try to continue by manually removing this bdev from the list and continue 2306 * with the next bdev in the list. 2307 */ 2308 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2309 } 2310 2311 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2312 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2313 /* 2314 * Bdev module finish need to be deferred as we might be in the middle of some context 2315 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2316 * after returning. 2317 */ 2318 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2319 return; 2320 } 2321 2322 /* 2323 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2324 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2325 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2326 * base bdevs. 2327 * 2328 * Also, walk the list in the reverse order. 2329 */ 2330 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2331 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2332 spdk_spin_lock(&bdev->internal.spinlock); 2333 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2334 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2335 spdk_spin_unlock(&bdev->internal.spinlock); 2336 continue; 2337 } 2338 spdk_spin_unlock(&bdev->internal.spinlock); 2339 2340 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2341 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2342 return; 2343 } 2344 2345 /* 2346 * If any bdev fails to unclaim underlying bdev properly, we may face the 2347 * case of bdev list consisting of claimed bdevs only (if claims are managed 2348 * correctly, this would mean there's a loop in the claims graph which is 2349 * clearly impossible). Warn and unregister last bdev on the list then. 2350 */ 2351 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2352 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2353 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2354 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2355 return; 2356 } 2357 } 2358 2359 static void 2360 bdev_module_fini_start_iter(void *arg) 2361 { 2362 struct spdk_bdev_module *bdev_module; 2363 2364 if (!g_resume_bdev_module) { 2365 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2366 } else { 2367 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2368 } 2369 2370 while (bdev_module) { 2371 if (bdev_module->async_fini_start) { 2372 /* Save our place so we can resume later. We must 2373 * save the variable here, before calling fini_start() 2374 * below, because in some cases the module may immediately 2375 * call spdk_bdev_module_fini_start_done() and re-enter 2376 * this function to continue iterating. */ 2377 g_resume_bdev_module = bdev_module; 2378 } 2379 2380 if (bdev_module->fini_start) { 2381 bdev_module->fini_start(); 2382 } 2383 2384 if (bdev_module->async_fini_start) { 2385 return; 2386 } 2387 2388 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2389 } 2390 2391 g_resume_bdev_module = NULL; 2392 2393 bdev_finish_unregister_bdevs_iter(NULL, 0); 2394 } 2395 2396 void 2397 spdk_bdev_module_fini_start_done(void) 2398 { 2399 if (spdk_get_thread() != g_fini_thread) { 2400 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2401 } else { 2402 bdev_module_fini_start_iter(NULL); 2403 } 2404 } 2405 2406 static void 2407 bdev_finish_wait_for_examine_done(void *cb_arg) 2408 { 2409 bdev_module_fini_start_iter(NULL); 2410 } 2411 2412 static void bdev_open_async_fini(void); 2413 2414 void 2415 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2416 { 2417 int rc; 2418 2419 assert(cb_fn != NULL); 2420 2421 g_fini_thread = spdk_get_thread(); 2422 2423 g_fini_cb_fn = cb_fn; 2424 g_fini_cb_arg = cb_arg; 2425 2426 bdev_open_async_fini(); 2427 2428 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2429 if (rc != 0) { 2430 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2431 bdev_finish_wait_for_examine_done(NULL); 2432 } 2433 } 2434 2435 struct spdk_bdev_io * 2436 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2437 { 2438 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2439 struct spdk_bdev_io *bdev_io; 2440 2441 if (ch->per_thread_cache_count > 0) { 2442 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2443 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2444 ch->per_thread_cache_count--; 2445 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2446 /* 2447 * Don't try to look for bdev_ios in the global pool if there are 2448 * waiters on bdev_ios - we don't want this caller to jump the line. 2449 */ 2450 bdev_io = NULL; 2451 } else { 2452 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2453 } 2454 2455 return bdev_io; 2456 } 2457 2458 void 2459 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2460 { 2461 struct spdk_bdev_mgmt_channel *ch; 2462 2463 assert(bdev_io != NULL); 2464 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2465 2466 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2467 2468 if (bdev_io->internal.buf != NULL) { 2469 bdev_io_put_buf(bdev_io); 2470 } 2471 2472 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2473 ch->per_thread_cache_count++; 2474 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2475 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2476 struct spdk_bdev_io_wait_entry *entry; 2477 2478 entry = TAILQ_FIRST(&ch->io_wait_queue); 2479 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2480 entry->cb_fn(entry->cb_arg); 2481 } 2482 } else { 2483 /* We should never have a full cache with entries on the io wait queue. */ 2484 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2485 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2486 } 2487 } 2488 2489 static bool 2490 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2491 { 2492 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2493 2494 switch (limit) { 2495 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2496 return true; 2497 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2498 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2499 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2500 return false; 2501 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2502 default: 2503 return false; 2504 } 2505 } 2506 2507 static bool 2508 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2509 { 2510 switch (bdev_io->type) { 2511 case SPDK_BDEV_IO_TYPE_NVME_IO: 2512 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2513 case SPDK_BDEV_IO_TYPE_READ: 2514 case SPDK_BDEV_IO_TYPE_WRITE: 2515 return true; 2516 case SPDK_BDEV_IO_TYPE_ZCOPY: 2517 if (bdev_io->u.bdev.zcopy.start) { 2518 return true; 2519 } else { 2520 return false; 2521 } 2522 default: 2523 return false; 2524 } 2525 } 2526 2527 static bool 2528 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2529 { 2530 switch (bdev_io->type) { 2531 case SPDK_BDEV_IO_TYPE_NVME_IO: 2532 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2533 /* Bit 1 (0x2) set for read operation */ 2534 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2535 return true; 2536 } else { 2537 return false; 2538 } 2539 case SPDK_BDEV_IO_TYPE_READ: 2540 return true; 2541 case SPDK_BDEV_IO_TYPE_ZCOPY: 2542 /* Populate to read from disk */ 2543 if (bdev_io->u.bdev.zcopy.populate) { 2544 return true; 2545 } else { 2546 return false; 2547 } 2548 default: 2549 return false; 2550 } 2551 } 2552 2553 static uint64_t 2554 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2555 { 2556 struct spdk_bdev *bdev = bdev_io->bdev; 2557 2558 switch (bdev_io->type) { 2559 case SPDK_BDEV_IO_TYPE_NVME_IO: 2560 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2561 return bdev_io->u.nvme_passthru.nbytes; 2562 case SPDK_BDEV_IO_TYPE_READ: 2563 case SPDK_BDEV_IO_TYPE_WRITE: 2564 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2565 case SPDK_BDEV_IO_TYPE_ZCOPY: 2566 /* Track the data in the start phase only */ 2567 if (bdev_io->u.bdev.zcopy.start) { 2568 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2569 } else { 2570 return 0; 2571 } 2572 default: 2573 return 0; 2574 } 2575 } 2576 2577 static inline bool 2578 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2579 { 2580 int64_t remaining_this_timeslice; 2581 2582 if (!limit->max_per_timeslice) { 2583 /* The QoS is disabled */ 2584 return false; 2585 } 2586 2587 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2588 __ATOMIC_RELAXED); 2589 if (remaining_this_timeslice + (int64_t)delta > 0) { 2590 /* There was still a quota for this delta -> the IO shouldn't be queued 2591 * 2592 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2593 * quota can be allowed once a while. Such overrun then taken into account in 2594 * the QoS poller, where the next timeslice quota is calculated. 2595 */ 2596 return false; 2597 } 2598 2599 /* There was no quota for this delta -> the IO should be queued 2600 * The remaining_this_timeslice must be rewinded so it reflects the real 2601 * amount of IOs or bytes allowed. 2602 */ 2603 __atomic_add_fetch( 2604 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2605 return true; 2606 } 2607 2608 static inline void 2609 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2610 { 2611 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2612 } 2613 2614 static bool 2615 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2616 { 2617 return bdev_qos_rw_queue_io(limit, io, 1); 2618 } 2619 2620 static void 2621 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2622 { 2623 bdev_qos_rw_rewind_io(limit, io, 1); 2624 } 2625 2626 static bool 2627 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2628 { 2629 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2630 } 2631 2632 static void 2633 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2634 { 2635 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2636 } 2637 2638 static bool 2639 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2640 { 2641 if (bdev_is_read_io(io) == false) { 2642 return false; 2643 } 2644 2645 return bdev_qos_rw_bps_queue(limit, io); 2646 } 2647 2648 static void 2649 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2650 { 2651 if (bdev_is_read_io(io) != false) { 2652 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2653 } 2654 } 2655 2656 static bool 2657 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2658 { 2659 if (bdev_is_read_io(io) == true) { 2660 return false; 2661 } 2662 2663 return bdev_qos_rw_bps_queue(limit, io); 2664 } 2665 2666 static void 2667 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2668 { 2669 if (bdev_is_read_io(io) != true) { 2670 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2671 } 2672 } 2673 2674 static void 2675 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2676 { 2677 int i; 2678 2679 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2680 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2681 qos->rate_limits[i].queue_io = NULL; 2682 continue; 2683 } 2684 2685 switch (i) { 2686 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2687 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2688 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2689 break; 2690 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2691 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2692 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2693 break; 2694 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2695 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2696 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2697 break; 2698 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2699 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2700 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2701 break; 2702 default: 2703 break; 2704 } 2705 } 2706 } 2707 2708 static void 2709 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2710 struct spdk_bdev_io *bdev_io, 2711 enum spdk_bdev_io_status status) 2712 { 2713 bdev_io->internal.in_submit_request = true; 2714 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2715 spdk_bdev_io_complete(bdev_io, status); 2716 bdev_io->internal.in_submit_request = false; 2717 } 2718 2719 static inline void 2720 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2721 { 2722 struct spdk_bdev *bdev = bdev_io->bdev; 2723 struct spdk_io_channel *ch = bdev_ch->channel; 2724 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2725 2726 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2727 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2728 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2729 2730 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2731 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2732 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2733 SPDK_BDEV_IO_STATUS_SUCCESS); 2734 return; 2735 } 2736 } 2737 2738 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2739 bdev_io->bdev->split_on_write_unit && 2740 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2741 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2742 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2743 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2744 return; 2745 } 2746 2747 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2748 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2749 bdev_io->internal.in_submit_request = true; 2750 bdev_submit_request(bdev, ch, bdev_io); 2751 bdev_io->internal.in_submit_request = false; 2752 } else { 2753 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2754 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2755 /* Special case when we have nomem IOs and no outstanding IOs which completions 2756 * could trigger retry of queued IOs */ 2757 bdev_shared_ch_retry_io(shared_resource); 2758 } 2759 } 2760 } 2761 2762 static bool 2763 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2764 { 2765 int i; 2766 2767 if (bdev_qos_io_to_limit(bdev_io) == true) { 2768 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2769 if (!qos->rate_limits[i].queue_io) { 2770 continue; 2771 } 2772 2773 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2774 bdev_io) == true) { 2775 for (i -= 1; i >= 0 ; i--) { 2776 if (!qos->rate_limits[i].queue_io) { 2777 continue; 2778 } 2779 2780 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2781 } 2782 return true; 2783 } 2784 } 2785 } 2786 2787 return false; 2788 } 2789 2790 static int 2791 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2792 { 2793 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2794 int submitted_ios = 0; 2795 2796 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2797 if (!bdev_qos_queue_io(qos, bdev_io)) { 2798 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2799 bdev_io_do_submit(ch, bdev_io); 2800 2801 submitted_ios++; 2802 } 2803 } 2804 2805 return submitted_ios; 2806 } 2807 2808 static void 2809 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2810 { 2811 int rc; 2812 2813 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2814 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2815 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2816 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2817 &bdev_io->internal.waitq_entry); 2818 if (rc != 0) { 2819 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2820 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2821 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2822 } 2823 } 2824 2825 static bool 2826 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2827 { 2828 uint32_t io_boundary; 2829 struct spdk_bdev *bdev = bdev_io->bdev; 2830 uint32_t max_segment_size = bdev->max_segment_size; 2831 uint32_t max_size = bdev->max_rw_size; 2832 int max_segs = bdev->max_num_segments; 2833 2834 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2835 io_boundary = bdev->write_unit_size; 2836 } else if (bdev->split_on_optimal_io_boundary) { 2837 io_boundary = bdev->optimal_io_boundary; 2838 } else { 2839 io_boundary = 0; 2840 } 2841 2842 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2843 return false; 2844 } 2845 2846 if (io_boundary) { 2847 uint64_t start_stripe, end_stripe; 2848 2849 start_stripe = bdev_io->u.bdev.offset_blocks; 2850 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2851 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2852 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2853 start_stripe >>= spdk_u32log2(io_boundary); 2854 end_stripe >>= spdk_u32log2(io_boundary); 2855 } else { 2856 start_stripe /= io_boundary; 2857 end_stripe /= io_boundary; 2858 } 2859 2860 if (start_stripe != end_stripe) { 2861 return true; 2862 } 2863 } 2864 2865 if (max_segs) { 2866 if (bdev_io->u.bdev.iovcnt > max_segs) { 2867 return true; 2868 } 2869 } 2870 2871 if (max_segment_size) { 2872 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2873 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2874 return true; 2875 } 2876 } 2877 } 2878 2879 if (max_size) { 2880 if (bdev_io->u.bdev.num_blocks > max_size) { 2881 return true; 2882 } 2883 } 2884 2885 return false; 2886 } 2887 2888 static bool 2889 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2890 { 2891 uint32_t num_unmap_segments; 2892 2893 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2894 return false; 2895 } 2896 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2897 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2898 return true; 2899 } 2900 2901 return false; 2902 } 2903 2904 static bool 2905 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2906 { 2907 if (!bdev_io->bdev->max_write_zeroes) { 2908 return false; 2909 } 2910 2911 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2912 return true; 2913 } 2914 2915 return false; 2916 } 2917 2918 static bool 2919 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2920 { 2921 if (bdev_io->bdev->max_copy != 0 && 2922 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2923 return true; 2924 } 2925 2926 return false; 2927 } 2928 2929 static bool 2930 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2931 { 2932 switch (bdev_io->type) { 2933 case SPDK_BDEV_IO_TYPE_READ: 2934 case SPDK_BDEV_IO_TYPE_WRITE: 2935 return bdev_rw_should_split(bdev_io); 2936 case SPDK_BDEV_IO_TYPE_UNMAP: 2937 return bdev_unmap_should_split(bdev_io); 2938 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2939 return bdev_write_zeroes_should_split(bdev_io); 2940 case SPDK_BDEV_IO_TYPE_COPY: 2941 return bdev_copy_should_split(bdev_io); 2942 default: 2943 return false; 2944 } 2945 } 2946 2947 static uint32_t 2948 _to_next_boundary(uint64_t offset, uint32_t boundary) 2949 { 2950 return (boundary - (offset % boundary)); 2951 } 2952 2953 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2954 2955 static void _bdev_rw_split(void *_bdev_io); 2956 2957 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2958 2959 static void 2960 _bdev_unmap_split(void *_bdev_io) 2961 { 2962 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2963 } 2964 2965 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2966 2967 static void 2968 _bdev_write_zeroes_split(void *_bdev_io) 2969 { 2970 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2971 } 2972 2973 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2974 2975 static void 2976 _bdev_copy_split(void *_bdev_io) 2977 { 2978 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2979 } 2980 2981 static int 2982 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2983 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2984 { 2985 int rc; 2986 uint64_t current_offset, current_remaining, current_src_offset; 2987 spdk_bdev_io_wait_cb io_wait_fn; 2988 2989 current_offset = *offset; 2990 current_remaining = *remaining; 2991 2992 bdev_io->u.bdev.split_outstanding++; 2993 2994 io_wait_fn = _bdev_rw_split; 2995 switch (bdev_io->type) { 2996 case SPDK_BDEV_IO_TYPE_READ: 2997 assert(bdev_io->u.bdev.accel_sequence == NULL); 2998 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2999 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3000 iov, iovcnt, md_buf, current_offset, 3001 num_blocks, bdev_io->internal.memory_domain, 3002 bdev_io->internal.memory_domain_ctx, NULL, 3003 bdev_io->u.bdev.dif_check_flags, 3004 bdev_io_split_done, bdev_io); 3005 break; 3006 case SPDK_BDEV_IO_TYPE_WRITE: 3007 assert(bdev_io->u.bdev.accel_sequence == NULL); 3008 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3009 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3010 iov, iovcnt, md_buf, current_offset, 3011 num_blocks, bdev_io->internal.memory_domain, 3012 bdev_io->internal.memory_domain_ctx, NULL, 3013 bdev_io->u.bdev.dif_check_flags, 3014 bdev_io->u.bdev.nvme_cdw12.raw, 3015 bdev_io->u.bdev.nvme_cdw13.raw, 3016 bdev_io_split_done, bdev_io); 3017 break; 3018 case SPDK_BDEV_IO_TYPE_UNMAP: 3019 io_wait_fn = _bdev_unmap_split; 3020 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3021 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3022 current_offset, num_blocks, 3023 bdev_io_split_done, bdev_io); 3024 break; 3025 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3026 io_wait_fn = _bdev_write_zeroes_split; 3027 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3028 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3029 current_offset, num_blocks, 3030 bdev_io_split_done, bdev_io); 3031 break; 3032 case SPDK_BDEV_IO_TYPE_COPY: 3033 io_wait_fn = _bdev_copy_split; 3034 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3035 (current_offset - bdev_io->u.bdev.offset_blocks); 3036 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3037 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3038 current_offset, current_src_offset, num_blocks, 3039 bdev_io_split_done, bdev_io); 3040 break; 3041 default: 3042 assert(false); 3043 rc = -EINVAL; 3044 break; 3045 } 3046 3047 if (rc == 0) { 3048 current_offset += num_blocks; 3049 current_remaining -= num_blocks; 3050 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3051 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3052 *offset = current_offset; 3053 *remaining = current_remaining; 3054 } else { 3055 bdev_io->u.bdev.split_outstanding--; 3056 if (rc == -ENOMEM) { 3057 if (bdev_io->u.bdev.split_outstanding == 0) { 3058 /* No I/O is outstanding. Hence we should wait here. */ 3059 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3060 } 3061 } else { 3062 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3063 if (bdev_io->u.bdev.split_outstanding == 0) { 3064 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3065 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3066 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3067 } 3068 } 3069 } 3070 3071 return rc; 3072 } 3073 3074 static void 3075 _bdev_rw_split(void *_bdev_io) 3076 { 3077 struct iovec *parent_iov, *iov; 3078 struct spdk_bdev_io *bdev_io = _bdev_io; 3079 struct spdk_bdev *bdev = bdev_io->bdev; 3080 uint64_t parent_offset, current_offset, remaining; 3081 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3082 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3083 uint32_t iovcnt, iov_len, child_iovsize; 3084 uint32_t blocklen = bdev->blocklen; 3085 uint32_t io_boundary; 3086 uint32_t max_segment_size = bdev->max_segment_size; 3087 uint32_t max_child_iovcnt = bdev->max_num_segments; 3088 uint32_t max_size = bdev->max_rw_size; 3089 void *md_buf = NULL; 3090 int rc; 3091 3092 max_size = max_size ? max_size : UINT32_MAX; 3093 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3094 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3095 SPDK_BDEV_IO_NUM_CHILD_IOV; 3096 3097 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3098 io_boundary = bdev->write_unit_size; 3099 } else if (bdev->split_on_optimal_io_boundary) { 3100 io_boundary = bdev->optimal_io_boundary; 3101 } else { 3102 io_boundary = UINT32_MAX; 3103 } 3104 3105 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3106 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3107 parent_offset = bdev_io->u.bdev.offset_blocks; 3108 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3109 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3110 3111 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3112 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3113 if (parent_iov_offset < parent_iov->iov_len) { 3114 break; 3115 } 3116 parent_iov_offset -= parent_iov->iov_len; 3117 } 3118 3119 child_iovcnt = 0; 3120 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3121 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3122 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3123 to_next_boundary = spdk_min(remaining, to_next_boundary); 3124 to_next_boundary = spdk_min(max_size, to_next_boundary); 3125 to_next_boundary_bytes = to_next_boundary * blocklen; 3126 3127 iov = &bdev_io->child_iov[child_iovcnt]; 3128 iovcnt = 0; 3129 3130 if (bdev_io->u.bdev.md_buf) { 3131 md_buf = (char *)bdev_io->u.bdev.md_buf + 3132 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3133 } 3134 3135 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3136 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3137 iovcnt < child_iovsize) { 3138 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3139 iov_len = parent_iov->iov_len - parent_iov_offset; 3140 3141 iov_len = spdk_min(iov_len, max_segment_size); 3142 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3143 to_next_boundary_bytes -= iov_len; 3144 3145 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3146 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3147 3148 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3149 parent_iov_offset += iov_len; 3150 } else { 3151 parent_iovpos++; 3152 parent_iov_offset = 0; 3153 } 3154 child_iovcnt++; 3155 iovcnt++; 3156 } 3157 3158 if (to_next_boundary_bytes > 0) { 3159 /* We had to stop this child I/O early because we ran out of 3160 * child_iov space or were limited by max_num_segments. 3161 * Ensure the iovs to be aligned with block size and 3162 * then adjust to_next_boundary before starting the 3163 * child I/O. 3164 */ 3165 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3166 iovcnt == child_iovsize); 3167 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3168 if (to_last_block_bytes != 0) { 3169 uint32_t child_iovpos = child_iovcnt - 1; 3170 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3171 * so the loop will naturally end 3172 */ 3173 3174 to_last_block_bytes = blocklen - to_last_block_bytes; 3175 to_next_boundary_bytes += to_last_block_bytes; 3176 while (to_last_block_bytes > 0 && iovcnt > 0) { 3177 iov_len = spdk_min(to_last_block_bytes, 3178 bdev_io->child_iov[child_iovpos].iov_len); 3179 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3180 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3181 child_iovpos--; 3182 if (--iovcnt == 0) { 3183 /* If the child IO is less than a block size just return. 3184 * If the first child IO of any split round is less than 3185 * a block size, an error exit. 3186 */ 3187 if (bdev_io->u.bdev.split_outstanding == 0) { 3188 SPDK_ERRLOG("The first child io was less than a block size\n"); 3189 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3190 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3191 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3192 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3193 } 3194 3195 return; 3196 } 3197 } 3198 3199 to_last_block_bytes -= iov_len; 3200 3201 if (parent_iov_offset == 0) { 3202 parent_iovpos--; 3203 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3204 } 3205 parent_iov_offset -= iov_len; 3206 } 3207 3208 assert(to_last_block_bytes == 0); 3209 } 3210 to_next_boundary -= to_next_boundary_bytes / blocklen; 3211 } 3212 3213 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3214 ¤t_offset, &remaining); 3215 if (spdk_unlikely(rc)) { 3216 return; 3217 } 3218 } 3219 } 3220 3221 static void 3222 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3223 { 3224 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3225 uint32_t num_children_reqs = 0; 3226 int rc; 3227 3228 offset = bdev_io->u.bdev.split_current_offset_blocks; 3229 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3230 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3231 3232 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3233 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3234 3235 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3236 &offset, &remaining); 3237 if (spdk_likely(rc == 0)) { 3238 num_children_reqs++; 3239 } else { 3240 return; 3241 } 3242 } 3243 } 3244 3245 static void 3246 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3247 { 3248 uint64_t offset, write_zeroes_blocks, remaining; 3249 uint32_t num_children_reqs = 0; 3250 int rc; 3251 3252 offset = bdev_io->u.bdev.split_current_offset_blocks; 3253 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3254 3255 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3256 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3257 3258 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3259 &offset, &remaining); 3260 if (spdk_likely(rc == 0)) { 3261 num_children_reqs++; 3262 } else { 3263 return; 3264 } 3265 } 3266 } 3267 3268 static void 3269 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3270 { 3271 uint64_t offset, copy_blocks, remaining; 3272 uint32_t num_children_reqs = 0; 3273 int rc; 3274 3275 offset = bdev_io->u.bdev.split_current_offset_blocks; 3276 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3277 3278 assert(bdev_io->bdev->max_copy != 0); 3279 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3280 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3281 3282 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3283 &offset, &remaining); 3284 if (spdk_likely(rc == 0)) { 3285 num_children_reqs++; 3286 } else { 3287 return; 3288 } 3289 } 3290 } 3291 3292 static void 3293 parent_bdev_io_complete(void *ctx, int rc) 3294 { 3295 struct spdk_bdev_io *parent_io = ctx; 3296 3297 if (rc) { 3298 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3299 } 3300 3301 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3302 parent_io->internal.caller_ctx); 3303 } 3304 3305 static void 3306 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3307 { 3308 struct spdk_bdev_io *bdev_io = ctx; 3309 3310 /* u.bdev.accel_sequence should have already been cleared at this point */ 3311 assert(bdev_io->u.bdev.accel_sequence == NULL); 3312 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3313 bdev_io->internal.accel_sequence = NULL; 3314 3315 if (spdk_unlikely(status != 0)) { 3316 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3317 } 3318 3319 parent_bdev_io_complete(bdev_io, status); 3320 } 3321 3322 static void 3323 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3324 { 3325 struct spdk_bdev_io *parent_io = cb_arg; 3326 3327 spdk_bdev_free_io(bdev_io); 3328 3329 if (!success) { 3330 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3331 /* If any child I/O failed, stop further splitting process. */ 3332 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3333 parent_io->u.bdev.split_remaining_num_blocks = 0; 3334 } 3335 parent_io->u.bdev.split_outstanding--; 3336 if (parent_io->u.bdev.split_outstanding != 0) { 3337 return; 3338 } 3339 3340 /* 3341 * Parent I/O finishes when all blocks are consumed. 3342 */ 3343 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3344 assert(parent_io->internal.cb != bdev_io_split_done); 3345 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3346 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3347 3348 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3349 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3350 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3351 return; 3352 } else if (parent_io->internal.orig_iovcnt != 0 && 3353 !bdev_io_use_accel_sequence(bdev_io)) { 3354 /* bdev IO will be completed in the callback */ 3355 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3356 return; 3357 } 3358 } 3359 3360 parent_bdev_io_complete(parent_io, 0); 3361 return; 3362 } 3363 3364 /* 3365 * Continue with the splitting process. This function will complete the parent I/O if the 3366 * splitting is done. 3367 */ 3368 switch (parent_io->type) { 3369 case SPDK_BDEV_IO_TYPE_READ: 3370 case SPDK_BDEV_IO_TYPE_WRITE: 3371 _bdev_rw_split(parent_io); 3372 break; 3373 case SPDK_BDEV_IO_TYPE_UNMAP: 3374 bdev_unmap_split(parent_io); 3375 break; 3376 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3377 bdev_write_zeroes_split(parent_io); 3378 break; 3379 case SPDK_BDEV_IO_TYPE_COPY: 3380 bdev_copy_split(parent_io); 3381 break; 3382 default: 3383 assert(false); 3384 break; 3385 } 3386 } 3387 3388 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3389 bool success); 3390 3391 static void 3392 bdev_io_split(struct spdk_bdev_io *bdev_io) 3393 { 3394 assert(bdev_io_should_split(bdev_io)); 3395 3396 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3397 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3398 bdev_io->u.bdev.split_outstanding = 0; 3399 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3400 3401 switch (bdev_io->type) { 3402 case SPDK_BDEV_IO_TYPE_READ: 3403 case SPDK_BDEV_IO_TYPE_WRITE: 3404 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3405 _bdev_rw_split(bdev_io); 3406 } else { 3407 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3408 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3409 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3410 } 3411 break; 3412 case SPDK_BDEV_IO_TYPE_UNMAP: 3413 bdev_unmap_split(bdev_io); 3414 break; 3415 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3416 bdev_write_zeroes_split(bdev_io); 3417 break; 3418 case SPDK_BDEV_IO_TYPE_COPY: 3419 bdev_copy_split(bdev_io); 3420 break; 3421 default: 3422 assert(false); 3423 break; 3424 } 3425 } 3426 3427 static void 3428 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3429 { 3430 if (!success) { 3431 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3432 return; 3433 } 3434 3435 _bdev_rw_split(bdev_io); 3436 } 3437 3438 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3439 * be inlined, at least on some compilers. 3440 */ 3441 static inline void 3442 _bdev_io_submit(void *ctx) 3443 { 3444 struct spdk_bdev_io *bdev_io = ctx; 3445 struct spdk_bdev *bdev = bdev_io->bdev; 3446 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3447 3448 if (spdk_likely(bdev_ch->flags == 0)) { 3449 bdev_io_do_submit(bdev_ch, bdev_io); 3450 return; 3451 } 3452 3453 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3454 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3455 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3456 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3457 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3458 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3459 } else { 3460 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3461 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3462 } 3463 } else { 3464 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3465 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3466 } 3467 } 3468 3469 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3470 3471 bool 3472 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3473 { 3474 if (range1->length == 0 || range2->length == 0) { 3475 return false; 3476 } 3477 3478 if (range1->offset + range1->length <= range2->offset) { 3479 return false; 3480 } 3481 3482 if (range2->offset + range2->length <= range1->offset) { 3483 return false; 3484 } 3485 3486 return true; 3487 } 3488 3489 static bool 3490 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3491 { 3492 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3493 struct lba_range r; 3494 3495 switch (bdev_io->type) { 3496 case SPDK_BDEV_IO_TYPE_NVME_IO: 3497 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3498 /* Don't try to decode the NVMe command - just assume worst-case and that 3499 * it overlaps a locked range. 3500 */ 3501 return true; 3502 case SPDK_BDEV_IO_TYPE_READ: 3503 if (!range->quiesce) { 3504 return false; 3505 } 3506 /* fallthrough */ 3507 case SPDK_BDEV_IO_TYPE_WRITE: 3508 case SPDK_BDEV_IO_TYPE_UNMAP: 3509 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3510 case SPDK_BDEV_IO_TYPE_ZCOPY: 3511 case SPDK_BDEV_IO_TYPE_COPY: 3512 r.offset = bdev_io->u.bdev.offset_blocks; 3513 r.length = bdev_io->u.bdev.num_blocks; 3514 if (!bdev_lba_range_overlapped(range, &r)) { 3515 /* This I/O doesn't overlap the specified LBA range. */ 3516 return false; 3517 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3518 /* This I/O overlaps, but the I/O is on the same channel that locked this 3519 * range, and the caller_ctx is the same as the locked_ctx. This means 3520 * that this I/O is associated with the lock, and is allowed to execute. 3521 */ 3522 return false; 3523 } else { 3524 return true; 3525 } 3526 default: 3527 return false; 3528 } 3529 } 3530 3531 void 3532 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3533 { 3534 struct spdk_bdev *bdev = bdev_io->bdev; 3535 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3536 3537 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3538 3539 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3540 struct lba_range *range; 3541 3542 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3543 if (bdev_io_range_is_locked(bdev_io, range)) { 3544 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3545 return; 3546 } 3547 } 3548 } 3549 3550 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3551 3552 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3553 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3554 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3555 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3556 spdk_bdev_get_name(bdev)); 3557 3558 if (bdev_io->internal.split) { 3559 bdev_io_split(bdev_io); 3560 return; 3561 } 3562 3563 _bdev_io_submit(bdev_io); 3564 } 3565 3566 static inline void 3567 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3568 { 3569 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3570 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3571 * For write operation we need to pull buffers from memory domain before submitting IO. 3572 * Once read operation completes, we need to use memory_domain push functionality to 3573 * update data in original memory domain IO buffer 3574 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3575 bdev_io->u.bdev.memory_domain = NULL; 3576 bdev_io->u.bdev.memory_domain_ctx = NULL; 3577 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3578 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3579 } 3580 3581 static inline void 3582 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3583 { 3584 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3585 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3586 3587 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3588 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3589 bdev_io_complete_unsubmitted(bdev_io); 3590 return; 3591 } 3592 3593 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3594 * support them, but we need to execute an accel sequence and the data buffer is from accel 3595 * memory domain (to avoid doing a push/pull from that domain). 3596 */ 3597 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3598 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3599 _bdev_io_ext_use_bounce_buffer(bdev_io); 3600 return; 3601 } 3602 3603 if (needs_exec) { 3604 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3605 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3606 return; 3607 } 3608 /* For reads we'll execute the sequence after the data is read, so, for now, only 3609 * clear out accel_sequence pointer and submit the IO */ 3610 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3611 bdev_io->u.bdev.accel_sequence = NULL; 3612 } 3613 3614 bdev_io_submit(bdev_io); 3615 } 3616 3617 static void 3618 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3619 { 3620 struct spdk_bdev *bdev = bdev_io->bdev; 3621 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3622 struct spdk_io_channel *ch = bdev_ch->channel; 3623 3624 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3625 3626 bdev_io->internal.in_submit_request = true; 3627 bdev_submit_request(bdev, ch, bdev_io); 3628 bdev_io->internal.in_submit_request = false; 3629 } 3630 3631 void 3632 bdev_io_init(struct spdk_bdev_io *bdev_io, 3633 struct spdk_bdev *bdev, void *cb_arg, 3634 spdk_bdev_io_completion_cb cb) 3635 { 3636 bdev_io->bdev = bdev; 3637 bdev_io->internal.caller_ctx = cb_arg; 3638 bdev_io->internal.cb = cb; 3639 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3640 bdev_io->internal.in_submit_request = false; 3641 bdev_io->internal.buf = NULL; 3642 bdev_io->internal.orig_iovs = NULL; 3643 bdev_io->internal.orig_iovcnt = 0; 3644 bdev_io->internal.orig_md_iov.iov_base = NULL; 3645 bdev_io->internal.error.nvme.cdw0 = 0; 3646 bdev_io->num_retries = 0; 3647 bdev_io->internal.get_buf_cb = NULL; 3648 bdev_io->internal.get_aux_buf_cb = NULL; 3649 bdev_io->internal.memory_domain = NULL; 3650 bdev_io->internal.memory_domain_ctx = NULL; 3651 bdev_io->internal.data_transfer_cpl = NULL; 3652 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3653 bdev_io->internal.accel_sequence = NULL; 3654 bdev_io->internal.has_accel_sequence = false; 3655 } 3656 3657 static bool 3658 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3659 { 3660 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3661 } 3662 3663 bool 3664 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3665 { 3666 bool supported; 3667 3668 supported = bdev_io_type_supported(bdev, io_type); 3669 3670 if (!supported) { 3671 switch (io_type) { 3672 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3673 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3674 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3675 break; 3676 default: 3677 break; 3678 } 3679 } 3680 3681 return supported; 3682 } 3683 3684 uint64_t 3685 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3686 { 3687 return bdev_io->internal.submit_tsc; 3688 } 3689 3690 int 3691 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3692 { 3693 if (bdev->fn_table->dump_info_json) { 3694 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3695 } 3696 3697 return 0; 3698 } 3699 3700 static void 3701 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3702 { 3703 uint32_t max_per_timeslice = 0; 3704 int i; 3705 3706 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3707 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3708 qos->rate_limits[i].max_per_timeslice = 0; 3709 continue; 3710 } 3711 3712 max_per_timeslice = qos->rate_limits[i].limit * 3713 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3714 3715 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3716 qos->rate_limits[i].min_per_timeslice); 3717 3718 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3719 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3720 } 3721 3722 bdev_qos_set_ops(qos); 3723 } 3724 3725 static void 3726 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3727 struct spdk_io_channel *io_ch, void *ctx) 3728 { 3729 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3730 int status; 3731 3732 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3733 3734 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3735 /* TODO: channels round robing */ 3736 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3737 3738 spdk_bdev_for_each_channel_continue(i, status); 3739 } 3740 3741 3742 static void 3743 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3744 { 3745 3746 } 3747 3748 static int 3749 bdev_channel_poll_qos(void *arg) 3750 { 3751 struct spdk_bdev *bdev = arg; 3752 struct spdk_bdev_qos *qos = bdev->internal.qos; 3753 uint64_t now = spdk_get_ticks(); 3754 int i; 3755 int64_t remaining_last_timeslice; 3756 3757 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3758 /* We received our callback earlier than expected - return 3759 * immediately and wait to do accounting until at least one 3760 * timeslice has actually expired. This should never happen 3761 * with a well-behaved timer implementation. 3762 */ 3763 return SPDK_POLLER_IDLE; 3764 } 3765 3766 /* Reset for next round of rate limiting */ 3767 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3768 /* We may have allowed the IOs or bytes to slightly overrun in the last 3769 * timeslice. remaining_this_timeslice is signed, so if it's negative 3770 * here, we'll account for the overrun so that the next timeslice will 3771 * be appropriately reduced. 3772 */ 3773 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3774 0, __ATOMIC_RELAXED); 3775 if (remaining_last_timeslice < 0) { 3776 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3777 * potentially use 2 atomic ops each, so they can intertwine. 3778 * This race can potentialy cause the limits to be a little fuzzy but won't cause any real damage. 3779 */ 3780 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3781 remaining_last_timeslice, __ATOMIC_RELAXED); 3782 } 3783 } 3784 3785 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3786 qos->last_timeslice += qos->timeslice_size; 3787 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3788 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3789 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3790 } 3791 } 3792 3793 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3794 bdev_channel_submit_qos_io_done); 3795 3796 return SPDK_POLLER_BUSY; 3797 } 3798 3799 static void 3800 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3801 { 3802 struct spdk_bdev_shared_resource *shared_resource; 3803 struct lba_range *range; 3804 3805 bdev_free_io_stat(ch->stat); 3806 #ifdef SPDK_CONFIG_VTUNE 3807 bdev_free_io_stat(ch->prev_stat); 3808 #endif 3809 3810 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3811 range = TAILQ_FIRST(&ch->locked_ranges); 3812 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3813 free(range); 3814 } 3815 3816 spdk_put_io_channel(ch->channel); 3817 spdk_put_io_channel(ch->accel_channel); 3818 3819 shared_resource = ch->shared_resource; 3820 3821 assert(TAILQ_EMPTY(&ch->io_locked)); 3822 assert(TAILQ_EMPTY(&ch->io_submitted)); 3823 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3824 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3825 assert(ch->io_outstanding == 0); 3826 assert(shared_resource->ref > 0); 3827 shared_resource->ref--; 3828 if (shared_resource->ref == 0) { 3829 assert(shared_resource->io_outstanding == 0); 3830 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3831 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3832 spdk_poller_unregister(&shared_resource->nomem_poller); 3833 free(shared_resource); 3834 } 3835 } 3836 3837 static void 3838 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3839 { 3840 struct spdk_bdev_qos *qos = bdev->internal.qos; 3841 int i; 3842 3843 assert(spdk_spin_held(&bdev->internal.spinlock)); 3844 3845 /* Rate limiting on this bdev enabled */ 3846 if (qos) { 3847 if (qos->ch == NULL) { 3848 struct spdk_io_channel *io_ch; 3849 3850 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3851 bdev->name, spdk_get_thread()); 3852 3853 /* No qos channel has been selected, so set one up */ 3854 3855 /* Take another reference to ch */ 3856 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3857 assert(io_ch != NULL); 3858 qos->ch = ch; 3859 3860 qos->thread = spdk_io_channel_get_thread(io_ch); 3861 3862 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3863 if (bdev_qos_is_iops_rate_limit(i) == true) { 3864 qos->rate_limits[i].min_per_timeslice = 3865 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3866 } else { 3867 qos->rate_limits[i].min_per_timeslice = 3868 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3869 } 3870 3871 if (qos->rate_limits[i].limit == 0) { 3872 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3873 } 3874 } 3875 bdev_qos_update_max_quota_per_timeslice(qos); 3876 qos->timeslice_size = 3877 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3878 qos->last_timeslice = spdk_get_ticks(); 3879 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3880 bdev, 3881 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3882 } 3883 3884 ch->flags |= BDEV_CH_QOS_ENABLED; 3885 } 3886 } 3887 3888 struct poll_timeout_ctx { 3889 struct spdk_bdev_desc *desc; 3890 uint64_t timeout_in_sec; 3891 spdk_bdev_io_timeout_cb cb_fn; 3892 void *cb_arg; 3893 }; 3894 3895 static void 3896 bdev_desc_free(struct spdk_bdev_desc *desc) 3897 { 3898 spdk_spin_destroy(&desc->spinlock); 3899 free(desc->media_events_buffer); 3900 free(desc); 3901 } 3902 3903 static void 3904 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3905 { 3906 struct poll_timeout_ctx *ctx = _ctx; 3907 struct spdk_bdev_desc *desc = ctx->desc; 3908 3909 free(ctx); 3910 3911 spdk_spin_lock(&desc->spinlock); 3912 desc->refs--; 3913 if (desc->closed == true && desc->refs == 0) { 3914 spdk_spin_unlock(&desc->spinlock); 3915 bdev_desc_free(desc); 3916 return; 3917 } 3918 spdk_spin_unlock(&desc->spinlock); 3919 } 3920 3921 static void 3922 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3923 struct spdk_io_channel *io_ch, void *_ctx) 3924 { 3925 struct poll_timeout_ctx *ctx = _ctx; 3926 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3927 struct spdk_bdev_desc *desc = ctx->desc; 3928 struct spdk_bdev_io *bdev_io; 3929 uint64_t now; 3930 3931 spdk_spin_lock(&desc->spinlock); 3932 if (desc->closed == true) { 3933 spdk_spin_unlock(&desc->spinlock); 3934 spdk_bdev_for_each_channel_continue(i, -1); 3935 return; 3936 } 3937 spdk_spin_unlock(&desc->spinlock); 3938 3939 now = spdk_get_ticks(); 3940 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3941 /* Exclude any I/O that are generated via splitting. */ 3942 if (bdev_io->internal.cb == bdev_io_split_done) { 3943 continue; 3944 } 3945 3946 /* Once we find an I/O that has not timed out, we can immediately 3947 * exit the loop. 3948 */ 3949 if (now < (bdev_io->internal.submit_tsc + 3950 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3951 goto end; 3952 } 3953 3954 if (bdev_io->internal.desc == desc) { 3955 ctx->cb_fn(ctx->cb_arg, bdev_io); 3956 } 3957 } 3958 3959 end: 3960 spdk_bdev_for_each_channel_continue(i, 0); 3961 } 3962 3963 static int 3964 bdev_poll_timeout_io(void *arg) 3965 { 3966 struct spdk_bdev_desc *desc = arg; 3967 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3968 struct poll_timeout_ctx *ctx; 3969 3970 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3971 if (!ctx) { 3972 SPDK_ERRLOG("failed to allocate memory\n"); 3973 return SPDK_POLLER_BUSY; 3974 } 3975 ctx->desc = desc; 3976 ctx->cb_arg = desc->cb_arg; 3977 ctx->cb_fn = desc->cb_fn; 3978 ctx->timeout_in_sec = desc->timeout_in_sec; 3979 3980 /* Take a ref on the descriptor in case it gets closed while we are checking 3981 * all of the channels. 3982 */ 3983 spdk_spin_lock(&desc->spinlock); 3984 desc->refs++; 3985 spdk_spin_unlock(&desc->spinlock); 3986 3987 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3988 bdev_channel_poll_timeout_io_done); 3989 3990 return SPDK_POLLER_BUSY; 3991 } 3992 3993 int 3994 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3995 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3996 { 3997 assert(desc->thread == spdk_get_thread()); 3998 3999 spdk_poller_unregister(&desc->io_timeout_poller); 4000 4001 if (timeout_in_sec) { 4002 assert(cb_fn != NULL); 4003 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4004 desc, 4005 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4006 1000); 4007 if (desc->io_timeout_poller == NULL) { 4008 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4009 return -1; 4010 } 4011 } 4012 4013 desc->cb_fn = cb_fn; 4014 desc->cb_arg = cb_arg; 4015 desc->timeout_in_sec = timeout_in_sec; 4016 4017 return 0; 4018 } 4019 4020 static int 4021 bdev_channel_create(void *io_device, void *ctx_buf) 4022 { 4023 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4024 struct spdk_bdev_channel *ch = ctx_buf; 4025 struct spdk_io_channel *mgmt_io_ch; 4026 struct spdk_bdev_mgmt_channel *mgmt_ch; 4027 struct spdk_bdev_shared_resource *shared_resource; 4028 struct lba_range *range; 4029 4030 ch->bdev = bdev; 4031 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4032 if (!ch->channel) { 4033 return -1; 4034 } 4035 4036 ch->accel_channel = spdk_accel_get_io_channel(); 4037 if (!ch->accel_channel) { 4038 spdk_put_io_channel(ch->channel); 4039 return -1; 4040 } 4041 4042 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 4043 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4044 4045 assert(ch->histogram == NULL); 4046 if (bdev->internal.histogram_enabled) { 4047 ch->histogram = spdk_histogram_data_alloc(); 4048 if (ch->histogram == NULL) { 4049 SPDK_ERRLOG("Could not allocate histogram\n"); 4050 } 4051 } 4052 4053 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4054 if (!mgmt_io_ch) { 4055 spdk_put_io_channel(ch->channel); 4056 spdk_put_io_channel(ch->accel_channel); 4057 return -1; 4058 } 4059 4060 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4061 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4062 if (shared_resource->shared_ch == ch->channel) { 4063 spdk_put_io_channel(mgmt_io_ch); 4064 shared_resource->ref++; 4065 break; 4066 } 4067 } 4068 4069 if (shared_resource == NULL) { 4070 shared_resource = calloc(1, sizeof(*shared_resource)); 4071 if (shared_resource == NULL) { 4072 spdk_put_io_channel(ch->channel); 4073 spdk_put_io_channel(ch->accel_channel); 4074 spdk_put_io_channel(mgmt_io_ch); 4075 return -1; 4076 } 4077 4078 shared_resource->mgmt_ch = mgmt_ch; 4079 shared_resource->io_outstanding = 0; 4080 TAILQ_INIT(&shared_resource->nomem_io); 4081 shared_resource->nomem_threshold = 0; 4082 shared_resource->shared_ch = ch->channel; 4083 shared_resource->ref = 1; 4084 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4085 } 4086 4087 ch->io_outstanding = 0; 4088 TAILQ_INIT(&ch->queued_resets); 4089 TAILQ_INIT(&ch->locked_ranges); 4090 TAILQ_INIT(&ch->qos_queued_io); 4091 ch->flags = 0; 4092 ch->shared_resource = shared_resource; 4093 4094 TAILQ_INIT(&ch->io_submitted); 4095 TAILQ_INIT(&ch->io_locked); 4096 TAILQ_INIT(&ch->io_accel_exec); 4097 TAILQ_INIT(&ch->io_memory_domain); 4098 4099 ch->stat = bdev_alloc_io_stat(false); 4100 if (ch->stat == NULL) { 4101 bdev_channel_destroy_resource(ch); 4102 return -1; 4103 } 4104 4105 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4106 4107 #ifdef SPDK_CONFIG_VTUNE 4108 { 4109 char *name; 4110 __itt_init_ittlib(NULL, 0); 4111 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4112 if (!name) { 4113 bdev_channel_destroy_resource(ch); 4114 return -1; 4115 } 4116 ch->handle = __itt_string_handle_create(name); 4117 free(name); 4118 ch->start_tsc = spdk_get_ticks(); 4119 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4120 ch->prev_stat = bdev_alloc_io_stat(false); 4121 if (ch->prev_stat == NULL) { 4122 bdev_channel_destroy_resource(ch); 4123 return -1; 4124 } 4125 } 4126 #endif 4127 4128 spdk_spin_lock(&bdev->internal.spinlock); 4129 bdev_enable_qos(bdev, ch); 4130 4131 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4132 struct lba_range *new_range; 4133 4134 new_range = calloc(1, sizeof(*new_range)); 4135 if (new_range == NULL) { 4136 spdk_spin_unlock(&bdev->internal.spinlock); 4137 bdev_channel_destroy_resource(ch); 4138 return -1; 4139 } 4140 new_range->length = range->length; 4141 new_range->offset = range->offset; 4142 new_range->locked_ctx = range->locked_ctx; 4143 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4144 } 4145 4146 spdk_spin_unlock(&bdev->internal.spinlock); 4147 4148 return 0; 4149 } 4150 4151 static int 4152 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4153 void *cb_ctx) 4154 { 4155 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4156 struct spdk_bdev_io *bdev_io; 4157 uint64_t buf_len; 4158 4159 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4160 if (bdev_io->internal.ch == bdev_ch) { 4161 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4162 spdk_iobuf_entry_abort(ch, entry, buf_len); 4163 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4164 } 4165 4166 return 0; 4167 } 4168 4169 /* 4170 * Abort I/O that are waiting on a data buffer. 4171 */ 4172 static void 4173 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4174 { 4175 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4176 bdev_abort_all_buf_io_cb, ch); 4177 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4178 bdev_abort_all_buf_io_cb, ch); 4179 } 4180 4181 /* 4182 * Abort I/O that are queued waiting for submission. These types of I/O are 4183 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4184 */ 4185 static void 4186 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4187 { 4188 struct spdk_bdev_io *bdev_io, *tmp; 4189 4190 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4191 if (bdev_io->internal.ch == ch) { 4192 TAILQ_REMOVE(queue, bdev_io, internal.link); 4193 /* 4194 * spdk_bdev_io_complete() assumes that the completed I/O had 4195 * been submitted to the bdev module. Since in this case it 4196 * hadn't, bump io_outstanding to account for the decrement 4197 * that spdk_bdev_io_complete() will do. 4198 */ 4199 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4200 bdev_io_increment_outstanding(ch, ch->shared_resource); 4201 } 4202 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4203 } 4204 } 4205 } 4206 4207 static bool 4208 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4209 { 4210 struct spdk_bdev_io *bdev_io; 4211 4212 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4213 if (bdev_io == bio_to_abort) { 4214 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4215 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4216 return true; 4217 } 4218 } 4219 4220 return false; 4221 } 4222 4223 static int 4224 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4225 { 4226 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4227 uint64_t buf_len; 4228 4229 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4230 if (bdev_io == bio_to_abort) { 4231 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4232 spdk_iobuf_entry_abort(ch, entry, buf_len); 4233 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4234 return 1; 4235 } 4236 4237 return 0; 4238 } 4239 4240 static bool 4241 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4242 { 4243 int rc; 4244 4245 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4246 bdev_abort_buf_io_cb, bio_to_abort); 4247 if (rc == 1) { 4248 return true; 4249 } 4250 4251 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4252 bdev_abort_buf_io_cb, bio_to_abort); 4253 return rc == 1; 4254 } 4255 4256 static void 4257 bdev_qos_channel_destroy(void *cb_arg) 4258 { 4259 struct spdk_bdev_qos *qos = cb_arg; 4260 4261 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4262 spdk_poller_unregister(&qos->poller); 4263 4264 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4265 4266 free(qos); 4267 } 4268 4269 static int 4270 bdev_qos_destroy(struct spdk_bdev *bdev) 4271 { 4272 int i; 4273 4274 /* 4275 * Cleanly shutting down the QoS poller is tricky, because 4276 * during the asynchronous operation the user could open 4277 * a new descriptor and create a new channel, spawning 4278 * a new QoS poller. 4279 * 4280 * The strategy is to create a new QoS structure here and swap it 4281 * in. The shutdown path then continues to refer to the old one 4282 * until it completes and then releases it. 4283 */ 4284 struct spdk_bdev_qos *new_qos, *old_qos; 4285 4286 old_qos = bdev->internal.qos; 4287 4288 new_qos = calloc(1, sizeof(*new_qos)); 4289 if (!new_qos) { 4290 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4291 return -ENOMEM; 4292 } 4293 4294 /* Copy the old QoS data into the newly allocated structure */ 4295 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4296 4297 /* Zero out the key parts of the QoS structure */ 4298 new_qos->ch = NULL; 4299 new_qos->thread = NULL; 4300 new_qos->poller = NULL; 4301 /* 4302 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4303 * It will be used later for the new QoS structure. 4304 */ 4305 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4306 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4307 new_qos->rate_limits[i].min_per_timeslice = 0; 4308 new_qos->rate_limits[i].max_per_timeslice = 0; 4309 } 4310 4311 bdev->internal.qos = new_qos; 4312 4313 if (old_qos->thread == NULL) { 4314 free(old_qos); 4315 } else { 4316 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4317 } 4318 4319 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4320 * been destroyed yet. The destruction path will end up waiting for the final 4321 * channel to be put before it releases resources. */ 4322 4323 return 0; 4324 } 4325 4326 void 4327 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4328 { 4329 total->bytes_read += add->bytes_read; 4330 total->num_read_ops += add->num_read_ops; 4331 total->bytes_written += add->bytes_written; 4332 total->num_write_ops += add->num_write_ops; 4333 total->bytes_unmapped += add->bytes_unmapped; 4334 total->num_unmap_ops += add->num_unmap_ops; 4335 total->bytes_copied += add->bytes_copied; 4336 total->num_copy_ops += add->num_copy_ops; 4337 total->read_latency_ticks += add->read_latency_ticks; 4338 total->write_latency_ticks += add->write_latency_ticks; 4339 total->unmap_latency_ticks += add->unmap_latency_ticks; 4340 total->copy_latency_ticks += add->copy_latency_ticks; 4341 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4342 total->max_read_latency_ticks = add->max_read_latency_ticks; 4343 } 4344 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4345 total->min_read_latency_ticks = add->min_read_latency_ticks; 4346 } 4347 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4348 total->max_write_latency_ticks = add->max_write_latency_ticks; 4349 } 4350 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4351 total->min_write_latency_ticks = add->min_write_latency_ticks; 4352 } 4353 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4354 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4355 } 4356 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4357 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4358 } 4359 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4360 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4361 } 4362 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4363 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4364 } 4365 } 4366 4367 static void 4368 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4369 { 4370 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4371 4372 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4373 memcpy(to_stat->io_error, from_stat->io_error, 4374 sizeof(struct spdk_bdev_io_error_stat)); 4375 } 4376 } 4377 4378 void 4379 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4380 { 4381 stat->max_read_latency_ticks = 0; 4382 stat->min_read_latency_ticks = UINT64_MAX; 4383 stat->max_write_latency_ticks = 0; 4384 stat->min_write_latency_ticks = UINT64_MAX; 4385 stat->max_unmap_latency_ticks = 0; 4386 stat->min_unmap_latency_ticks = UINT64_MAX; 4387 stat->max_copy_latency_ticks = 0; 4388 stat->min_copy_latency_ticks = UINT64_MAX; 4389 4390 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4391 return; 4392 } 4393 4394 stat->bytes_read = 0; 4395 stat->num_read_ops = 0; 4396 stat->bytes_written = 0; 4397 stat->num_write_ops = 0; 4398 stat->bytes_unmapped = 0; 4399 stat->num_unmap_ops = 0; 4400 stat->bytes_copied = 0; 4401 stat->num_copy_ops = 0; 4402 stat->read_latency_ticks = 0; 4403 stat->write_latency_ticks = 0; 4404 stat->unmap_latency_ticks = 0; 4405 stat->copy_latency_ticks = 0; 4406 4407 if (stat->io_error != NULL) { 4408 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4409 } 4410 } 4411 4412 struct spdk_bdev_io_stat * 4413 bdev_alloc_io_stat(bool io_error_stat) 4414 { 4415 struct spdk_bdev_io_stat *stat; 4416 4417 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4418 if (stat == NULL) { 4419 return NULL; 4420 } 4421 4422 if (io_error_stat) { 4423 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4424 if (stat->io_error == NULL) { 4425 free(stat); 4426 return NULL; 4427 } 4428 } else { 4429 stat->io_error = NULL; 4430 } 4431 4432 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4433 4434 return stat; 4435 } 4436 4437 void 4438 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4439 { 4440 if (stat != NULL) { 4441 free(stat->io_error); 4442 free(stat); 4443 } 4444 } 4445 4446 void 4447 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4448 { 4449 int i; 4450 4451 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4452 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4453 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4454 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4455 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4456 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4457 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4458 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4459 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4460 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4461 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4462 stat->min_read_latency_ticks != UINT64_MAX ? 4463 stat->min_read_latency_ticks : 0); 4464 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4465 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4466 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4467 stat->min_write_latency_ticks != UINT64_MAX ? 4468 stat->min_write_latency_ticks : 0); 4469 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4470 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4471 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4472 stat->min_unmap_latency_ticks != UINT64_MAX ? 4473 stat->min_unmap_latency_ticks : 0); 4474 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4475 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4476 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4477 stat->min_copy_latency_ticks != UINT64_MAX ? 4478 stat->min_copy_latency_ticks : 0); 4479 4480 if (stat->io_error != NULL) { 4481 spdk_json_write_named_object_begin(w, "io_error"); 4482 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4483 if (stat->io_error->error_status[i] != 0) { 4484 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4485 stat->io_error->error_status[i]); 4486 } 4487 } 4488 spdk_json_write_object_end(w); 4489 } 4490 } 4491 4492 static void 4493 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4494 { 4495 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4496 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4497 4498 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4499 bdev_abort_all_buf_io(mgmt_ch, ch); 4500 } 4501 4502 static void 4503 bdev_channel_destroy(void *io_device, void *ctx_buf) 4504 { 4505 struct spdk_bdev_channel *ch = ctx_buf; 4506 4507 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4508 spdk_get_thread()); 4509 4510 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4511 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4512 4513 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4514 spdk_spin_lock(&ch->bdev->internal.spinlock); 4515 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4516 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4517 4518 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4519 4520 bdev_channel_abort_queued_ios(ch); 4521 4522 if (ch->histogram) { 4523 spdk_histogram_data_free(ch->histogram); 4524 } 4525 4526 bdev_channel_destroy_resource(ch); 4527 } 4528 4529 /* 4530 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4531 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4532 */ 4533 static int 4534 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4535 { 4536 struct spdk_bdev_name *tmp; 4537 4538 bdev_name->name = strdup(name); 4539 if (bdev_name->name == NULL) { 4540 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4541 return -ENOMEM; 4542 } 4543 4544 bdev_name->bdev = bdev; 4545 4546 spdk_spin_lock(&g_bdev_mgr.spinlock); 4547 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4548 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4549 4550 if (tmp != NULL) { 4551 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4552 free(bdev_name->name); 4553 return -EEXIST; 4554 } 4555 4556 return 0; 4557 } 4558 4559 static void 4560 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4561 { 4562 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4563 free(bdev_name->name); 4564 } 4565 4566 static void 4567 bdev_name_del(struct spdk_bdev_name *bdev_name) 4568 { 4569 spdk_spin_lock(&g_bdev_mgr.spinlock); 4570 bdev_name_del_unsafe(bdev_name); 4571 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4572 } 4573 4574 int 4575 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4576 { 4577 struct spdk_bdev_alias *tmp; 4578 int ret; 4579 4580 if (alias == NULL) { 4581 SPDK_ERRLOG("Empty alias passed\n"); 4582 return -EINVAL; 4583 } 4584 4585 tmp = calloc(1, sizeof(*tmp)); 4586 if (tmp == NULL) { 4587 SPDK_ERRLOG("Unable to allocate alias\n"); 4588 return -ENOMEM; 4589 } 4590 4591 ret = bdev_name_add(&tmp->alias, bdev, alias); 4592 if (ret != 0) { 4593 free(tmp); 4594 return ret; 4595 } 4596 4597 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4598 4599 return 0; 4600 } 4601 4602 static int 4603 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4604 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4605 { 4606 struct spdk_bdev_alias *tmp; 4607 4608 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4609 if (strcmp(alias, tmp->alias.name) == 0) { 4610 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4611 alias_del_fn(&tmp->alias); 4612 free(tmp); 4613 return 0; 4614 } 4615 } 4616 4617 return -ENOENT; 4618 } 4619 4620 int 4621 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4622 { 4623 int rc; 4624 4625 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4626 if (rc == -ENOENT) { 4627 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4628 } 4629 4630 return rc; 4631 } 4632 4633 void 4634 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4635 { 4636 struct spdk_bdev_alias *p, *tmp; 4637 4638 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4639 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4640 bdev_name_del(&p->alias); 4641 free(p); 4642 } 4643 } 4644 4645 struct spdk_io_channel * 4646 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4647 { 4648 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4649 } 4650 4651 void * 4652 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4653 { 4654 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4655 void *ctx = NULL; 4656 4657 if (bdev->fn_table->get_module_ctx) { 4658 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4659 } 4660 4661 return ctx; 4662 } 4663 4664 const char * 4665 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4666 { 4667 return bdev->module->name; 4668 } 4669 4670 const char * 4671 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4672 { 4673 return bdev->name; 4674 } 4675 4676 const char * 4677 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4678 { 4679 return bdev->product_name; 4680 } 4681 4682 const struct spdk_bdev_aliases_list * 4683 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4684 { 4685 return &bdev->aliases; 4686 } 4687 4688 uint32_t 4689 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4690 { 4691 return bdev->blocklen; 4692 } 4693 4694 uint32_t 4695 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4696 { 4697 return bdev->write_unit_size; 4698 } 4699 4700 uint64_t 4701 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4702 { 4703 return bdev->blockcnt; 4704 } 4705 4706 const char * 4707 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4708 { 4709 return qos_rpc_type[type]; 4710 } 4711 4712 void 4713 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4714 { 4715 int i; 4716 4717 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4718 4719 spdk_spin_lock(&bdev->internal.spinlock); 4720 if (bdev->internal.qos) { 4721 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4722 if (bdev->internal.qos->rate_limits[i].limit != 4723 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4724 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4725 if (bdev_qos_is_iops_rate_limit(i) == false) { 4726 /* Change from Byte to Megabyte which is user visible. */ 4727 limits[i] = limits[i] / 1024 / 1024; 4728 } 4729 } 4730 } 4731 } 4732 spdk_spin_unlock(&bdev->internal.spinlock); 4733 } 4734 4735 size_t 4736 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4737 { 4738 return 1 << bdev->required_alignment; 4739 } 4740 4741 uint32_t 4742 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4743 { 4744 return bdev->optimal_io_boundary; 4745 } 4746 4747 bool 4748 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4749 { 4750 return bdev->write_cache; 4751 } 4752 4753 const struct spdk_uuid * 4754 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4755 { 4756 return &bdev->uuid; 4757 } 4758 4759 uint16_t 4760 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4761 { 4762 return bdev->acwu; 4763 } 4764 4765 uint32_t 4766 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4767 { 4768 return bdev->md_len; 4769 } 4770 4771 bool 4772 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4773 { 4774 return (bdev->md_len != 0) && bdev->md_interleave; 4775 } 4776 4777 bool 4778 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4779 { 4780 return (bdev->md_len != 0) && !bdev->md_interleave; 4781 } 4782 4783 bool 4784 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4785 { 4786 return bdev->zoned; 4787 } 4788 4789 uint32_t 4790 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4791 { 4792 if (spdk_bdev_is_md_interleaved(bdev)) { 4793 return bdev->blocklen - bdev->md_len; 4794 } else { 4795 return bdev->blocklen; 4796 } 4797 } 4798 4799 uint32_t 4800 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4801 { 4802 return bdev->phys_blocklen; 4803 } 4804 4805 static uint32_t 4806 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4807 { 4808 if (!spdk_bdev_is_md_interleaved(bdev)) { 4809 return bdev->blocklen + bdev->md_len; 4810 } else { 4811 return bdev->blocklen; 4812 } 4813 } 4814 4815 /* We have to use the typedef in the function declaration to appease astyle. */ 4816 typedef enum spdk_dif_type spdk_dif_type_t; 4817 4818 spdk_dif_type_t 4819 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4820 { 4821 if (bdev->md_len != 0) { 4822 return bdev->dif_type; 4823 } else { 4824 return SPDK_DIF_DISABLE; 4825 } 4826 } 4827 4828 bool 4829 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4830 { 4831 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4832 return bdev->dif_is_head_of_md; 4833 } else { 4834 return false; 4835 } 4836 } 4837 4838 bool 4839 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4840 enum spdk_dif_check_type check_type) 4841 { 4842 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4843 return false; 4844 } 4845 4846 switch (check_type) { 4847 case SPDK_DIF_CHECK_TYPE_REFTAG: 4848 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4849 case SPDK_DIF_CHECK_TYPE_APPTAG: 4850 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4851 case SPDK_DIF_CHECK_TYPE_GUARD: 4852 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4853 default: 4854 return false; 4855 } 4856 } 4857 4858 static uint32_t 4859 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4860 { 4861 uint64_t aligned_length, max_write_blocks; 4862 4863 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4864 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4865 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4866 4867 return max_write_blocks; 4868 } 4869 4870 uint32_t 4871 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4872 { 4873 return bdev->max_copy; 4874 } 4875 4876 uint64_t 4877 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4878 { 4879 return bdev->internal.measured_queue_depth; 4880 } 4881 4882 uint64_t 4883 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4884 { 4885 return bdev->internal.period; 4886 } 4887 4888 uint64_t 4889 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4890 { 4891 return bdev->internal.weighted_io_time; 4892 } 4893 4894 uint64_t 4895 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4896 { 4897 return bdev->internal.io_time; 4898 } 4899 4900 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 4901 { 4902 return bdev->ctratt; 4903 } 4904 4905 static void bdev_update_qd_sampling_period(void *ctx); 4906 4907 static void 4908 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4909 { 4910 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4911 4912 if (bdev->internal.measured_queue_depth) { 4913 bdev->internal.io_time += bdev->internal.period; 4914 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4915 } 4916 4917 bdev->internal.qd_poll_in_progress = false; 4918 4919 bdev_update_qd_sampling_period(bdev); 4920 } 4921 4922 static void 4923 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4924 struct spdk_io_channel *io_ch, void *_ctx) 4925 { 4926 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4927 4928 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4929 spdk_bdev_for_each_channel_continue(i, 0); 4930 } 4931 4932 static int 4933 bdev_calculate_measured_queue_depth(void *ctx) 4934 { 4935 struct spdk_bdev *bdev = ctx; 4936 4937 bdev->internal.qd_poll_in_progress = true; 4938 bdev->internal.temporary_queue_depth = 0; 4939 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4940 return SPDK_POLLER_BUSY; 4941 } 4942 4943 static void 4944 bdev_update_qd_sampling_period(void *ctx) 4945 { 4946 struct spdk_bdev *bdev = ctx; 4947 4948 if (bdev->internal.period == bdev->internal.new_period) { 4949 return; 4950 } 4951 4952 if (bdev->internal.qd_poll_in_progress) { 4953 return; 4954 } 4955 4956 bdev->internal.period = bdev->internal.new_period; 4957 4958 spdk_poller_unregister(&bdev->internal.qd_poller); 4959 if (bdev->internal.period != 0) { 4960 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4961 bdev, bdev->internal.period); 4962 } else { 4963 spdk_bdev_close(bdev->internal.qd_desc); 4964 bdev->internal.qd_desc = NULL; 4965 } 4966 } 4967 4968 static void 4969 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4970 { 4971 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4972 } 4973 4974 void 4975 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4976 { 4977 int rc; 4978 4979 if (bdev->internal.new_period == period) { 4980 return; 4981 } 4982 4983 bdev->internal.new_period = period; 4984 4985 if (bdev->internal.qd_desc != NULL) { 4986 assert(bdev->internal.period != 0); 4987 4988 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4989 bdev_update_qd_sampling_period, bdev); 4990 return; 4991 } 4992 4993 assert(bdev->internal.period == 0); 4994 4995 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4996 NULL, &bdev->internal.qd_desc); 4997 if (rc != 0) { 4998 return; 4999 } 5000 5001 bdev->internal.period = period; 5002 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5003 bdev, period); 5004 } 5005 5006 struct bdev_get_current_qd_ctx { 5007 uint64_t current_qd; 5008 spdk_bdev_get_current_qd_cb cb_fn; 5009 void *cb_arg; 5010 }; 5011 5012 static void 5013 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5014 { 5015 struct bdev_get_current_qd_ctx *ctx = _ctx; 5016 5017 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5018 5019 free(ctx); 5020 } 5021 5022 static void 5023 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5024 struct spdk_io_channel *io_ch, void *_ctx) 5025 { 5026 struct bdev_get_current_qd_ctx *ctx = _ctx; 5027 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5028 5029 ctx->current_qd += bdev_ch->io_outstanding; 5030 5031 spdk_bdev_for_each_channel_continue(i, 0); 5032 } 5033 5034 void 5035 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5036 void *cb_arg) 5037 { 5038 struct bdev_get_current_qd_ctx *ctx; 5039 5040 assert(cb_fn != NULL); 5041 5042 ctx = calloc(1, sizeof(*ctx)); 5043 if (ctx == NULL) { 5044 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5045 return; 5046 } 5047 5048 ctx->cb_fn = cb_fn; 5049 ctx->cb_arg = cb_arg; 5050 5051 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5052 } 5053 5054 static void 5055 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5056 { 5057 assert(desc->thread == spdk_get_thread()); 5058 5059 spdk_spin_lock(&desc->spinlock); 5060 desc->refs--; 5061 if (!desc->closed) { 5062 spdk_spin_unlock(&desc->spinlock); 5063 desc->callback.event_fn(type, 5064 desc->bdev, 5065 desc->callback.ctx); 5066 return; 5067 } else if (desc->refs == 0) { 5068 /* This descriptor was closed after this event_notify message was sent. 5069 * spdk_bdev_close() could not free the descriptor since this message was 5070 * in flight, so we free it now using bdev_desc_free(). 5071 */ 5072 spdk_spin_unlock(&desc->spinlock); 5073 bdev_desc_free(desc); 5074 return; 5075 } 5076 spdk_spin_unlock(&desc->spinlock); 5077 } 5078 5079 static void 5080 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5081 { 5082 spdk_spin_lock(&desc->spinlock); 5083 desc->refs++; 5084 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5085 spdk_spin_unlock(&desc->spinlock); 5086 } 5087 5088 static void 5089 _resize_notify(void *ctx) 5090 { 5091 struct spdk_bdev_desc *desc = ctx; 5092 5093 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5094 } 5095 5096 int 5097 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5098 { 5099 struct spdk_bdev_desc *desc; 5100 int ret; 5101 5102 if (size == bdev->blockcnt) { 5103 return 0; 5104 } 5105 5106 spdk_spin_lock(&bdev->internal.spinlock); 5107 5108 /* bdev has open descriptors */ 5109 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5110 bdev->blockcnt > size) { 5111 ret = -EBUSY; 5112 } else { 5113 bdev->blockcnt = size; 5114 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5115 event_notify(desc, _resize_notify); 5116 } 5117 ret = 0; 5118 } 5119 5120 spdk_spin_unlock(&bdev->internal.spinlock); 5121 5122 return ret; 5123 } 5124 5125 /* 5126 * Convert I/O offset and length from bytes to blocks. 5127 * 5128 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5129 */ 5130 static uint64_t 5131 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5132 uint64_t num_bytes, uint64_t *num_blocks) 5133 { 5134 uint32_t block_size = bdev->blocklen; 5135 uint8_t shift_cnt; 5136 5137 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5138 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5139 shift_cnt = spdk_u32log2(block_size); 5140 *offset_blocks = offset_bytes >> shift_cnt; 5141 *num_blocks = num_bytes >> shift_cnt; 5142 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5143 (num_bytes - (*num_blocks << shift_cnt)); 5144 } else { 5145 *offset_blocks = offset_bytes / block_size; 5146 *num_blocks = num_bytes / block_size; 5147 return (offset_bytes % block_size) | (num_bytes % block_size); 5148 } 5149 } 5150 5151 static bool 5152 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5153 { 5154 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5155 * has been an overflow and hence the offset has been wrapped around */ 5156 if (offset_blocks + num_blocks < offset_blocks) { 5157 return false; 5158 } 5159 5160 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5161 if (offset_blocks + num_blocks > bdev->blockcnt) { 5162 return false; 5163 } 5164 5165 return true; 5166 } 5167 5168 static void 5169 bdev_seek_complete_cb(void *ctx) 5170 { 5171 struct spdk_bdev_io *bdev_io = ctx; 5172 5173 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5174 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5175 } 5176 5177 static int 5178 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5179 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5180 spdk_bdev_io_completion_cb cb, void *cb_arg) 5181 { 5182 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5183 struct spdk_bdev_io *bdev_io; 5184 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5185 5186 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5187 5188 /* Check if offset_blocks is valid looking at the validity of one block */ 5189 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5190 return -EINVAL; 5191 } 5192 5193 bdev_io = bdev_channel_get_io(channel); 5194 if (!bdev_io) { 5195 return -ENOMEM; 5196 } 5197 5198 bdev_io->internal.ch = channel; 5199 bdev_io->internal.desc = desc; 5200 bdev_io->type = io_type; 5201 bdev_io->u.bdev.offset_blocks = offset_blocks; 5202 bdev_io->u.bdev.memory_domain = NULL; 5203 bdev_io->u.bdev.memory_domain_ctx = NULL; 5204 bdev_io->u.bdev.accel_sequence = NULL; 5205 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5206 5207 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5208 /* In case bdev doesn't support seek to next data/hole offset, 5209 * it is assumed that only data and no holes are present */ 5210 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5211 bdev_io->u.bdev.seek.offset = offset_blocks; 5212 } else { 5213 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5214 } 5215 5216 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5217 return 0; 5218 } 5219 5220 bdev_io_submit(bdev_io); 5221 return 0; 5222 } 5223 5224 int 5225 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5226 uint64_t offset_blocks, 5227 spdk_bdev_io_completion_cb cb, void *cb_arg) 5228 { 5229 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5230 } 5231 5232 int 5233 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5234 uint64_t offset_blocks, 5235 spdk_bdev_io_completion_cb cb, void *cb_arg) 5236 { 5237 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5238 } 5239 5240 uint64_t 5241 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5242 { 5243 return bdev_io->u.bdev.seek.offset; 5244 } 5245 5246 static int 5247 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5248 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5249 spdk_bdev_io_completion_cb cb, void *cb_arg) 5250 { 5251 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5252 struct spdk_bdev_io *bdev_io; 5253 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5254 5255 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5256 return -EINVAL; 5257 } 5258 5259 bdev_io = bdev_channel_get_io(channel); 5260 if (!bdev_io) { 5261 return -ENOMEM; 5262 } 5263 5264 bdev_io->internal.ch = channel; 5265 bdev_io->internal.desc = desc; 5266 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5267 bdev_io->u.bdev.iovs = &bdev_io->iov; 5268 bdev_io->u.bdev.iovs[0].iov_base = buf; 5269 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5270 bdev_io->u.bdev.iovcnt = 1; 5271 bdev_io->u.bdev.md_buf = md_buf; 5272 bdev_io->u.bdev.num_blocks = num_blocks; 5273 bdev_io->u.bdev.offset_blocks = offset_blocks; 5274 bdev_io->u.bdev.memory_domain = NULL; 5275 bdev_io->u.bdev.memory_domain_ctx = NULL; 5276 bdev_io->u.bdev.accel_sequence = NULL; 5277 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5278 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5279 5280 bdev_io_submit(bdev_io); 5281 return 0; 5282 } 5283 5284 int 5285 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5286 void *buf, uint64_t offset, uint64_t nbytes, 5287 spdk_bdev_io_completion_cb cb, void *cb_arg) 5288 { 5289 uint64_t offset_blocks, num_blocks; 5290 5291 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5292 nbytes, &num_blocks) != 0) { 5293 return -EINVAL; 5294 } 5295 5296 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5297 } 5298 5299 int 5300 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5301 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5302 spdk_bdev_io_completion_cb cb, void *cb_arg) 5303 { 5304 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5305 } 5306 5307 int 5308 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5309 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5310 spdk_bdev_io_completion_cb cb, void *cb_arg) 5311 { 5312 struct iovec iov = { 5313 .iov_base = buf, 5314 }; 5315 5316 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5317 return -EINVAL; 5318 } 5319 5320 if (md_buf && !_is_buf_allocated(&iov)) { 5321 return -EINVAL; 5322 } 5323 5324 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5325 cb, cb_arg); 5326 } 5327 5328 int 5329 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5330 struct iovec *iov, int iovcnt, 5331 uint64_t offset, uint64_t nbytes, 5332 spdk_bdev_io_completion_cb cb, void *cb_arg) 5333 { 5334 uint64_t offset_blocks, num_blocks; 5335 5336 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5337 nbytes, &num_blocks) != 0) { 5338 return -EINVAL; 5339 } 5340 5341 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5342 } 5343 5344 static int 5345 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5346 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5347 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5348 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5349 spdk_bdev_io_completion_cb cb, void *cb_arg) 5350 { 5351 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5352 struct spdk_bdev_io *bdev_io; 5353 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5354 5355 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5356 return -EINVAL; 5357 } 5358 5359 bdev_io = bdev_channel_get_io(channel); 5360 if (spdk_unlikely(!bdev_io)) { 5361 return -ENOMEM; 5362 } 5363 5364 bdev_io->internal.ch = channel; 5365 bdev_io->internal.desc = desc; 5366 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5367 bdev_io->u.bdev.iovs = iov; 5368 bdev_io->u.bdev.iovcnt = iovcnt; 5369 bdev_io->u.bdev.md_buf = md_buf; 5370 bdev_io->u.bdev.num_blocks = num_blocks; 5371 bdev_io->u.bdev.offset_blocks = offset_blocks; 5372 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5373 bdev_io->internal.memory_domain = domain; 5374 bdev_io->internal.memory_domain_ctx = domain_ctx; 5375 bdev_io->internal.accel_sequence = seq; 5376 bdev_io->internal.has_accel_sequence = seq != NULL; 5377 bdev_io->u.bdev.memory_domain = domain; 5378 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5379 bdev_io->u.bdev.accel_sequence = seq; 5380 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5381 5382 _bdev_io_submit_ext(desc, bdev_io); 5383 5384 return 0; 5385 } 5386 5387 int 5388 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5389 struct iovec *iov, int iovcnt, 5390 uint64_t offset_blocks, uint64_t num_blocks, 5391 spdk_bdev_io_completion_cb cb, void *cb_arg) 5392 { 5393 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5394 5395 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5396 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5397 } 5398 5399 int 5400 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5401 struct iovec *iov, int iovcnt, void *md_buf, 5402 uint64_t offset_blocks, uint64_t num_blocks, 5403 spdk_bdev_io_completion_cb cb, void *cb_arg) 5404 { 5405 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5406 5407 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5408 return -EINVAL; 5409 } 5410 5411 if (md_buf && !_is_buf_allocated(iov)) { 5412 return -EINVAL; 5413 } 5414 5415 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5416 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5417 } 5418 5419 static inline bool 5420 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5421 { 5422 /* 5423 * We check if opts size is at least of size when we first introduced 5424 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5425 * are not checked internal. 5426 */ 5427 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5428 sizeof(opts->metadata) && 5429 opts->size <= sizeof(*opts) && 5430 /* When memory domain is used, the user must provide data buffers */ 5431 (!opts->memory_domain || (iov && iov[0].iov_base)); 5432 } 5433 5434 int 5435 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5436 struct iovec *iov, int iovcnt, 5437 uint64_t offset_blocks, uint64_t num_blocks, 5438 spdk_bdev_io_completion_cb cb, void *cb_arg, 5439 struct spdk_bdev_ext_io_opts *opts) 5440 { 5441 struct spdk_memory_domain *domain = NULL; 5442 struct spdk_accel_sequence *seq = NULL; 5443 void *domain_ctx = NULL, *md = NULL; 5444 uint32_t dif_check_flags = 0; 5445 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5446 5447 if (opts) { 5448 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5449 return -EINVAL; 5450 } 5451 5452 md = opts->metadata; 5453 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5454 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5455 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5456 if (md) { 5457 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5458 return -EINVAL; 5459 } 5460 5461 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5462 return -EINVAL; 5463 } 5464 5465 if (spdk_unlikely(seq != NULL)) { 5466 return -EINVAL; 5467 } 5468 } 5469 } 5470 5471 dif_check_flags = bdev->dif_check_flags & 5472 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5473 5474 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5475 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5476 } 5477 5478 static int 5479 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5480 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5481 spdk_bdev_io_completion_cb cb, void *cb_arg) 5482 { 5483 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5484 struct spdk_bdev_io *bdev_io; 5485 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5486 5487 if (!desc->write) { 5488 return -EBADF; 5489 } 5490 5491 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5492 return -EINVAL; 5493 } 5494 5495 bdev_io = bdev_channel_get_io(channel); 5496 if (!bdev_io) { 5497 return -ENOMEM; 5498 } 5499 5500 bdev_io->internal.ch = channel; 5501 bdev_io->internal.desc = desc; 5502 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5503 bdev_io->u.bdev.iovs = &bdev_io->iov; 5504 bdev_io->u.bdev.iovs[0].iov_base = buf; 5505 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5506 bdev_io->u.bdev.iovcnt = 1; 5507 bdev_io->u.bdev.md_buf = md_buf; 5508 bdev_io->u.bdev.num_blocks = num_blocks; 5509 bdev_io->u.bdev.offset_blocks = offset_blocks; 5510 bdev_io->u.bdev.memory_domain = NULL; 5511 bdev_io->u.bdev.memory_domain_ctx = NULL; 5512 bdev_io->u.bdev.accel_sequence = NULL; 5513 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5514 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5515 5516 bdev_io_submit(bdev_io); 5517 return 0; 5518 } 5519 5520 int 5521 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5522 void *buf, uint64_t offset, uint64_t nbytes, 5523 spdk_bdev_io_completion_cb cb, void *cb_arg) 5524 { 5525 uint64_t offset_blocks, num_blocks; 5526 5527 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5528 nbytes, &num_blocks) != 0) { 5529 return -EINVAL; 5530 } 5531 5532 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5533 } 5534 5535 int 5536 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5537 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5538 spdk_bdev_io_completion_cb cb, void *cb_arg) 5539 { 5540 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5541 cb, cb_arg); 5542 } 5543 5544 int 5545 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5546 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5547 spdk_bdev_io_completion_cb cb, void *cb_arg) 5548 { 5549 struct iovec iov = { 5550 .iov_base = buf, 5551 }; 5552 5553 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5554 return -EINVAL; 5555 } 5556 5557 if (md_buf && !_is_buf_allocated(&iov)) { 5558 return -EINVAL; 5559 } 5560 5561 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5562 cb, cb_arg); 5563 } 5564 5565 static int 5566 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5567 struct iovec *iov, int iovcnt, void *md_buf, 5568 uint64_t offset_blocks, uint64_t num_blocks, 5569 struct spdk_memory_domain *domain, void *domain_ctx, 5570 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5571 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5572 spdk_bdev_io_completion_cb cb, void *cb_arg) 5573 { 5574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5575 struct spdk_bdev_io *bdev_io; 5576 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5577 5578 if (spdk_unlikely(!desc->write)) { 5579 return -EBADF; 5580 } 5581 5582 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5583 return -EINVAL; 5584 } 5585 5586 bdev_io = bdev_channel_get_io(channel); 5587 if (spdk_unlikely(!bdev_io)) { 5588 return -ENOMEM; 5589 } 5590 5591 bdev_io->internal.ch = channel; 5592 bdev_io->internal.desc = desc; 5593 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5594 bdev_io->u.bdev.iovs = iov; 5595 bdev_io->u.bdev.iovcnt = iovcnt; 5596 bdev_io->u.bdev.md_buf = md_buf; 5597 bdev_io->u.bdev.num_blocks = num_blocks; 5598 bdev_io->u.bdev.offset_blocks = offset_blocks; 5599 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5600 bdev_io->internal.memory_domain = domain; 5601 bdev_io->internal.memory_domain_ctx = domain_ctx; 5602 bdev_io->internal.accel_sequence = seq; 5603 bdev_io->internal.has_accel_sequence = seq != NULL; 5604 bdev_io->u.bdev.memory_domain = domain; 5605 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5606 bdev_io->u.bdev.accel_sequence = seq; 5607 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5608 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5609 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5610 5611 _bdev_io_submit_ext(desc, bdev_io); 5612 5613 return 0; 5614 } 5615 5616 int 5617 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5618 struct iovec *iov, int iovcnt, 5619 uint64_t offset, uint64_t len, 5620 spdk_bdev_io_completion_cb cb, void *cb_arg) 5621 { 5622 uint64_t offset_blocks, num_blocks; 5623 5624 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5625 len, &num_blocks) != 0) { 5626 return -EINVAL; 5627 } 5628 5629 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5630 } 5631 5632 int 5633 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5634 struct iovec *iov, int iovcnt, 5635 uint64_t offset_blocks, uint64_t num_blocks, 5636 spdk_bdev_io_completion_cb cb, void *cb_arg) 5637 { 5638 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5639 5640 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5641 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5642 cb, cb_arg); 5643 } 5644 5645 int 5646 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5647 struct iovec *iov, int iovcnt, void *md_buf, 5648 uint64_t offset_blocks, uint64_t num_blocks, 5649 spdk_bdev_io_completion_cb cb, void *cb_arg) 5650 { 5651 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5652 5653 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5654 return -EINVAL; 5655 } 5656 5657 if (md_buf && !_is_buf_allocated(iov)) { 5658 return -EINVAL; 5659 } 5660 5661 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5662 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5663 cb, cb_arg); 5664 } 5665 5666 int 5667 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5668 struct iovec *iov, int iovcnt, 5669 uint64_t offset_blocks, uint64_t num_blocks, 5670 spdk_bdev_io_completion_cb cb, void *cb_arg, 5671 struct spdk_bdev_ext_io_opts *opts) 5672 { 5673 struct spdk_memory_domain *domain = NULL; 5674 struct spdk_accel_sequence *seq = NULL; 5675 void *domain_ctx = NULL, *md = NULL; 5676 uint32_t dif_check_flags = 0; 5677 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5678 uint32_t nvme_cdw12_raw = 0; 5679 uint32_t nvme_cdw13_raw = 0; 5680 5681 if (opts) { 5682 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5683 return -EINVAL; 5684 } 5685 md = opts->metadata; 5686 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5687 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5688 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5689 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5690 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5691 if (md) { 5692 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5693 return -EINVAL; 5694 } 5695 5696 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5697 return -EINVAL; 5698 } 5699 5700 if (spdk_unlikely(seq != NULL)) { 5701 return -EINVAL; 5702 } 5703 } 5704 } 5705 5706 dif_check_flags = bdev->dif_check_flags & 5707 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5708 5709 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5710 domain, domain_ctx, seq, dif_check_flags, 5711 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5712 } 5713 5714 static void 5715 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5716 { 5717 struct spdk_bdev_io *parent_io = cb_arg; 5718 struct spdk_bdev *bdev = parent_io->bdev; 5719 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5720 int i, rc = 0; 5721 5722 if (!success) { 5723 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5724 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5725 spdk_bdev_free_io(bdev_io); 5726 return; 5727 } 5728 5729 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5730 rc = memcmp(read_buf, 5731 parent_io->u.bdev.iovs[i].iov_base, 5732 parent_io->u.bdev.iovs[i].iov_len); 5733 if (rc) { 5734 break; 5735 } 5736 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5737 } 5738 5739 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5740 rc = memcmp(bdev_io->u.bdev.md_buf, 5741 parent_io->u.bdev.md_buf, 5742 spdk_bdev_get_md_size(bdev)); 5743 } 5744 5745 spdk_bdev_free_io(bdev_io); 5746 5747 if (rc == 0) { 5748 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5749 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5750 } else { 5751 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5752 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5753 } 5754 } 5755 5756 static void 5757 bdev_compare_do_read(void *_bdev_io) 5758 { 5759 struct spdk_bdev_io *bdev_io = _bdev_io; 5760 int rc; 5761 5762 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5763 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5764 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5765 bdev_compare_do_read_done, bdev_io); 5766 5767 if (rc == -ENOMEM) { 5768 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5769 } else if (rc != 0) { 5770 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5771 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5772 } 5773 } 5774 5775 static int 5776 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5777 struct iovec *iov, int iovcnt, void *md_buf, 5778 uint64_t offset_blocks, uint64_t num_blocks, 5779 spdk_bdev_io_completion_cb cb, void *cb_arg) 5780 { 5781 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5782 struct spdk_bdev_io *bdev_io; 5783 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5784 5785 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5786 return -EINVAL; 5787 } 5788 5789 bdev_io = bdev_channel_get_io(channel); 5790 if (!bdev_io) { 5791 return -ENOMEM; 5792 } 5793 5794 bdev_io->internal.ch = channel; 5795 bdev_io->internal.desc = desc; 5796 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5797 bdev_io->u.bdev.iovs = iov; 5798 bdev_io->u.bdev.iovcnt = iovcnt; 5799 bdev_io->u.bdev.md_buf = md_buf; 5800 bdev_io->u.bdev.num_blocks = num_blocks; 5801 bdev_io->u.bdev.offset_blocks = offset_blocks; 5802 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5803 bdev_io->u.bdev.memory_domain = NULL; 5804 bdev_io->u.bdev.memory_domain_ctx = NULL; 5805 bdev_io->u.bdev.accel_sequence = NULL; 5806 5807 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5808 bdev_io_submit(bdev_io); 5809 return 0; 5810 } 5811 5812 bdev_compare_do_read(bdev_io); 5813 5814 return 0; 5815 } 5816 5817 int 5818 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5819 struct iovec *iov, int iovcnt, 5820 uint64_t offset_blocks, uint64_t num_blocks, 5821 spdk_bdev_io_completion_cb cb, void *cb_arg) 5822 { 5823 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5824 num_blocks, cb, cb_arg); 5825 } 5826 5827 int 5828 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5829 struct iovec *iov, int iovcnt, void *md_buf, 5830 uint64_t offset_blocks, uint64_t num_blocks, 5831 spdk_bdev_io_completion_cb cb, void *cb_arg) 5832 { 5833 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5834 return -EINVAL; 5835 } 5836 5837 if (md_buf && !_is_buf_allocated(iov)) { 5838 return -EINVAL; 5839 } 5840 5841 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5842 num_blocks, cb, cb_arg); 5843 } 5844 5845 static int 5846 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5847 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5848 spdk_bdev_io_completion_cb cb, void *cb_arg) 5849 { 5850 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5851 struct spdk_bdev_io *bdev_io; 5852 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5853 5854 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5855 return -EINVAL; 5856 } 5857 5858 bdev_io = bdev_channel_get_io(channel); 5859 if (!bdev_io) { 5860 return -ENOMEM; 5861 } 5862 5863 bdev_io->internal.ch = channel; 5864 bdev_io->internal.desc = desc; 5865 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5866 bdev_io->u.bdev.iovs = &bdev_io->iov; 5867 bdev_io->u.bdev.iovs[0].iov_base = buf; 5868 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5869 bdev_io->u.bdev.iovcnt = 1; 5870 bdev_io->u.bdev.md_buf = md_buf; 5871 bdev_io->u.bdev.num_blocks = num_blocks; 5872 bdev_io->u.bdev.offset_blocks = offset_blocks; 5873 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5874 bdev_io->u.bdev.memory_domain = NULL; 5875 bdev_io->u.bdev.memory_domain_ctx = NULL; 5876 bdev_io->u.bdev.accel_sequence = NULL; 5877 5878 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5879 bdev_io_submit(bdev_io); 5880 return 0; 5881 } 5882 5883 bdev_compare_do_read(bdev_io); 5884 5885 return 0; 5886 } 5887 5888 int 5889 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5890 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5891 spdk_bdev_io_completion_cb cb, void *cb_arg) 5892 { 5893 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5894 cb, cb_arg); 5895 } 5896 5897 int 5898 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5899 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5900 spdk_bdev_io_completion_cb cb, void *cb_arg) 5901 { 5902 struct iovec iov = { 5903 .iov_base = buf, 5904 }; 5905 5906 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5907 return -EINVAL; 5908 } 5909 5910 if (md_buf && !_is_buf_allocated(&iov)) { 5911 return -EINVAL; 5912 } 5913 5914 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5915 cb, cb_arg); 5916 } 5917 5918 static void 5919 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5920 { 5921 struct spdk_bdev_io *bdev_io = ctx; 5922 5923 if (unlock_status) { 5924 SPDK_ERRLOG("LBA range unlock failed\n"); 5925 } 5926 5927 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5928 false, bdev_io->internal.caller_ctx); 5929 } 5930 5931 static void 5932 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5933 { 5934 bdev_io->internal.status = status; 5935 5936 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5937 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5938 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5939 } 5940 5941 static void 5942 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5943 { 5944 struct spdk_bdev_io *parent_io = cb_arg; 5945 5946 if (!success) { 5947 SPDK_ERRLOG("Compare and write operation failed\n"); 5948 } 5949 5950 spdk_bdev_free_io(bdev_io); 5951 5952 bdev_comparev_and_writev_blocks_unlock(parent_io, 5953 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5954 } 5955 5956 static void 5957 bdev_compare_and_write_do_write(void *_bdev_io) 5958 { 5959 struct spdk_bdev_io *bdev_io = _bdev_io; 5960 int rc; 5961 5962 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5963 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5964 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5965 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5966 bdev_compare_and_write_do_write_done, bdev_io); 5967 5968 5969 if (rc == -ENOMEM) { 5970 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5971 } else if (rc != 0) { 5972 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5973 } 5974 } 5975 5976 static void 5977 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5978 { 5979 struct spdk_bdev_io *parent_io = cb_arg; 5980 5981 spdk_bdev_free_io(bdev_io); 5982 5983 if (!success) { 5984 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5985 return; 5986 } 5987 5988 bdev_compare_and_write_do_write(parent_io); 5989 } 5990 5991 static void 5992 bdev_compare_and_write_do_compare(void *_bdev_io) 5993 { 5994 struct spdk_bdev_io *bdev_io = _bdev_io; 5995 int rc; 5996 5997 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5998 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5999 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6000 bdev_compare_and_write_do_compare_done, bdev_io); 6001 6002 if (rc == -ENOMEM) { 6003 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6004 } else if (rc != 0) { 6005 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6006 } 6007 } 6008 6009 static void 6010 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6011 { 6012 struct spdk_bdev_io *bdev_io = ctx; 6013 6014 if (status) { 6015 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6016 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6017 return; 6018 } 6019 6020 bdev_compare_and_write_do_compare(bdev_io); 6021 } 6022 6023 int 6024 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6025 struct iovec *compare_iov, int compare_iovcnt, 6026 struct iovec *write_iov, int write_iovcnt, 6027 uint64_t offset_blocks, uint64_t num_blocks, 6028 spdk_bdev_io_completion_cb cb, void *cb_arg) 6029 { 6030 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6031 struct spdk_bdev_io *bdev_io; 6032 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6033 6034 if (!desc->write) { 6035 return -EBADF; 6036 } 6037 6038 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6039 return -EINVAL; 6040 } 6041 6042 if (num_blocks > bdev->acwu) { 6043 return -EINVAL; 6044 } 6045 6046 bdev_io = bdev_channel_get_io(channel); 6047 if (!bdev_io) { 6048 return -ENOMEM; 6049 } 6050 6051 bdev_io->internal.ch = channel; 6052 bdev_io->internal.desc = desc; 6053 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6054 bdev_io->u.bdev.iovs = compare_iov; 6055 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6056 bdev_io->u.bdev.fused_iovs = write_iov; 6057 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6058 bdev_io->u.bdev.md_buf = NULL; 6059 bdev_io->u.bdev.num_blocks = num_blocks; 6060 bdev_io->u.bdev.offset_blocks = offset_blocks; 6061 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6062 bdev_io->u.bdev.memory_domain = NULL; 6063 bdev_io->u.bdev.memory_domain_ctx = NULL; 6064 bdev_io->u.bdev.accel_sequence = NULL; 6065 6066 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6067 bdev_io_submit(bdev_io); 6068 return 0; 6069 } 6070 6071 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6072 bdev_comparev_and_writev_blocks_locked, bdev_io); 6073 } 6074 6075 int 6076 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6077 struct iovec *iov, int iovcnt, 6078 uint64_t offset_blocks, uint64_t num_blocks, 6079 bool populate, 6080 spdk_bdev_io_completion_cb cb, void *cb_arg) 6081 { 6082 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6083 struct spdk_bdev_io *bdev_io; 6084 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6085 6086 if (!desc->write) { 6087 return -EBADF; 6088 } 6089 6090 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6091 return -EINVAL; 6092 } 6093 6094 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6095 return -ENOTSUP; 6096 } 6097 6098 bdev_io = bdev_channel_get_io(channel); 6099 if (!bdev_io) { 6100 return -ENOMEM; 6101 } 6102 6103 bdev_io->internal.ch = channel; 6104 bdev_io->internal.desc = desc; 6105 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6106 bdev_io->u.bdev.num_blocks = num_blocks; 6107 bdev_io->u.bdev.offset_blocks = offset_blocks; 6108 bdev_io->u.bdev.iovs = iov; 6109 bdev_io->u.bdev.iovcnt = iovcnt; 6110 bdev_io->u.bdev.md_buf = NULL; 6111 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6112 bdev_io->u.bdev.zcopy.commit = 0; 6113 bdev_io->u.bdev.zcopy.start = 1; 6114 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6115 bdev_io->u.bdev.memory_domain = NULL; 6116 bdev_io->u.bdev.memory_domain_ctx = NULL; 6117 bdev_io->u.bdev.accel_sequence = NULL; 6118 6119 bdev_io_submit(bdev_io); 6120 6121 return 0; 6122 } 6123 6124 int 6125 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6126 spdk_bdev_io_completion_cb cb, void *cb_arg) 6127 { 6128 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6129 return -EINVAL; 6130 } 6131 6132 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6133 bdev_io->u.bdev.zcopy.start = 0; 6134 bdev_io->internal.caller_ctx = cb_arg; 6135 bdev_io->internal.cb = cb; 6136 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6137 6138 bdev_io_submit(bdev_io); 6139 6140 return 0; 6141 } 6142 6143 int 6144 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6145 uint64_t offset, uint64_t len, 6146 spdk_bdev_io_completion_cb cb, void *cb_arg) 6147 { 6148 uint64_t offset_blocks, num_blocks; 6149 6150 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6151 len, &num_blocks) != 0) { 6152 return -EINVAL; 6153 } 6154 6155 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6156 } 6157 6158 int 6159 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6160 uint64_t offset_blocks, uint64_t num_blocks, 6161 spdk_bdev_io_completion_cb cb, void *cb_arg) 6162 { 6163 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6164 struct spdk_bdev_io *bdev_io; 6165 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6166 6167 if (!desc->write) { 6168 return -EBADF; 6169 } 6170 6171 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6172 return -EINVAL; 6173 } 6174 6175 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6176 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6177 return -ENOTSUP; 6178 } 6179 6180 bdev_io = bdev_channel_get_io(channel); 6181 6182 if (!bdev_io) { 6183 return -ENOMEM; 6184 } 6185 6186 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6187 bdev_io->internal.ch = channel; 6188 bdev_io->internal.desc = desc; 6189 bdev_io->u.bdev.offset_blocks = offset_blocks; 6190 bdev_io->u.bdev.num_blocks = num_blocks; 6191 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6192 bdev_io->u.bdev.memory_domain = NULL; 6193 bdev_io->u.bdev.memory_domain_ctx = NULL; 6194 bdev_io->u.bdev.accel_sequence = NULL; 6195 6196 /* If the write_zeroes size is large and should be split, use the generic split 6197 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6198 * 6199 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6200 * or emulate it using regular write request otherwise. 6201 */ 6202 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6203 bdev_io->internal.split) { 6204 bdev_io_submit(bdev_io); 6205 return 0; 6206 } 6207 6208 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6209 6210 return bdev_write_zero_buffer(bdev_io); 6211 } 6212 6213 int 6214 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6215 uint64_t offset, uint64_t nbytes, 6216 spdk_bdev_io_completion_cb cb, void *cb_arg) 6217 { 6218 uint64_t offset_blocks, num_blocks; 6219 6220 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6221 nbytes, &num_blocks) != 0) { 6222 return -EINVAL; 6223 } 6224 6225 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6226 } 6227 6228 static void 6229 bdev_io_complete_cb(void *ctx) 6230 { 6231 struct spdk_bdev_io *bdev_io = ctx; 6232 6233 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6234 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6235 } 6236 6237 int 6238 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6239 uint64_t offset_blocks, uint64_t num_blocks, 6240 spdk_bdev_io_completion_cb cb, void *cb_arg) 6241 { 6242 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6243 struct spdk_bdev_io *bdev_io; 6244 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6245 6246 if (!desc->write) { 6247 return -EBADF; 6248 } 6249 6250 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6251 return -EINVAL; 6252 } 6253 6254 bdev_io = bdev_channel_get_io(channel); 6255 if (!bdev_io) { 6256 return -ENOMEM; 6257 } 6258 6259 bdev_io->internal.ch = channel; 6260 bdev_io->internal.desc = desc; 6261 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6262 6263 bdev_io->u.bdev.iovs = &bdev_io->iov; 6264 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6265 bdev_io->u.bdev.iovs[0].iov_len = 0; 6266 bdev_io->u.bdev.iovcnt = 1; 6267 6268 bdev_io->u.bdev.offset_blocks = offset_blocks; 6269 bdev_io->u.bdev.num_blocks = num_blocks; 6270 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6271 bdev_io->u.bdev.memory_domain = NULL; 6272 bdev_io->u.bdev.memory_domain_ctx = NULL; 6273 bdev_io->u.bdev.accel_sequence = NULL; 6274 6275 if (num_blocks == 0) { 6276 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6277 return 0; 6278 } 6279 6280 bdev_io_submit(bdev_io); 6281 return 0; 6282 } 6283 6284 int 6285 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6286 uint64_t offset, uint64_t length, 6287 spdk_bdev_io_completion_cb cb, void *cb_arg) 6288 { 6289 uint64_t offset_blocks, num_blocks; 6290 6291 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6292 length, &num_blocks) != 0) { 6293 return -EINVAL; 6294 } 6295 6296 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6297 } 6298 6299 int 6300 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6301 uint64_t offset_blocks, uint64_t num_blocks, 6302 spdk_bdev_io_completion_cb cb, void *cb_arg) 6303 { 6304 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6305 struct spdk_bdev_io *bdev_io; 6306 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6307 6308 if (!desc->write) { 6309 return -EBADF; 6310 } 6311 6312 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6313 return -EINVAL; 6314 } 6315 6316 bdev_io = bdev_channel_get_io(channel); 6317 if (!bdev_io) { 6318 return -ENOMEM; 6319 } 6320 6321 bdev_io->internal.ch = channel; 6322 bdev_io->internal.desc = desc; 6323 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6324 bdev_io->u.bdev.iovs = NULL; 6325 bdev_io->u.bdev.iovcnt = 0; 6326 bdev_io->u.bdev.offset_blocks = offset_blocks; 6327 bdev_io->u.bdev.num_blocks = num_blocks; 6328 bdev_io->u.bdev.memory_domain = NULL; 6329 bdev_io->u.bdev.memory_domain_ctx = NULL; 6330 bdev_io->u.bdev.accel_sequence = NULL; 6331 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6332 6333 bdev_io_submit(bdev_io); 6334 return 0; 6335 } 6336 6337 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6338 6339 static void 6340 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6341 { 6342 struct spdk_bdev_channel *ch = _ctx; 6343 struct spdk_bdev_io *bdev_io; 6344 6345 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6346 6347 if (status == -EBUSY) { 6348 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6349 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6350 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6351 } else { 6352 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6353 6354 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6355 /* If outstanding IOs are still present and reset_io_drain_timeout 6356 * seconds passed, start the reset. */ 6357 bdev_io_submit_reset(bdev_io); 6358 } else { 6359 /* We still have in progress memory domain pull/push or we're 6360 * executing accel sequence. Since we cannot abort either of those 6361 * operaions, fail the reset request. */ 6362 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6363 } 6364 } 6365 } else { 6366 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6367 SPDK_DEBUGLOG(bdev, 6368 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6369 ch->bdev->name); 6370 /* Mark the completion status as a SUCCESS and complete the reset. */ 6371 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6372 } 6373 } 6374 6375 static void 6376 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6377 struct spdk_io_channel *io_ch, void *_ctx) 6378 { 6379 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6380 int status = 0; 6381 6382 if (cur_ch->io_outstanding > 0 || 6383 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6384 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6385 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6386 * further iteration over the rest of the channels and pass non-zero status 6387 * to the callback function. */ 6388 status = -EBUSY; 6389 } 6390 spdk_bdev_for_each_channel_continue(i, status); 6391 } 6392 6393 static int 6394 bdev_reset_poll_for_outstanding_io(void *ctx) 6395 { 6396 struct spdk_bdev_channel *ch = ctx; 6397 struct spdk_bdev_io *bdev_io; 6398 6399 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6400 6401 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6402 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6403 bdev_reset_check_outstanding_io_done); 6404 6405 return SPDK_POLLER_BUSY; 6406 } 6407 6408 static void 6409 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6410 { 6411 struct spdk_bdev_channel *ch = _ctx; 6412 struct spdk_bdev_io *bdev_io; 6413 6414 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6415 6416 if (bdev->reset_io_drain_timeout == 0) { 6417 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6418 6419 bdev_io_submit_reset(bdev_io); 6420 return; 6421 } 6422 6423 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6424 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6425 6426 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6427 * submit the reset to the underlying module only if outstanding I/O 6428 * remain after reset_io_drain_timeout seconds have passed. */ 6429 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6430 bdev_reset_check_outstanding_io_done); 6431 } 6432 6433 static void 6434 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6435 struct spdk_io_channel *ch, void *_ctx) 6436 { 6437 struct spdk_bdev_channel *channel; 6438 struct spdk_bdev_mgmt_channel *mgmt_channel; 6439 struct spdk_bdev_shared_resource *shared_resource; 6440 bdev_io_tailq_t tmp_queued; 6441 6442 TAILQ_INIT(&tmp_queued); 6443 6444 channel = __io_ch_to_bdev_ch(ch); 6445 shared_resource = channel->shared_resource; 6446 mgmt_channel = shared_resource->mgmt_ch; 6447 6448 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6449 6450 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6451 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6452 } 6453 6454 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6455 bdev_abort_all_buf_io(mgmt_channel, channel); 6456 bdev_abort_all_queued_io(&tmp_queued, channel); 6457 6458 spdk_bdev_for_each_channel_continue(i, 0); 6459 } 6460 6461 static void 6462 bdev_start_reset(void *ctx) 6463 { 6464 struct spdk_bdev_channel *ch = ctx; 6465 6466 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6467 bdev_reset_freeze_channel_done); 6468 } 6469 6470 static void 6471 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6472 { 6473 struct spdk_bdev *bdev = ch->bdev; 6474 6475 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6476 6477 spdk_spin_lock(&bdev->internal.spinlock); 6478 if (bdev->internal.reset_in_progress == NULL) { 6479 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6480 /* 6481 * Take a channel reference for the target bdev for the life of this 6482 * reset. This guards against the channel getting destroyed while 6483 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6484 * progress. We will release the reference when this reset is 6485 * completed. 6486 */ 6487 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6488 bdev_start_reset(ch); 6489 } 6490 spdk_spin_unlock(&bdev->internal.spinlock); 6491 } 6492 6493 int 6494 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6495 spdk_bdev_io_completion_cb cb, void *cb_arg) 6496 { 6497 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6498 struct spdk_bdev_io *bdev_io; 6499 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6500 6501 bdev_io = bdev_channel_get_io(channel); 6502 if (!bdev_io) { 6503 return -ENOMEM; 6504 } 6505 6506 bdev_io->internal.ch = channel; 6507 bdev_io->internal.desc = desc; 6508 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6509 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6510 bdev_io->u.reset.ch_ref = NULL; 6511 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6512 6513 spdk_spin_lock(&bdev->internal.spinlock); 6514 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6515 spdk_spin_unlock(&bdev->internal.spinlock); 6516 6517 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6518 internal.ch_link); 6519 6520 bdev_channel_start_reset(channel); 6521 6522 return 0; 6523 } 6524 6525 void 6526 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6527 struct spdk_bdev_io_stat *stat) 6528 { 6529 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6530 6531 bdev_get_io_stat(stat, channel->stat); 6532 } 6533 6534 static void 6535 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6536 { 6537 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6538 6539 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6540 bdev_iostat_ctx->cb_arg, 0); 6541 free(bdev_iostat_ctx); 6542 } 6543 6544 static void 6545 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6546 struct spdk_io_channel *ch, void *_ctx) 6547 { 6548 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6549 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6550 6551 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6552 spdk_bdev_for_each_channel_continue(i, 0); 6553 } 6554 6555 void 6556 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6557 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6558 { 6559 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6560 6561 assert(bdev != NULL); 6562 assert(stat != NULL); 6563 assert(cb != NULL); 6564 6565 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6566 if (bdev_iostat_ctx == NULL) { 6567 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6568 cb(bdev, stat, cb_arg, -ENOMEM); 6569 return; 6570 } 6571 6572 bdev_iostat_ctx->stat = stat; 6573 bdev_iostat_ctx->cb = cb; 6574 bdev_iostat_ctx->cb_arg = cb_arg; 6575 6576 /* Start with the statistics from previously deleted channels. */ 6577 spdk_spin_lock(&bdev->internal.spinlock); 6578 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6579 spdk_spin_unlock(&bdev->internal.spinlock); 6580 6581 /* Then iterate and add the statistics from each existing channel. */ 6582 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6583 bdev_get_device_stat_done); 6584 } 6585 6586 struct bdev_iostat_reset_ctx { 6587 enum spdk_bdev_reset_stat_mode mode; 6588 bdev_reset_device_stat_cb cb; 6589 void *cb_arg; 6590 }; 6591 6592 static void 6593 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6594 { 6595 struct bdev_iostat_reset_ctx *ctx = _ctx; 6596 6597 ctx->cb(bdev, ctx->cb_arg, 0); 6598 6599 free(ctx); 6600 } 6601 6602 static void 6603 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6604 struct spdk_io_channel *ch, void *_ctx) 6605 { 6606 struct bdev_iostat_reset_ctx *ctx = _ctx; 6607 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6608 6609 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6610 6611 spdk_bdev_for_each_channel_continue(i, 0); 6612 } 6613 6614 void 6615 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6616 bdev_reset_device_stat_cb cb, void *cb_arg) 6617 { 6618 struct bdev_iostat_reset_ctx *ctx; 6619 6620 assert(bdev != NULL); 6621 assert(cb != NULL); 6622 6623 ctx = calloc(1, sizeof(*ctx)); 6624 if (ctx == NULL) { 6625 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6626 cb(bdev, cb_arg, -ENOMEM); 6627 return; 6628 } 6629 6630 ctx->mode = mode; 6631 ctx->cb = cb; 6632 ctx->cb_arg = cb_arg; 6633 6634 spdk_spin_lock(&bdev->internal.spinlock); 6635 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6636 spdk_spin_unlock(&bdev->internal.spinlock); 6637 6638 spdk_bdev_for_each_channel(bdev, 6639 bdev_reset_each_channel_stat, 6640 ctx, 6641 bdev_reset_device_stat_done); 6642 } 6643 6644 int 6645 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6646 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6647 spdk_bdev_io_completion_cb cb, void *cb_arg) 6648 { 6649 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6650 struct spdk_bdev_io *bdev_io; 6651 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6652 6653 if (!desc->write) { 6654 return -EBADF; 6655 } 6656 6657 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6658 return -ENOTSUP; 6659 } 6660 6661 bdev_io = bdev_channel_get_io(channel); 6662 if (!bdev_io) { 6663 return -ENOMEM; 6664 } 6665 6666 bdev_io->internal.ch = channel; 6667 bdev_io->internal.desc = desc; 6668 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6669 bdev_io->u.nvme_passthru.cmd = *cmd; 6670 bdev_io->u.nvme_passthru.buf = buf; 6671 bdev_io->u.nvme_passthru.nbytes = nbytes; 6672 bdev_io->u.nvme_passthru.md_buf = NULL; 6673 bdev_io->u.nvme_passthru.md_len = 0; 6674 6675 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6676 6677 bdev_io_submit(bdev_io); 6678 return 0; 6679 } 6680 6681 int 6682 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6683 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6684 spdk_bdev_io_completion_cb cb, void *cb_arg) 6685 { 6686 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6687 struct spdk_bdev_io *bdev_io; 6688 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6689 6690 if (!desc->write) { 6691 /* 6692 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6693 * to easily determine if the command is a read or write, but for now just 6694 * do not allow io_passthru with a read-only descriptor. 6695 */ 6696 return -EBADF; 6697 } 6698 6699 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6700 return -ENOTSUP; 6701 } 6702 6703 bdev_io = bdev_channel_get_io(channel); 6704 if (!bdev_io) { 6705 return -ENOMEM; 6706 } 6707 6708 bdev_io->internal.ch = channel; 6709 bdev_io->internal.desc = desc; 6710 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6711 bdev_io->u.nvme_passthru.cmd = *cmd; 6712 bdev_io->u.nvme_passthru.buf = buf; 6713 bdev_io->u.nvme_passthru.nbytes = nbytes; 6714 bdev_io->u.nvme_passthru.md_buf = NULL; 6715 bdev_io->u.nvme_passthru.md_len = 0; 6716 6717 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6718 6719 bdev_io_submit(bdev_io); 6720 return 0; 6721 } 6722 6723 int 6724 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6725 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6726 spdk_bdev_io_completion_cb cb, void *cb_arg) 6727 { 6728 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6729 struct spdk_bdev_io *bdev_io; 6730 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6731 6732 if (!desc->write) { 6733 /* 6734 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6735 * to easily determine if the command is a read or write, but for now just 6736 * do not allow io_passthru with a read-only descriptor. 6737 */ 6738 return -EBADF; 6739 } 6740 6741 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6742 return -ENOTSUP; 6743 } 6744 6745 bdev_io = bdev_channel_get_io(channel); 6746 if (!bdev_io) { 6747 return -ENOMEM; 6748 } 6749 6750 bdev_io->internal.ch = channel; 6751 bdev_io->internal.desc = desc; 6752 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6753 bdev_io->u.nvme_passthru.cmd = *cmd; 6754 bdev_io->u.nvme_passthru.buf = buf; 6755 bdev_io->u.nvme_passthru.nbytes = nbytes; 6756 bdev_io->u.nvme_passthru.md_buf = md_buf; 6757 bdev_io->u.nvme_passthru.md_len = md_len; 6758 6759 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6760 6761 bdev_io_submit(bdev_io); 6762 return 0; 6763 } 6764 6765 int 6766 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6767 struct spdk_io_channel *ch, 6768 const struct spdk_nvme_cmd *cmd, 6769 struct iovec *iov, int iovcnt, size_t nbytes, 6770 void *md_buf, size_t md_len, 6771 spdk_bdev_io_completion_cb cb, void *cb_arg) 6772 { 6773 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6774 struct spdk_bdev_io *bdev_io; 6775 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6776 6777 if (!desc->write) { 6778 /* 6779 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6780 * to easily determine if the command is a read or write, but for now just 6781 * do not allow io_passthru with a read-only descriptor. 6782 */ 6783 return -EBADF; 6784 } 6785 6786 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6787 return -ENOTSUP; 6788 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6789 return -ENOTSUP; 6790 } 6791 6792 bdev_io = bdev_channel_get_io(channel); 6793 if (!bdev_io) { 6794 return -ENOMEM; 6795 } 6796 6797 bdev_io->internal.ch = channel; 6798 bdev_io->internal.desc = desc; 6799 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6800 bdev_io->u.nvme_passthru.cmd = *cmd; 6801 bdev_io->u.nvme_passthru.iovs = iov; 6802 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6803 bdev_io->u.nvme_passthru.nbytes = nbytes; 6804 bdev_io->u.nvme_passthru.md_buf = md_buf; 6805 bdev_io->u.nvme_passthru.md_len = md_len; 6806 6807 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6808 6809 bdev_io_submit(bdev_io); 6810 return 0; 6811 } 6812 6813 static void bdev_abort_retry(void *ctx); 6814 static void bdev_abort(struct spdk_bdev_io *parent_io); 6815 6816 static void 6817 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6818 { 6819 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6820 struct spdk_bdev_io *parent_io = cb_arg; 6821 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6822 6823 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6824 6825 spdk_bdev_free_io(bdev_io); 6826 6827 if (!success) { 6828 /* Check if the target I/O completed in the meantime. */ 6829 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6830 if (tmp_io == bio_to_abort) { 6831 break; 6832 } 6833 } 6834 6835 /* If the target I/O still exists, set the parent to failed. */ 6836 if (tmp_io != NULL) { 6837 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6838 } 6839 } 6840 6841 parent_io->u.bdev.split_outstanding--; 6842 if (parent_io->u.bdev.split_outstanding == 0) { 6843 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6844 bdev_abort_retry(parent_io); 6845 } else { 6846 bdev_io_complete(parent_io); 6847 } 6848 } 6849 } 6850 6851 static int 6852 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6853 struct spdk_bdev_io *bio_to_abort, 6854 spdk_bdev_io_completion_cb cb, void *cb_arg) 6855 { 6856 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6857 struct spdk_bdev_io *bdev_io; 6858 6859 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6860 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6861 /* TODO: Abort reset or abort request. */ 6862 return -ENOTSUP; 6863 } 6864 6865 bdev_io = bdev_channel_get_io(channel); 6866 if (bdev_io == NULL) { 6867 return -ENOMEM; 6868 } 6869 6870 bdev_io->internal.ch = channel; 6871 bdev_io->internal.desc = desc; 6872 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6873 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6874 6875 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6876 assert(bdev_io_should_split(bio_to_abort)); 6877 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6878 6879 /* Parent abort request is not submitted directly, but to manage its 6880 * execution add it to the submitted list here. 6881 */ 6882 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6883 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6884 6885 bdev_abort(bdev_io); 6886 6887 return 0; 6888 } 6889 6890 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6891 6892 /* Submit the abort request to the underlying bdev module. */ 6893 bdev_io_submit(bdev_io); 6894 6895 return 0; 6896 } 6897 6898 static bool 6899 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6900 { 6901 struct spdk_bdev_io *iter; 6902 6903 TAILQ_FOREACH(iter, tailq, internal.link) { 6904 if (iter == bdev_io) { 6905 return true; 6906 } 6907 } 6908 6909 return false; 6910 } 6911 6912 static uint32_t 6913 _bdev_abort(struct spdk_bdev_io *parent_io) 6914 { 6915 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6916 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6917 void *bio_cb_arg; 6918 struct spdk_bdev_io *bio_to_abort; 6919 uint32_t matched_ios; 6920 int rc; 6921 6922 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6923 6924 /* matched_ios is returned and will be kept by the caller. 6925 * 6926 * This function will be used for two cases, 1) the same cb_arg is used for 6927 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6928 * Incrementing split_outstanding directly here may confuse readers especially 6929 * for the 1st case. 6930 * 6931 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6932 * works as expected. 6933 */ 6934 matched_ios = 0; 6935 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6936 6937 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6938 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6939 continue; 6940 } 6941 6942 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6943 /* Any I/O which was submitted after this abort command should be excluded. */ 6944 continue; 6945 } 6946 6947 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6948 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6949 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6950 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6951 break; 6952 } 6953 6954 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6955 if (rc != 0) { 6956 if (rc == -ENOMEM) { 6957 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6958 } else { 6959 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6960 } 6961 break; 6962 } 6963 matched_ios++; 6964 } 6965 6966 return matched_ios; 6967 } 6968 6969 static void 6970 bdev_abort_retry(void *ctx) 6971 { 6972 struct spdk_bdev_io *parent_io = ctx; 6973 uint32_t matched_ios; 6974 6975 matched_ios = _bdev_abort(parent_io); 6976 6977 if (matched_ios == 0) { 6978 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6979 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6980 } else { 6981 /* For retry, the case that no target I/O was found is success 6982 * because it means target I/Os completed in the meantime. 6983 */ 6984 bdev_io_complete(parent_io); 6985 } 6986 return; 6987 } 6988 6989 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6990 parent_io->u.bdev.split_outstanding = matched_ios; 6991 } 6992 6993 static void 6994 bdev_abort(struct spdk_bdev_io *parent_io) 6995 { 6996 uint32_t matched_ios; 6997 6998 matched_ios = _bdev_abort(parent_io); 6999 7000 if (matched_ios == 0) { 7001 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7002 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7003 } else { 7004 /* The case the no target I/O was found is failure. */ 7005 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7006 bdev_io_complete(parent_io); 7007 } 7008 return; 7009 } 7010 7011 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7012 parent_io->u.bdev.split_outstanding = matched_ios; 7013 } 7014 7015 int 7016 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7017 void *bio_cb_arg, 7018 spdk_bdev_io_completion_cb cb, void *cb_arg) 7019 { 7020 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7021 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7022 struct spdk_bdev_io *bdev_io; 7023 7024 if (bio_cb_arg == NULL) { 7025 return -EINVAL; 7026 } 7027 7028 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7029 return -ENOTSUP; 7030 } 7031 7032 bdev_io = bdev_channel_get_io(channel); 7033 if (bdev_io == NULL) { 7034 return -ENOMEM; 7035 } 7036 7037 bdev_io->internal.ch = channel; 7038 bdev_io->internal.desc = desc; 7039 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7040 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7041 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7042 7043 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7044 7045 /* Parent abort request is not submitted directly, but to manage its execution, 7046 * add it to the submitted list here. 7047 */ 7048 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 7049 7050 bdev_abort(bdev_io); 7051 7052 return 0; 7053 } 7054 7055 int 7056 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7057 struct spdk_bdev_io_wait_entry *entry) 7058 { 7059 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7060 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7061 7062 if (bdev != entry->bdev) { 7063 SPDK_ERRLOG("bdevs do not match\n"); 7064 return -EINVAL; 7065 } 7066 7067 if (mgmt_ch->per_thread_cache_count > 0) { 7068 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7069 return -EINVAL; 7070 } 7071 7072 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7073 return 0; 7074 } 7075 7076 static inline void 7077 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7078 { 7079 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7080 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7081 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7082 uint32_t blocklen = bdev_io->bdev->blocklen; 7083 7084 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7085 switch (bdev_io->type) { 7086 case SPDK_BDEV_IO_TYPE_READ: 7087 io_stat->bytes_read += num_blocks * blocklen; 7088 io_stat->num_read_ops++; 7089 io_stat->read_latency_ticks += tsc_diff; 7090 if (io_stat->max_read_latency_ticks < tsc_diff) { 7091 io_stat->max_read_latency_ticks = tsc_diff; 7092 } 7093 if (io_stat->min_read_latency_ticks > tsc_diff) { 7094 io_stat->min_read_latency_ticks = tsc_diff; 7095 } 7096 break; 7097 case SPDK_BDEV_IO_TYPE_WRITE: 7098 io_stat->bytes_written += num_blocks * blocklen; 7099 io_stat->num_write_ops++; 7100 io_stat->write_latency_ticks += tsc_diff; 7101 if (io_stat->max_write_latency_ticks < tsc_diff) { 7102 io_stat->max_write_latency_ticks = tsc_diff; 7103 } 7104 if (io_stat->min_write_latency_ticks > tsc_diff) { 7105 io_stat->min_write_latency_ticks = tsc_diff; 7106 } 7107 break; 7108 case SPDK_BDEV_IO_TYPE_UNMAP: 7109 io_stat->bytes_unmapped += num_blocks * blocklen; 7110 io_stat->num_unmap_ops++; 7111 io_stat->unmap_latency_ticks += tsc_diff; 7112 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7113 io_stat->max_unmap_latency_ticks = tsc_diff; 7114 } 7115 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7116 io_stat->min_unmap_latency_ticks = tsc_diff; 7117 } 7118 break; 7119 case SPDK_BDEV_IO_TYPE_ZCOPY: 7120 /* Track the data in the start phase only */ 7121 if (bdev_io->u.bdev.zcopy.start) { 7122 if (bdev_io->u.bdev.zcopy.populate) { 7123 io_stat->bytes_read += num_blocks * blocklen; 7124 io_stat->num_read_ops++; 7125 io_stat->read_latency_ticks += tsc_diff; 7126 if (io_stat->max_read_latency_ticks < tsc_diff) { 7127 io_stat->max_read_latency_ticks = tsc_diff; 7128 } 7129 if (io_stat->min_read_latency_ticks > tsc_diff) { 7130 io_stat->min_read_latency_ticks = tsc_diff; 7131 } 7132 } else { 7133 io_stat->bytes_written += num_blocks * blocklen; 7134 io_stat->num_write_ops++; 7135 io_stat->write_latency_ticks += tsc_diff; 7136 if (io_stat->max_write_latency_ticks < tsc_diff) { 7137 io_stat->max_write_latency_ticks = tsc_diff; 7138 } 7139 if (io_stat->min_write_latency_ticks > tsc_diff) { 7140 io_stat->min_write_latency_ticks = tsc_diff; 7141 } 7142 } 7143 } 7144 break; 7145 case SPDK_BDEV_IO_TYPE_COPY: 7146 io_stat->bytes_copied += num_blocks * blocklen; 7147 io_stat->num_copy_ops++; 7148 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7149 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7150 io_stat->max_copy_latency_ticks = tsc_diff; 7151 } 7152 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7153 io_stat->min_copy_latency_ticks = tsc_diff; 7154 } 7155 break; 7156 default: 7157 break; 7158 } 7159 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7160 io_stat = bdev_io->bdev->internal.stat; 7161 assert(io_stat->io_error != NULL); 7162 7163 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7164 io_stat->io_error->error_status[-io_status - 1]++; 7165 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7166 } 7167 7168 #ifdef SPDK_CONFIG_VTUNE 7169 uint64_t now_tsc = spdk_get_ticks(); 7170 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7171 uint64_t data[5]; 7172 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7173 7174 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7175 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7176 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7177 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7178 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7179 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7180 7181 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7182 __itt_metadata_u64, 5, data); 7183 7184 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7185 bdev_io->internal.ch->start_tsc = now_tsc; 7186 } 7187 #endif 7188 } 7189 7190 static inline void 7191 _bdev_io_complete(void *ctx) 7192 { 7193 struct spdk_bdev_io *bdev_io = ctx; 7194 7195 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7196 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7197 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7198 } 7199 7200 assert(bdev_io->internal.cb != NULL); 7201 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7202 7203 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7204 bdev_io->internal.caller_ctx); 7205 } 7206 7207 static inline void 7208 bdev_io_complete(void *ctx) 7209 { 7210 struct spdk_bdev_io *bdev_io = ctx; 7211 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7212 uint64_t tsc, tsc_diff; 7213 7214 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7215 /* 7216 * Defer completion to avoid potential infinite recursion if the 7217 * user's completion callback issues a new I/O. 7218 */ 7219 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7220 bdev_io_complete, bdev_io); 7221 return; 7222 } 7223 7224 tsc = spdk_get_ticks(); 7225 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7226 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7227 bdev_io->internal.caller_ctx); 7228 7229 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7230 7231 if (bdev_io->internal.ch->histogram) { 7232 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7233 } 7234 7235 bdev_io_update_io_stat(bdev_io, tsc_diff); 7236 _bdev_io_complete(bdev_io); 7237 } 7238 7239 /* The difference between this function and bdev_io_complete() is that this should be called to 7240 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7241 * io_submitted list and don't have submit_tsc updated. 7242 */ 7243 static inline void 7244 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7245 { 7246 /* Since the IO hasn't been submitted it's bound to be failed */ 7247 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7248 7249 /* At this point we don't know if the IO is completed from submission context or not, but, 7250 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7251 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7252 _bdev_io_complete, bdev_io); 7253 } 7254 7255 static void bdev_destroy_cb(void *io_device); 7256 7257 static void 7258 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7259 { 7260 struct spdk_bdev_io *bdev_io = _ctx; 7261 7262 if (bdev_io->u.reset.ch_ref != NULL) { 7263 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7264 bdev_io->u.reset.ch_ref = NULL; 7265 } 7266 7267 bdev_io_complete(bdev_io); 7268 7269 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7270 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7271 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7272 } 7273 } 7274 7275 static void 7276 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7277 struct spdk_io_channel *_ch, void *_ctx) 7278 { 7279 struct spdk_bdev_io *bdev_io = _ctx; 7280 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7281 struct spdk_bdev_io *queued_reset; 7282 7283 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7284 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7285 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7286 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7287 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7288 } 7289 7290 spdk_bdev_for_each_channel_continue(i, 0); 7291 } 7292 7293 static void 7294 bdev_io_complete_sequence_cb(void *ctx, int status) 7295 { 7296 struct spdk_bdev_io *bdev_io = ctx; 7297 7298 /* u.bdev.accel_sequence should have already been cleared at this point */ 7299 assert(bdev_io->u.bdev.accel_sequence == NULL); 7300 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7301 bdev_io->internal.accel_sequence = NULL; 7302 7303 if (spdk_unlikely(status != 0)) { 7304 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7305 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7306 } 7307 7308 bdev_io_complete(bdev_io); 7309 } 7310 7311 void 7312 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7313 { 7314 struct spdk_bdev *bdev = bdev_io->bdev; 7315 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7316 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7317 7318 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7319 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7320 spdk_bdev_get_module_name(bdev), 7321 bdev_io_status_get_string(bdev_io->internal.status)); 7322 assert(false); 7323 } 7324 bdev_io->internal.status = status; 7325 7326 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7327 bool unlock_channels = false; 7328 7329 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7330 SPDK_ERRLOG("NOMEM returned for reset\n"); 7331 } 7332 spdk_spin_lock(&bdev->internal.spinlock); 7333 if (bdev_io == bdev->internal.reset_in_progress) { 7334 bdev->internal.reset_in_progress = NULL; 7335 unlock_channels = true; 7336 } 7337 spdk_spin_unlock(&bdev->internal.spinlock); 7338 7339 if (unlock_channels) { 7340 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7341 bdev_reset_complete); 7342 return; 7343 } 7344 } else { 7345 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7346 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7347 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7348 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7349 return; 7350 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7351 !bdev_io_use_accel_sequence(bdev_io))) { 7352 _bdev_io_push_bounce_data_buffer(bdev_io, 7353 _bdev_io_complete_push_bounce_done); 7354 /* bdev IO will be completed in the callback */ 7355 return; 7356 } 7357 } 7358 7359 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7360 return; 7361 } 7362 } 7363 7364 bdev_io_complete(bdev_io); 7365 } 7366 7367 void 7368 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7369 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7370 { 7371 enum spdk_bdev_io_status status; 7372 7373 if (sc == SPDK_SCSI_STATUS_GOOD) { 7374 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7375 } else { 7376 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7377 bdev_io->internal.error.scsi.sc = sc; 7378 bdev_io->internal.error.scsi.sk = sk; 7379 bdev_io->internal.error.scsi.asc = asc; 7380 bdev_io->internal.error.scsi.ascq = ascq; 7381 } 7382 7383 spdk_bdev_io_complete(bdev_io, status); 7384 } 7385 7386 void 7387 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7388 int *sc, int *sk, int *asc, int *ascq) 7389 { 7390 assert(sc != NULL); 7391 assert(sk != NULL); 7392 assert(asc != NULL); 7393 assert(ascq != NULL); 7394 7395 switch (bdev_io->internal.status) { 7396 case SPDK_BDEV_IO_STATUS_SUCCESS: 7397 *sc = SPDK_SCSI_STATUS_GOOD; 7398 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7399 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7400 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7401 break; 7402 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7403 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7404 break; 7405 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7406 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7407 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7408 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7409 *ascq = bdev_io->internal.error.scsi.ascq; 7410 break; 7411 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7412 *sc = bdev_io->internal.error.scsi.sc; 7413 *sk = bdev_io->internal.error.scsi.sk; 7414 *asc = bdev_io->internal.error.scsi.asc; 7415 *ascq = bdev_io->internal.error.scsi.ascq; 7416 break; 7417 default: 7418 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7419 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7420 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7421 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7422 break; 7423 } 7424 } 7425 7426 void 7427 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7428 { 7429 enum spdk_bdev_io_status status; 7430 7431 if (aio_result == 0) { 7432 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7433 } else { 7434 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7435 } 7436 7437 bdev_io->internal.error.aio_result = aio_result; 7438 7439 spdk_bdev_io_complete(bdev_io, status); 7440 } 7441 7442 void 7443 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7444 { 7445 assert(aio_result != NULL); 7446 7447 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7448 *aio_result = bdev_io->internal.error.aio_result; 7449 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7450 *aio_result = 0; 7451 } else { 7452 *aio_result = -EIO; 7453 } 7454 } 7455 7456 void 7457 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7458 { 7459 enum spdk_bdev_io_status status; 7460 7461 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7462 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7463 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7464 status = SPDK_BDEV_IO_STATUS_ABORTED; 7465 } else { 7466 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7467 } 7468 7469 bdev_io->internal.error.nvme.cdw0 = cdw0; 7470 bdev_io->internal.error.nvme.sct = sct; 7471 bdev_io->internal.error.nvme.sc = sc; 7472 7473 spdk_bdev_io_complete(bdev_io, status); 7474 } 7475 7476 void 7477 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7478 { 7479 assert(sct != NULL); 7480 assert(sc != NULL); 7481 assert(cdw0 != NULL); 7482 7483 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7484 *sct = SPDK_NVME_SCT_GENERIC; 7485 *sc = SPDK_NVME_SC_SUCCESS; 7486 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7487 *cdw0 = 0; 7488 } else { 7489 *cdw0 = 1U; 7490 } 7491 return; 7492 } 7493 7494 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7495 *sct = SPDK_NVME_SCT_GENERIC; 7496 *sc = SPDK_NVME_SC_SUCCESS; 7497 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7498 *sct = bdev_io->internal.error.nvme.sct; 7499 *sc = bdev_io->internal.error.nvme.sc; 7500 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7501 *sct = SPDK_NVME_SCT_GENERIC; 7502 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7503 } else { 7504 *sct = SPDK_NVME_SCT_GENERIC; 7505 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7506 } 7507 7508 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7509 } 7510 7511 void 7512 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7513 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7514 { 7515 assert(first_sct != NULL); 7516 assert(first_sc != NULL); 7517 assert(second_sct != NULL); 7518 assert(second_sc != NULL); 7519 assert(cdw0 != NULL); 7520 7521 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7522 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7523 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7524 *first_sct = bdev_io->internal.error.nvme.sct; 7525 *first_sc = bdev_io->internal.error.nvme.sc; 7526 *second_sct = SPDK_NVME_SCT_GENERIC; 7527 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7528 } else { 7529 *first_sct = SPDK_NVME_SCT_GENERIC; 7530 *first_sc = SPDK_NVME_SC_SUCCESS; 7531 *second_sct = bdev_io->internal.error.nvme.sct; 7532 *second_sc = bdev_io->internal.error.nvme.sc; 7533 } 7534 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7535 *first_sct = SPDK_NVME_SCT_GENERIC; 7536 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7537 *second_sct = SPDK_NVME_SCT_GENERIC; 7538 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7539 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7540 *first_sct = SPDK_NVME_SCT_GENERIC; 7541 *first_sc = SPDK_NVME_SC_SUCCESS; 7542 *second_sct = SPDK_NVME_SCT_GENERIC; 7543 *second_sc = SPDK_NVME_SC_SUCCESS; 7544 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7545 *first_sct = SPDK_NVME_SCT_GENERIC; 7546 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7547 *second_sct = SPDK_NVME_SCT_GENERIC; 7548 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7549 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7550 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7551 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7552 *second_sct = SPDK_NVME_SCT_GENERIC; 7553 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7554 } else { 7555 *first_sct = SPDK_NVME_SCT_GENERIC; 7556 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7557 *second_sct = SPDK_NVME_SCT_GENERIC; 7558 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7559 } 7560 7561 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7562 } 7563 7564 void 7565 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7566 const struct spdk_bdev_io *base_io) 7567 { 7568 switch (base_io->internal.status) { 7569 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7570 spdk_bdev_io_complete_nvme_status(bdev_io, 7571 base_io->internal.error.nvme.cdw0, 7572 base_io->internal.error.nvme.sct, 7573 base_io->internal.error.nvme.sc); 7574 break; 7575 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7576 spdk_bdev_io_complete_scsi_status(bdev_io, 7577 base_io->internal.error.scsi.sc, 7578 base_io->internal.error.scsi.sk, 7579 base_io->internal.error.scsi.asc, 7580 base_io->internal.error.scsi.ascq); 7581 break; 7582 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7583 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7584 break; 7585 default: 7586 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7587 break; 7588 } 7589 } 7590 7591 struct spdk_thread * 7592 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7593 { 7594 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7595 } 7596 7597 struct spdk_io_channel * 7598 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7599 { 7600 return bdev_io->internal.ch->channel; 7601 } 7602 7603 static int 7604 bdev_register(struct spdk_bdev *bdev) 7605 { 7606 char *bdev_name; 7607 char uuid[SPDK_UUID_STRING_LEN]; 7608 struct spdk_iobuf_opts iobuf_opts; 7609 int ret; 7610 7611 assert(bdev->module != NULL); 7612 7613 if (!bdev->name) { 7614 SPDK_ERRLOG("Bdev name is NULL\n"); 7615 return -EINVAL; 7616 } 7617 7618 if (!strlen(bdev->name)) { 7619 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7620 return -EINVAL; 7621 } 7622 7623 /* Users often register their own I/O devices using the bdev name. In 7624 * order to avoid conflicts, prepend bdev_. */ 7625 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7626 if (!bdev_name) { 7627 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7628 return -ENOMEM; 7629 } 7630 7631 bdev->internal.stat = bdev_alloc_io_stat(true); 7632 if (!bdev->internal.stat) { 7633 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7634 free(bdev_name); 7635 return -ENOMEM; 7636 } 7637 7638 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7639 bdev->internal.measured_queue_depth = UINT64_MAX; 7640 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7641 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7642 bdev->internal.qd_poller = NULL; 7643 bdev->internal.qos = NULL; 7644 7645 TAILQ_INIT(&bdev->internal.open_descs); 7646 TAILQ_INIT(&bdev->internal.locked_ranges); 7647 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7648 TAILQ_INIT(&bdev->aliases); 7649 7650 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7651 if (ret != 0) { 7652 bdev_free_io_stat(bdev->internal.stat); 7653 free(bdev_name); 7654 return ret; 7655 } 7656 7657 /* UUID may be specified by the user or defined by bdev itself. 7658 * Otherwise it will be generated here, so this field will never be empty. */ 7659 if (spdk_uuid_is_null(&bdev->uuid)) { 7660 spdk_uuid_generate(&bdev->uuid); 7661 } 7662 7663 /* Add the UUID alias only if it's different than the name */ 7664 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7665 if (strcmp(bdev->name, uuid) != 0) { 7666 ret = spdk_bdev_alias_add(bdev, uuid); 7667 if (ret != 0) { 7668 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7669 bdev_name_del(&bdev->internal.bdev_name); 7670 bdev_free_io_stat(bdev->internal.stat); 7671 free(bdev_name); 7672 return ret; 7673 } 7674 } 7675 7676 spdk_iobuf_get_opts(&iobuf_opts); 7677 if (spdk_bdev_get_buf_align(bdev) > 1) { 7678 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7679 iobuf_opts.large_bufsize / bdev->blocklen); 7680 } 7681 7682 /* If the user didn't specify a write unit size, set it to one. */ 7683 if (bdev->write_unit_size == 0) { 7684 bdev->write_unit_size = 1; 7685 } 7686 7687 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7688 if (bdev->acwu == 0) { 7689 bdev->acwu = bdev->write_unit_size; 7690 } 7691 7692 if (bdev->phys_blocklen == 0) { 7693 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7694 } 7695 7696 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7697 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7698 } 7699 7700 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7701 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7702 } 7703 7704 bdev->internal.reset_in_progress = NULL; 7705 bdev->internal.qd_poll_in_progress = false; 7706 bdev->internal.period = 0; 7707 bdev->internal.new_period = 0; 7708 7709 spdk_io_device_register(__bdev_to_io_dev(bdev), 7710 bdev_channel_create, bdev_channel_destroy, 7711 sizeof(struct spdk_bdev_channel), 7712 bdev_name); 7713 7714 free(bdev_name); 7715 7716 spdk_spin_init(&bdev->internal.spinlock); 7717 7718 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7719 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7720 7721 return 0; 7722 } 7723 7724 static void 7725 bdev_destroy_cb(void *io_device) 7726 { 7727 int rc; 7728 struct spdk_bdev *bdev; 7729 spdk_bdev_unregister_cb cb_fn; 7730 void *cb_arg; 7731 7732 bdev = __bdev_from_io_dev(io_device); 7733 7734 if (bdev->internal.unregister_td != spdk_get_thread()) { 7735 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7736 return; 7737 } 7738 7739 cb_fn = bdev->internal.unregister_cb; 7740 cb_arg = bdev->internal.unregister_ctx; 7741 7742 spdk_spin_destroy(&bdev->internal.spinlock); 7743 free(bdev->internal.qos); 7744 bdev_free_io_stat(bdev->internal.stat); 7745 7746 rc = bdev->fn_table->destruct(bdev->ctxt); 7747 if (rc < 0) { 7748 SPDK_ERRLOG("destruct failed\n"); 7749 } 7750 if (rc <= 0 && cb_fn != NULL) { 7751 cb_fn(cb_arg, rc); 7752 } 7753 } 7754 7755 void 7756 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7757 { 7758 if (bdev->internal.unregister_cb != NULL) { 7759 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7760 } 7761 } 7762 7763 static void 7764 _remove_notify(void *arg) 7765 { 7766 struct spdk_bdev_desc *desc = arg; 7767 7768 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7769 } 7770 7771 /* returns: 0 - bdev removed and ready to be destructed. 7772 * -EBUSY - bdev can't be destructed yet. */ 7773 static int 7774 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7775 { 7776 struct spdk_bdev_desc *desc, *tmp; 7777 int rc = 0; 7778 char uuid[SPDK_UUID_STRING_LEN]; 7779 7780 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7781 assert(spdk_spin_held(&bdev->internal.spinlock)); 7782 7783 /* Notify each descriptor about hotremoval */ 7784 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7785 rc = -EBUSY; 7786 /* 7787 * Defer invocation of the event_cb to a separate message that will 7788 * run later on its thread. This ensures this context unwinds and 7789 * we don't recursively unregister this bdev again if the event_cb 7790 * immediately closes its descriptor. 7791 */ 7792 event_notify(desc, _remove_notify); 7793 } 7794 7795 /* If there are no descriptors, proceed removing the bdev */ 7796 if (rc == 0) { 7797 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7798 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7799 7800 /* Delete the name and the UUID alias */ 7801 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7802 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7803 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7804 7805 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7806 7807 if (bdev->internal.reset_in_progress != NULL) { 7808 /* If reset is in progress, let the completion callback for reset 7809 * unregister the bdev. 7810 */ 7811 rc = -EBUSY; 7812 } 7813 } 7814 7815 return rc; 7816 } 7817 7818 static void 7819 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7820 struct spdk_io_channel *io_ch, void *_ctx) 7821 { 7822 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7823 7824 bdev_channel_abort_queued_ios(bdev_ch); 7825 spdk_bdev_for_each_channel_continue(i, 0); 7826 } 7827 7828 static void 7829 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7830 { 7831 int rc; 7832 7833 spdk_spin_lock(&g_bdev_mgr.spinlock); 7834 spdk_spin_lock(&bdev->internal.spinlock); 7835 /* 7836 * Set the status to REMOVING after completing to abort channels. Otherwise, 7837 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7838 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7839 * may fail. 7840 */ 7841 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7842 rc = bdev_unregister_unsafe(bdev); 7843 spdk_spin_unlock(&bdev->internal.spinlock); 7844 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7845 7846 if (rc == 0) { 7847 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7848 } 7849 } 7850 7851 void 7852 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7853 { 7854 struct spdk_thread *thread; 7855 7856 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7857 7858 thread = spdk_get_thread(); 7859 if (!thread) { 7860 /* The user called this from a non-SPDK thread. */ 7861 if (cb_fn != NULL) { 7862 cb_fn(cb_arg, -ENOTSUP); 7863 } 7864 return; 7865 } 7866 7867 spdk_spin_lock(&g_bdev_mgr.spinlock); 7868 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7869 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7870 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7871 if (cb_fn) { 7872 cb_fn(cb_arg, -EBUSY); 7873 } 7874 return; 7875 } 7876 7877 spdk_spin_lock(&bdev->internal.spinlock); 7878 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7879 bdev->internal.unregister_cb = cb_fn; 7880 bdev->internal.unregister_ctx = cb_arg; 7881 bdev->internal.unregister_td = thread; 7882 spdk_spin_unlock(&bdev->internal.spinlock); 7883 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7884 7885 spdk_bdev_set_qd_sampling_period(bdev, 0); 7886 7887 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7888 bdev_unregister); 7889 } 7890 7891 int 7892 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7893 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7894 { 7895 struct spdk_bdev_desc *desc; 7896 struct spdk_bdev *bdev; 7897 int rc; 7898 7899 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7900 if (rc != 0) { 7901 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7902 return rc; 7903 } 7904 7905 bdev = spdk_bdev_desc_get_bdev(desc); 7906 7907 if (bdev->module != module) { 7908 spdk_bdev_close(desc); 7909 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7910 bdev_name); 7911 return -ENODEV; 7912 } 7913 7914 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7915 7916 spdk_bdev_close(desc); 7917 7918 return 0; 7919 } 7920 7921 static int 7922 bdev_start_qos(struct spdk_bdev *bdev) 7923 { 7924 struct set_qos_limit_ctx *ctx; 7925 7926 /* Enable QoS */ 7927 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7928 ctx = calloc(1, sizeof(*ctx)); 7929 if (ctx == NULL) { 7930 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7931 return -ENOMEM; 7932 } 7933 ctx->bdev = bdev; 7934 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7935 } 7936 7937 return 0; 7938 } 7939 7940 static void 7941 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7942 struct spdk_bdev *bdev) 7943 { 7944 enum spdk_bdev_claim_type type; 7945 const char *typename, *modname; 7946 extern struct spdk_log_flag SPDK_LOG_bdev; 7947 7948 assert(spdk_spin_held(&bdev->internal.spinlock)); 7949 7950 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7951 return; 7952 } 7953 7954 type = bdev->internal.claim_type; 7955 typename = spdk_bdev_claim_get_name(type); 7956 7957 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7958 modname = bdev->internal.claim.v1.module->name; 7959 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7960 bdev->name, detail, typename, modname); 7961 return; 7962 } 7963 7964 if (claim_type_is_v2(type)) { 7965 struct spdk_bdev_module_claim *claim; 7966 7967 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7968 modname = claim->module->name; 7969 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7970 bdev->name, detail, typename, modname); 7971 } 7972 return; 7973 } 7974 7975 assert(false); 7976 } 7977 7978 static int 7979 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7980 { 7981 struct spdk_thread *thread; 7982 int rc = 0; 7983 7984 thread = spdk_get_thread(); 7985 if (!thread) { 7986 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7987 return -ENOTSUP; 7988 } 7989 7990 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7991 spdk_get_thread()); 7992 7993 desc->bdev = bdev; 7994 desc->thread = thread; 7995 desc->write = write; 7996 7997 spdk_spin_lock(&bdev->internal.spinlock); 7998 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7999 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8000 spdk_spin_unlock(&bdev->internal.spinlock); 8001 return -ENODEV; 8002 } 8003 8004 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8005 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8006 spdk_spin_unlock(&bdev->internal.spinlock); 8007 return -EPERM; 8008 } 8009 8010 rc = bdev_start_qos(bdev); 8011 if (rc != 0) { 8012 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8013 spdk_spin_unlock(&bdev->internal.spinlock); 8014 return rc; 8015 } 8016 8017 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8018 8019 spdk_spin_unlock(&bdev->internal.spinlock); 8020 8021 return 0; 8022 } 8023 8024 static int 8025 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8026 struct spdk_bdev_desc **_desc) 8027 { 8028 struct spdk_bdev_desc *desc; 8029 unsigned int i; 8030 8031 desc = calloc(1, sizeof(*desc)); 8032 if (desc == NULL) { 8033 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8034 return -ENOMEM; 8035 } 8036 8037 TAILQ_INIT(&desc->pending_media_events); 8038 TAILQ_INIT(&desc->free_media_events); 8039 8040 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8041 desc->callback.event_fn = event_cb; 8042 desc->callback.ctx = event_ctx; 8043 spdk_spin_init(&desc->spinlock); 8044 8045 if (bdev->media_events) { 8046 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8047 sizeof(*desc->media_events_buffer)); 8048 if (desc->media_events_buffer == NULL) { 8049 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8050 bdev_desc_free(desc); 8051 return -ENOMEM; 8052 } 8053 8054 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8055 TAILQ_INSERT_TAIL(&desc->free_media_events, 8056 &desc->media_events_buffer[i], tailq); 8057 } 8058 } 8059 8060 if (bdev->fn_table->accel_sequence_supported != NULL) { 8061 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8062 desc->accel_sequence_supported[i] = 8063 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8064 (enum spdk_bdev_io_type)i); 8065 } 8066 } 8067 8068 *_desc = desc; 8069 8070 return 0; 8071 } 8072 8073 static int 8074 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8075 void *event_ctx, struct spdk_bdev_desc **_desc) 8076 { 8077 struct spdk_bdev_desc *desc; 8078 struct spdk_bdev *bdev; 8079 int rc; 8080 8081 bdev = bdev_get_by_name(bdev_name); 8082 8083 if (bdev == NULL) { 8084 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8085 return -ENODEV; 8086 } 8087 8088 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8089 if (rc != 0) { 8090 return rc; 8091 } 8092 8093 rc = bdev_open(bdev, write, desc); 8094 if (rc != 0) { 8095 bdev_desc_free(desc); 8096 desc = NULL; 8097 } 8098 8099 *_desc = desc; 8100 8101 return rc; 8102 } 8103 8104 int 8105 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8106 void *event_ctx, struct spdk_bdev_desc **_desc) 8107 { 8108 int rc; 8109 8110 if (event_cb == NULL) { 8111 SPDK_ERRLOG("Missing event callback function\n"); 8112 return -EINVAL; 8113 } 8114 8115 spdk_spin_lock(&g_bdev_mgr.spinlock); 8116 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8117 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8118 8119 return rc; 8120 } 8121 8122 struct spdk_bdev_open_async_ctx { 8123 char *bdev_name; 8124 spdk_bdev_event_cb_t event_cb; 8125 void *event_ctx; 8126 bool write; 8127 int rc; 8128 spdk_bdev_open_async_cb_t cb_fn; 8129 void *cb_arg; 8130 struct spdk_bdev_desc *desc; 8131 struct spdk_bdev_open_async_opts opts; 8132 uint64_t start_ticks; 8133 struct spdk_thread *orig_thread; 8134 struct spdk_poller *poller; 8135 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8136 }; 8137 8138 static void 8139 bdev_open_async_done(void *arg) 8140 { 8141 struct spdk_bdev_open_async_ctx *ctx = arg; 8142 8143 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8144 8145 free(ctx->bdev_name); 8146 free(ctx); 8147 } 8148 8149 static void 8150 bdev_open_async_cancel(void *arg) 8151 { 8152 struct spdk_bdev_open_async_ctx *ctx = arg; 8153 8154 assert(ctx->rc == -ESHUTDOWN); 8155 8156 spdk_poller_unregister(&ctx->poller); 8157 8158 bdev_open_async_done(ctx); 8159 } 8160 8161 /* This is called when the bdev library finishes at shutdown. */ 8162 static void 8163 bdev_open_async_fini(void) 8164 { 8165 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8166 8167 spdk_spin_lock(&g_bdev_mgr.spinlock); 8168 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8169 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8170 /* 8171 * We have to move to ctx->orig_thread to unregister ctx->poller. 8172 * However, there is a chance that ctx->poller is executed before 8173 * message is executed, which could result in bdev_open_async_done() 8174 * being called twice. To avoid such race condition, set ctx->rc to 8175 * -ESHUTDOWN. 8176 */ 8177 ctx->rc = -ESHUTDOWN; 8178 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8179 } 8180 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8181 } 8182 8183 static int bdev_open_async(void *arg); 8184 8185 static void 8186 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8187 { 8188 uint64_t timeout_ticks; 8189 8190 if (ctx->rc == -ESHUTDOWN) { 8191 /* This context is being canceled. Do nothing. */ 8192 return; 8193 } 8194 8195 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8196 &ctx->desc); 8197 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8198 goto exit; 8199 } 8200 8201 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8202 if (spdk_get_ticks() >= timeout_ticks) { 8203 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8204 ctx->rc = -ETIMEDOUT; 8205 goto exit; 8206 } 8207 8208 return; 8209 8210 exit: 8211 spdk_poller_unregister(&ctx->poller); 8212 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8213 8214 /* Completion callback is processed after stack unwinding. */ 8215 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8216 } 8217 8218 static int 8219 bdev_open_async(void *arg) 8220 { 8221 struct spdk_bdev_open_async_ctx *ctx = arg; 8222 8223 spdk_spin_lock(&g_bdev_mgr.spinlock); 8224 8225 _bdev_open_async(ctx); 8226 8227 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8228 8229 return SPDK_POLLER_BUSY; 8230 } 8231 8232 static void 8233 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8234 struct spdk_bdev_open_async_opts *opts_src, 8235 size_t size) 8236 { 8237 assert(opts); 8238 assert(opts_src); 8239 8240 opts->size = size; 8241 8242 #define SET_FIELD(field) \ 8243 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8244 opts->field = opts_src->field; \ 8245 } \ 8246 8247 SET_FIELD(timeout_ms); 8248 8249 /* Do not remove this statement, you should always update this statement when you adding a new field, 8250 * and do not forget to add the SET_FIELD statement for your added field. */ 8251 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8252 8253 #undef SET_FIELD 8254 } 8255 8256 static void 8257 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8258 { 8259 assert(opts); 8260 8261 opts->size = size; 8262 8263 #define SET_FIELD(field, value) \ 8264 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8265 opts->field = value; \ 8266 } \ 8267 8268 SET_FIELD(timeout_ms, 0); 8269 8270 #undef SET_FIELD 8271 } 8272 8273 int 8274 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8275 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8276 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8277 { 8278 struct spdk_bdev_open_async_ctx *ctx; 8279 8280 if (event_cb == NULL) { 8281 SPDK_ERRLOG("Missing event callback function\n"); 8282 return -EINVAL; 8283 } 8284 8285 if (open_cb == NULL) { 8286 SPDK_ERRLOG("Missing open callback function\n"); 8287 return -EINVAL; 8288 } 8289 8290 if (opts != NULL && opts->size == 0) { 8291 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8292 return -EINVAL; 8293 } 8294 8295 ctx = calloc(1, sizeof(*ctx)); 8296 if (ctx == NULL) { 8297 SPDK_ERRLOG("Failed to allocate open context\n"); 8298 return -ENOMEM; 8299 } 8300 8301 ctx->bdev_name = strdup(bdev_name); 8302 if (ctx->bdev_name == NULL) { 8303 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8304 free(ctx); 8305 return -ENOMEM; 8306 } 8307 8308 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8309 if (ctx->poller == NULL) { 8310 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8311 free(ctx->bdev_name); 8312 free(ctx); 8313 return -ENOMEM; 8314 } 8315 8316 ctx->cb_fn = open_cb; 8317 ctx->cb_arg = open_cb_arg; 8318 ctx->write = write; 8319 ctx->event_cb = event_cb; 8320 ctx->event_ctx = event_ctx; 8321 ctx->orig_thread = spdk_get_thread(); 8322 ctx->start_ticks = spdk_get_ticks(); 8323 8324 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8325 if (opts != NULL) { 8326 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8327 } 8328 8329 spdk_spin_lock(&g_bdev_mgr.spinlock); 8330 8331 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8332 _bdev_open_async(ctx); 8333 8334 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8335 8336 return 0; 8337 } 8338 8339 static void 8340 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8341 { 8342 int rc; 8343 8344 spdk_spin_lock(&bdev->internal.spinlock); 8345 spdk_spin_lock(&desc->spinlock); 8346 8347 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8348 8349 desc->closed = true; 8350 8351 if (desc->claim != NULL) { 8352 bdev_desc_release_claims(desc); 8353 } 8354 8355 if (0 == desc->refs) { 8356 spdk_spin_unlock(&desc->spinlock); 8357 bdev_desc_free(desc); 8358 } else { 8359 spdk_spin_unlock(&desc->spinlock); 8360 } 8361 8362 /* If no more descriptors, kill QoS channel */ 8363 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8364 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8365 bdev->name, spdk_get_thread()); 8366 8367 if (bdev_qos_destroy(bdev)) { 8368 /* There isn't anything we can do to recover here. Just let the 8369 * old QoS poller keep running. The QoS handling won't change 8370 * cores when the user allocates a new channel, but it won't break. */ 8371 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8372 } 8373 } 8374 8375 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8376 rc = bdev_unregister_unsafe(bdev); 8377 spdk_spin_unlock(&bdev->internal.spinlock); 8378 8379 if (rc == 0) { 8380 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8381 } 8382 } else { 8383 spdk_spin_unlock(&bdev->internal.spinlock); 8384 } 8385 } 8386 8387 void 8388 spdk_bdev_close(struct spdk_bdev_desc *desc) 8389 { 8390 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8391 8392 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8393 spdk_get_thread()); 8394 8395 assert(desc->thread == spdk_get_thread()); 8396 8397 spdk_poller_unregister(&desc->io_timeout_poller); 8398 8399 spdk_spin_lock(&g_bdev_mgr.spinlock); 8400 8401 bdev_close(bdev, desc); 8402 8403 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8404 } 8405 8406 static void 8407 bdev_register_finished(void *arg) 8408 { 8409 struct spdk_bdev_desc *desc = arg; 8410 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8411 8412 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8413 8414 spdk_spin_lock(&g_bdev_mgr.spinlock); 8415 8416 bdev_close(bdev, desc); 8417 8418 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8419 } 8420 8421 int 8422 spdk_bdev_register(struct spdk_bdev *bdev) 8423 { 8424 struct spdk_bdev_desc *desc; 8425 struct spdk_thread *thread = spdk_get_thread(); 8426 int rc; 8427 8428 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8429 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8430 thread ? spdk_thread_get_name(thread) : "null"); 8431 return -EINVAL; 8432 } 8433 8434 rc = bdev_register(bdev); 8435 if (rc != 0) { 8436 return rc; 8437 } 8438 8439 /* A descriptor is opened to prevent bdev deletion during examination */ 8440 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8441 if (rc != 0) { 8442 spdk_bdev_unregister(bdev, NULL, NULL); 8443 return rc; 8444 } 8445 8446 rc = bdev_open(bdev, false, desc); 8447 if (rc != 0) { 8448 bdev_desc_free(desc); 8449 spdk_bdev_unregister(bdev, NULL, NULL); 8450 return rc; 8451 } 8452 8453 /* Examine configuration before initializing I/O */ 8454 bdev_examine(bdev); 8455 8456 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8457 if (rc != 0) { 8458 bdev_close(bdev, desc); 8459 spdk_bdev_unregister(bdev, NULL, NULL); 8460 } 8461 8462 return rc; 8463 } 8464 8465 int 8466 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8467 struct spdk_bdev_module *module) 8468 { 8469 spdk_spin_lock(&bdev->internal.spinlock); 8470 8471 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8472 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8473 spdk_spin_unlock(&bdev->internal.spinlock); 8474 return -EPERM; 8475 } 8476 8477 if (desc && !desc->write) { 8478 desc->write = true; 8479 } 8480 8481 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8482 bdev->internal.claim.v1.module = module; 8483 8484 spdk_spin_unlock(&bdev->internal.spinlock); 8485 return 0; 8486 } 8487 8488 void 8489 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8490 { 8491 spdk_spin_lock(&bdev->internal.spinlock); 8492 8493 assert(bdev->internal.claim.v1.module != NULL); 8494 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8495 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8496 bdev->internal.claim.v1.module = NULL; 8497 8498 spdk_spin_unlock(&bdev->internal.spinlock); 8499 } 8500 8501 /* 8502 * Start claims v2 8503 */ 8504 8505 const char * 8506 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8507 { 8508 switch (type) { 8509 case SPDK_BDEV_CLAIM_NONE: 8510 return "not_claimed"; 8511 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8512 return "exclusive_write"; 8513 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8514 return "read_many_write_one"; 8515 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8516 return "read_many_write_none"; 8517 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8518 return "read_many_write_many"; 8519 default: 8520 break; 8521 } 8522 return "invalid_claim"; 8523 } 8524 8525 static bool 8526 claim_type_is_v2(enum spdk_bdev_claim_type type) 8527 { 8528 switch (type) { 8529 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8530 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8531 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8532 return true; 8533 default: 8534 break; 8535 } 8536 return false; 8537 } 8538 8539 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8540 static bool 8541 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8542 { 8543 switch (type) { 8544 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8545 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8546 return true; 8547 default: 8548 break; 8549 } 8550 return false; 8551 } 8552 8553 void 8554 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8555 { 8556 if (opts == NULL) { 8557 SPDK_ERRLOG("opts should not be NULL\n"); 8558 assert(opts != NULL); 8559 return; 8560 } 8561 if (size == 0) { 8562 SPDK_ERRLOG("size should not be zero\n"); 8563 assert(size != 0); 8564 return; 8565 } 8566 8567 memset(opts, 0, size); 8568 opts->opts_size = size; 8569 8570 #define FIELD_OK(field) \ 8571 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8572 8573 #define SET_FIELD(field, value) \ 8574 if (FIELD_OK(field)) { \ 8575 opts->field = value; \ 8576 } \ 8577 8578 SET_FIELD(shared_claim_key, 0); 8579 8580 #undef FIELD_OK 8581 #undef SET_FIELD 8582 } 8583 8584 static int 8585 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8586 { 8587 if (src->opts_size == 0) { 8588 SPDK_ERRLOG("size should not be zero\n"); 8589 return -1; 8590 } 8591 8592 memset(dst, 0, sizeof(*dst)); 8593 dst->opts_size = src->opts_size; 8594 8595 #define FIELD_OK(field) \ 8596 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8597 8598 #define SET_FIELD(field) \ 8599 if (FIELD_OK(field)) { \ 8600 dst->field = src->field; \ 8601 } \ 8602 8603 if (FIELD_OK(name)) { 8604 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8605 } 8606 8607 SET_FIELD(shared_claim_key); 8608 8609 /* You should not remove this statement, but need to update the assert statement 8610 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8611 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8612 8613 #undef FIELD_OK 8614 #undef SET_FIELD 8615 return 0; 8616 } 8617 8618 /* Returns 0 if a read-write-once claim can be taken. */ 8619 static int 8620 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8621 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8622 { 8623 struct spdk_bdev *bdev = desc->bdev; 8624 struct spdk_bdev_desc *open_desc; 8625 8626 assert(spdk_spin_held(&bdev->internal.spinlock)); 8627 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8628 8629 if (opts->shared_claim_key != 0) { 8630 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8631 bdev->name); 8632 return -EINVAL; 8633 } 8634 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8635 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8636 return -EPERM; 8637 } 8638 if (desc->claim != NULL) { 8639 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8640 bdev->name, desc->claim->module->name); 8641 return -EPERM; 8642 } 8643 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8644 if (desc != open_desc && open_desc->write) { 8645 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8646 "another descriptor is open for writing\n", 8647 bdev->name); 8648 return -EPERM; 8649 } 8650 } 8651 8652 return 0; 8653 } 8654 8655 /* Returns 0 if a read-only-many claim can be taken. */ 8656 static int 8657 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8658 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8659 { 8660 struct spdk_bdev *bdev = desc->bdev; 8661 struct spdk_bdev_desc *open_desc; 8662 8663 assert(spdk_spin_held(&bdev->internal.spinlock)); 8664 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8665 assert(desc->claim == NULL); 8666 8667 if (desc->write) { 8668 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8669 bdev->name); 8670 return -EINVAL; 8671 } 8672 if (opts->shared_claim_key != 0) { 8673 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8674 return -EINVAL; 8675 } 8676 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8677 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8678 if (open_desc->write) { 8679 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8680 "another descriptor is open for writing\n", 8681 bdev->name); 8682 return -EPERM; 8683 } 8684 } 8685 } 8686 8687 return 0; 8688 } 8689 8690 /* Returns 0 if a read-write-many claim can be taken. */ 8691 static int 8692 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8693 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8694 { 8695 struct spdk_bdev *bdev = desc->bdev; 8696 struct spdk_bdev_desc *open_desc; 8697 8698 assert(spdk_spin_held(&bdev->internal.spinlock)); 8699 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8700 assert(desc->claim == NULL); 8701 8702 if (opts->shared_claim_key == 0) { 8703 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8704 bdev->name); 8705 return -EINVAL; 8706 } 8707 switch (bdev->internal.claim_type) { 8708 case SPDK_BDEV_CLAIM_NONE: 8709 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8710 if (open_desc == desc) { 8711 continue; 8712 } 8713 if (open_desc->write) { 8714 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8715 "another descriptor is open for writing without a " 8716 "claim\n", bdev->name); 8717 return -EPERM; 8718 } 8719 } 8720 break; 8721 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8722 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8723 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8724 return -EPERM; 8725 } 8726 break; 8727 default: 8728 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8729 return -EBUSY; 8730 } 8731 8732 return 0; 8733 } 8734 8735 /* Updates desc and its bdev with a v2 claim. */ 8736 static int 8737 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8738 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8739 { 8740 struct spdk_bdev *bdev = desc->bdev; 8741 struct spdk_bdev_module_claim *claim; 8742 8743 assert(spdk_spin_held(&bdev->internal.spinlock)); 8744 assert(claim_type_is_v2(type)); 8745 assert(desc->claim == NULL); 8746 8747 claim = calloc(1, sizeof(*desc->claim)); 8748 if (claim == NULL) { 8749 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8750 return -ENOMEM; 8751 } 8752 claim->module = module; 8753 claim->desc = desc; 8754 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8755 memcpy(claim->name, opts->name, sizeof(claim->name)); 8756 desc->claim = claim; 8757 8758 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8759 bdev->internal.claim_type = type; 8760 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8761 bdev->internal.claim.v2.key = opts->shared_claim_key; 8762 } 8763 assert(type == bdev->internal.claim_type); 8764 8765 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8766 8767 if (!desc->write && claim_type_promotes_to_write(type)) { 8768 desc->write = true; 8769 } 8770 8771 return 0; 8772 } 8773 8774 int 8775 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8776 struct spdk_bdev_claim_opts *_opts, 8777 struct spdk_bdev_module *module) 8778 { 8779 struct spdk_bdev *bdev; 8780 struct spdk_bdev_claim_opts opts; 8781 int rc = 0; 8782 8783 if (desc == NULL) { 8784 SPDK_ERRLOG("descriptor must not be NULL\n"); 8785 return -EINVAL; 8786 } 8787 8788 bdev = desc->bdev; 8789 8790 if (_opts == NULL) { 8791 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8792 } else if (claim_opts_copy(_opts, &opts) != 0) { 8793 return -EINVAL; 8794 } 8795 8796 spdk_spin_lock(&bdev->internal.spinlock); 8797 8798 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8799 bdev->internal.claim_type != type) { 8800 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8801 spdk_spin_unlock(&bdev->internal.spinlock); 8802 return -EPERM; 8803 } 8804 8805 if (claim_type_is_v2(type) && desc->claim != NULL) { 8806 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8807 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8808 spdk_spin_unlock(&bdev->internal.spinlock); 8809 return -EPERM; 8810 } 8811 8812 switch (type) { 8813 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8814 spdk_spin_unlock(&bdev->internal.spinlock); 8815 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8816 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8817 rc = claim_verify_rwo(desc, type, &opts, module); 8818 break; 8819 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8820 rc = claim_verify_rom(desc, type, &opts, module); 8821 break; 8822 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8823 rc = claim_verify_rwm(desc, type, &opts, module); 8824 break; 8825 default: 8826 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8827 rc = -ENOTSUP; 8828 } 8829 8830 if (rc == 0) { 8831 rc = claim_bdev(desc, type, &opts, module); 8832 } 8833 8834 spdk_spin_unlock(&bdev->internal.spinlock); 8835 return rc; 8836 } 8837 8838 static void 8839 claim_reset(struct spdk_bdev *bdev) 8840 { 8841 assert(spdk_spin_held(&bdev->internal.spinlock)); 8842 assert(claim_type_is_v2(bdev->internal.claim_type)); 8843 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8844 8845 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8846 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8847 } 8848 8849 static void 8850 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8851 { 8852 struct spdk_bdev *bdev = desc->bdev; 8853 8854 assert(spdk_spin_held(&bdev->internal.spinlock)); 8855 assert(claim_type_is_v2(bdev->internal.claim_type)); 8856 8857 if (bdev->internal.examine_in_progress == 0) { 8858 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8859 free(desc->claim); 8860 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8861 claim_reset(bdev); 8862 } 8863 } else { 8864 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8865 desc->claim->module = NULL; 8866 desc->claim->desc = NULL; 8867 } 8868 desc->claim = NULL; 8869 } 8870 8871 /* 8872 * End claims v2 8873 */ 8874 8875 struct spdk_bdev * 8876 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8877 { 8878 assert(desc != NULL); 8879 return desc->bdev; 8880 } 8881 8882 int 8883 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8884 { 8885 struct spdk_bdev *bdev, *tmp; 8886 struct spdk_bdev_desc *desc; 8887 int rc = 0; 8888 8889 assert(fn != NULL); 8890 8891 spdk_spin_lock(&g_bdev_mgr.spinlock); 8892 bdev = spdk_bdev_first(); 8893 while (bdev != NULL) { 8894 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8895 if (rc != 0) { 8896 break; 8897 } 8898 rc = bdev_open(bdev, false, desc); 8899 if (rc != 0) { 8900 bdev_desc_free(desc); 8901 if (rc == -ENODEV) { 8902 /* Ignore the error and move to the next bdev. */ 8903 rc = 0; 8904 bdev = spdk_bdev_next(bdev); 8905 continue; 8906 } 8907 break; 8908 } 8909 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8910 8911 rc = fn(ctx, bdev); 8912 8913 spdk_spin_lock(&g_bdev_mgr.spinlock); 8914 tmp = spdk_bdev_next(bdev); 8915 bdev_close(bdev, desc); 8916 if (rc != 0) { 8917 break; 8918 } 8919 bdev = tmp; 8920 } 8921 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8922 8923 return rc; 8924 } 8925 8926 int 8927 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8928 { 8929 struct spdk_bdev *bdev, *tmp; 8930 struct spdk_bdev_desc *desc; 8931 int rc = 0; 8932 8933 assert(fn != NULL); 8934 8935 spdk_spin_lock(&g_bdev_mgr.spinlock); 8936 bdev = spdk_bdev_first_leaf(); 8937 while (bdev != NULL) { 8938 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8939 if (rc != 0) { 8940 break; 8941 } 8942 rc = bdev_open(bdev, false, desc); 8943 if (rc != 0) { 8944 bdev_desc_free(desc); 8945 if (rc == -ENODEV) { 8946 /* Ignore the error and move to the next bdev. */ 8947 rc = 0; 8948 bdev = spdk_bdev_next_leaf(bdev); 8949 continue; 8950 } 8951 break; 8952 } 8953 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8954 8955 rc = fn(ctx, bdev); 8956 8957 spdk_spin_lock(&g_bdev_mgr.spinlock); 8958 tmp = spdk_bdev_next_leaf(bdev); 8959 bdev_close(bdev, desc); 8960 if (rc != 0) { 8961 break; 8962 } 8963 bdev = tmp; 8964 } 8965 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8966 8967 return rc; 8968 } 8969 8970 void 8971 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8972 { 8973 struct iovec *iovs; 8974 int iovcnt; 8975 8976 if (bdev_io == NULL) { 8977 return; 8978 } 8979 8980 switch (bdev_io->type) { 8981 case SPDK_BDEV_IO_TYPE_READ: 8982 case SPDK_BDEV_IO_TYPE_WRITE: 8983 case SPDK_BDEV_IO_TYPE_ZCOPY: 8984 iovs = bdev_io->u.bdev.iovs; 8985 iovcnt = bdev_io->u.bdev.iovcnt; 8986 break; 8987 default: 8988 iovs = NULL; 8989 iovcnt = 0; 8990 break; 8991 } 8992 8993 if (iovp) { 8994 *iovp = iovs; 8995 } 8996 if (iovcntp) { 8997 *iovcntp = iovcnt; 8998 } 8999 } 9000 9001 void * 9002 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9003 { 9004 if (bdev_io == NULL) { 9005 return NULL; 9006 } 9007 9008 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9009 return NULL; 9010 } 9011 9012 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9013 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9014 return bdev_io->u.bdev.md_buf; 9015 } 9016 9017 return NULL; 9018 } 9019 9020 void * 9021 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9022 { 9023 if (bdev_io == NULL) { 9024 assert(false); 9025 return NULL; 9026 } 9027 9028 return bdev_io->internal.caller_ctx; 9029 } 9030 9031 void 9032 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9033 { 9034 9035 if (spdk_bdev_module_list_find(bdev_module->name)) { 9036 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9037 assert(false); 9038 } 9039 9040 spdk_spin_init(&bdev_module->internal.spinlock); 9041 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9042 9043 /* 9044 * Modules with examine callbacks must be initialized first, so they are 9045 * ready to handle examine callbacks from later modules that will 9046 * register physical bdevs. 9047 */ 9048 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9049 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9050 } else { 9051 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9052 } 9053 } 9054 9055 struct spdk_bdev_module * 9056 spdk_bdev_module_list_find(const char *name) 9057 { 9058 struct spdk_bdev_module *bdev_module; 9059 9060 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9061 if (strcmp(name, bdev_module->name) == 0) { 9062 break; 9063 } 9064 } 9065 9066 return bdev_module; 9067 } 9068 9069 static int 9070 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9071 { 9072 uint64_t num_blocks; 9073 void *md_buf = NULL; 9074 9075 num_blocks = bdev_io->u.bdev.num_blocks; 9076 9077 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9078 md_buf = (char *)g_bdev_mgr.zero_buffer + 9079 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9080 } 9081 9082 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9083 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9084 g_bdev_mgr.zero_buffer, md_buf, 9085 bdev_io->u.bdev.offset_blocks, num_blocks, 9086 bdev_write_zero_buffer_done, bdev_io); 9087 } 9088 9089 static void 9090 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9091 { 9092 struct spdk_bdev_io *parent_io = cb_arg; 9093 9094 spdk_bdev_free_io(bdev_io); 9095 9096 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9097 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9098 } 9099 9100 static void 9101 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9102 { 9103 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9104 ctx->bdev->internal.qos_mod_in_progress = false; 9105 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9106 9107 if (ctx->cb_fn) { 9108 ctx->cb_fn(ctx->cb_arg, status); 9109 } 9110 free(ctx); 9111 } 9112 9113 static void 9114 bdev_disable_qos_done(void *cb_arg) 9115 { 9116 struct set_qos_limit_ctx *ctx = cb_arg; 9117 struct spdk_bdev *bdev = ctx->bdev; 9118 struct spdk_bdev_qos *qos; 9119 9120 spdk_spin_lock(&bdev->internal.spinlock); 9121 qos = bdev->internal.qos; 9122 bdev->internal.qos = NULL; 9123 spdk_spin_unlock(&bdev->internal.spinlock); 9124 9125 if (qos->thread != NULL) { 9126 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9127 spdk_poller_unregister(&qos->poller); 9128 } 9129 9130 free(qos); 9131 9132 bdev_set_qos_limit_done(ctx, 0); 9133 } 9134 9135 static void 9136 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9137 { 9138 struct set_qos_limit_ctx *ctx = _ctx; 9139 struct spdk_thread *thread; 9140 9141 spdk_spin_lock(&bdev->internal.spinlock); 9142 thread = bdev->internal.qos->thread; 9143 spdk_spin_unlock(&bdev->internal.spinlock); 9144 9145 if (thread != NULL) { 9146 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9147 } else { 9148 bdev_disable_qos_done(ctx); 9149 } 9150 } 9151 9152 static void 9153 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9154 struct spdk_io_channel *ch, void *_ctx) 9155 { 9156 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9157 struct spdk_bdev_io *bdev_io; 9158 9159 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9160 9161 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9162 /* Re-submit the queued I/O. */ 9163 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9164 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9165 _bdev_io_submit(bdev_io); 9166 } 9167 9168 spdk_bdev_for_each_channel_continue(i, 0); 9169 } 9170 9171 static void 9172 bdev_update_qos_rate_limit_msg(void *cb_arg) 9173 { 9174 struct set_qos_limit_ctx *ctx = cb_arg; 9175 struct spdk_bdev *bdev = ctx->bdev; 9176 9177 spdk_spin_lock(&bdev->internal.spinlock); 9178 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9179 spdk_spin_unlock(&bdev->internal.spinlock); 9180 9181 bdev_set_qos_limit_done(ctx, 0); 9182 } 9183 9184 static void 9185 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9186 struct spdk_io_channel *ch, void *_ctx) 9187 { 9188 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9189 9190 spdk_spin_lock(&bdev->internal.spinlock); 9191 bdev_enable_qos(bdev, bdev_ch); 9192 spdk_spin_unlock(&bdev->internal.spinlock); 9193 spdk_bdev_for_each_channel_continue(i, 0); 9194 } 9195 9196 static void 9197 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9198 { 9199 struct set_qos_limit_ctx *ctx = _ctx; 9200 9201 bdev_set_qos_limit_done(ctx, status); 9202 } 9203 9204 static void 9205 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9206 { 9207 int i; 9208 9209 assert(bdev->internal.qos != NULL); 9210 9211 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9212 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9213 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9214 9215 if (limits[i] == 0) { 9216 bdev->internal.qos->rate_limits[i].limit = 9217 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9218 } 9219 } 9220 } 9221 } 9222 9223 void 9224 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9225 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9226 { 9227 struct set_qos_limit_ctx *ctx; 9228 uint32_t limit_set_complement; 9229 uint64_t min_limit_per_sec; 9230 int i; 9231 bool disable_rate_limit = true; 9232 9233 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9234 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9235 continue; 9236 } 9237 9238 if (limits[i] > 0) { 9239 disable_rate_limit = false; 9240 } 9241 9242 if (bdev_qos_is_iops_rate_limit(i) == true) { 9243 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9244 } else { 9245 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9246 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9247 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9248 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9249 } 9250 /* Change from megabyte to byte rate limit */ 9251 limits[i] = limits[i] * 1024 * 1024; 9252 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9253 } 9254 9255 limit_set_complement = limits[i] % min_limit_per_sec; 9256 if (limit_set_complement) { 9257 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9258 limits[i], min_limit_per_sec); 9259 limits[i] += min_limit_per_sec - limit_set_complement; 9260 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9261 } 9262 } 9263 9264 ctx = calloc(1, sizeof(*ctx)); 9265 if (ctx == NULL) { 9266 cb_fn(cb_arg, -ENOMEM); 9267 return; 9268 } 9269 9270 ctx->cb_fn = cb_fn; 9271 ctx->cb_arg = cb_arg; 9272 ctx->bdev = bdev; 9273 9274 spdk_spin_lock(&bdev->internal.spinlock); 9275 if (bdev->internal.qos_mod_in_progress) { 9276 spdk_spin_unlock(&bdev->internal.spinlock); 9277 free(ctx); 9278 cb_fn(cb_arg, -EAGAIN); 9279 return; 9280 } 9281 bdev->internal.qos_mod_in_progress = true; 9282 9283 if (disable_rate_limit == true && bdev->internal.qos) { 9284 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9285 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9286 (bdev->internal.qos->rate_limits[i].limit > 0 && 9287 bdev->internal.qos->rate_limits[i].limit != 9288 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9289 disable_rate_limit = false; 9290 break; 9291 } 9292 } 9293 } 9294 9295 if (disable_rate_limit == false) { 9296 if (bdev->internal.qos == NULL) { 9297 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9298 if (!bdev->internal.qos) { 9299 spdk_spin_unlock(&bdev->internal.spinlock); 9300 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9301 bdev_set_qos_limit_done(ctx, -ENOMEM); 9302 return; 9303 } 9304 } 9305 9306 if (bdev->internal.qos->thread == NULL) { 9307 /* Enabling */ 9308 bdev_set_qos_rate_limits(bdev, limits); 9309 9310 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9311 bdev_enable_qos_done); 9312 } else { 9313 /* Updating */ 9314 bdev_set_qos_rate_limits(bdev, limits); 9315 9316 spdk_thread_send_msg(bdev->internal.qos->thread, 9317 bdev_update_qos_rate_limit_msg, ctx); 9318 } 9319 } else { 9320 if (bdev->internal.qos != NULL) { 9321 bdev_set_qos_rate_limits(bdev, limits); 9322 9323 /* Disabling */ 9324 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9325 bdev_disable_qos_msg_done); 9326 } else { 9327 spdk_spin_unlock(&bdev->internal.spinlock); 9328 bdev_set_qos_limit_done(ctx, 0); 9329 return; 9330 } 9331 } 9332 9333 spdk_spin_unlock(&bdev->internal.spinlock); 9334 } 9335 9336 struct spdk_bdev_histogram_ctx { 9337 spdk_bdev_histogram_status_cb cb_fn; 9338 void *cb_arg; 9339 struct spdk_bdev *bdev; 9340 int status; 9341 }; 9342 9343 static void 9344 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9345 { 9346 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9347 9348 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9349 ctx->bdev->internal.histogram_in_progress = false; 9350 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9351 ctx->cb_fn(ctx->cb_arg, ctx->status); 9352 free(ctx); 9353 } 9354 9355 static void 9356 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9357 struct spdk_io_channel *_ch, void *_ctx) 9358 { 9359 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9360 9361 if (ch->histogram != NULL) { 9362 spdk_histogram_data_free(ch->histogram); 9363 ch->histogram = NULL; 9364 } 9365 spdk_bdev_for_each_channel_continue(i, 0); 9366 } 9367 9368 static void 9369 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9370 { 9371 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9372 9373 if (status != 0) { 9374 ctx->status = status; 9375 ctx->bdev->internal.histogram_enabled = false; 9376 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9377 bdev_histogram_disable_channel_cb); 9378 } else { 9379 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9380 ctx->bdev->internal.histogram_in_progress = false; 9381 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9382 ctx->cb_fn(ctx->cb_arg, ctx->status); 9383 free(ctx); 9384 } 9385 } 9386 9387 static void 9388 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9389 struct spdk_io_channel *_ch, void *_ctx) 9390 { 9391 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9392 int status = 0; 9393 9394 if (ch->histogram == NULL) { 9395 ch->histogram = spdk_histogram_data_alloc(); 9396 if (ch->histogram == NULL) { 9397 status = -ENOMEM; 9398 } 9399 } 9400 9401 spdk_bdev_for_each_channel_continue(i, status); 9402 } 9403 9404 void 9405 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9406 void *cb_arg, bool enable) 9407 { 9408 struct spdk_bdev_histogram_ctx *ctx; 9409 9410 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9411 if (ctx == NULL) { 9412 cb_fn(cb_arg, -ENOMEM); 9413 return; 9414 } 9415 9416 ctx->bdev = bdev; 9417 ctx->status = 0; 9418 ctx->cb_fn = cb_fn; 9419 ctx->cb_arg = cb_arg; 9420 9421 spdk_spin_lock(&bdev->internal.spinlock); 9422 if (bdev->internal.histogram_in_progress) { 9423 spdk_spin_unlock(&bdev->internal.spinlock); 9424 free(ctx); 9425 cb_fn(cb_arg, -EAGAIN); 9426 return; 9427 } 9428 9429 bdev->internal.histogram_in_progress = true; 9430 spdk_spin_unlock(&bdev->internal.spinlock); 9431 9432 bdev->internal.histogram_enabled = enable; 9433 9434 if (enable) { 9435 /* Allocate histogram for each channel */ 9436 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9437 bdev_histogram_enable_channel_cb); 9438 } else { 9439 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9440 bdev_histogram_disable_channel_cb); 9441 } 9442 } 9443 9444 struct spdk_bdev_histogram_data_ctx { 9445 spdk_bdev_histogram_data_cb cb_fn; 9446 void *cb_arg; 9447 struct spdk_bdev *bdev; 9448 /** merged histogram data from all channels */ 9449 struct spdk_histogram_data *histogram; 9450 }; 9451 9452 static void 9453 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9454 { 9455 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9456 9457 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9458 free(ctx); 9459 } 9460 9461 static void 9462 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9463 struct spdk_io_channel *_ch, void *_ctx) 9464 { 9465 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9466 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9467 int status = 0; 9468 9469 if (ch->histogram == NULL) { 9470 status = -EFAULT; 9471 } else { 9472 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9473 } 9474 9475 spdk_bdev_for_each_channel_continue(i, status); 9476 } 9477 9478 void 9479 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9480 spdk_bdev_histogram_data_cb cb_fn, 9481 void *cb_arg) 9482 { 9483 struct spdk_bdev_histogram_data_ctx *ctx; 9484 9485 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9486 if (ctx == NULL) { 9487 cb_fn(cb_arg, -ENOMEM, NULL); 9488 return; 9489 } 9490 9491 ctx->bdev = bdev; 9492 ctx->cb_fn = cb_fn; 9493 ctx->cb_arg = cb_arg; 9494 9495 ctx->histogram = histogram; 9496 9497 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9498 bdev_histogram_get_channel_cb); 9499 } 9500 9501 void 9502 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9503 void *cb_arg) 9504 { 9505 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9506 int status = 0; 9507 9508 assert(cb_fn != NULL); 9509 9510 if (bdev_ch->histogram == NULL) { 9511 status = -EFAULT; 9512 } 9513 cb_fn(cb_arg, status, bdev_ch->histogram); 9514 } 9515 9516 size_t 9517 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9518 size_t max_events) 9519 { 9520 struct media_event_entry *entry; 9521 size_t num_events = 0; 9522 9523 for (; num_events < max_events; ++num_events) { 9524 entry = TAILQ_FIRST(&desc->pending_media_events); 9525 if (entry == NULL) { 9526 break; 9527 } 9528 9529 events[num_events] = entry->event; 9530 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9531 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9532 } 9533 9534 return num_events; 9535 } 9536 9537 int 9538 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9539 size_t num_events) 9540 { 9541 struct spdk_bdev_desc *desc; 9542 struct media_event_entry *entry; 9543 size_t event_id; 9544 int rc = 0; 9545 9546 assert(bdev->media_events); 9547 9548 spdk_spin_lock(&bdev->internal.spinlock); 9549 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9550 if (desc->write) { 9551 break; 9552 } 9553 } 9554 9555 if (desc == NULL || desc->media_events_buffer == NULL) { 9556 rc = -ENODEV; 9557 goto out; 9558 } 9559 9560 for (event_id = 0; event_id < num_events; ++event_id) { 9561 entry = TAILQ_FIRST(&desc->free_media_events); 9562 if (entry == NULL) { 9563 break; 9564 } 9565 9566 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9567 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9568 entry->event = events[event_id]; 9569 } 9570 9571 rc = event_id; 9572 out: 9573 spdk_spin_unlock(&bdev->internal.spinlock); 9574 return rc; 9575 } 9576 9577 static void 9578 _media_management_notify(void *arg) 9579 { 9580 struct spdk_bdev_desc *desc = arg; 9581 9582 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9583 } 9584 9585 void 9586 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9587 { 9588 struct spdk_bdev_desc *desc; 9589 9590 spdk_spin_lock(&bdev->internal.spinlock); 9591 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9592 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9593 event_notify(desc, _media_management_notify); 9594 } 9595 } 9596 spdk_spin_unlock(&bdev->internal.spinlock); 9597 } 9598 9599 struct locked_lba_range_ctx { 9600 struct lba_range range; 9601 struct lba_range *current_range; 9602 struct lba_range *owner_range; 9603 struct spdk_poller *poller; 9604 lock_range_cb cb_fn; 9605 void *cb_arg; 9606 }; 9607 9608 static void 9609 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9610 { 9611 struct locked_lba_range_ctx *ctx = _ctx; 9612 9613 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9614 free(ctx); 9615 } 9616 9617 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9618 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9619 9620 static void 9621 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9622 { 9623 struct locked_lba_range_ctx *ctx = _ctx; 9624 9625 if (status == -ENOMEM) { 9626 /* One of the channels could not allocate a range object. 9627 * So we have to go back and clean up any ranges that were 9628 * allocated successfully before we return error status to 9629 * the caller. We can reuse the unlock function to do that 9630 * clean up. 9631 */ 9632 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9633 bdev_lock_error_cleanup_cb); 9634 return; 9635 } 9636 9637 /* All channels have locked this range and no I/O overlapping the range 9638 * are outstanding! Set the owner_ch for the range object for the 9639 * locking channel, so that this channel will know that it is allowed 9640 * to write to this range. 9641 */ 9642 if (ctx->owner_range != NULL) { 9643 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9644 } 9645 9646 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9647 9648 /* Don't free the ctx here. Its range is in the bdev's global list of 9649 * locked ranges still, and will be removed and freed when this range 9650 * is later unlocked. 9651 */ 9652 } 9653 9654 static int 9655 bdev_lock_lba_range_check_io(void *_i) 9656 { 9657 struct spdk_bdev_channel_iter *i = _i; 9658 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9659 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9660 struct locked_lba_range_ctx *ctx = i->ctx; 9661 struct lba_range *range = ctx->current_range; 9662 struct spdk_bdev_io *bdev_io; 9663 9664 spdk_poller_unregister(&ctx->poller); 9665 9666 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9667 * range. But we need to wait until any outstanding IO overlapping with this range 9668 * are completed. 9669 */ 9670 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9671 if (bdev_io_range_is_locked(bdev_io, range)) { 9672 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9673 return SPDK_POLLER_BUSY; 9674 } 9675 } 9676 9677 spdk_bdev_for_each_channel_continue(i, 0); 9678 return SPDK_POLLER_BUSY; 9679 } 9680 9681 static void 9682 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9683 struct spdk_io_channel *_ch, void *_ctx) 9684 { 9685 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9686 struct locked_lba_range_ctx *ctx = _ctx; 9687 struct lba_range *range; 9688 9689 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9690 if (range->length == ctx->range.length && 9691 range->offset == ctx->range.offset && 9692 range->locked_ctx == ctx->range.locked_ctx) { 9693 /* This range already exists on this channel, so don't add 9694 * it again. This can happen when a new channel is created 9695 * while the for_each_channel operation is in progress. 9696 * Do not check for outstanding I/O in that case, since the 9697 * range was locked before any I/O could be submitted to the 9698 * new channel. 9699 */ 9700 spdk_bdev_for_each_channel_continue(i, 0); 9701 return; 9702 } 9703 } 9704 9705 range = calloc(1, sizeof(*range)); 9706 if (range == NULL) { 9707 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9708 return; 9709 } 9710 9711 range->length = ctx->range.length; 9712 range->offset = ctx->range.offset; 9713 range->locked_ctx = ctx->range.locked_ctx; 9714 range->quiesce = ctx->range.quiesce; 9715 ctx->current_range = range; 9716 if (ctx->range.owner_ch == ch) { 9717 /* This is the range object for the channel that will hold 9718 * the lock. Store it in the ctx object so that we can easily 9719 * set its owner_ch after the lock is finally acquired. 9720 */ 9721 ctx->owner_range = range; 9722 } 9723 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9724 bdev_lock_lba_range_check_io(i); 9725 } 9726 9727 static void 9728 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9729 { 9730 assert(spdk_get_thread() == ctx->range.owner_thread); 9731 assert(ctx->range.owner_ch == NULL || 9732 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9733 9734 /* We will add a copy of this range to each channel now. */ 9735 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9736 bdev_lock_lba_range_cb); 9737 } 9738 9739 static bool 9740 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9741 { 9742 struct lba_range *r; 9743 9744 TAILQ_FOREACH(r, tailq, tailq) { 9745 if (bdev_lba_range_overlapped(range, r)) { 9746 return true; 9747 } 9748 } 9749 return false; 9750 } 9751 9752 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9753 9754 static int 9755 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9756 uint64_t offset, uint64_t length, 9757 lock_range_cb cb_fn, void *cb_arg) 9758 { 9759 struct locked_lba_range_ctx *ctx; 9760 9761 ctx = calloc(1, sizeof(*ctx)); 9762 if (ctx == NULL) { 9763 return -ENOMEM; 9764 } 9765 9766 ctx->range.offset = offset; 9767 ctx->range.length = length; 9768 ctx->range.owner_thread = spdk_get_thread(); 9769 ctx->range.owner_ch = ch; 9770 ctx->range.locked_ctx = cb_arg; 9771 ctx->range.bdev = bdev; 9772 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9773 ctx->cb_fn = cb_fn; 9774 ctx->cb_arg = cb_arg; 9775 9776 spdk_spin_lock(&bdev->internal.spinlock); 9777 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9778 /* There is an active lock overlapping with this range. 9779 * Put it on the pending list until this range no 9780 * longer overlaps with another. 9781 */ 9782 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9783 } else { 9784 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9785 bdev_lock_lba_range_ctx(bdev, ctx); 9786 } 9787 spdk_spin_unlock(&bdev->internal.spinlock); 9788 return 0; 9789 } 9790 9791 static int 9792 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9793 uint64_t offset, uint64_t length, 9794 lock_range_cb cb_fn, void *cb_arg) 9795 { 9796 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9797 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9798 9799 if (cb_arg == NULL) { 9800 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9801 return -EINVAL; 9802 } 9803 9804 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9805 } 9806 9807 static void 9808 bdev_lock_lba_range_ctx_msg(void *_ctx) 9809 { 9810 struct locked_lba_range_ctx *ctx = _ctx; 9811 9812 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9813 } 9814 9815 static void 9816 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9817 { 9818 struct locked_lba_range_ctx *ctx = _ctx; 9819 struct locked_lba_range_ctx *pending_ctx; 9820 struct lba_range *range, *tmp; 9821 9822 spdk_spin_lock(&bdev->internal.spinlock); 9823 /* Check if there are any pending locked ranges that overlap with this range 9824 * that was just unlocked. If there are, check that it doesn't overlap with any 9825 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9826 * the lock process. 9827 */ 9828 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9829 if (bdev_lba_range_overlapped(range, &ctx->range) && 9830 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9831 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9832 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9833 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9834 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9835 bdev_lock_lba_range_ctx_msg, pending_ctx); 9836 } 9837 } 9838 spdk_spin_unlock(&bdev->internal.spinlock); 9839 9840 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9841 free(ctx); 9842 } 9843 9844 static void 9845 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9846 struct spdk_io_channel *_ch, void *_ctx) 9847 { 9848 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9849 struct locked_lba_range_ctx *ctx = _ctx; 9850 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9851 struct spdk_bdev_io *bdev_io; 9852 struct lba_range *range; 9853 9854 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9855 if (ctx->range.offset == range->offset && 9856 ctx->range.length == range->length && 9857 ctx->range.locked_ctx == range->locked_ctx) { 9858 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9859 free(range); 9860 break; 9861 } 9862 } 9863 9864 /* Note: we should almost always be able to assert that the range specified 9865 * was found. But there are some very rare corner cases where a new channel 9866 * gets created simultaneously with a range unlock, where this function 9867 * would execute on that new channel and wouldn't have the range. 9868 * We also use this to clean up range allocations when a later allocation 9869 * fails in the locking path. 9870 * So we can't actually assert() here. 9871 */ 9872 9873 /* Swap the locked IO into a temporary list, and then try to submit them again. 9874 * We could hyper-optimize this to only resubmit locked I/O that overlap 9875 * with the range that was just unlocked, but this isn't a performance path so 9876 * we go for simplicity here. 9877 */ 9878 TAILQ_INIT(&io_locked); 9879 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9880 while (!TAILQ_EMPTY(&io_locked)) { 9881 bdev_io = TAILQ_FIRST(&io_locked); 9882 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9883 bdev_io_submit(bdev_io); 9884 } 9885 9886 spdk_bdev_for_each_channel_continue(i, 0); 9887 } 9888 9889 static int 9890 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9891 lock_range_cb cb_fn, void *cb_arg) 9892 { 9893 struct locked_lba_range_ctx *ctx; 9894 struct lba_range *range; 9895 9896 spdk_spin_lock(&bdev->internal.spinlock); 9897 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9898 * and remove it. This ensures new channels don't inherit the locked range. 9899 * Then we will send a message to each channel to remove the range from its 9900 * per-channel list. 9901 */ 9902 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9903 if (range->offset == offset && range->length == length && 9904 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9905 break; 9906 } 9907 } 9908 if (range == NULL) { 9909 assert(false); 9910 spdk_spin_unlock(&bdev->internal.spinlock); 9911 return -EINVAL; 9912 } 9913 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9914 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9915 spdk_spin_unlock(&bdev->internal.spinlock); 9916 9917 ctx->cb_fn = cb_fn; 9918 ctx->cb_arg = cb_arg; 9919 9920 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9921 bdev_unlock_lba_range_cb); 9922 return 0; 9923 } 9924 9925 static int 9926 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9927 uint64_t offset, uint64_t length, 9928 lock_range_cb cb_fn, void *cb_arg) 9929 { 9930 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9931 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9932 struct lba_range *range; 9933 bool range_found = false; 9934 9935 /* Let's make sure the specified channel actually has a lock on 9936 * the specified range. Note that the range must match exactly. 9937 */ 9938 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9939 if (range->offset == offset && range->length == length && 9940 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9941 range_found = true; 9942 break; 9943 } 9944 } 9945 9946 if (!range_found) { 9947 return -EINVAL; 9948 } 9949 9950 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9951 } 9952 9953 struct bdev_quiesce_ctx { 9954 spdk_bdev_quiesce_cb cb_fn; 9955 void *cb_arg; 9956 }; 9957 9958 static void 9959 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9960 { 9961 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9962 9963 if (quiesce_ctx->cb_fn != NULL) { 9964 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9965 } 9966 9967 free(quiesce_ctx); 9968 } 9969 9970 static void 9971 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9972 { 9973 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9974 struct spdk_bdev_module *module = range->bdev->module; 9975 9976 if (status != 0) { 9977 if (quiesce_ctx->cb_fn != NULL) { 9978 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9979 } 9980 free(quiesce_ctx); 9981 return; 9982 } 9983 9984 spdk_spin_lock(&module->internal.spinlock); 9985 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9986 spdk_spin_unlock(&module->internal.spinlock); 9987 9988 if (quiesce_ctx->cb_fn != NULL) { 9989 /* copy the context in case the range is unlocked by the callback */ 9990 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 9991 9992 quiesce_ctx->cb_fn = NULL; 9993 quiesce_ctx->cb_arg = NULL; 9994 9995 tmp.cb_fn(tmp.cb_arg, status); 9996 } 9997 /* quiesce_ctx will be freed on unquiesce */ 9998 } 9999 10000 static int 10001 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10002 uint64_t offset, uint64_t length, 10003 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10004 bool unquiesce) 10005 { 10006 struct bdev_quiesce_ctx *quiesce_ctx; 10007 int rc; 10008 10009 if (module != bdev->module) { 10010 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10011 return -EINVAL; 10012 } 10013 10014 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10015 return -EINVAL; 10016 } 10017 10018 if (unquiesce) { 10019 struct lba_range *range; 10020 10021 /* Make sure the specified range is actually quiesced in the specified module and 10022 * then remove it from the list. Note that the range must match exactly. 10023 */ 10024 spdk_spin_lock(&module->internal.spinlock); 10025 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10026 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10027 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10028 break; 10029 } 10030 } 10031 spdk_spin_unlock(&module->internal.spinlock); 10032 10033 if (range == NULL) { 10034 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10035 return -EINVAL; 10036 } 10037 10038 quiesce_ctx = range->locked_ctx; 10039 quiesce_ctx->cb_fn = cb_fn; 10040 quiesce_ctx->cb_arg = cb_arg; 10041 10042 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10043 } else { 10044 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10045 if (quiesce_ctx == NULL) { 10046 return -ENOMEM; 10047 } 10048 10049 quiesce_ctx->cb_fn = cb_fn; 10050 quiesce_ctx->cb_arg = cb_arg; 10051 10052 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10053 if (rc != 0) { 10054 free(quiesce_ctx); 10055 } 10056 } 10057 10058 return rc; 10059 } 10060 10061 int 10062 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10063 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10064 { 10065 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10066 } 10067 10068 int 10069 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10070 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10071 { 10072 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10073 } 10074 10075 int 10076 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10077 uint64_t offset, uint64_t length, 10078 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10079 { 10080 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10081 } 10082 10083 int 10084 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10085 uint64_t offset, uint64_t length, 10086 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10087 { 10088 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10089 } 10090 10091 int 10092 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10093 int array_size) 10094 { 10095 if (!bdev) { 10096 return -EINVAL; 10097 } 10098 10099 if (bdev->fn_table->get_memory_domains) { 10100 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10101 } 10102 10103 return 0; 10104 } 10105 10106 struct spdk_bdev_for_each_io_ctx { 10107 void *ctx; 10108 spdk_bdev_io_fn fn; 10109 spdk_bdev_for_each_io_cb cb; 10110 }; 10111 10112 static void 10113 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10114 struct spdk_io_channel *io_ch, void *_ctx) 10115 { 10116 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10117 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10118 struct spdk_bdev_io *bdev_io; 10119 int rc = 0; 10120 10121 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10122 rc = ctx->fn(ctx->ctx, bdev_io); 10123 if (rc != 0) { 10124 break; 10125 } 10126 } 10127 10128 spdk_bdev_for_each_channel_continue(i, rc); 10129 } 10130 10131 static void 10132 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10133 { 10134 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10135 10136 ctx->cb(ctx->ctx, status); 10137 10138 free(ctx); 10139 } 10140 10141 void 10142 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10143 spdk_bdev_for_each_io_cb cb) 10144 { 10145 struct spdk_bdev_for_each_io_ctx *ctx; 10146 10147 assert(fn != NULL && cb != NULL); 10148 10149 ctx = calloc(1, sizeof(*ctx)); 10150 if (ctx == NULL) { 10151 SPDK_ERRLOG("Failed to allocate context.\n"); 10152 cb(_ctx, -ENOMEM); 10153 return; 10154 } 10155 10156 ctx->ctx = _ctx; 10157 ctx->fn = fn; 10158 ctx->cb = cb; 10159 10160 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10161 bdev_for_each_io_done); 10162 } 10163 10164 void 10165 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10166 { 10167 spdk_for_each_channel_continue(iter->i, status); 10168 } 10169 10170 static struct spdk_bdev * 10171 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10172 { 10173 void *io_device = spdk_io_channel_iter_get_io_device(i); 10174 10175 return __bdev_from_io_dev(io_device); 10176 } 10177 10178 static void 10179 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10180 { 10181 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10182 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10183 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10184 10185 iter->i = i; 10186 iter->fn(iter, bdev, ch, iter->ctx); 10187 } 10188 10189 static void 10190 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10191 { 10192 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10193 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10194 10195 iter->i = i; 10196 iter->cpl(bdev, iter->ctx, status); 10197 10198 free(iter); 10199 } 10200 10201 void 10202 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10203 void *ctx, spdk_bdev_for_each_channel_done cpl) 10204 { 10205 struct spdk_bdev_channel_iter *iter; 10206 10207 assert(bdev != NULL && fn != NULL && ctx != NULL); 10208 10209 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10210 if (iter == NULL) { 10211 SPDK_ERRLOG("Unable to allocate iterator\n"); 10212 assert(false); 10213 return; 10214 } 10215 10216 iter->fn = fn; 10217 iter->cpl = cpl; 10218 iter->ctx = ctx; 10219 10220 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10221 iter, bdev_each_channel_cpl); 10222 } 10223 10224 static void 10225 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10226 { 10227 struct spdk_bdev_io *parent_io = cb_arg; 10228 10229 spdk_bdev_free_io(bdev_io); 10230 10231 /* Check return status of write */ 10232 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10233 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10234 } 10235 10236 static void 10237 bdev_copy_do_write(void *_bdev_io) 10238 { 10239 struct spdk_bdev_io *bdev_io = _bdev_io; 10240 int rc; 10241 10242 /* Write blocks */ 10243 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10244 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10245 bdev_io->u.bdev.iovs[0].iov_base, 10246 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10247 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10248 10249 if (rc == -ENOMEM) { 10250 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10251 } else if (rc != 0) { 10252 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10253 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10254 } 10255 } 10256 10257 static void 10258 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10259 { 10260 struct spdk_bdev_io *parent_io = cb_arg; 10261 10262 spdk_bdev_free_io(bdev_io); 10263 10264 /* Check return status of read */ 10265 if (!success) { 10266 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10267 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10268 return; 10269 } 10270 10271 /* Do write */ 10272 bdev_copy_do_write(parent_io); 10273 } 10274 10275 static void 10276 bdev_copy_do_read(void *_bdev_io) 10277 { 10278 struct spdk_bdev_io *bdev_io = _bdev_io; 10279 int rc; 10280 10281 /* Read blocks */ 10282 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10283 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10284 bdev_io->u.bdev.iovs[0].iov_base, 10285 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10286 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10287 10288 if (rc == -ENOMEM) { 10289 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10290 } else if (rc != 0) { 10291 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10292 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10293 } 10294 } 10295 10296 static void 10297 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10298 { 10299 if (!success) { 10300 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10301 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10302 return; 10303 } 10304 10305 bdev_copy_do_read(bdev_io); 10306 } 10307 10308 int 10309 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10310 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10311 spdk_bdev_io_completion_cb cb, void *cb_arg) 10312 { 10313 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10314 struct spdk_bdev_io *bdev_io; 10315 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10316 10317 if (!desc->write) { 10318 return -EBADF; 10319 } 10320 10321 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10322 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10323 SPDK_DEBUGLOG(bdev, 10324 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10325 dst_offset_blocks, src_offset_blocks, num_blocks); 10326 return -EINVAL; 10327 } 10328 10329 bdev_io = bdev_channel_get_io(channel); 10330 if (!bdev_io) { 10331 return -ENOMEM; 10332 } 10333 10334 bdev_io->internal.ch = channel; 10335 bdev_io->internal.desc = desc; 10336 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10337 10338 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10339 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10340 bdev_io->u.bdev.num_blocks = num_blocks; 10341 bdev_io->u.bdev.memory_domain = NULL; 10342 bdev_io->u.bdev.memory_domain_ctx = NULL; 10343 bdev_io->u.bdev.iovs = NULL; 10344 bdev_io->u.bdev.iovcnt = 0; 10345 bdev_io->u.bdev.md_buf = NULL; 10346 bdev_io->u.bdev.accel_sequence = NULL; 10347 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10348 10349 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10350 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10351 return 0; 10352 } 10353 10354 10355 /* If the copy size is large and should be split, use the generic split logic 10356 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10357 * 10358 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10359 * emulate it using regular read and write requests otherwise. 10360 */ 10361 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10362 bdev_io->internal.split) { 10363 bdev_io_submit(bdev_io); 10364 return 0; 10365 } 10366 10367 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10368 10369 return 0; 10370 } 10371 10372 SPDK_LOG_REGISTER_COMPONENT(bdev) 10373 10374 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10375 { 10376 struct spdk_trace_tpoint_opts opts[] = { 10377 { 10378 "BDEV_IO_START", TRACE_BDEV_IO_START, 10379 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10380 { 10381 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10382 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10383 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10384 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10385 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 10386 } 10387 }, 10388 { 10389 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10390 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10391 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10392 }, 10393 { 10394 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10395 OWNER_TYPE_BDEV, OBJECT_NONE, 1, 10396 { 10397 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10398 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10399 } 10400 }, 10401 { 10402 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10403 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10404 { 10405 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 10406 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 10407 } 10408 }, 10409 }; 10410 10411 10412 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10413 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10414 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10415 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10416 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10417 } 10418