1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC (UINT64_MAX / (1024 * 1024)) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 78 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 79 }; 80 81 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 82 83 RB_HEAD(bdev_name_tree, spdk_bdev_name); 84 85 static int 86 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 87 { 88 return strcmp(name1->name, name2->name); 89 } 90 91 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 92 93 struct spdk_bdev_mgr { 94 struct spdk_mempool *bdev_io_pool; 95 96 void *zero_buffer; 97 98 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 99 100 struct spdk_bdev_list bdevs; 101 struct bdev_name_tree bdev_names; 102 103 bool init_complete; 104 bool module_init_complete; 105 106 struct spdk_spinlock spinlock; 107 108 TAILQ_HEAD(, spdk_bdev_open_async_ctx) async_bdev_opens; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 .async_bdev_opens = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.async_bdev_opens), 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 bool quiesce; 140 void *locked_ctx; 141 struct spdk_thread *owner_thread; 142 struct spdk_bdev_channel *owner_ch; 143 TAILQ_ENTRY(lba_range) tailq; 144 TAILQ_ENTRY(lba_range) tailq_module; 145 }; 146 147 static struct spdk_bdev_opts g_bdev_opts = { 148 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 149 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 150 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 151 .iobuf_small_cache_size = BUF_SMALL_CACHE_SIZE, 152 .iobuf_large_cache_size = BUF_LARGE_CACHE_SIZE, 153 }; 154 155 static spdk_bdev_init_cb g_init_cb_fn = NULL; 156 static void *g_init_cb_arg = NULL; 157 158 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 159 static void *g_fini_cb_arg = NULL; 160 static struct spdk_thread *g_fini_thread = NULL; 161 162 struct spdk_bdev_qos_limit { 163 /** IOs or bytes allowed per second (i.e., 1s). */ 164 uint64_t limit; 165 166 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 167 * For remaining bytes, allowed to run negative if an I/O is submitted when 168 * some bytes are remaining, but the I/O is bigger than that amount. The 169 * excess will be deducted from the next timeslice. 170 */ 171 int64_t remaining_this_timeslice; 172 173 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 174 uint32_t min_per_timeslice; 175 176 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 177 uint32_t max_per_timeslice; 178 179 /** Function to check whether to queue the IO. 180 * If The IO is allowed to pass, the quota will be reduced correspondingly. 181 */ 182 bool (*queue_io)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 184 /** Function to rewind the quota once the IO was allowed to be sent by this 185 * limit but queued due to one of the further limits. 186 */ 187 void (*rewind_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 188 }; 189 190 struct spdk_bdev_qos { 191 /** Types of structure of rate limits. */ 192 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 193 194 /** The channel that all I/O are funneled through. */ 195 struct spdk_bdev_channel *ch; 196 197 /** The thread on which the poller is running. */ 198 struct spdk_thread *thread; 199 200 /** Size of a timeslice in tsc ticks. */ 201 uint64_t timeslice_size; 202 203 /** Timestamp of start of last timeslice. */ 204 uint64_t last_timeslice; 205 206 /** Poller that processes queued I/O commands each time slice. */ 207 struct spdk_poller *poller; 208 }; 209 210 struct spdk_bdev_mgmt_channel { 211 /* 212 * Each thread keeps a cache of bdev_io - this allows 213 * bdev threads which are *not* DPDK threads to still 214 * benefit from a per-thread bdev_io cache. Without 215 * this, non-DPDK threads fetching from the mempool 216 * incur a cmpxchg on get and put. 217 */ 218 bdev_io_stailq_t per_thread_cache; 219 uint32_t per_thread_cache_count; 220 uint32_t bdev_io_cache_size; 221 222 struct spdk_iobuf_channel iobuf; 223 224 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 225 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 226 }; 227 228 /* 229 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 230 * will queue here their IO that awaits retry. It makes it possible to retry sending 231 * IO to one bdev after IO from other bdev completes. 232 */ 233 struct spdk_bdev_shared_resource { 234 /* The bdev management channel */ 235 struct spdk_bdev_mgmt_channel *mgmt_ch; 236 237 /* 238 * Count of I/O submitted to bdev module and waiting for completion. 239 * Incremented before submit_request() is called on an spdk_bdev_io. 240 */ 241 uint64_t io_outstanding; 242 243 /* 244 * Queue of IO awaiting retry because of a previous NOMEM status returned 245 * on this channel. 246 */ 247 bdev_io_tailq_t nomem_io; 248 249 /* 250 * Threshold which io_outstanding must drop to before retrying nomem_io. 251 */ 252 uint64_t nomem_threshold; 253 254 /* I/O channel allocated by a bdev module */ 255 struct spdk_io_channel *shared_ch; 256 257 struct spdk_poller *nomem_poller; 258 259 /* Refcount of bdev channels using this resource */ 260 uint32_t ref; 261 262 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 263 }; 264 265 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 266 #define BDEV_CH_QOS_ENABLED (1 << 1) 267 268 struct spdk_bdev_channel { 269 struct spdk_bdev *bdev; 270 271 /* The channel for the underlying device */ 272 struct spdk_io_channel *channel; 273 274 /* Accel channel */ 275 struct spdk_io_channel *accel_channel; 276 277 /* Per io_device per thread data */ 278 struct spdk_bdev_shared_resource *shared_resource; 279 280 struct spdk_bdev_io_stat *stat; 281 282 /* 283 * Count of I/O submitted to the underlying dev module through this channel 284 * and waiting for completion. 285 */ 286 uint64_t io_outstanding; 287 288 /* 289 * List of all submitted I/Os including I/O that are generated via splitting. 290 */ 291 bdev_io_tailq_t io_submitted; 292 293 /* 294 * List of spdk_bdev_io that are currently queued because they write to a locked 295 * LBA range. 296 */ 297 bdev_io_tailq_t io_locked; 298 299 /* List of I/Os with accel sequence being currently executed */ 300 bdev_io_tailq_t io_accel_exec; 301 302 /* List of I/Os doing memory domain pull/push */ 303 bdev_io_tailq_t io_memory_domain; 304 305 uint32_t flags; 306 307 uint16_t trace_id; 308 309 struct spdk_histogram_data *histogram; 310 311 #ifdef SPDK_CONFIG_VTUNE 312 uint64_t start_tsc; 313 uint64_t interval_tsc; 314 __itt_string_handle *handle; 315 struct spdk_bdev_io_stat *prev_stat; 316 #endif 317 318 bdev_io_tailq_t queued_resets; 319 320 lba_range_tailq_t locked_ranges; 321 322 /** List of I/Os queued by QoS. */ 323 bdev_io_tailq_t qos_queued_io; 324 }; 325 326 struct media_event_entry { 327 struct spdk_bdev_media_event event; 328 TAILQ_ENTRY(media_event_entry) tailq; 329 }; 330 331 #define MEDIA_EVENT_POOL_SIZE 64 332 333 struct spdk_bdev_desc { 334 struct spdk_bdev *bdev; 335 struct spdk_thread *thread; 336 struct { 337 spdk_bdev_event_cb_t event_fn; 338 void *ctx; 339 } callback; 340 bool closed; 341 bool write; 342 bool memory_domains_supported; 343 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 344 struct spdk_spinlock spinlock; 345 uint32_t refs; 346 TAILQ_HEAD(, media_event_entry) pending_media_events; 347 TAILQ_HEAD(, media_event_entry) free_media_events; 348 struct media_event_entry *media_events_buffer; 349 TAILQ_ENTRY(spdk_bdev_desc) link; 350 351 uint64_t timeout_in_sec; 352 spdk_bdev_io_timeout_cb cb_fn; 353 void *cb_arg; 354 struct spdk_poller *io_timeout_poller; 355 struct spdk_bdev_module_claim *claim; 356 }; 357 358 struct spdk_bdev_iostat_ctx { 359 struct spdk_bdev_io_stat *stat; 360 spdk_bdev_get_device_stat_cb cb; 361 void *cb_arg; 362 }; 363 364 struct set_qos_limit_ctx { 365 void (*cb_fn)(void *cb_arg, int status); 366 void *cb_arg; 367 struct spdk_bdev *bdev; 368 }; 369 370 struct spdk_bdev_channel_iter { 371 spdk_bdev_for_each_channel_msg fn; 372 spdk_bdev_for_each_channel_done cpl; 373 struct spdk_io_channel_iter *i; 374 void *ctx; 375 }; 376 377 struct spdk_bdev_io_error_stat { 378 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 379 }; 380 381 enum bdev_io_retry_state { 382 BDEV_IO_RETRY_STATE_INVALID, 383 BDEV_IO_RETRY_STATE_PULL, 384 BDEV_IO_RETRY_STATE_PULL_MD, 385 BDEV_IO_RETRY_STATE_SUBMIT, 386 BDEV_IO_RETRY_STATE_PUSH, 387 BDEV_IO_RETRY_STATE_PUSH_MD, 388 }; 389 390 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 391 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 392 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 393 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 394 395 static inline void bdev_io_complete(void *ctx); 396 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 397 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 398 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 399 400 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 401 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 402 403 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 404 struct spdk_io_channel *ch, void *_ctx); 405 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 406 407 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 408 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 409 uint64_t num_blocks, 410 struct spdk_memory_domain *domain, void *domain_ctx, 411 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 412 spdk_bdev_io_completion_cb cb, void *cb_arg); 413 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 414 struct iovec *iov, int iovcnt, void *md_buf, 415 uint64_t offset_blocks, uint64_t num_blocks, 416 struct spdk_memory_domain *domain, void *domain_ctx, 417 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 418 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 419 spdk_bdev_io_completion_cb cb, void *cb_arg); 420 421 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 422 uint64_t offset, uint64_t length, 423 lock_range_cb cb_fn, void *cb_arg); 424 425 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 426 uint64_t offset, uint64_t length, 427 lock_range_cb cb_fn, void *cb_arg); 428 429 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 430 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 431 432 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 433 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 434 static void claim_reset(struct spdk_bdev *bdev); 435 436 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 437 438 #define bdev_get_ext_io_opt(opts, field, defval) \ 439 ((opts) != NULL ? SPDK_GET_FIELD(opts, field, defval) : (defval)) 440 441 void 442 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 443 { 444 if (!opts) { 445 SPDK_ERRLOG("opts should not be NULL\n"); 446 return; 447 } 448 449 if (!opts_size) { 450 SPDK_ERRLOG("opts_size should not be zero value\n"); 451 return; 452 } 453 454 opts->opts_size = opts_size; 455 456 #define SET_FIELD(field) \ 457 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 458 opts->field = g_bdev_opts.field; \ 459 } \ 460 461 SET_FIELD(bdev_io_pool_size); 462 SET_FIELD(bdev_io_cache_size); 463 SET_FIELD(bdev_auto_examine); 464 SET_FIELD(iobuf_small_cache_size); 465 SET_FIELD(iobuf_large_cache_size); 466 467 /* Do not remove this statement, you should always update this statement when you adding a new field, 468 * and do not forget to add the SET_FIELD statement for your added field. */ 469 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 470 471 #undef SET_FIELD 472 } 473 474 int 475 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 476 { 477 uint32_t min_pool_size; 478 479 if (!opts) { 480 SPDK_ERRLOG("opts cannot be NULL\n"); 481 return -1; 482 } 483 484 if (!opts->opts_size) { 485 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 486 return -1; 487 } 488 489 /* 490 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 491 * initialization. A second mgmt_ch will be created on the same thread when the application starts 492 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 493 */ 494 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 495 if (opts->bdev_io_pool_size < min_pool_size) { 496 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 497 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 498 spdk_thread_get_count()); 499 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 500 return -1; 501 } 502 503 #define SET_FIELD(field) \ 504 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 505 g_bdev_opts.field = opts->field; \ 506 } \ 507 508 SET_FIELD(bdev_io_pool_size); 509 SET_FIELD(bdev_io_cache_size); 510 SET_FIELD(bdev_auto_examine); 511 SET_FIELD(iobuf_small_cache_size); 512 SET_FIELD(iobuf_large_cache_size); 513 514 g_bdev_opts.opts_size = opts->opts_size; 515 516 #undef SET_FIELD 517 518 return 0; 519 } 520 521 static struct spdk_bdev * 522 bdev_get_by_name(const char *bdev_name) 523 { 524 struct spdk_bdev_name find; 525 struct spdk_bdev_name *res; 526 527 find.name = (char *)bdev_name; 528 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 529 if (res != NULL) { 530 return res->bdev; 531 } 532 533 return NULL; 534 } 535 536 struct spdk_bdev * 537 spdk_bdev_get_by_name(const char *bdev_name) 538 { 539 struct spdk_bdev *bdev; 540 541 spdk_spin_lock(&g_bdev_mgr.spinlock); 542 bdev = bdev_get_by_name(bdev_name); 543 spdk_spin_unlock(&g_bdev_mgr.spinlock); 544 545 return bdev; 546 } 547 548 struct bdev_io_status_string { 549 enum spdk_bdev_io_status status; 550 const char *str; 551 }; 552 553 static const struct bdev_io_status_string bdev_io_status_strings[] = { 554 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 555 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 556 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 557 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 558 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 559 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 560 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 561 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 562 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 563 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 564 }; 565 566 static const char * 567 bdev_io_status_get_string(enum spdk_bdev_io_status status) 568 { 569 uint32_t i; 570 571 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 572 if (bdev_io_status_strings[i].status == status) { 573 return bdev_io_status_strings[i].str; 574 } 575 } 576 577 return "reserved"; 578 } 579 580 struct spdk_bdev_wait_for_examine_ctx { 581 struct spdk_poller *poller; 582 spdk_bdev_wait_for_examine_cb cb_fn; 583 void *cb_arg; 584 }; 585 586 static bool bdev_module_all_actions_completed(void); 587 588 static int 589 bdev_wait_for_examine_cb(void *arg) 590 { 591 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 592 593 if (!bdev_module_all_actions_completed()) { 594 return SPDK_POLLER_IDLE; 595 } 596 597 spdk_poller_unregister(&ctx->poller); 598 ctx->cb_fn(ctx->cb_arg); 599 free(ctx); 600 601 return SPDK_POLLER_BUSY; 602 } 603 604 int 605 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 606 { 607 struct spdk_bdev_wait_for_examine_ctx *ctx; 608 609 ctx = calloc(1, sizeof(*ctx)); 610 if (ctx == NULL) { 611 return -ENOMEM; 612 } 613 ctx->cb_fn = cb_fn; 614 ctx->cb_arg = cb_arg; 615 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 616 617 return 0; 618 } 619 620 struct spdk_bdev_examine_item { 621 char *name; 622 TAILQ_ENTRY(spdk_bdev_examine_item) link; 623 }; 624 625 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 626 627 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 628 g_bdev_examine_allowlist); 629 630 static inline bool 631 bdev_examine_allowlist_check(const char *name) 632 { 633 struct spdk_bdev_examine_item *item; 634 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 635 if (strcmp(name, item->name) == 0) { 636 return true; 637 } 638 } 639 return false; 640 } 641 642 static inline void 643 bdev_examine_allowlist_free(void) 644 { 645 struct spdk_bdev_examine_item *item; 646 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 647 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 648 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 649 free(item->name); 650 free(item); 651 } 652 } 653 654 static inline bool 655 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 656 { 657 struct spdk_bdev_alias *tmp; 658 if (bdev_examine_allowlist_check(bdev->name)) { 659 return true; 660 } 661 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 662 if (bdev_examine_allowlist_check(tmp->alias.name)) { 663 return true; 664 } 665 } 666 return false; 667 } 668 669 static inline bool 670 bdev_ok_to_examine(struct spdk_bdev *bdev) 671 { 672 if (g_bdev_opts.bdev_auto_examine) { 673 return true; 674 } else { 675 return bdev_in_examine_allowlist(bdev); 676 } 677 } 678 679 static void 680 bdev_examine(struct spdk_bdev *bdev) 681 { 682 struct spdk_bdev_module *module; 683 struct spdk_bdev_module_claim *claim, *tmpclaim; 684 uint32_t action; 685 686 if (!bdev_ok_to_examine(bdev)) { 687 return; 688 } 689 690 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 691 if (module->examine_config) { 692 spdk_spin_lock(&module->internal.spinlock); 693 action = module->internal.action_in_progress; 694 module->internal.action_in_progress++; 695 spdk_spin_unlock(&module->internal.spinlock); 696 module->examine_config(bdev); 697 if (action != module->internal.action_in_progress) { 698 SPDK_ERRLOG("examine_config for module %s did not call " 699 "spdk_bdev_module_examine_done()\n", module->name); 700 } 701 } 702 } 703 704 spdk_spin_lock(&bdev->internal.spinlock); 705 706 switch (bdev->internal.claim_type) { 707 case SPDK_BDEV_CLAIM_NONE: 708 /* Examine by all bdev modules */ 709 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 710 if (module->examine_disk) { 711 spdk_spin_lock(&module->internal.spinlock); 712 module->internal.action_in_progress++; 713 spdk_spin_unlock(&module->internal.spinlock); 714 spdk_spin_unlock(&bdev->internal.spinlock); 715 module->examine_disk(bdev); 716 spdk_spin_lock(&bdev->internal.spinlock); 717 } 718 } 719 break; 720 case SPDK_BDEV_CLAIM_EXCL_WRITE: 721 /* Examine by the one bdev module with a v1 claim */ 722 module = bdev->internal.claim.v1.module; 723 if (module->examine_disk) { 724 spdk_spin_lock(&module->internal.spinlock); 725 module->internal.action_in_progress++; 726 spdk_spin_unlock(&module->internal.spinlock); 727 spdk_spin_unlock(&bdev->internal.spinlock); 728 module->examine_disk(bdev); 729 return; 730 } 731 break; 732 default: 733 /* Examine by all bdev modules with a v2 claim */ 734 assert(claim_type_is_v2(bdev->internal.claim_type)); 735 /* 736 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 737 * list, perhaps accessing freed memory. Without protection, this could happen 738 * while the lock is dropped during the examine callback. 739 */ 740 bdev->internal.examine_in_progress++; 741 742 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 743 module = claim->module; 744 745 if (module == NULL) { 746 /* This is a vestigial claim, held by examine_count */ 747 continue; 748 } 749 750 if (module->examine_disk == NULL) { 751 continue; 752 } 753 754 spdk_spin_lock(&module->internal.spinlock); 755 module->internal.action_in_progress++; 756 spdk_spin_unlock(&module->internal.spinlock); 757 758 /* Call examine_disk without holding internal.spinlock. */ 759 spdk_spin_unlock(&bdev->internal.spinlock); 760 module->examine_disk(bdev); 761 spdk_spin_lock(&bdev->internal.spinlock); 762 } 763 764 assert(bdev->internal.examine_in_progress > 0); 765 bdev->internal.examine_in_progress--; 766 if (bdev->internal.examine_in_progress == 0) { 767 /* Remove any claims that were released during examine_disk */ 768 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 769 if (claim->desc != NULL) { 770 continue; 771 } 772 773 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 774 free(claim); 775 } 776 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 777 claim_reset(bdev); 778 } 779 } 780 } 781 782 spdk_spin_unlock(&bdev->internal.spinlock); 783 } 784 785 int 786 spdk_bdev_examine(const char *name) 787 { 788 struct spdk_bdev *bdev; 789 struct spdk_bdev_examine_item *item; 790 struct spdk_thread *thread = spdk_get_thread(); 791 792 if (spdk_unlikely(!spdk_thread_is_app_thread(thread))) { 793 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 794 thread ? spdk_thread_get_name(thread) : "null"); 795 return -EINVAL; 796 } 797 798 if (g_bdev_opts.bdev_auto_examine) { 799 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 800 return -EINVAL; 801 } 802 803 if (bdev_examine_allowlist_check(name)) { 804 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 805 return -EEXIST; 806 } 807 808 item = calloc(1, sizeof(*item)); 809 if (!item) { 810 return -ENOMEM; 811 } 812 item->name = strdup(name); 813 if (!item->name) { 814 free(item); 815 return -ENOMEM; 816 } 817 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 818 819 bdev = spdk_bdev_get_by_name(name); 820 if (bdev) { 821 bdev_examine(bdev); 822 } 823 return 0; 824 } 825 826 static inline void 827 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 828 { 829 struct spdk_bdev_examine_item *item; 830 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 831 spdk_json_write_object_begin(w); 832 spdk_json_write_named_string(w, "method", "bdev_examine"); 833 spdk_json_write_named_object_begin(w, "params"); 834 spdk_json_write_named_string(w, "name", item->name); 835 spdk_json_write_object_end(w); 836 spdk_json_write_object_end(w); 837 } 838 } 839 840 struct spdk_bdev * 841 spdk_bdev_first(void) 842 { 843 struct spdk_bdev *bdev; 844 845 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 846 if (bdev) { 847 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 848 } 849 850 return bdev; 851 } 852 853 struct spdk_bdev * 854 spdk_bdev_next(struct spdk_bdev *prev) 855 { 856 struct spdk_bdev *bdev; 857 858 bdev = TAILQ_NEXT(prev, internal.link); 859 if (bdev) { 860 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 861 } 862 863 return bdev; 864 } 865 866 static struct spdk_bdev * 867 _bdev_next_leaf(struct spdk_bdev *bdev) 868 { 869 while (bdev != NULL) { 870 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 871 return bdev; 872 } else { 873 bdev = TAILQ_NEXT(bdev, internal.link); 874 } 875 } 876 877 return bdev; 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_first_leaf(void) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 886 887 if (bdev) { 888 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 889 } 890 891 return bdev; 892 } 893 894 struct spdk_bdev * 895 spdk_bdev_next_leaf(struct spdk_bdev *prev) 896 { 897 struct spdk_bdev *bdev; 898 899 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 900 901 if (bdev) { 902 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 903 } 904 905 return bdev; 906 } 907 908 static inline bool 909 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 910 { 911 return bdev_io->internal.memory_domain; 912 } 913 914 static inline bool 915 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 916 { 917 return bdev_io->internal.has_accel_sequence; 918 } 919 920 static inline void 921 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 922 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 923 { 924 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 925 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 926 * channels we will instead wait for half to complete. 927 */ 928 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 929 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 930 931 assert(state != BDEV_IO_RETRY_STATE_INVALID); 932 bdev_io->internal.retry_state = state; 933 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 934 } 935 936 static inline void 937 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 938 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 939 { 940 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 941 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 942 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 943 944 assert(state != BDEV_IO_RETRY_STATE_INVALID); 945 bdev_io->internal.retry_state = state; 946 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 947 } 948 949 void 950 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 951 { 952 struct iovec *iovs; 953 954 if (bdev_io->u.bdev.iovs == NULL) { 955 bdev_io->u.bdev.iovs = &bdev_io->iov; 956 bdev_io->u.bdev.iovcnt = 1; 957 } 958 959 iovs = bdev_io->u.bdev.iovs; 960 961 assert(iovs != NULL); 962 assert(bdev_io->u.bdev.iovcnt >= 1); 963 964 iovs[0].iov_base = buf; 965 iovs[0].iov_len = len; 966 } 967 968 void 969 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 970 { 971 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 972 bdev_io->u.bdev.md_buf = md_buf; 973 } 974 975 static bool 976 _is_buf_allocated(const struct iovec *iovs) 977 { 978 if (iovs == NULL) { 979 return false; 980 } 981 982 return iovs[0].iov_base != NULL; 983 } 984 985 static bool 986 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 987 { 988 int i; 989 uintptr_t iov_base; 990 991 if (spdk_likely(alignment == 1)) { 992 return true; 993 } 994 995 for (i = 0; i < iovcnt; i++) { 996 iov_base = (uintptr_t)iovs[i].iov_base; 997 if ((iov_base & (alignment - 1)) != 0) { 998 return false; 999 } 1000 } 1001 1002 return true; 1003 } 1004 1005 static inline bool 1006 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1007 { 1008 if (!bdev_io->internal.accel_sequence) { 1009 return false; 1010 } 1011 1012 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1013 * bdev module didn't support accel sequences */ 1014 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1015 } 1016 1017 static inline void 1018 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1019 struct spdk_bdev_shared_resource *shared_resource) 1020 { 1021 bdev_ch->io_outstanding++; 1022 shared_resource->io_outstanding++; 1023 } 1024 1025 static inline void 1026 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1027 struct spdk_bdev_shared_resource *shared_resource) 1028 { 1029 assert(bdev_ch->io_outstanding > 0); 1030 assert(shared_resource->io_outstanding > 0); 1031 bdev_ch->io_outstanding--; 1032 shared_resource->io_outstanding--; 1033 } 1034 1035 static void 1036 bdev_io_submit_sequence_cb(void *ctx, int status) 1037 { 1038 struct spdk_bdev_io *bdev_io = ctx; 1039 1040 bdev_io->u.bdev.accel_sequence = NULL; 1041 bdev_io->internal.accel_sequence = NULL; 1042 1043 if (spdk_unlikely(status != 0)) { 1044 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1045 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1046 bdev_io_complete_unsubmitted(bdev_io); 1047 return; 1048 } 1049 1050 bdev_io_submit(bdev_io); 1051 } 1052 1053 static void 1054 bdev_io_exec_sequence_cb(void *ctx, int status) 1055 { 1056 struct spdk_bdev_io *bdev_io = ctx; 1057 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1058 1059 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1060 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1061 1062 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1063 bdev_ch_retry_io(ch); 1064 } 1065 1066 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1067 } 1068 1069 static void 1070 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1071 { 1072 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1073 1074 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1075 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1076 1077 /* Since the operations are appended during submission, they're in the opposite order than 1078 * how we want to execute them for reads (i.e. we need to execute the most recently added 1079 * operation first), so reverse the sequence before executing it. 1080 */ 1081 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1082 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1083 } 1084 1085 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1086 bdev_io_increment_outstanding(ch, ch->shared_resource); 1087 bdev_io->internal.data_transfer_cpl = cb_fn; 1088 1089 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1090 bdev_io_exec_sequence_cb, bdev_io); 1091 } 1092 1093 static void 1094 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1095 { 1096 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1097 void *buf; 1098 1099 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1100 buf = bdev_io->internal.buf; 1101 bdev_io->internal.buf = NULL; 1102 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1103 bdev_io->internal.get_aux_buf_cb = NULL; 1104 } else { 1105 assert(bdev_io->internal.get_buf_cb != NULL); 1106 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1107 bdev_io->internal.get_buf_cb = NULL; 1108 } 1109 } 1110 1111 static void 1112 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1113 { 1114 struct spdk_bdev_io *bdev_io = ctx; 1115 1116 if (rc) { 1117 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1118 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1119 } 1120 bdev_io_get_buf_complete(bdev_io, !rc); 1121 } 1122 1123 static void 1124 bdev_io_pull_md_buf_done(void *ctx, int status) 1125 { 1126 struct spdk_bdev_io *bdev_io = ctx; 1127 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1128 1129 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1130 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1131 1132 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1133 bdev_ch_retry_io(ch); 1134 } 1135 1136 assert(bdev_io->internal.data_transfer_cpl); 1137 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1138 } 1139 1140 static void 1141 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1142 { 1143 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1144 int rc = 0; 1145 1146 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1147 if (bdev_io_use_memory_domain(bdev_io)) { 1148 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1149 bdev_io_increment_outstanding(ch, ch->shared_resource); 1150 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1151 bdev_io->internal.memory_domain_ctx, 1152 &bdev_io->internal.orig_md_iov, 1, 1153 &bdev_io->internal.bounce_md_iov, 1, 1154 bdev_io_pull_md_buf_done, bdev_io); 1155 if (rc == 0) { 1156 /* Continue to submit IO in completion callback */ 1157 return; 1158 } 1159 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1160 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1161 if (rc != -ENOMEM) { 1162 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1163 spdk_memory_domain_get_dma_device_id( 1164 bdev_io->internal.memory_domain), rc); 1165 } 1166 } else { 1167 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1168 bdev_io->internal.orig_md_iov.iov_base, 1169 bdev_io->internal.orig_md_iov.iov_len); 1170 } 1171 } 1172 1173 if (spdk_unlikely(rc == -ENOMEM)) { 1174 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1175 } else { 1176 assert(bdev_io->internal.data_transfer_cpl); 1177 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1178 } 1179 } 1180 1181 static void 1182 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1183 { 1184 /* save original md_buf */ 1185 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1186 bdev_io->internal.orig_md_iov.iov_len = len; 1187 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1188 bdev_io->internal.bounce_md_iov.iov_len = len; 1189 /* set bounce md_buf */ 1190 bdev_io->u.bdev.md_buf = md_buf; 1191 1192 bdev_io_pull_md_buf(bdev_io); 1193 } 1194 1195 static void 1196 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1197 { 1198 struct spdk_bdev *bdev = bdev_io->bdev; 1199 uint64_t md_len; 1200 void *buf; 1201 1202 if (spdk_bdev_is_md_separate(bdev)) { 1203 assert(!bdev_io_use_accel_sequence(bdev_io)); 1204 1205 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1206 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1207 1208 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1209 1210 if (bdev_io->u.bdev.md_buf != NULL) { 1211 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1212 return; 1213 } else { 1214 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1215 } 1216 } 1217 1218 bdev_io_get_buf_complete(bdev_io, true); 1219 } 1220 1221 static inline void 1222 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1223 { 1224 if (rc) { 1225 SPDK_ERRLOG("Failed to get data buffer\n"); 1226 assert(bdev_io->internal.data_transfer_cpl); 1227 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1228 return; 1229 } 1230 1231 _bdev_io_set_md_buf(bdev_io); 1232 } 1233 1234 static void 1235 bdev_io_pull_data_done_and_track(void *ctx, int status) 1236 { 1237 struct spdk_bdev_io *bdev_io = ctx; 1238 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1239 1240 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1241 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1242 1243 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1244 bdev_ch_retry_io(ch); 1245 } 1246 1247 bdev_io_pull_data_done(bdev_io, status); 1248 } 1249 1250 static void 1251 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1252 { 1253 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1254 int rc = 0; 1255 1256 /* If we need to exec an accel sequence or the IO uses a memory domain buffer and has a 1257 * sequence, append a copy operation making accel change the src/dst buffers of the previous 1258 * operation */ 1259 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) || 1260 (bdev_io_use_accel_sequence(bdev_io) && bdev_io_use_memory_domain(bdev_io))) { 1261 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1262 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1263 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1264 NULL, NULL, 1265 bdev_io->internal.orig_iovs, 1266 bdev_io->internal.orig_iovcnt, 1267 bdev_io->internal.memory_domain, 1268 bdev_io->internal.memory_domain_ctx, 1269 NULL, NULL); 1270 } else { 1271 /* We need to reverse the src/dst for reads */ 1272 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1273 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1274 bdev_io->internal.orig_iovs, 1275 bdev_io->internal.orig_iovcnt, 1276 bdev_io->internal.memory_domain, 1277 bdev_io->internal.memory_domain_ctx, 1278 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1279 NULL, NULL, NULL, NULL); 1280 } 1281 1282 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1283 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1284 bdev_io->internal.accel_sequence); 1285 } 1286 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1287 /* if this is write path, copy data from original buffer to bounce buffer */ 1288 if (bdev_io_use_memory_domain(bdev_io)) { 1289 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1290 bdev_io_increment_outstanding(ch, ch->shared_resource); 1291 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1292 bdev_io->internal.memory_domain_ctx, 1293 bdev_io->internal.orig_iovs, 1294 (uint32_t) bdev_io->internal.orig_iovcnt, 1295 bdev_io->u.bdev.iovs, 1, 1296 bdev_io_pull_data_done_and_track, 1297 bdev_io); 1298 if (rc == 0) { 1299 /* Continue to submit IO in completion callback */ 1300 return; 1301 } 1302 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1303 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1304 if (rc != -ENOMEM) { 1305 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1306 spdk_memory_domain_get_dma_device_id( 1307 bdev_io->internal.memory_domain)); 1308 } 1309 } else { 1310 assert(bdev_io->u.bdev.iovcnt == 1); 1311 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1312 bdev_io->u.bdev.iovs[0].iov_len, 1313 bdev_io->internal.orig_iovs, 1314 bdev_io->internal.orig_iovcnt); 1315 } 1316 } 1317 1318 if (spdk_unlikely(rc == -ENOMEM)) { 1319 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1320 } else { 1321 bdev_io_pull_data_done(bdev_io, rc); 1322 } 1323 } 1324 1325 static void 1326 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1327 bdev_copy_bounce_buffer_cpl cpl_cb) 1328 { 1329 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1330 1331 bdev_io->internal.data_transfer_cpl = cpl_cb; 1332 /* save original iovec */ 1333 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1334 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1335 /* set bounce iov */ 1336 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1337 bdev_io->u.bdev.iovcnt = 1; 1338 /* set bounce buffer for this operation */ 1339 bdev_io->u.bdev.iovs[0].iov_base = buf; 1340 bdev_io->u.bdev.iovs[0].iov_len = len; 1341 1342 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1343 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1344 } else { 1345 bdev_io_pull_data(bdev_io); 1346 } 1347 } 1348 1349 static void 1350 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1351 { 1352 struct spdk_bdev *bdev = bdev_io->bdev; 1353 bool buf_allocated; 1354 uint64_t alignment; 1355 void *aligned_buf; 1356 1357 bdev_io->internal.buf = buf; 1358 1359 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1360 bdev_io_get_buf_complete(bdev_io, true); 1361 return; 1362 } 1363 1364 alignment = spdk_bdev_get_buf_align(bdev); 1365 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1366 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1367 1368 if (buf_allocated) { 1369 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1370 /* Continue in completion callback */ 1371 return; 1372 } else { 1373 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1374 } 1375 1376 _bdev_io_set_md_buf(bdev_io); 1377 } 1378 1379 static inline uint64_t 1380 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1381 { 1382 struct spdk_bdev *bdev = bdev_io->bdev; 1383 uint64_t md_len, alignment; 1384 1385 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1386 1387 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1388 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1389 1390 return len + alignment + md_len; 1391 } 1392 1393 static void 1394 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1395 { 1396 struct spdk_bdev_mgmt_channel *ch; 1397 1398 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1399 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1400 } 1401 1402 static void 1403 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1404 { 1405 assert(bdev_io->internal.buf != NULL); 1406 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1407 bdev_io->internal.buf = NULL; 1408 } 1409 1410 void 1411 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1412 { 1413 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1414 1415 assert(buf != NULL); 1416 _bdev_io_put_buf(bdev_io, buf, len); 1417 } 1418 1419 static inline void 1420 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1421 struct spdk_bdev_io *bdev_io) 1422 { 1423 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1424 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1425 * sequence pointer to make sure we won't touch it anymore. */ 1426 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1427 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1428 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1429 bdev_io->internal.accel_sequence = NULL; 1430 } 1431 1432 bdev->fn_table->submit_request(ioch, bdev_io); 1433 } 1434 1435 static inline void 1436 bdev_ch_resubmit_io(struct spdk_bdev_shared_resource *shared_resource, struct spdk_bdev_io *bdev_io) 1437 { 1438 struct spdk_bdev *bdev = bdev_io->bdev; 1439 1440 bdev_io_increment_outstanding(bdev_io->internal.ch, shared_resource); 1441 bdev_io->internal.error.nvme.cdw0 = 0; 1442 bdev_io->num_retries++; 1443 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1444 } 1445 1446 static void 1447 bdev_shared_ch_retry_io(struct spdk_bdev_shared_resource *shared_resource) 1448 { 1449 struct spdk_bdev_io *bdev_io; 1450 1451 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1452 /* 1453 * Allow some more I/O to complete before retrying the nomem_io queue. 1454 * Some drivers (such as nvme) cannot immediately take a new I/O in 1455 * the context of a completion, because the resources for the I/O are 1456 * not released until control returns to the bdev poller. Also, we 1457 * may require several small I/O to complete before a larger I/O 1458 * (that requires splitting) can be submitted. 1459 */ 1460 return; 1461 } 1462 1463 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1464 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1465 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1466 1467 switch (bdev_io->internal.retry_state) { 1468 case BDEV_IO_RETRY_STATE_SUBMIT: 1469 bdev_ch_resubmit_io(shared_resource, bdev_io); 1470 break; 1471 case BDEV_IO_RETRY_STATE_PULL: 1472 bdev_io_pull_data(bdev_io); 1473 break; 1474 case BDEV_IO_RETRY_STATE_PULL_MD: 1475 bdev_io_pull_md_buf(bdev_io); 1476 break; 1477 case BDEV_IO_RETRY_STATE_PUSH: 1478 bdev_io_push_bounce_data(bdev_io); 1479 break; 1480 case BDEV_IO_RETRY_STATE_PUSH_MD: 1481 bdev_io_push_bounce_md_buf(bdev_io); 1482 break; 1483 default: 1484 assert(0 && "invalid retry state"); 1485 break; 1486 } 1487 1488 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1489 /* This IO completed again with NOMEM status, so break the loop and 1490 * don't try anymore. Note that a bdev_io that fails with NOMEM 1491 * always gets requeued at the front of the list, to maintain 1492 * ordering. 1493 */ 1494 break; 1495 } 1496 } 1497 } 1498 1499 static void 1500 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1501 { 1502 bdev_shared_ch_retry_io(bdev_ch->shared_resource); 1503 } 1504 1505 static int 1506 bdev_no_mem_poller(void *ctx) 1507 { 1508 struct spdk_bdev_shared_resource *shared_resource = ctx; 1509 1510 spdk_poller_unregister(&shared_resource->nomem_poller); 1511 1512 if (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1513 bdev_shared_ch_retry_io(shared_resource); 1514 } 1515 /* the retry cb may re-register the poller so double check */ 1516 if (!TAILQ_EMPTY(&shared_resource->nomem_io) && 1517 shared_resource->io_outstanding == 0 && shared_resource->nomem_poller == NULL) { 1518 /* No IOs were submitted, try again */ 1519 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1520 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1521 } 1522 1523 return SPDK_POLLER_BUSY; 1524 } 1525 1526 static inline bool 1527 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1528 { 1529 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1530 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1531 1532 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1533 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1534 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1535 1536 if (shared_resource->io_outstanding == 0 && !shared_resource->nomem_poller) { 1537 /* Special case when we have nomem IOs and no outstanding IOs which completions 1538 * could trigger retry of queued IOs 1539 * Any IOs submitted may trigger retry of queued IOs. This poller handles a case when no 1540 * new IOs submitted, e.g. qd==1 */ 1541 shared_resource->nomem_poller = SPDK_POLLER_REGISTER(bdev_no_mem_poller, shared_resource, 1542 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * 10); 1543 } 1544 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1545 * ownership of that sequence is transferred back to the bdev layer, so we need to 1546 * restore internal.accel_sequence to make sure that the sequence is handled 1547 * correctly in case the I/O is later aborted. */ 1548 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1549 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1550 assert(bdev_io->internal.accel_sequence == NULL); 1551 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1552 } 1553 1554 return true; 1555 } 1556 1557 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1558 bdev_ch_retry_io(bdev_ch); 1559 } 1560 1561 return false; 1562 } 1563 1564 static void 1565 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1566 { 1567 struct spdk_bdev_io *bdev_io = ctx; 1568 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1569 1570 if (rc) { 1571 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1572 } 1573 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1574 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1575 */ 1576 bdev_io_put_buf(bdev_io); 1577 1578 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1579 bdev_ch_retry_io(ch); 1580 } 1581 1582 /* Continue with IO completion flow */ 1583 bdev_io_complete(bdev_io); 1584 } 1585 1586 static void 1587 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1588 { 1589 struct spdk_bdev_io *bdev_io = ctx; 1590 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1591 1592 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1593 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1594 1595 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1596 bdev_ch_retry_io(ch); 1597 } 1598 1599 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1600 } 1601 1602 static inline void 1603 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1604 { 1605 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1606 int rc = 0; 1607 1608 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1609 /* do the same for metadata buffer */ 1610 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1611 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1612 1613 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1614 if (bdev_io_use_memory_domain(bdev_io)) { 1615 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1616 bdev_io_increment_outstanding(ch, ch->shared_resource); 1617 /* If memory domain is used then we need to call async push function */ 1618 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1619 bdev_io->internal.memory_domain_ctx, 1620 &bdev_io->internal.orig_md_iov, 1621 (uint32_t)bdev_io->internal.orig_iovcnt, 1622 &bdev_io->internal.bounce_md_iov, 1, 1623 bdev_io_push_bounce_md_buf_done, 1624 bdev_io); 1625 if (rc == 0) { 1626 /* Continue IO completion in async callback */ 1627 return; 1628 } 1629 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1630 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1631 if (rc != -ENOMEM) { 1632 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1633 spdk_memory_domain_get_dma_device_id( 1634 bdev_io->internal.memory_domain)); 1635 } 1636 } else { 1637 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1638 bdev_io->internal.orig_md_iov.iov_len); 1639 } 1640 } 1641 } 1642 1643 if (spdk_unlikely(rc == -ENOMEM)) { 1644 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1645 } else { 1646 assert(bdev_io->internal.data_transfer_cpl); 1647 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1648 } 1649 } 1650 1651 static inline void 1652 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1653 { 1654 assert(bdev_io->internal.data_transfer_cpl); 1655 if (rc) { 1656 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1657 return; 1658 } 1659 1660 /* set original buffer for this io */ 1661 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1662 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1663 /* disable bouncing buffer for this io */ 1664 bdev_io->internal.orig_iovcnt = 0; 1665 bdev_io->internal.orig_iovs = NULL; 1666 1667 bdev_io_push_bounce_md_buf(bdev_io); 1668 } 1669 1670 static void 1671 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1672 { 1673 struct spdk_bdev_io *bdev_io = ctx; 1674 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1675 1676 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1677 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1678 1679 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1680 bdev_ch_retry_io(ch); 1681 } 1682 1683 bdev_io_push_bounce_data_done(bdev_io, status); 1684 } 1685 1686 static inline void 1687 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1688 { 1689 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1690 int rc = 0; 1691 1692 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1693 assert(!bdev_io_use_accel_sequence(bdev_io)); 1694 1695 /* if this is read path, copy data from bounce buffer to original buffer */ 1696 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1697 if (bdev_io_use_memory_domain(bdev_io)) { 1698 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1699 bdev_io_increment_outstanding(ch, ch->shared_resource); 1700 /* If memory domain is used then we need to call async push function */ 1701 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1702 bdev_io->internal.memory_domain_ctx, 1703 bdev_io->internal.orig_iovs, 1704 (uint32_t)bdev_io->internal.orig_iovcnt, 1705 &bdev_io->internal.bounce_iov, 1, 1706 bdev_io_push_bounce_data_done_and_track, 1707 bdev_io); 1708 if (rc == 0) { 1709 /* Continue IO completion in async callback */ 1710 return; 1711 } 1712 1713 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1714 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1715 if (rc != -ENOMEM) { 1716 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1717 spdk_memory_domain_get_dma_device_id( 1718 bdev_io->internal.memory_domain)); 1719 } 1720 } else { 1721 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1722 bdev_io->internal.orig_iovcnt, 1723 bdev_io->internal.bounce_iov.iov_base, 1724 bdev_io->internal.bounce_iov.iov_len); 1725 } 1726 } 1727 1728 if (spdk_unlikely(rc == -ENOMEM)) { 1729 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1730 } else { 1731 bdev_io_push_bounce_data_done(bdev_io, rc); 1732 } 1733 } 1734 1735 static inline void 1736 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1737 { 1738 bdev_io->internal.data_transfer_cpl = cpl_cb; 1739 bdev_io_push_bounce_data(bdev_io); 1740 } 1741 1742 static void 1743 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1744 { 1745 struct spdk_bdev_io *bdev_io; 1746 1747 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1748 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1749 } 1750 1751 static void 1752 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1753 { 1754 struct spdk_bdev_mgmt_channel *mgmt_ch; 1755 uint64_t max_len; 1756 void *buf; 1757 1758 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1759 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1760 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1761 1762 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1763 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1764 bdev_io_get_buf_complete(bdev_io, false); 1765 return; 1766 } 1767 1768 bdev_io->internal.buf_len = len; 1769 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1770 bdev_io_get_iobuf_cb); 1771 if (buf != NULL) { 1772 _bdev_io_set_buf(bdev_io, buf, len); 1773 } 1774 } 1775 1776 void 1777 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1778 { 1779 struct spdk_bdev *bdev = bdev_io->bdev; 1780 uint64_t alignment; 1781 1782 assert(cb != NULL); 1783 bdev_io->internal.get_buf_cb = cb; 1784 1785 alignment = spdk_bdev_get_buf_align(bdev); 1786 1787 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1788 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1789 /* Buffer already present and aligned */ 1790 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1791 return; 1792 } 1793 1794 bdev_io_get_buf(bdev_io, len); 1795 } 1796 1797 static void 1798 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1799 bool success) 1800 { 1801 if (!success) { 1802 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1803 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1804 bdev_io_complete_unsubmitted(bdev_io); 1805 return; 1806 } 1807 1808 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1809 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1810 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1811 return; 1812 } 1813 /* For reads we'll execute the sequence after the data is read, so, for now, only 1814 * clear out accel_sequence pointer and submit the IO */ 1815 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1816 bdev_io->u.bdev.accel_sequence = NULL; 1817 } 1818 1819 bdev_io_submit(bdev_io); 1820 } 1821 1822 static void 1823 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1824 uint64_t len) 1825 { 1826 assert(cb != NULL); 1827 bdev_io->internal.get_buf_cb = cb; 1828 1829 bdev_io_get_buf(bdev_io, len); 1830 } 1831 1832 void 1833 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1834 { 1835 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1836 1837 assert(cb != NULL); 1838 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1839 bdev_io->internal.get_aux_buf_cb = cb; 1840 bdev_io_get_buf(bdev_io, len); 1841 } 1842 1843 static int 1844 bdev_module_get_max_ctx_size(void) 1845 { 1846 struct spdk_bdev_module *bdev_module; 1847 int max_bdev_module_size = 0; 1848 1849 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1850 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1851 max_bdev_module_size = bdev_module->get_ctx_size(); 1852 } 1853 } 1854 1855 return max_bdev_module_size; 1856 } 1857 1858 static void 1859 bdev_enable_histogram_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1860 { 1861 if (!bdev->internal.histogram_enabled) { 1862 return; 1863 } 1864 1865 spdk_json_write_object_begin(w); 1866 spdk_json_write_named_string(w, "method", "bdev_enable_histogram"); 1867 1868 spdk_json_write_named_object_begin(w, "params"); 1869 spdk_json_write_named_string(w, "name", bdev->name); 1870 1871 spdk_json_write_named_bool(w, "enable", bdev->internal.histogram_enabled); 1872 spdk_json_write_object_end(w); 1873 1874 spdk_json_write_object_end(w); 1875 } 1876 1877 static void 1878 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1879 { 1880 int i; 1881 struct spdk_bdev_qos *qos = bdev->internal.qos; 1882 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1883 1884 if (!qos) { 1885 return; 1886 } 1887 1888 spdk_bdev_get_qos_rate_limits(bdev, limits); 1889 1890 spdk_json_write_object_begin(w); 1891 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1892 1893 spdk_json_write_named_object_begin(w, "params"); 1894 spdk_json_write_named_string(w, "name", bdev->name); 1895 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1896 if (limits[i] > 0) { 1897 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1898 } 1899 } 1900 spdk_json_write_object_end(w); 1901 1902 spdk_json_write_object_end(w); 1903 } 1904 1905 void 1906 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1907 { 1908 struct spdk_bdev_module *bdev_module; 1909 struct spdk_bdev *bdev; 1910 1911 assert(w != NULL); 1912 1913 spdk_json_write_array_begin(w); 1914 1915 spdk_json_write_object_begin(w); 1916 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1917 spdk_json_write_named_object_begin(w, "params"); 1918 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1919 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1920 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1921 spdk_json_write_named_uint32(w, "iobuf_small_cache_size", g_bdev_opts.iobuf_small_cache_size); 1922 spdk_json_write_named_uint32(w, "iobuf_large_cache_size", g_bdev_opts.iobuf_large_cache_size); 1923 spdk_json_write_object_end(w); 1924 spdk_json_write_object_end(w); 1925 1926 bdev_examine_allowlist_config_json(w); 1927 1928 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1929 if (bdev_module->config_json) { 1930 bdev_module->config_json(w); 1931 } 1932 } 1933 1934 spdk_spin_lock(&g_bdev_mgr.spinlock); 1935 1936 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1937 if (bdev->fn_table->write_config_json) { 1938 bdev->fn_table->write_config_json(bdev, w); 1939 } 1940 1941 bdev_qos_config_json(bdev, w); 1942 bdev_enable_histogram_config_json(bdev, w); 1943 } 1944 1945 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1946 1947 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1948 spdk_json_write_object_begin(w); 1949 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1950 spdk_json_write_object_end(w); 1951 1952 spdk_json_write_array_end(w); 1953 } 1954 1955 static void 1956 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1957 { 1958 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1959 struct spdk_bdev_io *bdev_io; 1960 1961 spdk_iobuf_channel_fini(&ch->iobuf); 1962 1963 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1964 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1965 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1966 ch->per_thread_cache_count--; 1967 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1968 } 1969 1970 assert(ch->per_thread_cache_count == 0); 1971 } 1972 1973 static int 1974 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1975 { 1976 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1977 struct spdk_bdev_io *bdev_io; 1978 uint32_t i; 1979 int rc; 1980 1981 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", 1982 g_bdev_opts.iobuf_small_cache_size, 1983 g_bdev_opts.iobuf_large_cache_size); 1984 if (rc != 0) { 1985 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1986 return -1; 1987 } 1988 1989 STAILQ_INIT(&ch->per_thread_cache); 1990 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1991 1992 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1993 ch->per_thread_cache_count = 0; 1994 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1995 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1996 if (bdev_io == NULL) { 1997 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1998 assert(false); 1999 bdev_mgmt_channel_destroy(io_device, ctx_buf); 2000 return -1; 2001 } 2002 ch->per_thread_cache_count++; 2003 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2004 } 2005 2006 TAILQ_INIT(&ch->shared_resources); 2007 TAILQ_INIT(&ch->io_wait_queue); 2008 2009 return 0; 2010 } 2011 2012 static void 2013 bdev_init_complete(int rc) 2014 { 2015 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 2016 void *cb_arg = g_init_cb_arg; 2017 struct spdk_bdev_module *m; 2018 2019 g_bdev_mgr.init_complete = true; 2020 g_init_cb_fn = NULL; 2021 g_init_cb_arg = NULL; 2022 2023 /* 2024 * For modules that need to know when subsystem init is complete, 2025 * inform them now. 2026 */ 2027 if (rc == 0) { 2028 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2029 if (m->init_complete) { 2030 m->init_complete(); 2031 } 2032 } 2033 } 2034 2035 cb_fn(cb_arg, rc); 2036 } 2037 2038 static bool 2039 bdev_module_all_actions_completed(void) 2040 { 2041 struct spdk_bdev_module *m; 2042 2043 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 2044 if (m->internal.action_in_progress > 0) { 2045 return false; 2046 } 2047 } 2048 return true; 2049 } 2050 2051 static void 2052 bdev_module_action_complete(void) 2053 { 2054 /* 2055 * Don't finish bdev subsystem initialization if 2056 * module pre-initialization is still in progress, or 2057 * the subsystem been already initialized. 2058 */ 2059 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2060 return; 2061 } 2062 2063 /* 2064 * Check all bdev modules for inits/examinations in progress. If any 2065 * exist, return immediately since we cannot finish bdev subsystem 2066 * initialization until all are completed. 2067 */ 2068 if (!bdev_module_all_actions_completed()) { 2069 return; 2070 } 2071 2072 /* 2073 * Modules already finished initialization - now that all 2074 * the bdev modules have finished their asynchronous I/O 2075 * processing, the entire bdev layer can be marked as complete. 2076 */ 2077 bdev_init_complete(0); 2078 } 2079 2080 static void 2081 bdev_module_action_done(struct spdk_bdev_module *module) 2082 { 2083 spdk_spin_lock(&module->internal.spinlock); 2084 assert(module->internal.action_in_progress > 0); 2085 module->internal.action_in_progress--; 2086 spdk_spin_unlock(&module->internal.spinlock); 2087 bdev_module_action_complete(); 2088 } 2089 2090 void 2091 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2092 { 2093 assert(module->async_init); 2094 bdev_module_action_done(module); 2095 } 2096 2097 void 2098 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2099 { 2100 bdev_module_action_done(module); 2101 } 2102 2103 /** The last initialized bdev module */ 2104 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2105 2106 static void 2107 bdev_init_failed(void *cb_arg) 2108 { 2109 struct spdk_bdev_module *module = cb_arg; 2110 2111 spdk_spin_lock(&module->internal.spinlock); 2112 assert(module->internal.action_in_progress > 0); 2113 module->internal.action_in_progress--; 2114 spdk_spin_unlock(&module->internal.spinlock); 2115 bdev_init_complete(-1); 2116 } 2117 2118 static int 2119 bdev_modules_init(void) 2120 { 2121 struct spdk_bdev_module *module; 2122 int rc = 0; 2123 2124 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2125 g_resume_bdev_module = module; 2126 if (module->async_init) { 2127 spdk_spin_lock(&module->internal.spinlock); 2128 module->internal.action_in_progress = 1; 2129 spdk_spin_unlock(&module->internal.spinlock); 2130 } 2131 rc = module->module_init(); 2132 if (rc != 0) { 2133 /* Bump action_in_progress to prevent other modules from completion of modules_init 2134 * Send message to defer application shutdown until resources are cleaned up */ 2135 spdk_spin_lock(&module->internal.spinlock); 2136 module->internal.action_in_progress = 1; 2137 spdk_spin_unlock(&module->internal.spinlock); 2138 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2139 return rc; 2140 } 2141 } 2142 2143 g_resume_bdev_module = NULL; 2144 return 0; 2145 } 2146 2147 void 2148 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2149 { 2150 int rc = 0; 2151 char mempool_name[32]; 2152 2153 assert(cb_fn != NULL); 2154 2155 g_init_cb_fn = cb_fn; 2156 g_init_cb_arg = cb_arg; 2157 2158 spdk_notify_type_register("bdev_register"); 2159 spdk_notify_type_register("bdev_unregister"); 2160 2161 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2162 2163 rc = spdk_iobuf_register_module("bdev"); 2164 if (rc != 0) { 2165 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2166 bdev_init_complete(-1); 2167 return; 2168 } 2169 2170 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2171 g_bdev_opts.bdev_io_pool_size, 2172 sizeof(struct spdk_bdev_io) + 2173 bdev_module_get_max_ctx_size(), 2174 0, 2175 SPDK_ENV_SOCKET_ID_ANY); 2176 2177 if (g_bdev_mgr.bdev_io_pool == NULL) { 2178 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2179 bdev_init_complete(-1); 2180 return; 2181 } 2182 2183 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2184 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2185 if (!g_bdev_mgr.zero_buffer) { 2186 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2187 bdev_init_complete(-1); 2188 return; 2189 } 2190 2191 #ifdef SPDK_CONFIG_VTUNE 2192 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2193 #endif 2194 2195 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2196 bdev_mgmt_channel_destroy, 2197 sizeof(struct spdk_bdev_mgmt_channel), 2198 "bdev_mgr"); 2199 2200 rc = bdev_modules_init(); 2201 g_bdev_mgr.module_init_complete = true; 2202 if (rc != 0) { 2203 SPDK_ERRLOG("bdev modules init failed\n"); 2204 return; 2205 } 2206 2207 bdev_module_action_complete(); 2208 } 2209 2210 static void 2211 bdev_mgr_unregister_cb(void *io_device) 2212 { 2213 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2214 2215 if (g_bdev_mgr.bdev_io_pool) { 2216 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2217 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2218 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2219 g_bdev_opts.bdev_io_pool_size); 2220 } 2221 2222 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2223 } 2224 2225 spdk_free(g_bdev_mgr.zero_buffer); 2226 2227 bdev_examine_allowlist_free(); 2228 2229 cb_fn(g_fini_cb_arg); 2230 g_fini_cb_fn = NULL; 2231 g_fini_cb_arg = NULL; 2232 g_bdev_mgr.init_complete = false; 2233 g_bdev_mgr.module_init_complete = false; 2234 } 2235 2236 static void 2237 bdev_module_fini_iter(void *arg) 2238 { 2239 struct spdk_bdev_module *bdev_module; 2240 2241 /* FIXME: Handling initialization failures is broken now, 2242 * so we won't even try cleaning up after successfully 2243 * initialized modules. if module_init_complete is false, 2244 * just call spdk_bdev_mgr_unregister_cb 2245 */ 2246 if (!g_bdev_mgr.module_init_complete) { 2247 bdev_mgr_unregister_cb(NULL); 2248 return; 2249 } 2250 2251 /* Start iterating from the last touched module */ 2252 if (!g_resume_bdev_module) { 2253 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2254 } else { 2255 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2256 internal.tailq); 2257 } 2258 2259 while (bdev_module) { 2260 if (bdev_module->async_fini) { 2261 /* Save our place so we can resume later. We must 2262 * save the variable here, before calling module_fini() 2263 * below, because in some cases the module may immediately 2264 * call spdk_bdev_module_fini_done() and re-enter 2265 * this function to continue iterating. */ 2266 g_resume_bdev_module = bdev_module; 2267 } 2268 2269 if (bdev_module->module_fini) { 2270 bdev_module->module_fini(); 2271 } 2272 2273 if (bdev_module->async_fini) { 2274 return; 2275 } 2276 2277 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2278 internal.tailq); 2279 } 2280 2281 g_resume_bdev_module = NULL; 2282 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2283 } 2284 2285 void 2286 spdk_bdev_module_fini_done(void) 2287 { 2288 if (spdk_get_thread() != g_fini_thread) { 2289 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2290 } else { 2291 bdev_module_fini_iter(NULL); 2292 } 2293 } 2294 2295 static void 2296 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2297 { 2298 struct spdk_bdev *bdev = cb_arg; 2299 2300 if (bdeverrno && bdev) { 2301 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2302 bdev->name); 2303 2304 /* 2305 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2306 * bdev; try to continue by manually removing this bdev from the list and continue 2307 * with the next bdev in the list. 2308 */ 2309 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2310 } 2311 2312 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2313 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2314 /* 2315 * Bdev module finish need to be deferred as we might be in the middle of some context 2316 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2317 * after returning. 2318 */ 2319 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2320 return; 2321 } 2322 2323 /* 2324 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2325 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2326 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2327 * base bdevs. 2328 * 2329 * Also, walk the list in the reverse order. 2330 */ 2331 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2332 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2333 spdk_spin_lock(&bdev->internal.spinlock); 2334 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2335 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2336 spdk_spin_unlock(&bdev->internal.spinlock); 2337 continue; 2338 } 2339 spdk_spin_unlock(&bdev->internal.spinlock); 2340 2341 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2342 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2343 return; 2344 } 2345 2346 /* 2347 * If any bdev fails to unclaim underlying bdev properly, we may face the 2348 * case of bdev list consisting of claimed bdevs only (if claims are managed 2349 * correctly, this would mean there's a loop in the claims graph which is 2350 * clearly impossible). Warn and unregister last bdev on the list then. 2351 */ 2352 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2353 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2354 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2355 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2356 return; 2357 } 2358 } 2359 2360 static void 2361 bdev_module_fini_start_iter(void *arg) 2362 { 2363 struct spdk_bdev_module *bdev_module; 2364 2365 if (!g_resume_bdev_module) { 2366 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2367 } else { 2368 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2369 } 2370 2371 while (bdev_module) { 2372 if (bdev_module->async_fini_start) { 2373 /* Save our place so we can resume later. We must 2374 * save the variable here, before calling fini_start() 2375 * below, because in some cases the module may immediately 2376 * call spdk_bdev_module_fini_start_done() and re-enter 2377 * this function to continue iterating. */ 2378 g_resume_bdev_module = bdev_module; 2379 } 2380 2381 if (bdev_module->fini_start) { 2382 bdev_module->fini_start(); 2383 } 2384 2385 if (bdev_module->async_fini_start) { 2386 return; 2387 } 2388 2389 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2390 } 2391 2392 g_resume_bdev_module = NULL; 2393 2394 bdev_finish_unregister_bdevs_iter(NULL, 0); 2395 } 2396 2397 void 2398 spdk_bdev_module_fini_start_done(void) 2399 { 2400 if (spdk_get_thread() != g_fini_thread) { 2401 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2402 } else { 2403 bdev_module_fini_start_iter(NULL); 2404 } 2405 } 2406 2407 static void 2408 bdev_finish_wait_for_examine_done(void *cb_arg) 2409 { 2410 bdev_module_fini_start_iter(NULL); 2411 } 2412 2413 static void bdev_open_async_fini(void); 2414 2415 void 2416 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2417 { 2418 int rc; 2419 2420 assert(cb_fn != NULL); 2421 2422 g_fini_thread = spdk_get_thread(); 2423 2424 g_fini_cb_fn = cb_fn; 2425 g_fini_cb_arg = cb_arg; 2426 2427 bdev_open_async_fini(); 2428 2429 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2430 if (rc != 0) { 2431 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2432 bdev_finish_wait_for_examine_done(NULL); 2433 } 2434 } 2435 2436 struct spdk_bdev_io * 2437 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2438 { 2439 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2440 struct spdk_bdev_io *bdev_io; 2441 2442 if (ch->per_thread_cache_count > 0) { 2443 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2444 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2445 ch->per_thread_cache_count--; 2446 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2447 /* 2448 * Don't try to look for bdev_ios in the global pool if there are 2449 * waiters on bdev_ios - we don't want this caller to jump the line. 2450 */ 2451 bdev_io = NULL; 2452 } else { 2453 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2454 } 2455 2456 return bdev_io; 2457 } 2458 2459 void 2460 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2461 { 2462 struct spdk_bdev_mgmt_channel *ch; 2463 2464 assert(bdev_io != NULL); 2465 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2466 2467 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2468 2469 if (bdev_io->internal.buf != NULL) { 2470 bdev_io_put_buf(bdev_io); 2471 } 2472 2473 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2474 ch->per_thread_cache_count++; 2475 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2476 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2477 struct spdk_bdev_io_wait_entry *entry; 2478 2479 entry = TAILQ_FIRST(&ch->io_wait_queue); 2480 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2481 entry->cb_fn(entry->cb_arg); 2482 } 2483 } else { 2484 /* We should never have a full cache with entries on the io wait queue. */ 2485 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2486 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2487 } 2488 } 2489 2490 static bool 2491 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2492 { 2493 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2494 2495 switch (limit) { 2496 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2497 return true; 2498 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2499 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2500 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2501 return false; 2502 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2503 default: 2504 return false; 2505 } 2506 } 2507 2508 static bool 2509 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2510 { 2511 switch (bdev_io->type) { 2512 case SPDK_BDEV_IO_TYPE_NVME_IO: 2513 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2514 case SPDK_BDEV_IO_TYPE_READ: 2515 case SPDK_BDEV_IO_TYPE_WRITE: 2516 return true; 2517 case SPDK_BDEV_IO_TYPE_ZCOPY: 2518 if (bdev_io->u.bdev.zcopy.start) { 2519 return true; 2520 } else { 2521 return false; 2522 } 2523 default: 2524 return false; 2525 } 2526 } 2527 2528 static bool 2529 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2530 { 2531 switch (bdev_io->type) { 2532 case SPDK_BDEV_IO_TYPE_NVME_IO: 2533 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2534 /* Bit 1 (0x2) set for read operation */ 2535 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2536 return true; 2537 } else { 2538 return false; 2539 } 2540 case SPDK_BDEV_IO_TYPE_READ: 2541 return true; 2542 case SPDK_BDEV_IO_TYPE_ZCOPY: 2543 /* Populate to read from disk */ 2544 if (bdev_io->u.bdev.zcopy.populate) { 2545 return true; 2546 } else { 2547 return false; 2548 } 2549 default: 2550 return false; 2551 } 2552 } 2553 2554 static uint64_t 2555 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2556 { 2557 struct spdk_bdev *bdev = bdev_io->bdev; 2558 2559 switch (bdev_io->type) { 2560 case SPDK_BDEV_IO_TYPE_NVME_IO: 2561 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2562 return bdev_io->u.nvme_passthru.nbytes; 2563 case SPDK_BDEV_IO_TYPE_READ: 2564 case SPDK_BDEV_IO_TYPE_WRITE: 2565 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2566 case SPDK_BDEV_IO_TYPE_ZCOPY: 2567 /* Track the data in the start phase only */ 2568 if (bdev_io->u.bdev.zcopy.start) { 2569 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2570 } else { 2571 return 0; 2572 } 2573 default: 2574 return 0; 2575 } 2576 } 2577 2578 static inline bool 2579 bdev_qos_rw_queue_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2580 { 2581 int64_t remaining_this_timeslice; 2582 2583 if (!limit->max_per_timeslice) { 2584 /* The QoS is disabled */ 2585 return false; 2586 } 2587 2588 remaining_this_timeslice = __atomic_sub_fetch(&limit->remaining_this_timeslice, delta, 2589 __ATOMIC_RELAXED); 2590 if (remaining_this_timeslice + (int64_t)delta > 0) { 2591 /* There was still a quota for this delta -> the IO shouldn't be queued 2592 * 2593 * We allow a slight quota overrun here so an IO bigger than the per-timeslice 2594 * quota can be allowed once a while. Such overrun then taken into account in 2595 * the QoS poller, where the next timeslice quota is calculated. 2596 */ 2597 return false; 2598 } 2599 2600 /* There was no quota for this delta -> the IO should be queued 2601 * The remaining_this_timeslice must be rewinded so it reflects the real 2602 * amount of IOs or bytes allowed. 2603 */ 2604 __atomic_add_fetch( 2605 &limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2606 return true; 2607 } 2608 2609 static inline void 2610 bdev_qos_rw_rewind_io(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io, uint64_t delta) 2611 { 2612 __atomic_add_fetch(&limit->remaining_this_timeslice, delta, __ATOMIC_RELAXED); 2613 } 2614 2615 static bool 2616 bdev_qos_rw_iops_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2617 { 2618 return bdev_qos_rw_queue_io(limit, io, 1); 2619 } 2620 2621 static void 2622 bdev_qos_rw_iops_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2623 { 2624 bdev_qos_rw_rewind_io(limit, io, 1); 2625 } 2626 2627 static bool 2628 bdev_qos_rw_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2629 { 2630 return bdev_qos_rw_queue_io(limit, io, bdev_get_io_size_in_byte(io)); 2631 } 2632 2633 static void 2634 bdev_qos_rw_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2635 { 2636 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2637 } 2638 2639 static bool 2640 bdev_qos_r_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2641 { 2642 if (bdev_is_read_io(io) == false) { 2643 return false; 2644 } 2645 2646 return bdev_qos_rw_bps_queue(limit, io); 2647 } 2648 2649 static void 2650 bdev_qos_r_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2651 { 2652 if (bdev_is_read_io(io) != false) { 2653 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2654 } 2655 } 2656 2657 static bool 2658 bdev_qos_w_bps_queue(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2659 { 2660 if (bdev_is_read_io(io) == true) { 2661 return false; 2662 } 2663 2664 return bdev_qos_rw_bps_queue(limit, io); 2665 } 2666 2667 static void 2668 bdev_qos_w_bps_rewind_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2669 { 2670 if (bdev_is_read_io(io) != true) { 2671 bdev_qos_rw_rewind_io(limit, io, bdev_get_io_size_in_byte(io)); 2672 } 2673 } 2674 2675 static void 2676 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2677 { 2678 int i; 2679 2680 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2681 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2682 qos->rate_limits[i].queue_io = NULL; 2683 continue; 2684 } 2685 2686 switch (i) { 2687 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2688 qos->rate_limits[i].queue_io = bdev_qos_rw_iops_queue; 2689 qos->rate_limits[i].rewind_quota = bdev_qos_rw_iops_rewind_quota; 2690 break; 2691 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2692 qos->rate_limits[i].queue_io = bdev_qos_rw_bps_queue; 2693 qos->rate_limits[i].rewind_quota = bdev_qos_rw_bps_rewind_quota; 2694 break; 2695 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2696 qos->rate_limits[i].queue_io = bdev_qos_r_bps_queue; 2697 qos->rate_limits[i].rewind_quota = bdev_qos_r_bps_rewind_quota; 2698 break; 2699 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2700 qos->rate_limits[i].queue_io = bdev_qos_w_bps_queue; 2701 qos->rate_limits[i].rewind_quota = bdev_qos_w_bps_rewind_quota; 2702 break; 2703 default: 2704 break; 2705 } 2706 } 2707 } 2708 2709 static void 2710 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2711 struct spdk_bdev_io *bdev_io, 2712 enum spdk_bdev_io_status status) 2713 { 2714 bdev_io->internal.in_submit_request = true; 2715 bdev_io_increment_outstanding(bdev_ch, bdev_ch->shared_resource); 2716 spdk_bdev_io_complete(bdev_io, status); 2717 bdev_io->internal.in_submit_request = false; 2718 } 2719 2720 static inline void 2721 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2722 { 2723 struct spdk_bdev *bdev = bdev_io->bdev; 2724 struct spdk_io_channel *ch = bdev_ch->channel; 2725 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2726 2727 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2728 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2729 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2730 2731 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2732 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2733 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2734 SPDK_BDEV_IO_STATUS_SUCCESS); 2735 return; 2736 } 2737 } 2738 2739 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2740 bdev_io->bdev->split_on_write_unit && 2741 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2742 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2743 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2744 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2745 return; 2746 } 2747 2748 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2749 bdev_io_increment_outstanding(bdev_ch, shared_resource); 2750 bdev_io->internal.in_submit_request = true; 2751 bdev_submit_request(bdev, ch, bdev_io); 2752 bdev_io->internal.in_submit_request = false; 2753 } else { 2754 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2755 if (shared_resource->nomem_threshold == 0 && shared_resource->io_outstanding == 0) { 2756 /* Special case when we have nomem IOs and no outstanding IOs which completions 2757 * could trigger retry of queued IOs */ 2758 bdev_shared_ch_retry_io(shared_resource); 2759 } 2760 } 2761 } 2762 2763 static bool 2764 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2765 { 2766 int i; 2767 2768 if (bdev_qos_io_to_limit(bdev_io) == true) { 2769 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2770 if (!qos->rate_limits[i].queue_io) { 2771 continue; 2772 } 2773 2774 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2775 bdev_io) == true) { 2776 for (i -= 1; i >= 0 ; i--) { 2777 if (!qos->rate_limits[i].queue_io) { 2778 continue; 2779 } 2780 2781 qos->rate_limits[i].rewind_quota(&qos->rate_limits[i], bdev_io); 2782 } 2783 return true; 2784 } 2785 } 2786 } 2787 2788 return false; 2789 } 2790 2791 static int 2792 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2793 { 2794 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2795 int submitted_ios = 0; 2796 2797 TAILQ_FOREACH_SAFE(bdev_io, &ch->qos_queued_io, internal.link, tmp) { 2798 if (!bdev_qos_queue_io(qos, bdev_io)) { 2799 TAILQ_REMOVE(&ch->qos_queued_io, bdev_io, internal.link); 2800 bdev_io_do_submit(ch, bdev_io); 2801 2802 submitted_ios++; 2803 } 2804 } 2805 2806 return submitted_ios; 2807 } 2808 2809 static void 2810 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2811 { 2812 int rc; 2813 2814 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2815 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2816 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2817 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2818 &bdev_io->internal.waitq_entry); 2819 if (rc != 0) { 2820 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2821 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2822 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2823 } 2824 } 2825 2826 static bool 2827 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2828 { 2829 uint32_t io_boundary; 2830 struct spdk_bdev *bdev = bdev_io->bdev; 2831 uint32_t max_segment_size = bdev->max_segment_size; 2832 uint32_t max_size = bdev->max_rw_size; 2833 int max_segs = bdev->max_num_segments; 2834 2835 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2836 io_boundary = bdev->write_unit_size; 2837 } else if (bdev->split_on_optimal_io_boundary) { 2838 io_boundary = bdev->optimal_io_boundary; 2839 } else { 2840 io_boundary = 0; 2841 } 2842 2843 if (spdk_likely(!io_boundary && !max_segs && !max_segment_size && !max_size)) { 2844 return false; 2845 } 2846 2847 if (io_boundary) { 2848 uint64_t start_stripe, end_stripe; 2849 2850 start_stripe = bdev_io->u.bdev.offset_blocks; 2851 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2852 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2853 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2854 start_stripe >>= spdk_u32log2(io_boundary); 2855 end_stripe >>= spdk_u32log2(io_boundary); 2856 } else { 2857 start_stripe /= io_boundary; 2858 end_stripe /= io_boundary; 2859 } 2860 2861 if (start_stripe != end_stripe) { 2862 return true; 2863 } 2864 } 2865 2866 if (max_segs) { 2867 if (bdev_io->u.bdev.iovcnt > max_segs) { 2868 return true; 2869 } 2870 } 2871 2872 if (max_segment_size) { 2873 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2874 if (bdev_io->u.bdev.iovs[i].iov_len > max_segment_size) { 2875 return true; 2876 } 2877 } 2878 } 2879 2880 if (max_size) { 2881 if (bdev_io->u.bdev.num_blocks > max_size) { 2882 return true; 2883 } 2884 } 2885 2886 return false; 2887 } 2888 2889 static bool 2890 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2891 { 2892 uint32_t num_unmap_segments; 2893 2894 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2895 return false; 2896 } 2897 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2898 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2899 return true; 2900 } 2901 2902 return false; 2903 } 2904 2905 static bool 2906 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2907 { 2908 if (!bdev_io->bdev->max_write_zeroes) { 2909 return false; 2910 } 2911 2912 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2913 return true; 2914 } 2915 2916 return false; 2917 } 2918 2919 static bool 2920 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2921 { 2922 if (bdev_io->bdev->max_copy != 0 && 2923 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2924 return true; 2925 } 2926 2927 return false; 2928 } 2929 2930 static bool 2931 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2932 { 2933 switch (bdev_io->type) { 2934 case SPDK_BDEV_IO_TYPE_READ: 2935 case SPDK_BDEV_IO_TYPE_WRITE: 2936 return bdev_rw_should_split(bdev_io); 2937 case SPDK_BDEV_IO_TYPE_UNMAP: 2938 return bdev_unmap_should_split(bdev_io); 2939 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2940 return bdev_write_zeroes_should_split(bdev_io); 2941 case SPDK_BDEV_IO_TYPE_COPY: 2942 return bdev_copy_should_split(bdev_io); 2943 default: 2944 return false; 2945 } 2946 } 2947 2948 static uint32_t 2949 _to_next_boundary(uint64_t offset, uint32_t boundary) 2950 { 2951 return (boundary - (offset % boundary)); 2952 } 2953 2954 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2955 2956 static void _bdev_rw_split(void *_bdev_io); 2957 2958 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2959 2960 static void 2961 _bdev_unmap_split(void *_bdev_io) 2962 { 2963 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2964 } 2965 2966 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2967 2968 static void 2969 _bdev_write_zeroes_split(void *_bdev_io) 2970 { 2971 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2972 } 2973 2974 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2975 2976 static void 2977 _bdev_copy_split(void *_bdev_io) 2978 { 2979 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2980 } 2981 2982 static int 2983 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2984 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2985 { 2986 int rc; 2987 uint64_t current_offset, current_remaining, current_src_offset; 2988 spdk_bdev_io_wait_cb io_wait_fn; 2989 2990 current_offset = *offset; 2991 current_remaining = *remaining; 2992 2993 bdev_io->u.bdev.split_outstanding++; 2994 2995 io_wait_fn = _bdev_rw_split; 2996 switch (bdev_io->type) { 2997 case SPDK_BDEV_IO_TYPE_READ: 2998 assert(bdev_io->u.bdev.accel_sequence == NULL); 2999 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 3000 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3001 iov, iovcnt, md_buf, current_offset, 3002 num_blocks, bdev_io->internal.memory_domain, 3003 bdev_io->internal.memory_domain_ctx, NULL, 3004 bdev_io->u.bdev.dif_check_flags, 3005 bdev_io_split_done, bdev_io); 3006 break; 3007 case SPDK_BDEV_IO_TYPE_WRITE: 3008 assert(bdev_io->u.bdev.accel_sequence == NULL); 3009 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 3010 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3011 iov, iovcnt, md_buf, current_offset, 3012 num_blocks, bdev_io->internal.memory_domain, 3013 bdev_io->internal.memory_domain_ctx, NULL, 3014 bdev_io->u.bdev.dif_check_flags, 3015 bdev_io->u.bdev.nvme_cdw12.raw, 3016 bdev_io->u.bdev.nvme_cdw13.raw, 3017 bdev_io_split_done, bdev_io); 3018 break; 3019 case SPDK_BDEV_IO_TYPE_UNMAP: 3020 io_wait_fn = _bdev_unmap_split; 3021 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 3022 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3023 current_offset, num_blocks, 3024 bdev_io_split_done, bdev_io); 3025 break; 3026 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3027 io_wait_fn = _bdev_write_zeroes_split; 3028 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 3029 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3030 current_offset, num_blocks, 3031 bdev_io_split_done, bdev_io); 3032 break; 3033 case SPDK_BDEV_IO_TYPE_COPY: 3034 io_wait_fn = _bdev_copy_split; 3035 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 3036 (current_offset - bdev_io->u.bdev.offset_blocks); 3037 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 3038 spdk_io_channel_from_ctx(bdev_io->internal.ch), 3039 current_offset, current_src_offset, num_blocks, 3040 bdev_io_split_done, bdev_io); 3041 break; 3042 default: 3043 assert(false); 3044 rc = -EINVAL; 3045 break; 3046 } 3047 3048 if (rc == 0) { 3049 current_offset += num_blocks; 3050 current_remaining -= num_blocks; 3051 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 3052 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 3053 *offset = current_offset; 3054 *remaining = current_remaining; 3055 } else { 3056 bdev_io->u.bdev.split_outstanding--; 3057 if (rc == -ENOMEM) { 3058 if (bdev_io->u.bdev.split_outstanding == 0) { 3059 /* No I/O is outstanding. Hence we should wait here. */ 3060 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 3061 } 3062 } else { 3063 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3064 if (bdev_io->u.bdev.split_outstanding == 0) { 3065 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3066 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3067 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3068 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3069 } 3070 } 3071 } 3072 3073 return rc; 3074 } 3075 3076 static void 3077 _bdev_rw_split(void *_bdev_io) 3078 { 3079 struct iovec *parent_iov, *iov; 3080 struct spdk_bdev_io *bdev_io = _bdev_io; 3081 struct spdk_bdev *bdev = bdev_io->bdev; 3082 uint64_t parent_offset, current_offset, remaining; 3083 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3084 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3085 uint32_t iovcnt, iov_len, child_iovsize; 3086 uint32_t blocklen = bdev->blocklen; 3087 uint32_t io_boundary; 3088 uint32_t max_segment_size = bdev->max_segment_size; 3089 uint32_t max_child_iovcnt = bdev->max_num_segments; 3090 uint32_t max_size = bdev->max_rw_size; 3091 void *md_buf = NULL; 3092 int rc; 3093 3094 max_size = max_size ? max_size : UINT32_MAX; 3095 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3096 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3097 SPDK_BDEV_IO_NUM_CHILD_IOV; 3098 3099 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3100 io_boundary = bdev->write_unit_size; 3101 } else if (bdev->split_on_optimal_io_boundary) { 3102 io_boundary = bdev->optimal_io_boundary; 3103 } else { 3104 io_boundary = UINT32_MAX; 3105 } 3106 3107 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3108 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3109 parent_offset = bdev_io->u.bdev.offset_blocks; 3110 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3111 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3112 3113 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3114 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3115 if (parent_iov_offset < parent_iov->iov_len) { 3116 break; 3117 } 3118 parent_iov_offset -= parent_iov->iov_len; 3119 } 3120 3121 child_iovcnt = 0; 3122 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3123 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3124 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3125 to_next_boundary = spdk_min(remaining, to_next_boundary); 3126 to_next_boundary = spdk_min(max_size, to_next_boundary); 3127 to_next_boundary_bytes = to_next_boundary * blocklen; 3128 3129 iov = &bdev_io->child_iov[child_iovcnt]; 3130 iovcnt = 0; 3131 3132 if (bdev_io->u.bdev.md_buf) { 3133 md_buf = (char *)bdev_io->u.bdev.md_buf + 3134 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3135 } 3136 3137 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3138 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3139 iovcnt < child_iovsize) { 3140 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3141 iov_len = parent_iov->iov_len - parent_iov_offset; 3142 3143 iov_len = spdk_min(iov_len, max_segment_size); 3144 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3145 to_next_boundary_bytes -= iov_len; 3146 3147 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3148 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3149 3150 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3151 parent_iov_offset += iov_len; 3152 } else { 3153 parent_iovpos++; 3154 parent_iov_offset = 0; 3155 } 3156 child_iovcnt++; 3157 iovcnt++; 3158 } 3159 3160 if (to_next_boundary_bytes > 0) { 3161 /* We had to stop this child I/O early because we ran out of 3162 * child_iov space or were limited by max_num_segments. 3163 * Ensure the iovs to be aligned with block size and 3164 * then adjust to_next_boundary before starting the 3165 * child I/O. 3166 */ 3167 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3168 iovcnt == child_iovsize); 3169 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3170 if (to_last_block_bytes != 0) { 3171 uint32_t child_iovpos = child_iovcnt - 1; 3172 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3173 * so the loop will naturally end 3174 */ 3175 3176 to_last_block_bytes = blocklen - to_last_block_bytes; 3177 to_next_boundary_bytes += to_last_block_bytes; 3178 while (to_last_block_bytes > 0 && iovcnt > 0) { 3179 iov_len = spdk_min(to_last_block_bytes, 3180 bdev_io->child_iov[child_iovpos].iov_len); 3181 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3182 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3183 child_iovpos--; 3184 if (--iovcnt == 0) { 3185 /* If the child IO is less than a block size just return. 3186 * If the first child IO of any split round is less than 3187 * a block size, an error exit. 3188 */ 3189 if (bdev_io->u.bdev.split_outstanding == 0) { 3190 SPDK_ERRLOG("The first child io was less than a block size\n"); 3191 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3192 spdk_trace_record(TRACE_BDEV_IO_DONE, bdev_io->internal.ch->trace_id, 3193 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3194 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3195 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3196 } 3197 3198 return; 3199 } 3200 } 3201 3202 to_last_block_bytes -= iov_len; 3203 3204 if (parent_iov_offset == 0) { 3205 parent_iovpos--; 3206 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3207 } 3208 parent_iov_offset -= iov_len; 3209 } 3210 3211 assert(to_last_block_bytes == 0); 3212 } 3213 to_next_boundary -= to_next_boundary_bytes / blocklen; 3214 } 3215 3216 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3217 ¤t_offset, &remaining); 3218 if (spdk_unlikely(rc)) { 3219 return; 3220 } 3221 } 3222 } 3223 3224 static void 3225 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3226 { 3227 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3228 uint32_t num_children_reqs = 0; 3229 int rc; 3230 3231 offset = bdev_io->u.bdev.split_current_offset_blocks; 3232 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3233 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3234 3235 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3236 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3237 3238 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3239 &offset, &remaining); 3240 if (spdk_likely(rc == 0)) { 3241 num_children_reqs++; 3242 } else { 3243 return; 3244 } 3245 } 3246 } 3247 3248 static void 3249 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3250 { 3251 uint64_t offset, write_zeroes_blocks, remaining; 3252 uint32_t num_children_reqs = 0; 3253 int rc; 3254 3255 offset = bdev_io->u.bdev.split_current_offset_blocks; 3256 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3257 3258 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3259 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3260 3261 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3262 &offset, &remaining); 3263 if (spdk_likely(rc == 0)) { 3264 num_children_reqs++; 3265 } else { 3266 return; 3267 } 3268 } 3269 } 3270 3271 static void 3272 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3273 { 3274 uint64_t offset, copy_blocks, remaining; 3275 uint32_t num_children_reqs = 0; 3276 int rc; 3277 3278 offset = bdev_io->u.bdev.split_current_offset_blocks; 3279 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3280 3281 assert(bdev_io->bdev->max_copy != 0); 3282 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3283 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3284 3285 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3286 &offset, &remaining); 3287 if (spdk_likely(rc == 0)) { 3288 num_children_reqs++; 3289 } else { 3290 return; 3291 } 3292 } 3293 } 3294 3295 static void 3296 parent_bdev_io_complete(void *ctx, int rc) 3297 { 3298 struct spdk_bdev_io *parent_io = ctx; 3299 3300 if (rc) { 3301 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3302 } 3303 3304 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3305 parent_io->internal.caller_ctx); 3306 } 3307 3308 static void 3309 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3310 { 3311 struct spdk_bdev_io *bdev_io = ctx; 3312 3313 /* u.bdev.accel_sequence should have already been cleared at this point */ 3314 assert(bdev_io->u.bdev.accel_sequence == NULL); 3315 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3316 bdev_io->internal.accel_sequence = NULL; 3317 3318 if (spdk_unlikely(status != 0)) { 3319 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3320 } 3321 3322 parent_bdev_io_complete(bdev_io, status); 3323 } 3324 3325 static void 3326 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3327 { 3328 struct spdk_bdev_io *parent_io = cb_arg; 3329 3330 spdk_bdev_free_io(bdev_io); 3331 3332 if (!success) { 3333 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3334 /* If any child I/O failed, stop further splitting process. */ 3335 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3336 parent_io->u.bdev.split_remaining_num_blocks = 0; 3337 } 3338 parent_io->u.bdev.split_outstanding--; 3339 if (parent_io->u.bdev.split_outstanding != 0) { 3340 return; 3341 } 3342 3343 /* 3344 * Parent I/O finishes when all blocks are consumed. 3345 */ 3346 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3347 assert(parent_io->internal.cb != bdev_io_split_done); 3348 spdk_trace_record(TRACE_BDEV_IO_DONE, parent_io->internal.ch->trace_id, 3349 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3350 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3351 3352 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3353 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3354 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3355 return; 3356 } else if (parent_io->internal.orig_iovcnt != 0 && 3357 !bdev_io_use_accel_sequence(bdev_io)) { 3358 /* bdev IO will be completed in the callback */ 3359 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3360 return; 3361 } 3362 } 3363 3364 parent_bdev_io_complete(parent_io, 0); 3365 return; 3366 } 3367 3368 /* 3369 * Continue with the splitting process. This function will complete the parent I/O if the 3370 * splitting is done. 3371 */ 3372 switch (parent_io->type) { 3373 case SPDK_BDEV_IO_TYPE_READ: 3374 case SPDK_BDEV_IO_TYPE_WRITE: 3375 _bdev_rw_split(parent_io); 3376 break; 3377 case SPDK_BDEV_IO_TYPE_UNMAP: 3378 bdev_unmap_split(parent_io); 3379 break; 3380 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3381 bdev_write_zeroes_split(parent_io); 3382 break; 3383 case SPDK_BDEV_IO_TYPE_COPY: 3384 bdev_copy_split(parent_io); 3385 break; 3386 default: 3387 assert(false); 3388 break; 3389 } 3390 } 3391 3392 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3393 bool success); 3394 3395 static void 3396 bdev_io_split(struct spdk_bdev_io *bdev_io) 3397 { 3398 assert(bdev_io_should_split(bdev_io)); 3399 3400 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3401 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3402 bdev_io->u.bdev.split_outstanding = 0; 3403 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3404 3405 switch (bdev_io->type) { 3406 case SPDK_BDEV_IO_TYPE_READ: 3407 case SPDK_BDEV_IO_TYPE_WRITE: 3408 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3409 _bdev_rw_split(bdev_io); 3410 } else { 3411 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3412 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3413 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3414 } 3415 break; 3416 case SPDK_BDEV_IO_TYPE_UNMAP: 3417 bdev_unmap_split(bdev_io); 3418 break; 3419 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3420 bdev_write_zeroes_split(bdev_io); 3421 break; 3422 case SPDK_BDEV_IO_TYPE_COPY: 3423 bdev_copy_split(bdev_io); 3424 break; 3425 default: 3426 assert(false); 3427 break; 3428 } 3429 } 3430 3431 static void 3432 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3433 { 3434 if (!success) { 3435 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3436 return; 3437 } 3438 3439 _bdev_rw_split(bdev_io); 3440 } 3441 3442 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3443 * be inlined, at least on some compilers. 3444 */ 3445 static inline void 3446 _bdev_io_submit(void *ctx) 3447 { 3448 struct spdk_bdev_io *bdev_io = ctx; 3449 struct spdk_bdev *bdev = bdev_io->bdev; 3450 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3451 3452 if (spdk_likely(bdev_ch->flags == 0)) { 3453 bdev_io_do_submit(bdev_ch, bdev_io); 3454 return; 3455 } 3456 3457 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3458 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3459 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3460 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3461 bdev_abort_queued_io(&bdev_ch->qos_queued_io, bdev_io->u.abort.bio_to_abort)) { 3462 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3463 } else { 3464 TAILQ_INSERT_TAIL(&bdev_ch->qos_queued_io, bdev_io, internal.link); 3465 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3466 } 3467 } else { 3468 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3469 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3470 } 3471 } 3472 3473 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3474 3475 bool 3476 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3477 { 3478 if (range1->length == 0 || range2->length == 0) { 3479 return false; 3480 } 3481 3482 if (range1->offset + range1->length <= range2->offset) { 3483 return false; 3484 } 3485 3486 if (range2->offset + range2->length <= range1->offset) { 3487 return false; 3488 } 3489 3490 return true; 3491 } 3492 3493 static bool 3494 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3495 { 3496 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3497 struct lba_range r; 3498 3499 switch (bdev_io->type) { 3500 case SPDK_BDEV_IO_TYPE_NVME_IO: 3501 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3502 /* Don't try to decode the NVMe command - just assume worst-case and that 3503 * it overlaps a locked range. 3504 */ 3505 return true; 3506 case SPDK_BDEV_IO_TYPE_READ: 3507 if (!range->quiesce) { 3508 return false; 3509 } 3510 /* fallthrough */ 3511 case SPDK_BDEV_IO_TYPE_WRITE: 3512 case SPDK_BDEV_IO_TYPE_UNMAP: 3513 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3514 case SPDK_BDEV_IO_TYPE_ZCOPY: 3515 case SPDK_BDEV_IO_TYPE_COPY: 3516 r.offset = bdev_io->u.bdev.offset_blocks; 3517 r.length = bdev_io->u.bdev.num_blocks; 3518 if (!bdev_lba_range_overlapped(range, &r)) { 3519 /* This I/O doesn't overlap the specified LBA range. */ 3520 return false; 3521 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3522 /* This I/O overlaps, but the I/O is on the same channel that locked this 3523 * range, and the caller_ctx is the same as the locked_ctx. This means 3524 * that this I/O is associated with the lock, and is allowed to execute. 3525 */ 3526 return false; 3527 } else { 3528 return true; 3529 } 3530 default: 3531 return false; 3532 } 3533 } 3534 3535 void 3536 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3537 { 3538 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3539 3540 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3541 3542 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3543 struct lba_range *range; 3544 3545 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3546 if (bdev_io_range_is_locked(bdev_io, range)) { 3547 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3548 return; 3549 } 3550 } 3551 } 3552 3553 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3554 3555 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3556 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 3557 ch->trace_id, bdev_io->u.bdev.num_blocks, 3558 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3559 bdev_io->u.bdev.offset_blocks); 3560 3561 if (bdev_io->internal.split) { 3562 bdev_io_split(bdev_io); 3563 return; 3564 } 3565 3566 _bdev_io_submit(bdev_io); 3567 } 3568 3569 static inline void 3570 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3571 { 3572 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3573 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3574 * For write operation we need to pull buffers from memory domain before submitting IO. 3575 * Once read operation completes, we need to use memory_domain push functionality to 3576 * update data in original memory domain IO buffer 3577 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3578 bdev_io->u.bdev.memory_domain = NULL; 3579 bdev_io->u.bdev.memory_domain_ctx = NULL; 3580 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3581 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3582 } 3583 3584 static inline void 3585 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3586 { 3587 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3588 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3589 3590 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3591 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3592 bdev_io_complete_unsubmitted(bdev_io); 3593 return; 3594 } 3595 3596 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3597 * support them, but we need to execute an accel sequence and the data buffer is from accel 3598 * memory domain (to avoid doing a push/pull from that domain). 3599 */ 3600 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3601 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3602 _bdev_io_ext_use_bounce_buffer(bdev_io); 3603 return; 3604 } 3605 3606 if (needs_exec) { 3607 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3608 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3609 return; 3610 } 3611 /* For reads we'll execute the sequence after the data is read, so, for now, only 3612 * clear out accel_sequence pointer and submit the IO */ 3613 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3614 bdev_io->u.bdev.accel_sequence = NULL; 3615 } 3616 3617 bdev_io_submit(bdev_io); 3618 } 3619 3620 static void 3621 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3622 { 3623 struct spdk_bdev *bdev = bdev_io->bdev; 3624 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3625 struct spdk_io_channel *ch = bdev_ch->channel; 3626 3627 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3628 3629 bdev_io->internal.in_submit_request = true; 3630 bdev_submit_request(bdev, ch, bdev_io); 3631 bdev_io->internal.in_submit_request = false; 3632 } 3633 3634 void 3635 bdev_io_init(struct spdk_bdev_io *bdev_io, 3636 struct spdk_bdev *bdev, void *cb_arg, 3637 spdk_bdev_io_completion_cb cb) 3638 { 3639 bdev_io->bdev = bdev; 3640 bdev_io->internal.caller_ctx = cb_arg; 3641 bdev_io->internal.cb = cb; 3642 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3643 bdev_io->internal.in_submit_request = false; 3644 bdev_io->internal.buf = NULL; 3645 bdev_io->internal.orig_iovs = NULL; 3646 bdev_io->internal.orig_iovcnt = 0; 3647 bdev_io->internal.orig_md_iov.iov_base = NULL; 3648 bdev_io->internal.error.nvme.cdw0 = 0; 3649 bdev_io->num_retries = 0; 3650 bdev_io->internal.get_buf_cb = NULL; 3651 bdev_io->internal.get_aux_buf_cb = NULL; 3652 bdev_io->internal.memory_domain = NULL; 3653 bdev_io->internal.memory_domain_ctx = NULL; 3654 bdev_io->internal.data_transfer_cpl = NULL; 3655 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3656 bdev_io->internal.accel_sequence = NULL; 3657 bdev_io->internal.has_accel_sequence = false; 3658 } 3659 3660 static bool 3661 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3662 { 3663 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3664 } 3665 3666 bool 3667 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3668 { 3669 bool supported; 3670 3671 supported = bdev_io_type_supported(bdev, io_type); 3672 3673 if (!supported) { 3674 switch (io_type) { 3675 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3676 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3677 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3678 break; 3679 default: 3680 break; 3681 } 3682 } 3683 3684 return supported; 3685 } 3686 3687 uint64_t 3688 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3689 { 3690 return bdev_io->internal.submit_tsc; 3691 } 3692 3693 int 3694 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3695 { 3696 if (bdev->fn_table->dump_info_json) { 3697 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3698 } 3699 3700 return 0; 3701 } 3702 3703 static void 3704 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3705 { 3706 uint32_t max_per_timeslice = 0; 3707 int i; 3708 3709 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3710 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3711 qos->rate_limits[i].max_per_timeslice = 0; 3712 continue; 3713 } 3714 3715 max_per_timeslice = qos->rate_limits[i].limit * 3716 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3717 3718 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3719 qos->rate_limits[i].min_per_timeslice); 3720 3721 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3722 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELEASE); 3723 } 3724 3725 bdev_qos_set_ops(qos); 3726 } 3727 3728 static void 3729 bdev_channel_submit_qos_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3730 struct spdk_io_channel *io_ch, void *ctx) 3731 { 3732 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3733 int status; 3734 3735 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3736 3737 /* if all IOs were sent then continue the iteration, otherwise - stop it */ 3738 /* TODO: channels round robing */ 3739 status = TAILQ_EMPTY(&bdev_ch->qos_queued_io) ? 0 : 1; 3740 3741 spdk_bdev_for_each_channel_continue(i, status); 3742 } 3743 3744 3745 static void 3746 bdev_channel_submit_qos_io_done(struct spdk_bdev *bdev, void *ctx, int status) 3747 { 3748 3749 } 3750 3751 static int 3752 bdev_channel_poll_qos(void *arg) 3753 { 3754 struct spdk_bdev *bdev = arg; 3755 struct spdk_bdev_qos *qos = bdev->internal.qos; 3756 uint64_t now = spdk_get_ticks(); 3757 int i; 3758 int64_t remaining_last_timeslice; 3759 3760 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3761 /* We received our callback earlier than expected - return 3762 * immediately and wait to do accounting until at least one 3763 * timeslice has actually expired. This should never happen 3764 * with a well-behaved timer implementation. 3765 */ 3766 return SPDK_POLLER_IDLE; 3767 } 3768 3769 /* Reset for next round of rate limiting */ 3770 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3771 /* We may have allowed the IOs or bytes to slightly overrun in the last 3772 * timeslice. remaining_this_timeslice is signed, so if it's negative 3773 * here, we'll account for the overrun so that the next timeslice will 3774 * be appropriately reduced. 3775 */ 3776 remaining_last_timeslice = __atomic_exchange_n(&qos->rate_limits[i].remaining_this_timeslice, 3777 0, __ATOMIC_RELAXED); 3778 if (remaining_last_timeslice < 0) { 3779 /* There could be a race condition here as both bdev_qos_rw_queue_io() and bdev_channel_poll_qos() 3780 * potentially use 2 atomic ops each, so they can intertwine. 3781 * This race can potentialy cause the limits to be a little fuzzy but won't cause any real damage. 3782 */ 3783 __atomic_store_n(&qos->rate_limits[i].remaining_this_timeslice, 3784 remaining_last_timeslice, __ATOMIC_RELAXED); 3785 } 3786 } 3787 3788 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3789 qos->last_timeslice += qos->timeslice_size; 3790 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3791 __atomic_add_fetch(&qos->rate_limits[i].remaining_this_timeslice, 3792 qos->rate_limits[i].max_per_timeslice, __ATOMIC_RELAXED); 3793 } 3794 } 3795 3796 spdk_bdev_for_each_channel(bdev, bdev_channel_submit_qos_io, qos, 3797 bdev_channel_submit_qos_io_done); 3798 3799 return SPDK_POLLER_BUSY; 3800 } 3801 3802 static void 3803 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3804 { 3805 struct spdk_bdev_shared_resource *shared_resource; 3806 struct lba_range *range; 3807 3808 bdev_free_io_stat(ch->stat); 3809 #ifdef SPDK_CONFIG_VTUNE 3810 bdev_free_io_stat(ch->prev_stat); 3811 #endif 3812 3813 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3814 range = TAILQ_FIRST(&ch->locked_ranges); 3815 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3816 free(range); 3817 } 3818 3819 spdk_put_io_channel(ch->channel); 3820 spdk_put_io_channel(ch->accel_channel); 3821 3822 shared_resource = ch->shared_resource; 3823 3824 assert(TAILQ_EMPTY(&ch->io_locked)); 3825 assert(TAILQ_EMPTY(&ch->io_submitted)); 3826 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3827 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3828 assert(ch->io_outstanding == 0); 3829 assert(shared_resource->ref > 0); 3830 shared_resource->ref--; 3831 if (shared_resource->ref == 0) { 3832 assert(shared_resource->io_outstanding == 0); 3833 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3834 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3835 spdk_poller_unregister(&shared_resource->nomem_poller); 3836 free(shared_resource); 3837 } 3838 } 3839 3840 static void 3841 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3842 { 3843 struct spdk_bdev_qos *qos = bdev->internal.qos; 3844 int i; 3845 3846 assert(spdk_spin_held(&bdev->internal.spinlock)); 3847 3848 /* Rate limiting on this bdev enabled */ 3849 if (qos) { 3850 if (qos->ch == NULL) { 3851 struct spdk_io_channel *io_ch; 3852 3853 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3854 bdev->name, spdk_get_thread()); 3855 3856 /* No qos channel has been selected, so set one up */ 3857 3858 /* Take another reference to ch */ 3859 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3860 assert(io_ch != NULL); 3861 qos->ch = ch; 3862 3863 qos->thread = spdk_io_channel_get_thread(io_ch); 3864 3865 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3866 if (bdev_qos_is_iops_rate_limit(i) == true) { 3867 qos->rate_limits[i].min_per_timeslice = 3868 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3869 } else { 3870 qos->rate_limits[i].min_per_timeslice = 3871 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3872 } 3873 3874 if (qos->rate_limits[i].limit == 0) { 3875 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3876 } 3877 } 3878 bdev_qos_update_max_quota_per_timeslice(qos); 3879 qos->timeslice_size = 3880 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3881 qos->last_timeslice = spdk_get_ticks(); 3882 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3883 bdev, 3884 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3885 } 3886 3887 ch->flags |= BDEV_CH_QOS_ENABLED; 3888 } 3889 } 3890 3891 struct poll_timeout_ctx { 3892 struct spdk_bdev_desc *desc; 3893 uint64_t timeout_in_sec; 3894 spdk_bdev_io_timeout_cb cb_fn; 3895 void *cb_arg; 3896 }; 3897 3898 static void 3899 bdev_desc_free(struct spdk_bdev_desc *desc) 3900 { 3901 spdk_spin_destroy(&desc->spinlock); 3902 free(desc->media_events_buffer); 3903 free(desc); 3904 } 3905 3906 static void 3907 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3908 { 3909 struct poll_timeout_ctx *ctx = _ctx; 3910 struct spdk_bdev_desc *desc = ctx->desc; 3911 3912 free(ctx); 3913 3914 spdk_spin_lock(&desc->spinlock); 3915 desc->refs--; 3916 if (desc->closed == true && desc->refs == 0) { 3917 spdk_spin_unlock(&desc->spinlock); 3918 bdev_desc_free(desc); 3919 return; 3920 } 3921 spdk_spin_unlock(&desc->spinlock); 3922 } 3923 3924 static void 3925 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3926 struct spdk_io_channel *io_ch, void *_ctx) 3927 { 3928 struct poll_timeout_ctx *ctx = _ctx; 3929 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3930 struct spdk_bdev_desc *desc = ctx->desc; 3931 struct spdk_bdev_io *bdev_io; 3932 uint64_t now; 3933 3934 spdk_spin_lock(&desc->spinlock); 3935 if (desc->closed == true) { 3936 spdk_spin_unlock(&desc->spinlock); 3937 spdk_bdev_for_each_channel_continue(i, -1); 3938 return; 3939 } 3940 spdk_spin_unlock(&desc->spinlock); 3941 3942 now = spdk_get_ticks(); 3943 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3944 /* Exclude any I/O that are generated via splitting. */ 3945 if (bdev_io->internal.cb == bdev_io_split_done) { 3946 continue; 3947 } 3948 3949 /* Once we find an I/O that has not timed out, we can immediately 3950 * exit the loop. 3951 */ 3952 if (now < (bdev_io->internal.submit_tsc + 3953 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3954 goto end; 3955 } 3956 3957 if (bdev_io->internal.desc == desc) { 3958 ctx->cb_fn(ctx->cb_arg, bdev_io); 3959 } 3960 } 3961 3962 end: 3963 spdk_bdev_for_each_channel_continue(i, 0); 3964 } 3965 3966 static int 3967 bdev_poll_timeout_io(void *arg) 3968 { 3969 struct spdk_bdev_desc *desc = arg; 3970 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3971 struct poll_timeout_ctx *ctx; 3972 3973 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3974 if (!ctx) { 3975 SPDK_ERRLOG("failed to allocate memory\n"); 3976 return SPDK_POLLER_BUSY; 3977 } 3978 ctx->desc = desc; 3979 ctx->cb_arg = desc->cb_arg; 3980 ctx->cb_fn = desc->cb_fn; 3981 ctx->timeout_in_sec = desc->timeout_in_sec; 3982 3983 /* Take a ref on the descriptor in case it gets closed while we are checking 3984 * all of the channels. 3985 */ 3986 spdk_spin_lock(&desc->spinlock); 3987 desc->refs++; 3988 spdk_spin_unlock(&desc->spinlock); 3989 3990 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3991 bdev_channel_poll_timeout_io_done); 3992 3993 return SPDK_POLLER_BUSY; 3994 } 3995 3996 int 3997 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3998 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3999 { 4000 assert(desc->thread == spdk_get_thread()); 4001 4002 spdk_poller_unregister(&desc->io_timeout_poller); 4003 4004 if (timeout_in_sec) { 4005 assert(cb_fn != NULL); 4006 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 4007 desc, 4008 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 4009 1000); 4010 if (desc->io_timeout_poller == NULL) { 4011 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 4012 return -1; 4013 } 4014 } 4015 4016 desc->cb_fn = cb_fn; 4017 desc->cb_arg = cb_arg; 4018 desc->timeout_in_sec = timeout_in_sec; 4019 4020 return 0; 4021 } 4022 4023 static int 4024 bdev_channel_create(void *io_device, void *ctx_buf) 4025 { 4026 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 4027 struct spdk_bdev_channel *ch = ctx_buf; 4028 struct spdk_io_channel *mgmt_io_ch; 4029 struct spdk_bdev_mgmt_channel *mgmt_ch; 4030 struct spdk_bdev_shared_resource *shared_resource; 4031 struct lba_range *range; 4032 4033 ch->bdev = bdev; 4034 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 4035 if (!ch->channel) { 4036 return -1; 4037 } 4038 4039 ch->accel_channel = spdk_accel_get_io_channel(); 4040 if (!ch->accel_channel) { 4041 spdk_put_io_channel(ch->channel); 4042 return -1; 4043 } 4044 4045 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, bdev->internal.trace_id, 0, 0, 4046 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4047 4048 assert(ch->histogram == NULL); 4049 if (bdev->internal.histogram_enabled) { 4050 ch->histogram = spdk_histogram_data_alloc(); 4051 if (ch->histogram == NULL) { 4052 SPDK_ERRLOG("Could not allocate histogram\n"); 4053 } 4054 } 4055 4056 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 4057 if (!mgmt_io_ch) { 4058 spdk_put_io_channel(ch->channel); 4059 spdk_put_io_channel(ch->accel_channel); 4060 return -1; 4061 } 4062 4063 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 4064 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 4065 if (shared_resource->shared_ch == ch->channel) { 4066 spdk_put_io_channel(mgmt_io_ch); 4067 shared_resource->ref++; 4068 break; 4069 } 4070 } 4071 4072 if (shared_resource == NULL) { 4073 shared_resource = calloc(1, sizeof(*shared_resource)); 4074 if (shared_resource == NULL) { 4075 spdk_put_io_channel(ch->channel); 4076 spdk_put_io_channel(ch->accel_channel); 4077 spdk_put_io_channel(mgmt_io_ch); 4078 return -1; 4079 } 4080 4081 shared_resource->mgmt_ch = mgmt_ch; 4082 shared_resource->io_outstanding = 0; 4083 TAILQ_INIT(&shared_resource->nomem_io); 4084 shared_resource->nomem_threshold = 0; 4085 shared_resource->shared_ch = ch->channel; 4086 shared_resource->ref = 1; 4087 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 4088 } 4089 4090 ch->io_outstanding = 0; 4091 TAILQ_INIT(&ch->queued_resets); 4092 TAILQ_INIT(&ch->locked_ranges); 4093 TAILQ_INIT(&ch->qos_queued_io); 4094 ch->flags = 0; 4095 ch->trace_id = bdev->internal.trace_id; 4096 ch->shared_resource = shared_resource; 4097 4098 TAILQ_INIT(&ch->io_submitted); 4099 TAILQ_INIT(&ch->io_locked); 4100 TAILQ_INIT(&ch->io_accel_exec); 4101 TAILQ_INIT(&ch->io_memory_domain); 4102 4103 ch->stat = bdev_alloc_io_stat(false); 4104 if (ch->stat == NULL) { 4105 bdev_channel_destroy_resource(ch); 4106 return -1; 4107 } 4108 4109 ch->stat->ticks_rate = spdk_get_ticks_hz(); 4110 4111 #ifdef SPDK_CONFIG_VTUNE 4112 { 4113 char *name; 4114 __itt_init_ittlib(NULL, 0); 4115 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 4116 if (!name) { 4117 bdev_channel_destroy_resource(ch); 4118 return -1; 4119 } 4120 ch->handle = __itt_string_handle_create(name); 4121 free(name); 4122 ch->start_tsc = spdk_get_ticks(); 4123 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4124 ch->prev_stat = bdev_alloc_io_stat(false); 4125 if (ch->prev_stat == NULL) { 4126 bdev_channel_destroy_resource(ch); 4127 return -1; 4128 } 4129 } 4130 #endif 4131 4132 spdk_spin_lock(&bdev->internal.spinlock); 4133 bdev_enable_qos(bdev, ch); 4134 4135 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4136 struct lba_range *new_range; 4137 4138 new_range = calloc(1, sizeof(*new_range)); 4139 if (new_range == NULL) { 4140 spdk_spin_unlock(&bdev->internal.spinlock); 4141 bdev_channel_destroy_resource(ch); 4142 return -1; 4143 } 4144 new_range->length = range->length; 4145 new_range->offset = range->offset; 4146 new_range->locked_ctx = range->locked_ctx; 4147 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4148 } 4149 4150 spdk_spin_unlock(&bdev->internal.spinlock); 4151 4152 return 0; 4153 } 4154 4155 static int 4156 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4157 void *cb_ctx) 4158 { 4159 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4160 struct spdk_bdev_io *bdev_io; 4161 uint64_t buf_len; 4162 4163 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4164 if (bdev_io->internal.ch == bdev_ch) { 4165 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4166 spdk_iobuf_entry_abort(ch, entry, buf_len); 4167 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4168 } 4169 4170 return 0; 4171 } 4172 4173 /* 4174 * Abort I/O that are waiting on a data buffer. 4175 */ 4176 static void 4177 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4178 { 4179 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4180 bdev_abort_all_buf_io_cb, ch); 4181 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4182 bdev_abort_all_buf_io_cb, ch); 4183 } 4184 4185 /* 4186 * Abort I/O that are queued waiting for submission. These types of I/O are 4187 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4188 */ 4189 static void 4190 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4191 { 4192 struct spdk_bdev_io *bdev_io, *tmp; 4193 4194 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4195 if (bdev_io->internal.ch == ch) { 4196 TAILQ_REMOVE(queue, bdev_io, internal.link); 4197 /* 4198 * spdk_bdev_io_complete() assumes that the completed I/O had 4199 * been submitted to the bdev module. Since in this case it 4200 * hadn't, bump io_outstanding to account for the decrement 4201 * that spdk_bdev_io_complete() will do. 4202 */ 4203 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4204 bdev_io_increment_outstanding(ch, ch->shared_resource); 4205 } 4206 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4207 } 4208 } 4209 } 4210 4211 static bool 4212 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4213 { 4214 struct spdk_bdev_io *bdev_io; 4215 4216 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4217 if (bdev_io == bio_to_abort) { 4218 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4219 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4220 return true; 4221 } 4222 } 4223 4224 return false; 4225 } 4226 4227 static int 4228 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4229 { 4230 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4231 uint64_t buf_len; 4232 4233 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4234 if (bdev_io == bio_to_abort) { 4235 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4236 spdk_iobuf_entry_abort(ch, entry, buf_len); 4237 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4238 return 1; 4239 } 4240 4241 return 0; 4242 } 4243 4244 static bool 4245 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4246 { 4247 int rc; 4248 4249 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4250 bdev_abort_buf_io_cb, bio_to_abort); 4251 if (rc == 1) { 4252 return true; 4253 } 4254 4255 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4256 bdev_abort_buf_io_cb, bio_to_abort); 4257 return rc == 1; 4258 } 4259 4260 static void 4261 bdev_qos_channel_destroy(void *cb_arg) 4262 { 4263 struct spdk_bdev_qos *qos = cb_arg; 4264 4265 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4266 spdk_poller_unregister(&qos->poller); 4267 4268 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4269 4270 free(qos); 4271 } 4272 4273 static int 4274 bdev_qos_destroy(struct spdk_bdev *bdev) 4275 { 4276 int i; 4277 4278 /* 4279 * Cleanly shutting down the QoS poller is tricky, because 4280 * during the asynchronous operation the user could open 4281 * a new descriptor and create a new channel, spawning 4282 * a new QoS poller. 4283 * 4284 * The strategy is to create a new QoS structure here and swap it 4285 * in. The shutdown path then continues to refer to the old one 4286 * until it completes and then releases it. 4287 */ 4288 struct spdk_bdev_qos *new_qos, *old_qos; 4289 4290 old_qos = bdev->internal.qos; 4291 4292 new_qos = calloc(1, sizeof(*new_qos)); 4293 if (!new_qos) { 4294 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4295 return -ENOMEM; 4296 } 4297 4298 /* Copy the old QoS data into the newly allocated structure */ 4299 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4300 4301 /* Zero out the key parts of the QoS structure */ 4302 new_qos->ch = NULL; 4303 new_qos->thread = NULL; 4304 new_qos->poller = NULL; 4305 /* 4306 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4307 * It will be used later for the new QoS structure. 4308 */ 4309 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4310 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4311 new_qos->rate_limits[i].min_per_timeslice = 0; 4312 new_qos->rate_limits[i].max_per_timeslice = 0; 4313 } 4314 4315 bdev->internal.qos = new_qos; 4316 4317 if (old_qos->thread == NULL) { 4318 free(old_qos); 4319 } else { 4320 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4321 } 4322 4323 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4324 * been destroyed yet. The destruction path will end up waiting for the final 4325 * channel to be put before it releases resources. */ 4326 4327 return 0; 4328 } 4329 4330 void 4331 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4332 { 4333 total->bytes_read += add->bytes_read; 4334 total->num_read_ops += add->num_read_ops; 4335 total->bytes_written += add->bytes_written; 4336 total->num_write_ops += add->num_write_ops; 4337 total->bytes_unmapped += add->bytes_unmapped; 4338 total->num_unmap_ops += add->num_unmap_ops; 4339 total->bytes_copied += add->bytes_copied; 4340 total->num_copy_ops += add->num_copy_ops; 4341 total->read_latency_ticks += add->read_latency_ticks; 4342 total->write_latency_ticks += add->write_latency_ticks; 4343 total->unmap_latency_ticks += add->unmap_latency_ticks; 4344 total->copy_latency_ticks += add->copy_latency_ticks; 4345 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4346 total->max_read_latency_ticks = add->max_read_latency_ticks; 4347 } 4348 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4349 total->min_read_latency_ticks = add->min_read_latency_ticks; 4350 } 4351 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4352 total->max_write_latency_ticks = add->max_write_latency_ticks; 4353 } 4354 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4355 total->min_write_latency_ticks = add->min_write_latency_ticks; 4356 } 4357 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4358 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4359 } 4360 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4361 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4362 } 4363 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4364 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4365 } 4366 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4367 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4368 } 4369 } 4370 4371 static void 4372 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4373 { 4374 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4375 4376 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4377 memcpy(to_stat->io_error, from_stat->io_error, 4378 sizeof(struct spdk_bdev_io_error_stat)); 4379 } 4380 } 4381 4382 void 4383 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4384 { 4385 stat->max_read_latency_ticks = 0; 4386 stat->min_read_latency_ticks = UINT64_MAX; 4387 stat->max_write_latency_ticks = 0; 4388 stat->min_write_latency_ticks = UINT64_MAX; 4389 stat->max_unmap_latency_ticks = 0; 4390 stat->min_unmap_latency_ticks = UINT64_MAX; 4391 stat->max_copy_latency_ticks = 0; 4392 stat->min_copy_latency_ticks = UINT64_MAX; 4393 4394 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4395 return; 4396 } 4397 4398 stat->bytes_read = 0; 4399 stat->num_read_ops = 0; 4400 stat->bytes_written = 0; 4401 stat->num_write_ops = 0; 4402 stat->bytes_unmapped = 0; 4403 stat->num_unmap_ops = 0; 4404 stat->bytes_copied = 0; 4405 stat->num_copy_ops = 0; 4406 stat->read_latency_ticks = 0; 4407 stat->write_latency_ticks = 0; 4408 stat->unmap_latency_ticks = 0; 4409 stat->copy_latency_ticks = 0; 4410 4411 if (stat->io_error != NULL) { 4412 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4413 } 4414 } 4415 4416 struct spdk_bdev_io_stat * 4417 bdev_alloc_io_stat(bool io_error_stat) 4418 { 4419 struct spdk_bdev_io_stat *stat; 4420 4421 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4422 if (stat == NULL) { 4423 return NULL; 4424 } 4425 4426 if (io_error_stat) { 4427 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4428 if (stat->io_error == NULL) { 4429 free(stat); 4430 return NULL; 4431 } 4432 } else { 4433 stat->io_error = NULL; 4434 } 4435 4436 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4437 4438 return stat; 4439 } 4440 4441 void 4442 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4443 { 4444 if (stat != NULL) { 4445 free(stat->io_error); 4446 free(stat); 4447 } 4448 } 4449 4450 void 4451 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4452 { 4453 int i; 4454 4455 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4456 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4457 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4458 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4459 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4460 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4461 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4462 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4463 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4464 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4465 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4466 stat->min_read_latency_ticks != UINT64_MAX ? 4467 stat->min_read_latency_ticks : 0); 4468 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4469 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4470 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4471 stat->min_write_latency_ticks != UINT64_MAX ? 4472 stat->min_write_latency_ticks : 0); 4473 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4474 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4475 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4476 stat->min_unmap_latency_ticks != UINT64_MAX ? 4477 stat->min_unmap_latency_ticks : 0); 4478 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4479 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4480 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4481 stat->min_copy_latency_ticks != UINT64_MAX ? 4482 stat->min_copy_latency_ticks : 0); 4483 4484 if (stat->io_error != NULL) { 4485 spdk_json_write_named_object_begin(w, "io_error"); 4486 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4487 if (stat->io_error->error_status[i] != 0) { 4488 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4489 stat->io_error->error_status[i]); 4490 } 4491 } 4492 spdk_json_write_object_end(w); 4493 } 4494 } 4495 4496 static void 4497 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4498 { 4499 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4500 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4501 4502 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4503 bdev_abort_all_buf_io(mgmt_ch, ch); 4504 } 4505 4506 static void 4507 bdev_channel_destroy(void *io_device, void *ctx_buf) 4508 { 4509 struct spdk_bdev_channel *ch = ctx_buf; 4510 4511 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4512 spdk_get_thread()); 4513 4514 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, ch->bdev->internal.trace_id, 0, 0, 4515 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4516 4517 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4518 spdk_spin_lock(&ch->bdev->internal.spinlock); 4519 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4520 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4521 4522 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4523 4524 bdev_channel_abort_queued_ios(ch); 4525 4526 if (ch->histogram) { 4527 spdk_histogram_data_free(ch->histogram); 4528 } 4529 4530 bdev_channel_destroy_resource(ch); 4531 } 4532 4533 /* 4534 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4535 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4536 */ 4537 static int 4538 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4539 { 4540 struct spdk_bdev_name *tmp; 4541 4542 bdev_name->name = strdup(name); 4543 if (bdev_name->name == NULL) { 4544 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4545 return -ENOMEM; 4546 } 4547 4548 bdev_name->bdev = bdev; 4549 4550 spdk_spin_lock(&g_bdev_mgr.spinlock); 4551 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4552 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4553 4554 if (tmp != NULL) { 4555 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4556 free(bdev_name->name); 4557 return -EEXIST; 4558 } 4559 4560 return 0; 4561 } 4562 4563 static void 4564 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4565 { 4566 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4567 free(bdev_name->name); 4568 } 4569 4570 static void 4571 bdev_name_del(struct spdk_bdev_name *bdev_name) 4572 { 4573 spdk_spin_lock(&g_bdev_mgr.spinlock); 4574 bdev_name_del_unsafe(bdev_name); 4575 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4576 } 4577 4578 int 4579 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4580 { 4581 struct spdk_bdev_alias *tmp; 4582 int ret; 4583 4584 if (alias == NULL) { 4585 SPDK_ERRLOG("Empty alias passed\n"); 4586 return -EINVAL; 4587 } 4588 4589 tmp = calloc(1, sizeof(*tmp)); 4590 if (tmp == NULL) { 4591 SPDK_ERRLOG("Unable to allocate alias\n"); 4592 return -ENOMEM; 4593 } 4594 4595 ret = bdev_name_add(&tmp->alias, bdev, alias); 4596 if (ret != 0) { 4597 free(tmp); 4598 return ret; 4599 } 4600 4601 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4602 4603 return 0; 4604 } 4605 4606 static int 4607 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4608 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4609 { 4610 struct spdk_bdev_alias *tmp; 4611 4612 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4613 if (strcmp(alias, tmp->alias.name) == 0) { 4614 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4615 alias_del_fn(&tmp->alias); 4616 free(tmp); 4617 return 0; 4618 } 4619 } 4620 4621 return -ENOENT; 4622 } 4623 4624 int 4625 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4626 { 4627 int rc; 4628 4629 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4630 if (rc == -ENOENT) { 4631 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4632 } 4633 4634 return rc; 4635 } 4636 4637 void 4638 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4639 { 4640 struct spdk_bdev_alias *p, *tmp; 4641 4642 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4643 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4644 bdev_name_del(&p->alias); 4645 free(p); 4646 } 4647 } 4648 4649 struct spdk_io_channel * 4650 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4651 { 4652 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4653 } 4654 4655 void * 4656 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4657 { 4658 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4659 void *ctx = NULL; 4660 4661 if (bdev->fn_table->get_module_ctx) { 4662 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4663 } 4664 4665 return ctx; 4666 } 4667 4668 const char * 4669 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4670 { 4671 return bdev->module->name; 4672 } 4673 4674 const char * 4675 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4676 { 4677 return bdev->name; 4678 } 4679 4680 const char * 4681 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4682 { 4683 return bdev->product_name; 4684 } 4685 4686 const struct spdk_bdev_aliases_list * 4687 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4688 { 4689 return &bdev->aliases; 4690 } 4691 4692 uint32_t 4693 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4694 { 4695 return bdev->blocklen; 4696 } 4697 4698 uint32_t 4699 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4700 { 4701 return bdev->write_unit_size; 4702 } 4703 4704 uint64_t 4705 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4706 { 4707 return bdev->blockcnt; 4708 } 4709 4710 const char * 4711 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4712 { 4713 return qos_rpc_type[type]; 4714 } 4715 4716 void 4717 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4718 { 4719 int i; 4720 4721 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4722 4723 spdk_spin_lock(&bdev->internal.spinlock); 4724 if (bdev->internal.qos) { 4725 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4726 if (bdev->internal.qos->rate_limits[i].limit != 4727 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4728 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4729 if (bdev_qos_is_iops_rate_limit(i) == false) { 4730 /* Change from Byte to Megabyte which is user visible. */ 4731 limits[i] = limits[i] / 1024 / 1024; 4732 } 4733 } 4734 } 4735 } 4736 spdk_spin_unlock(&bdev->internal.spinlock); 4737 } 4738 4739 size_t 4740 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4741 { 4742 return 1 << bdev->required_alignment; 4743 } 4744 4745 uint32_t 4746 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4747 { 4748 return bdev->optimal_io_boundary; 4749 } 4750 4751 bool 4752 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4753 { 4754 return bdev->write_cache; 4755 } 4756 4757 const struct spdk_uuid * 4758 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4759 { 4760 return &bdev->uuid; 4761 } 4762 4763 uint16_t 4764 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4765 { 4766 return bdev->acwu; 4767 } 4768 4769 uint32_t 4770 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4771 { 4772 return bdev->md_len; 4773 } 4774 4775 bool 4776 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4777 { 4778 return (bdev->md_len != 0) && bdev->md_interleave; 4779 } 4780 4781 bool 4782 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4783 { 4784 return (bdev->md_len != 0) && !bdev->md_interleave; 4785 } 4786 4787 bool 4788 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4789 { 4790 return bdev->zoned; 4791 } 4792 4793 uint32_t 4794 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4795 { 4796 if (spdk_bdev_is_md_interleaved(bdev)) { 4797 return bdev->blocklen - bdev->md_len; 4798 } else { 4799 return bdev->blocklen; 4800 } 4801 } 4802 4803 uint32_t 4804 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4805 { 4806 return bdev->phys_blocklen; 4807 } 4808 4809 static uint32_t 4810 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4811 { 4812 if (!spdk_bdev_is_md_interleaved(bdev)) { 4813 return bdev->blocklen + bdev->md_len; 4814 } else { 4815 return bdev->blocklen; 4816 } 4817 } 4818 4819 /* We have to use the typedef in the function declaration to appease astyle. */ 4820 typedef enum spdk_dif_type spdk_dif_type_t; 4821 4822 spdk_dif_type_t 4823 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4824 { 4825 if (bdev->md_len != 0) { 4826 return bdev->dif_type; 4827 } else { 4828 return SPDK_DIF_DISABLE; 4829 } 4830 } 4831 4832 bool 4833 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4834 { 4835 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4836 return bdev->dif_is_head_of_md; 4837 } else { 4838 return false; 4839 } 4840 } 4841 4842 bool 4843 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4844 enum spdk_dif_check_type check_type) 4845 { 4846 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4847 return false; 4848 } 4849 4850 switch (check_type) { 4851 case SPDK_DIF_CHECK_TYPE_REFTAG: 4852 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4853 case SPDK_DIF_CHECK_TYPE_APPTAG: 4854 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4855 case SPDK_DIF_CHECK_TYPE_GUARD: 4856 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4857 default: 4858 return false; 4859 } 4860 } 4861 4862 static uint32_t 4863 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4864 { 4865 uint64_t aligned_length, max_write_blocks; 4866 4867 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4868 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4869 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4870 4871 return max_write_blocks; 4872 } 4873 4874 uint32_t 4875 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4876 { 4877 return bdev->max_copy; 4878 } 4879 4880 uint64_t 4881 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4882 { 4883 return bdev->internal.measured_queue_depth; 4884 } 4885 4886 uint64_t 4887 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4888 { 4889 return bdev->internal.period; 4890 } 4891 4892 uint64_t 4893 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4894 { 4895 return bdev->internal.weighted_io_time; 4896 } 4897 4898 uint64_t 4899 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4900 { 4901 return bdev->internal.io_time; 4902 } 4903 4904 union spdk_bdev_nvme_ctratt spdk_bdev_get_nvme_ctratt(struct spdk_bdev *bdev) 4905 { 4906 return bdev->ctratt; 4907 } 4908 4909 static void bdev_update_qd_sampling_period(void *ctx); 4910 4911 static void 4912 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4913 { 4914 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4915 4916 if (bdev->internal.measured_queue_depth) { 4917 bdev->internal.io_time += bdev->internal.period; 4918 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4919 } 4920 4921 bdev->internal.qd_poll_in_progress = false; 4922 4923 bdev_update_qd_sampling_period(bdev); 4924 } 4925 4926 static void 4927 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4928 struct spdk_io_channel *io_ch, void *_ctx) 4929 { 4930 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4931 4932 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4933 spdk_bdev_for_each_channel_continue(i, 0); 4934 } 4935 4936 static int 4937 bdev_calculate_measured_queue_depth(void *ctx) 4938 { 4939 struct spdk_bdev *bdev = ctx; 4940 4941 bdev->internal.qd_poll_in_progress = true; 4942 bdev->internal.temporary_queue_depth = 0; 4943 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4944 return SPDK_POLLER_BUSY; 4945 } 4946 4947 static void 4948 bdev_update_qd_sampling_period(void *ctx) 4949 { 4950 struct spdk_bdev *bdev = ctx; 4951 4952 if (bdev->internal.period == bdev->internal.new_period) { 4953 return; 4954 } 4955 4956 if (bdev->internal.qd_poll_in_progress) { 4957 return; 4958 } 4959 4960 bdev->internal.period = bdev->internal.new_period; 4961 4962 spdk_poller_unregister(&bdev->internal.qd_poller); 4963 if (bdev->internal.period != 0) { 4964 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4965 bdev, bdev->internal.period); 4966 } else { 4967 spdk_bdev_close(bdev->internal.qd_desc); 4968 bdev->internal.qd_desc = NULL; 4969 } 4970 } 4971 4972 static void 4973 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4974 { 4975 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4976 } 4977 4978 void 4979 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4980 { 4981 int rc; 4982 4983 if (bdev->internal.new_period == period) { 4984 return; 4985 } 4986 4987 bdev->internal.new_period = period; 4988 4989 if (bdev->internal.qd_desc != NULL) { 4990 assert(bdev->internal.period != 0); 4991 4992 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4993 bdev_update_qd_sampling_period, bdev); 4994 return; 4995 } 4996 4997 assert(bdev->internal.period == 0); 4998 4999 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 5000 NULL, &bdev->internal.qd_desc); 5001 if (rc != 0) { 5002 return; 5003 } 5004 5005 bdev->internal.period = period; 5006 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 5007 bdev, period); 5008 } 5009 5010 struct bdev_get_current_qd_ctx { 5011 uint64_t current_qd; 5012 spdk_bdev_get_current_qd_cb cb_fn; 5013 void *cb_arg; 5014 }; 5015 5016 static void 5017 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 5018 { 5019 struct bdev_get_current_qd_ctx *ctx = _ctx; 5020 5021 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 5022 5023 free(ctx); 5024 } 5025 5026 static void 5027 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5028 struct spdk_io_channel *io_ch, void *_ctx) 5029 { 5030 struct bdev_get_current_qd_ctx *ctx = _ctx; 5031 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 5032 5033 ctx->current_qd += bdev_ch->io_outstanding; 5034 5035 spdk_bdev_for_each_channel_continue(i, 0); 5036 } 5037 5038 void 5039 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 5040 void *cb_arg) 5041 { 5042 struct bdev_get_current_qd_ctx *ctx; 5043 5044 assert(cb_fn != NULL); 5045 5046 ctx = calloc(1, sizeof(*ctx)); 5047 if (ctx == NULL) { 5048 cb_fn(bdev, 0, cb_arg, -ENOMEM); 5049 return; 5050 } 5051 5052 ctx->cb_fn = cb_fn; 5053 ctx->cb_arg = cb_arg; 5054 5055 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 5056 } 5057 5058 static void 5059 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 5060 { 5061 assert(desc->thread == spdk_get_thread()); 5062 5063 spdk_spin_lock(&desc->spinlock); 5064 desc->refs--; 5065 if (!desc->closed) { 5066 spdk_spin_unlock(&desc->spinlock); 5067 desc->callback.event_fn(type, 5068 desc->bdev, 5069 desc->callback.ctx); 5070 return; 5071 } else if (desc->refs == 0) { 5072 /* This descriptor was closed after this event_notify message was sent. 5073 * spdk_bdev_close() could not free the descriptor since this message was 5074 * in flight, so we free it now using bdev_desc_free(). 5075 */ 5076 spdk_spin_unlock(&desc->spinlock); 5077 bdev_desc_free(desc); 5078 return; 5079 } 5080 spdk_spin_unlock(&desc->spinlock); 5081 } 5082 5083 static void 5084 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 5085 { 5086 spdk_spin_lock(&desc->spinlock); 5087 desc->refs++; 5088 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 5089 spdk_spin_unlock(&desc->spinlock); 5090 } 5091 5092 static void 5093 _resize_notify(void *ctx) 5094 { 5095 struct spdk_bdev_desc *desc = ctx; 5096 5097 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 5098 } 5099 5100 int 5101 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 5102 { 5103 struct spdk_bdev_desc *desc; 5104 int ret; 5105 5106 if (size == bdev->blockcnt) { 5107 return 0; 5108 } 5109 5110 spdk_spin_lock(&bdev->internal.spinlock); 5111 5112 /* bdev has open descriptors */ 5113 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 5114 bdev->blockcnt > size) { 5115 ret = -EBUSY; 5116 } else { 5117 bdev->blockcnt = size; 5118 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 5119 event_notify(desc, _resize_notify); 5120 } 5121 ret = 0; 5122 } 5123 5124 spdk_spin_unlock(&bdev->internal.spinlock); 5125 5126 return ret; 5127 } 5128 5129 /* 5130 * Convert I/O offset and length from bytes to blocks. 5131 * 5132 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5133 */ 5134 static uint64_t 5135 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5136 uint64_t num_bytes, uint64_t *num_blocks) 5137 { 5138 uint32_t block_size = bdev->blocklen; 5139 uint8_t shift_cnt; 5140 5141 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5142 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5143 shift_cnt = spdk_u32log2(block_size); 5144 *offset_blocks = offset_bytes >> shift_cnt; 5145 *num_blocks = num_bytes >> shift_cnt; 5146 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5147 (num_bytes - (*num_blocks << shift_cnt)); 5148 } else { 5149 *offset_blocks = offset_bytes / block_size; 5150 *num_blocks = num_bytes / block_size; 5151 return (offset_bytes % block_size) | (num_bytes % block_size); 5152 } 5153 } 5154 5155 static bool 5156 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5157 { 5158 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5159 * has been an overflow and hence the offset has been wrapped around */ 5160 if (offset_blocks + num_blocks < offset_blocks) { 5161 return false; 5162 } 5163 5164 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5165 if (offset_blocks + num_blocks > bdev->blockcnt) { 5166 return false; 5167 } 5168 5169 return true; 5170 } 5171 5172 static void 5173 bdev_seek_complete_cb(void *ctx) 5174 { 5175 struct spdk_bdev_io *bdev_io = ctx; 5176 5177 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5178 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5179 } 5180 5181 static int 5182 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5183 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5184 spdk_bdev_io_completion_cb cb, void *cb_arg) 5185 { 5186 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5187 struct spdk_bdev_io *bdev_io; 5188 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5189 5190 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5191 5192 /* Check if offset_blocks is valid looking at the validity of one block */ 5193 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5194 return -EINVAL; 5195 } 5196 5197 bdev_io = bdev_channel_get_io(channel); 5198 if (!bdev_io) { 5199 return -ENOMEM; 5200 } 5201 5202 bdev_io->internal.ch = channel; 5203 bdev_io->internal.desc = desc; 5204 bdev_io->type = io_type; 5205 bdev_io->u.bdev.offset_blocks = offset_blocks; 5206 bdev_io->u.bdev.memory_domain = NULL; 5207 bdev_io->u.bdev.memory_domain_ctx = NULL; 5208 bdev_io->u.bdev.accel_sequence = NULL; 5209 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5210 5211 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5212 /* In case bdev doesn't support seek to next data/hole offset, 5213 * it is assumed that only data and no holes are present */ 5214 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5215 bdev_io->u.bdev.seek.offset = offset_blocks; 5216 } else { 5217 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5218 } 5219 5220 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5221 return 0; 5222 } 5223 5224 bdev_io_submit(bdev_io); 5225 return 0; 5226 } 5227 5228 int 5229 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5230 uint64_t offset_blocks, 5231 spdk_bdev_io_completion_cb cb, void *cb_arg) 5232 { 5233 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5234 } 5235 5236 int 5237 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5238 uint64_t offset_blocks, 5239 spdk_bdev_io_completion_cb cb, void *cb_arg) 5240 { 5241 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5242 } 5243 5244 uint64_t 5245 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5246 { 5247 return bdev_io->u.bdev.seek.offset; 5248 } 5249 5250 static int 5251 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5252 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5253 spdk_bdev_io_completion_cb cb, void *cb_arg) 5254 { 5255 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5256 struct spdk_bdev_io *bdev_io; 5257 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5258 5259 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5260 return -EINVAL; 5261 } 5262 5263 bdev_io = bdev_channel_get_io(channel); 5264 if (!bdev_io) { 5265 return -ENOMEM; 5266 } 5267 5268 bdev_io->internal.ch = channel; 5269 bdev_io->internal.desc = desc; 5270 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5271 bdev_io->u.bdev.iovs = &bdev_io->iov; 5272 bdev_io->u.bdev.iovs[0].iov_base = buf; 5273 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5274 bdev_io->u.bdev.iovcnt = 1; 5275 bdev_io->u.bdev.md_buf = md_buf; 5276 bdev_io->u.bdev.num_blocks = num_blocks; 5277 bdev_io->u.bdev.offset_blocks = offset_blocks; 5278 bdev_io->u.bdev.memory_domain = NULL; 5279 bdev_io->u.bdev.memory_domain_ctx = NULL; 5280 bdev_io->u.bdev.accel_sequence = NULL; 5281 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5282 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5283 5284 bdev_io_submit(bdev_io); 5285 return 0; 5286 } 5287 5288 int 5289 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5290 void *buf, uint64_t offset, uint64_t nbytes, 5291 spdk_bdev_io_completion_cb cb, void *cb_arg) 5292 { 5293 uint64_t offset_blocks, num_blocks; 5294 5295 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5296 nbytes, &num_blocks) != 0) { 5297 return -EINVAL; 5298 } 5299 5300 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5301 } 5302 5303 int 5304 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5305 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5306 spdk_bdev_io_completion_cb cb, void *cb_arg) 5307 { 5308 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5309 } 5310 5311 int 5312 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5313 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5314 spdk_bdev_io_completion_cb cb, void *cb_arg) 5315 { 5316 struct iovec iov = { 5317 .iov_base = buf, 5318 }; 5319 5320 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5321 return -EINVAL; 5322 } 5323 5324 if (md_buf && !_is_buf_allocated(&iov)) { 5325 return -EINVAL; 5326 } 5327 5328 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5329 cb, cb_arg); 5330 } 5331 5332 int 5333 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5334 struct iovec *iov, int iovcnt, 5335 uint64_t offset, uint64_t nbytes, 5336 spdk_bdev_io_completion_cb cb, void *cb_arg) 5337 { 5338 uint64_t offset_blocks, num_blocks; 5339 5340 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5341 nbytes, &num_blocks) != 0) { 5342 return -EINVAL; 5343 } 5344 5345 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5346 } 5347 5348 static int 5349 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5350 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5351 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5352 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5353 spdk_bdev_io_completion_cb cb, void *cb_arg) 5354 { 5355 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5356 struct spdk_bdev_io *bdev_io; 5357 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5358 5359 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5360 return -EINVAL; 5361 } 5362 5363 bdev_io = bdev_channel_get_io(channel); 5364 if (spdk_unlikely(!bdev_io)) { 5365 return -ENOMEM; 5366 } 5367 5368 bdev_io->internal.ch = channel; 5369 bdev_io->internal.desc = desc; 5370 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5371 bdev_io->u.bdev.iovs = iov; 5372 bdev_io->u.bdev.iovcnt = iovcnt; 5373 bdev_io->u.bdev.md_buf = md_buf; 5374 bdev_io->u.bdev.num_blocks = num_blocks; 5375 bdev_io->u.bdev.offset_blocks = offset_blocks; 5376 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5377 bdev_io->internal.memory_domain = domain; 5378 bdev_io->internal.memory_domain_ctx = domain_ctx; 5379 bdev_io->internal.accel_sequence = seq; 5380 bdev_io->internal.has_accel_sequence = seq != NULL; 5381 bdev_io->u.bdev.memory_domain = domain; 5382 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5383 bdev_io->u.bdev.accel_sequence = seq; 5384 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5385 5386 _bdev_io_submit_ext(desc, bdev_io); 5387 5388 return 0; 5389 } 5390 5391 int 5392 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5393 struct iovec *iov, int iovcnt, 5394 uint64_t offset_blocks, uint64_t num_blocks, 5395 spdk_bdev_io_completion_cb cb, void *cb_arg) 5396 { 5397 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5398 5399 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5400 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5401 } 5402 5403 int 5404 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5405 struct iovec *iov, int iovcnt, void *md_buf, 5406 uint64_t offset_blocks, uint64_t num_blocks, 5407 spdk_bdev_io_completion_cb cb, void *cb_arg) 5408 { 5409 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5410 5411 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5412 return -EINVAL; 5413 } 5414 5415 if (md_buf && !_is_buf_allocated(iov)) { 5416 return -EINVAL; 5417 } 5418 5419 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5420 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, cb, cb_arg); 5421 } 5422 5423 static inline bool 5424 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5425 { 5426 /* 5427 * We check if opts size is at least of size when we first introduced 5428 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5429 * are not checked internal. 5430 */ 5431 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5432 sizeof(opts->metadata) && 5433 opts->size <= sizeof(*opts) && 5434 /* When memory domain is used, the user must provide data buffers */ 5435 (!opts->memory_domain || (iov && iov[0].iov_base)); 5436 } 5437 5438 int 5439 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5440 struct iovec *iov, int iovcnt, 5441 uint64_t offset_blocks, uint64_t num_blocks, 5442 spdk_bdev_io_completion_cb cb, void *cb_arg, 5443 struct spdk_bdev_ext_io_opts *opts) 5444 { 5445 struct spdk_memory_domain *domain = NULL; 5446 struct spdk_accel_sequence *seq = NULL; 5447 void *domain_ctx = NULL, *md = NULL; 5448 uint32_t dif_check_flags = 0; 5449 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5450 5451 if (opts) { 5452 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5453 return -EINVAL; 5454 } 5455 5456 md = opts->metadata; 5457 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5458 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5459 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5460 if (md) { 5461 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5462 return -EINVAL; 5463 } 5464 5465 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5466 return -EINVAL; 5467 } 5468 5469 if (spdk_unlikely(seq != NULL)) { 5470 return -EINVAL; 5471 } 5472 } 5473 } 5474 5475 dif_check_flags = bdev->dif_check_flags & 5476 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5477 5478 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5479 num_blocks, domain, domain_ctx, seq, dif_check_flags, cb, cb_arg); 5480 } 5481 5482 static int 5483 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5484 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5485 spdk_bdev_io_completion_cb cb, void *cb_arg) 5486 { 5487 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5488 struct spdk_bdev_io *bdev_io; 5489 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5490 5491 if (!desc->write) { 5492 return -EBADF; 5493 } 5494 5495 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5496 return -EINVAL; 5497 } 5498 5499 bdev_io = bdev_channel_get_io(channel); 5500 if (!bdev_io) { 5501 return -ENOMEM; 5502 } 5503 5504 bdev_io->internal.ch = channel; 5505 bdev_io->internal.desc = desc; 5506 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5507 bdev_io->u.bdev.iovs = &bdev_io->iov; 5508 bdev_io->u.bdev.iovs[0].iov_base = buf; 5509 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5510 bdev_io->u.bdev.iovcnt = 1; 5511 bdev_io->u.bdev.md_buf = md_buf; 5512 bdev_io->u.bdev.num_blocks = num_blocks; 5513 bdev_io->u.bdev.offset_blocks = offset_blocks; 5514 bdev_io->u.bdev.memory_domain = NULL; 5515 bdev_io->u.bdev.memory_domain_ctx = NULL; 5516 bdev_io->u.bdev.accel_sequence = NULL; 5517 bdev_io->u.bdev.dif_check_flags = bdev->dif_check_flags; 5518 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5519 5520 bdev_io_submit(bdev_io); 5521 return 0; 5522 } 5523 5524 int 5525 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5526 void *buf, uint64_t offset, uint64_t nbytes, 5527 spdk_bdev_io_completion_cb cb, void *cb_arg) 5528 { 5529 uint64_t offset_blocks, num_blocks; 5530 5531 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5532 nbytes, &num_blocks) != 0) { 5533 return -EINVAL; 5534 } 5535 5536 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5537 } 5538 5539 int 5540 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5541 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5542 spdk_bdev_io_completion_cb cb, void *cb_arg) 5543 { 5544 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5545 cb, cb_arg); 5546 } 5547 5548 int 5549 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5550 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5551 spdk_bdev_io_completion_cb cb, void *cb_arg) 5552 { 5553 struct iovec iov = { 5554 .iov_base = buf, 5555 }; 5556 5557 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5558 return -EINVAL; 5559 } 5560 5561 if (md_buf && !_is_buf_allocated(&iov)) { 5562 return -EINVAL; 5563 } 5564 5565 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5566 cb, cb_arg); 5567 } 5568 5569 static int 5570 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5571 struct iovec *iov, int iovcnt, void *md_buf, 5572 uint64_t offset_blocks, uint64_t num_blocks, 5573 struct spdk_memory_domain *domain, void *domain_ctx, 5574 struct spdk_accel_sequence *seq, uint32_t dif_check_flags, 5575 uint32_t nvme_cdw12_raw, uint32_t nvme_cdw13_raw, 5576 spdk_bdev_io_completion_cb cb, void *cb_arg) 5577 { 5578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5579 struct spdk_bdev_io *bdev_io; 5580 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5581 5582 if (spdk_unlikely(!desc->write)) { 5583 return -EBADF; 5584 } 5585 5586 if (spdk_unlikely(!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks))) { 5587 return -EINVAL; 5588 } 5589 5590 bdev_io = bdev_channel_get_io(channel); 5591 if (spdk_unlikely(!bdev_io)) { 5592 return -ENOMEM; 5593 } 5594 5595 bdev_io->internal.ch = channel; 5596 bdev_io->internal.desc = desc; 5597 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5598 bdev_io->u.bdev.iovs = iov; 5599 bdev_io->u.bdev.iovcnt = iovcnt; 5600 bdev_io->u.bdev.md_buf = md_buf; 5601 bdev_io->u.bdev.num_blocks = num_blocks; 5602 bdev_io->u.bdev.offset_blocks = offset_blocks; 5603 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5604 bdev_io->internal.memory_domain = domain; 5605 bdev_io->internal.memory_domain_ctx = domain_ctx; 5606 bdev_io->internal.accel_sequence = seq; 5607 bdev_io->internal.has_accel_sequence = seq != NULL; 5608 bdev_io->u.bdev.memory_domain = domain; 5609 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5610 bdev_io->u.bdev.accel_sequence = seq; 5611 bdev_io->u.bdev.dif_check_flags = dif_check_flags; 5612 bdev_io->u.bdev.nvme_cdw12.raw = nvme_cdw12_raw; 5613 bdev_io->u.bdev.nvme_cdw13.raw = nvme_cdw13_raw; 5614 5615 _bdev_io_submit_ext(desc, bdev_io); 5616 5617 return 0; 5618 } 5619 5620 int 5621 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5622 struct iovec *iov, int iovcnt, 5623 uint64_t offset, uint64_t len, 5624 spdk_bdev_io_completion_cb cb, void *cb_arg) 5625 { 5626 uint64_t offset_blocks, num_blocks; 5627 5628 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5629 len, &num_blocks) != 0) { 5630 return -EINVAL; 5631 } 5632 5633 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5634 } 5635 5636 int 5637 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5638 struct iovec *iov, int iovcnt, 5639 uint64_t offset_blocks, uint64_t num_blocks, 5640 spdk_bdev_io_completion_cb cb, void *cb_arg) 5641 { 5642 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5643 5644 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5645 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5646 cb, cb_arg); 5647 } 5648 5649 int 5650 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5651 struct iovec *iov, int iovcnt, void *md_buf, 5652 uint64_t offset_blocks, uint64_t num_blocks, 5653 spdk_bdev_io_completion_cb cb, void *cb_arg) 5654 { 5655 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5656 5657 if (md_buf && !spdk_bdev_is_md_separate(bdev)) { 5658 return -EINVAL; 5659 } 5660 5661 if (md_buf && !_is_buf_allocated(iov)) { 5662 return -EINVAL; 5663 } 5664 5665 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5666 num_blocks, NULL, NULL, NULL, bdev->dif_check_flags, 0, 0, 5667 cb, cb_arg); 5668 } 5669 5670 int 5671 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5672 struct iovec *iov, int iovcnt, 5673 uint64_t offset_blocks, uint64_t num_blocks, 5674 spdk_bdev_io_completion_cb cb, void *cb_arg, 5675 struct spdk_bdev_ext_io_opts *opts) 5676 { 5677 struct spdk_memory_domain *domain = NULL; 5678 struct spdk_accel_sequence *seq = NULL; 5679 void *domain_ctx = NULL, *md = NULL; 5680 uint32_t dif_check_flags = 0; 5681 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5682 uint32_t nvme_cdw12_raw = 0; 5683 uint32_t nvme_cdw13_raw = 0; 5684 5685 if (opts) { 5686 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5687 return -EINVAL; 5688 } 5689 md = opts->metadata; 5690 domain = bdev_get_ext_io_opt(opts, memory_domain, NULL); 5691 domain_ctx = bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL); 5692 seq = bdev_get_ext_io_opt(opts, accel_sequence, NULL); 5693 nvme_cdw12_raw = bdev_get_ext_io_opt(opts, nvme_cdw12.raw, 0); 5694 nvme_cdw13_raw = bdev_get_ext_io_opt(opts, nvme_cdw13.raw, 0); 5695 if (md) { 5696 if (spdk_unlikely(!spdk_bdev_is_md_separate(bdev))) { 5697 return -EINVAL; 5698 } 5699 5700 if (spdk_unlikely(!_is_buf_allocated(iov))) { 5701 return -EINVAL; 5702 } 5703 5704 if (spdk_unlikely(seq != NULL)) { 5705 return -EINVAL; 5706 } 5707 } 5708 } 5709 5710 dif_check_flags = bdev->dif_check_flags & 5711 ~(bdev_get_ext_io_opt(opts, dif_check_flags_exclude_mask, 0)); 5712 5713 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5714 domain, domain_ctx, seq, dif_check_flags, 5715 nvme_cdw12_raw, nvme_cdw13_raw, cb, cb_arg); 5716 } 5717 5718 static void 5719 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5720 { 5721 struct spdk_bdev_io *parent_io = cb_arg; 5722 struct spdk_bdev *bdev = parent_io->bdev; 5723 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5724 int i, rc = 0; 5725 5726 if (!success) { 5727 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5728 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5729 spdk_bdev_free_io(bdev_io); 5730 return; 5731 } 5732 5733 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5734 rc = memcmp(read_buf, 5735 parent_io->u.bdev.iovs[i].iov_base, 5736 parent_io->u.bdev.iovs[i].iov_len); 5737 if (rc) { 5738 break; 5739 } 5740 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5741 } 5742 5743 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5744 rc = memcmp(bdev_io->u.bdev.md_buf, 5745 parent_io->u.bdev.md_buf, 5746 spdk_bdev_get_md_size(bdev)); 5747 } 5748 5749 spdk_bdev_free_io(bdev_io); 5750 5751 if (rc == 0) { 5752 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5753 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5754 } else { 5755 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5756 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5757 } 5758 } 5759 5760 static void 5761 bdev_compare_do_read(void *_bdev_io) 5762 { 5763 struct spdk_bdev_io *bdev_io = _bdev_io; 5764 int rc; 5765 5766 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5767 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5768 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5769 bdev_compare_do_read_done, bdev_io); 5770 5771 if (rc == -ENOMEM) { 5772 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5773 } else if (rc != 0) { 5774 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5775 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5776 } 5777 } 5778 5779 static int 5780 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5781 struct iovec *iov, int iovcnt, void *md_buf, 5782 uint64_t offset_blocks, uint64_t num_blocks, 5783 spdk_bdev_io_completion_cb cb, void *cb_arg) 5784 { 5785 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5786 struct spdk_bdev_io *bdev_io; 5787 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5788 5789 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5790 return -EINVAL; 5791 } 5792 5793 bdev_io = bdev_channel_get_io(channel); 5794 if (!bdev_io) { 5795 return -ENOMEM; 5796 } 5797 5798 bdev_io->internal.ch = channel; 5799 bdev_io->internal.desc = desc; 5800 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5801 bdev_io->u.bdev.iovs = iov; 5802 bdev_io->u.bdev.iovcnt = iovcnt; 5803 bdev_io->u.bdev.md_buf = md_buf; 5804 bdev_io->u.bdev.num_blocks = num_blocks; 5805 bdev_io->u.bdev.offset_blocks = offset_blocks; 5806 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5807 bdev_io->u.bdev.memory_domain = NULL; 5808 bdev_io->u.bdev.memory_domain_ctx = NULL; 5809 bdev_io->u.bdev.accel_sequence = NULL; 5810 5811 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5812 bdev_io_submit(bdev_io); 5813 return 0; 5814 } 5815 5816 bdev_compare_do_read(bdev_io); 5817 5818 return 0; 5819 } 5820 5821 int 5822 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5823 struct iovec *iov, int iovcnt, 5824 uint64_t offset_blocks, uint64_t num_blocks, 5825 spdk_bdev_io_completion_cb cb, void *cb_arg) 5826 { 5827 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5828 num_blocks, cb, cb_arg); 5829 } 5830 5831 int 5832 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5833 struct iovec *iov, int iovcnt, void *md_buf, 5834 uint64_t offset_blocks, uint64_t num_blocks, 5835 spdk_bdev_io_completion_cb cb, void *cb_arg) 5836 { 5837 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5838 return -EINVAL; 5839 } 5840 5841 if (md_buf && !_is_buf_allocated(iov)) { 5842 return -EINVAL; 5843 } 5844 5845 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5846 num_blocks, cb, cb_arg); 5847 } 5848 5849 static int 5850 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5851 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5852 spdk_bdev_io_completion_cb cb, void *cb_arg) 5853 { 5854 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5855 struct spdk_bdev_io *bdev_io; 5856 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5857 5858 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5859 return -EINVAL; 5860 } 5861 5862 bdev_io = bdev_channel_get_io(channel); 5863 if (!bdev_io) { 5864 return -ENOMEM; 5865 } 5866 5867 bdev_io->internal.ch = channel; 5868 bdev_io->internal.desc = desc; 5869 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5870 bdev_io->u.bdev.iovs = &bdev_io->iov; 5871 bdev_io->u.bdev.iovs[0].iov_base = buf; 5872 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5873 bdev_io->u.bdev.iovcnt = 1; 5874 bdev_io->u.bdev.md_buf = md_buf; 5875 bdev_io->u.bdev.num_blocks = num_blocks; 5876 bdev_io->u.bdev.offset_blocks = offset_blocks; 5877 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5878 bdev_io->u.bdev.memory_domain = NULL; 5879 bdev_io->u.bdev.memory_domain_ctx = NULL; 5880 bdev_io->u.bdev.accel_sequence = NULL; 5881 5882 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5883 bdev_io_submit(bdev_io); 5884 return 0; 5885 } 5886 5887 bdev_compare_do_read(bdev_io); 5888 5889 return 0; 5890 } 5891 5892 int 5893 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5894 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5895 spdk_bdev_io_completion_cb cb, void *cb_arg) 5896 { 5897 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5898 cb, cb_arg); 5899 } 5900 5901 int 5902 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5903 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5904 spdk_bdev_io_completion_cb cb, void *cb_arg) 5905 { 5906 struct iovec iov = { 5907 .iov_base = buf, 5908 }; 5909 5910 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5911 return -EINVAL; 5912 } 5913 5914 if (md_buf && !_is_buf_allocated(&iov)) { 5915 return -EINVAL; 5916 } 5917 5918 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5919 cb, cb_arg); 5920 } 5921 5922 static void 5923 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5924 { 5925 struct spdk_bdev_io *bdev_io = ctx; 5926 5927 if (unlock_status) { 5928 SPDK_ERRLOG("LBA range unlock failed\n"); 5929 } 5930 5931 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5932 false, bdev_io->internal.caller_ctx); 5933 } 5934 5935 static void 5936 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5937 { 5938 bdev_io->internal.status = status; 5939 5940 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5941 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5942 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5943 } 5944 5945 static void 5946 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5947 { 5948 struct spdk_bdev_io *parent_io = cb_arg; 5949 5950 if (!success) { 5951 SPDK_ERRLOG("Compare and write operation failed\n"); 5952 } 5953 5954 spdk_bdev_free_io(bdev_io); 5955 5956 bdev_comparev_and_writev_blocks_unlock(parent_io, 5957 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5958 } 5959 5960 static void 5961 bdev_compare_and_write_do_write(void *_bdev_io) 5962 { 5963 struct spdk_bdev_io *bdev_io = _bdev_io; 5964 int rc; 5965 5966 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5967 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5968 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5969 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5970 bdev_compare_and_write_do_write_done, bdev_io); 5971 5972 5973 if (rc == -ENOMEM) { 5974 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5975 } else if (rc != 0) { 5976 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5977 } 5978 } 5979 5980 static void 5981 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5982 { 5983 struct spdk_bdev_io *parent_io = cb_arg; 5984 5985 spdk_bdev_free_io(bdev_io); 5986 5987 if (!success) { 5988 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5989 return; 5990 } 5991 5992 bdev_compare_and_write_do_write(parent_io); 5993 } 5994 5995 static void 5996 bdev_compare_and_write_do_compare(void *_bdev_io) 5997 { 5998 struct spdk_bdev_io *bdev_io = _bdev_io; 5999 int rc; 6000 6001 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 6002 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 6003 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 6004 bdev_compare_and_write_do_compare_done, bdev_io); 6005 6006 if (rc == -ENOMEM) { 6007 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 6008 } else if (rc != 0) { 6009 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 6010 } 6011 } 6012 6013 static void 6014 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 6015 { 6016 struct spdk_bdev_io *bdev_io = ctx; 6017 6018 if (status) { 6019 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 6020 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6021 return; 6022 } 6023 6024 bdev_compare_and_write_do_compare(bdev_io); 6025 } 6026 6027 int 6028 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6029 struct iovec *compare_iov, int compare_iovcnt, 6030 struct iovec *write_iov, int write_iovcnt, 6031 uint64_t offset_blocks, uint64_t num_blocks, 6032 spdk_bdev_io_completion_cb cb, void *cb_arg) 6033 { 6034 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6035 struct spdk_bdev_io *bdev_io; 6036 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6037 6038 if (!desc->write) { 6039 return -EBADF; 6040 } 6041 6042 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6043 return -EINVAL; 6044 } 6045 6046 if (num_blocks > bdev->acwu) { 6047 return -EINVAL; 6048 } 6049 6050 bdev_io = bdev_channel_get_io(channel); 6051 if (!bdev_io) { 6052 return -ENOMEM; 6053 } 6054 6055 bdev_io->internal.ch = channel; 6056 bdev_io->internal.desc = desc; 6057 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 6058 bdev_io->u.bdev.iovs = compare_iov; 6059 bdev_io->u.bdev.iovcnt = compare_iovcnt; 6060 bdev_io->u.bdev.fused_iovs = write_iov; 6061 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 6062 bdev_io->u.bdev.md_buf = NULL; 6063 bdev_io->u.bdev.num_blocks = num_blocks; 6064 bdev_io->u.bdev.offset_blocks = offset_blocks; 6065 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6066 bdev_io->u.bdev.memory_domain = NULL; 6067 bdev_io->u.bdev.memory_domain_ctx = NULL; 6068 bdev_io->u.bdev.accel_sequence = NULL; 6069 6070 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 6071 bdev_io_submit(bdev_io); 6072 return 0; 6073 } 6074 6075 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 6076 bdev_comparev_and_writev_blocks_locked, bdev_io); 6077 } 6078 6079 int 6080 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6081 struct iovec *iov, int iovcnt, 6082 uint64_t offset_blocks, uint64_t num_blocks, 6083 bool populate, 6084 spdk_bdev_io_completion_cb cb, void *cb_arg) 6085 { 6086 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6087 struct spdk_bdev_io *bdev_io; 6088 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6089 6090 if (!desc->write) { 6091 return -EBADF; 6092 } 6093 6094 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6095 return -EINVAL; 6096 } 6097 6098 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 6099 return -ENOTSUP; 6100 } 6101 6102 bdev_io = bdev_channel_get_io(channel); 6103 if (!bdev_io) { 6104 return -ENOMEM; 6105 } 6106 6107 bdev_io->internal.ch = channel; 6108 bdev_io->internal.desc = desc; 6109 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 6110 bdev_io->u.bdev.num_blocks = num_blocks; 6111 bdev_io->u.bdev.offset_blocks = offset_blocks; 6112 bdev_io->u.bdev.iovs = iov; 6113 bdev_io->u.bdev.iovcnt = iovcnt; 6114 bdev_io->u.bdev.md_buf = NULL; 6115 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 6116 bdev_io->u.bdev.zcopy.commit = 0; 6117 bdev_io->u.bdev.zcopy.start = 1; 6118 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6119 bdev_io->u.bdev.memory_domain = NULL; 6120 bdev_io->u.bdev.memory_domain_ctx = NULL; 6121 bdev_io->u.bdev.accel_sequence = NULL; 6122 6123 bdev_io_submit(bdev_io); 6124 6125 return 0; 6126 } 6127 6128 int 6129 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 6130 spdk_bdev_io_completion_cb cb, void *cb_arg) 6131 { 6132 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 6133 return -EINVAL; 6134 } 6135 6136 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 6137 bdev_io->u.bdev.zcopy.start = 0; 6138 bdev_io->internal.caller_ctx = cb_arg; 6139 bdev_io->internal.cb = cb; 6140 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 6141 6142 bdev_io_submit(bdev_io); 6143 6144 return 0; 6145 } 6146 6147 int 6148 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6149 uint64_t offset, uint64_t len, 6150 spdk_bdev_io_completion_cb cb, void *cb_arg) 6151 { 6152 uint64_t offset_blocks, num_blocks; 6153 6154 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6155 len, &num_blocks) != 0) { 6156 return -EINVAL; 6157 } 6158 6159 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6160 } 6161 6162 int 6163 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6164 uint64_t offset_blocks, uint64_t num_blocks, 6165 spdk_bdev_io_completion_cb cb, void *cb_arg) 6166 { 6167 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6168 struct spdk_bdev_io *bdev_io; 6169 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6170 6171 if (!desc->write) { 6172 return -EBADF; 6173 } 6174 6175 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6176 return -EINVAL; 6177 } 6178 6179 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6180 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6181 return -ENOTSUP; 6182 } 6183 6184 bdev_io = bdev_channel_get_io(channel); 6185 6186 if (!bdev_io) { 6187 return -ENOMEM; 6188 } 6189 6190 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6191 bdev_io->internal.ch = channel; 6192 bdev_io->internal.desc = desc; 6193 bdev_io->u.bdev.offset_blocks = offset_blocks; 6194 bdev_io->u.bdev.num_blocks = num_blocks; 6195 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6196 bdev_io->u.bdev.memory_domain = NULL; 6197 bdev_io->u.bdev.memory_domain_ctx = NULL; 6198 bdev_io->u.bdev.accel_sequence = NULL; 6199 6200 /* If the write_zeroes size is large and should be split, use the generic split 6201 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6202 * 6203 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6204 * or emulate it using regular write request otherwise. 6205 */ 6206 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6207 bdev_io->internal.split) { 6208 bdev_io_submit(bdev_io); 6209 return 0; 6210 } 6211 6212 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6213 6214 return bdev_write_zero_buffer(bdev_io); 6215 } 6216 6217 int 6218 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6219 uint64_t offset, uint64_t nbytes, 6220 spdk_bdev_io_completion_cb cb, void *cb_arg) 6221 { 6222 uint64_t offset_blocks, num_blocks; 6223 6224 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6225 nbytes, &num_blocks) != 0) { 6226 return -EINVAL; 6227 } 6228 6229 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6230 } 6231 6232 static void 6233 bdev_io_complete_cb(void *ctx) 6234 { 6235 struct spdk_bdev_io *bdev_io = ctx; 6236 6237 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6238 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 6239 } 6240 6241 int 6242 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6243 uint64_t offset_blocks, uint64_t num_blocks, 6244 spdk_bdev_io_completion_cb cb, void *cb_arg) 6245 { 6246 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6247 struct spdk_bdev_io *bdev_io; 6248 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6249 6250 if (!desc->write) { 6251 return -EBADF; 6252 } 6253 6254 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6255 return -EINVAL; 6256 } 6257 6258 bdev_io = bdev_channel_get_io(channel); 6259 if (!bdev_io) { 6260 return -ENOMEM; 6261 } 6262 6263 bdev_io->internal.ch = channel; 6264 bdev_io->internal.desc = desc; 6265 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6266 6267 bdev_io->u.bdev.iovs = &bdev_io->iov; 6268 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6269 bdev_io->u.bdev.iovs[0].iov_len = 0; 6270 bdev_io->u.bdev.iovcnt = 1; 6271 6272 bdev_io->u.bdev.offset_blocks = offset_blocks; 6273 bdev_io->u.bdev.num_blocks = num_blocks; 6274 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6275 bdev_io->u.bdev.memory_domain = NULL; 6276 bdev_io->u.bdev.memory_domain_ctx = NULL; 6277 bdev_io->u.bdev.accel_sequence = NULL; 6278 6279 if (num_blocks == 0) { 6280 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 6281 return 0; 6282 } 6283 6284 bdev_io_submit(bdev_io); 6285 return 0; 6286 } 6287 6288 int 6289 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6290 uint64_t offset, uint64_t length, 6291 spdk_bdev_io_completion_cb cb, void *cb_arg) 6292 { 6293 uint64_t offset_blocks, num_blocks; 6294 6295 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6296 length, &num_blocks) != 0) { 6297 return -EINVAL; 6298 } 6299 6300 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6301 } 6302 6303 int 6304 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6305 uint64_t offset_blocks, uint64_t num_blocks, 6306 spdk_bdev_io_completion_cb cb, void *cb_arg) 6307 { 6308 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6309 struct spdk_bdev_io *bdev_io; 6310 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6311 6312 if (!desc->write) { 6313 return -EBADF; 6314 } 6315 6316 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6317 return -EINVAL; 6318 } 6319 6320 bdev_io = bdev_channel_get_io(channel); 6321 if (!bdev_io) { 6322 return -ENOMEM; 6323 } 6324 6325 bdev_io->internal.ch = channel; 6326 bdev_io->internal.desc = desc; 6327 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6328 bdev_io->u.bdev.iovs = NULL; 6329 bdev_io->u.bdev.iovcnt = 0; 6330 bdev_io->u.bdev.offset_blocks = offset_blocks; 6331 bdev_io->u.bdev.num_blocks = num_blocks; 6332 bdev_io->u.bdev.memory_domain = NULL; 6333 bdev_io->u.bdev.memory_domain_ctx = NULL; 6334 bdev_io->u.bdev.accel_sequence = NULL; 6335 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6336 6337 bdev_io_submit(bdev_io); 6338 return 0; 6339 } 6340 6341 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6342 6343 static void 6344 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6345 { 6346 struct spdk_bdev_channel *ch = _ctx; 6347 struct spdk_bdev_io *bdev_io; 6348 6349 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6350 6351 if (status == -EBUSY) { 6352 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6353 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6354 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6355 } else { 6356 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6357 6358 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6359 /* If outstanding IOs are still present and reset_io_drain_timeout 6360 * seconds passed, start the reset. */ 6361 bdev_io_submit_reset(bdev_io); 6362 } else { 6363 /* We still have in progress memory domain pull/push or we're 6364 * executing accel sequence. Since we cannot abort either of those 6365 * operaions, fail the reset request. */ 6366 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6367 } 6368 } 6369 } else { 6370 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6371 SPDK_DEBUGLOG(bdev, 6372 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6373 ch->bdev->name); 6374 /* Mark the completion status as a SUCCESS and complete the reset. */ 6375 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6376 } 6377 } 6378 6379 static void 6380 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6381 struct spdk_io_channel *io_ch, void *_ctx) 6382 { 6383 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6384 int status = 0; 6385 6386 if (cur_ch->io_outstanding > 0 || 6387 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6388 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6389 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6390 * further iteration over the rest of the channels and pass non-zero status 6391 * to the callback function. */ 6392 status = -EBUSY; 6393 } 6394 spdk_bdev_for_each_channel_continue(i, status); 6395 } 6396 6397 static int 6398 bdev_reset_poll_for_outstanding_io(void *ctx) 6399 { 6400 struct spdk_bdev_channel *ch = ctx; 6401 struct spdk_bdev_io *bdev_io; 6402 6403 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6404 6405 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6406 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6407 bdev_reset_check_outstanding_io_done); 6408 6409 return SPDK_POLLER_BUSY; 6410 } 6411 6412 static void 6413 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6414 { 6415 struct spdk_bdev_channel *ch = _ctx; 6416 struct spdk_bdev_io *bdev_io; 6417 6418 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6419 6420 if (bdev->reset_io_drain_timeout == 0) { 6421 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6422 6423 bdev_io_submit_reset(bdev_io); 6424 return; 6425 } 6426 6427 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6428 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6429 6430 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6431 * submit the reset to the underlying module only if outstanding I/O 6432 * remain after reset_io_drain_timeout seconds have passed. */ 6433 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6434 bdev_reset_check_outstanding_io_done); 6435 } 6436 6437 static void 6438 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6439 struct spdk_io_channel *ch, void *_ctx) 6440 { 6441 struct spdk_bdev_channel *channel; 6442 struct spdk_bdev_mgmt_channel *mgmt_channel; 6443 struct spdk_bdev_shared_resource *shared_resource; 6444 bdev_io_tailq_t tmp_queued; 6445 6446 TAILQ_INIT(&tmp_queued); 6447 6448 channel = __io_ch_to_bdev_ch(ch); 6449 shared_resource = channel->shared_resource; 6450 mgmt_channel = shared_resource->mgmt_ch; 6451 6452 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6453 6454 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6455 TAILQ_SWAP(&channel->qos_queued_io, &tmp_queued, spdk_bdev_io, internal.link); 6456 } 6457 6458 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6459 bdev_abort_all_buf_io(mgmt_channel, channel); 6460 bdev_abort_all_queued_io(&tmp_queued, channel); 6461 6462 spdk_bdev_for_each_channel_continue(i, 0); 6463 } 6464 6465 static void 6466 bdev_start_reset(void *ctx) 6467 { 6468 struct spdk_bdev_channel *ch = ctx; 6469 6470 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6471 bdev_reset_freeze_channel_done); 6472 } 6473 6474 static void 6475 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6476 { 6477 struct spdk_bdev *bdev = ch->bdev; 6478 6479 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6480 6481 spdk_spin_lock(&bdev->internal.spinlock); 6482 if (bdev->internal.reset_in_progress == NULL) { 6483 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6484 /* 6485 * Take a channel reference for the target bdev for the life of this 6486 * reset. This guards against the channel getting destroyed while 6487 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6488 * progress. We will release the reference when this reset is 6489 * completed. 6490 */ 6491 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6492 bdev_start_reset(ch); 6493 } 6494 spdk_spin_unlock(&bdev->internal.spinlock); 6495 } 6496 6497 int 6498 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6499 spdk_bdev_io_completion_cb cb, void *cb_arg) 6500 { 6501 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6502 struct spdk_bdev_io *bdev_io; 6503 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6504 6505 bdev_io = bdev_channel_get_io(channel); 6506 if (!bdev_io) { 6507 return -ENOMEM; 6508 } 6509 6510 bdev_io->internal.ch = channel; 6511 bdev_io->internal.desc = desc; 6512 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6513 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6514 bdev_io->u.reset.ch_ref = NULL; 6515 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6516 6517 spdk_spin_lock(&bdev->internal.spinlock); 6518 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6519 spdk_spin_unlock(&bdev->internal.spinlock); 6520 6521 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6522 internal.ch_link); 6523 6524 bdev_channel_start_reset(channel); 6525 6526 return 0; 6527 } 6528 6529 void 6530 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6531 struct spdk_bdev_io_stat *stat) 6532 { 6533 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6534 6535 bdev_get_io_stat(stat, channel->stat); 6536 } 6537 6538 static void 6539 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6540 { 6541 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6542 6543 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6544 bdev_iostat_ctx->cb_arg, 0); 6545 free(bdev_iostat_ctx); 6546 } 6547 6548 static void 6549 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6550 struct spdk_io_channel *ch, void *_ctx) 6551 { 6552 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6553 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6554 6555 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6556 spdk_bdev_for_each_channel_continue(i, 0); 6557 } 6558 6559 void 6560 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6561 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6562 { 6563 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6564 6565 assert(bdev != NULL); 6566 assert(stat != NULL); 6567 assert(cb != NULL); 6568 6569 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6570 if (bdev_iostat_ctx == NULL) { 6571 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6572 cb(bdev, stat, cb_arg, -ENOMEM); 6573 return; 6574 } 6575 6576 bdev_iostat_ctx->stat = stat; 6577 bdev_iostat_ctx->cb = cb; 6578 bdev_iostat_ctx->cb_arg = cb_arg; 6579 6580 /* Start with the statistics from previously deleted channels. */ 6581 spdk_spin_lock(&bdev->internal.spinlock); 6582 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6583 spdk_spin_unlock(&bdev->internal.spinlock); 6584 6585 /* Then iterate and add the statistics from each existing channel. */ 6586 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6587 bdev_get_device_stat_done); 6588 } 6589 6590 struct bdev_iostat_reset_ctx { 6591 enum spdk_bdev_reset_stat_mode mode; 6592 bdev_reset_device_stat_cb cb; 6593 void *cb_arg; 6594 }; 6595 6596 static void 6597 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6598 { 6599 struct bdev_iostat_reset_ctx *ctx = _ctx; 6600 6601 ctx->cb(bdev, ctx->cb_arg, 0); 6602 6603 free(ctx); 6604 } 6605 6606 static void 6607 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6608 struct spdk_io_channel *ch, void *_ctx) 6609 { 6610 struct bdev_iostat_reset_ctx *ctx = _ctx; 6611 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6612 6613 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6614 6615 spdk_bdev_for_each_channel_continue(i, 0); 6616 } 6617 6618 void 6619 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6620 bdev_reset_device_stat_cb cb, void *cb_arg) 6621 { 6622 struct bdev_iostat_reset_ctx *ctx; 6623 6624 assert(bdev != NULL); 6625 assert(cb != NULL); 6626 6627 ctx = calloc(1, sizeof(*ctx)); 6628 if (ctx == NULL) { 6629 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6630 cb(bdev, cb_arg, -ENOMEM); 6631 return; 6632 } 6633 6634 ctx->mode = mode; 6635 ctx->cb = cb; 6636 ctx->cb_arg = cb_arg; 6637 6638 spdk_spin_lock(&bdev->internal.spinlock); 6639 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6640 spdk_spin_unlock(&bdev->internal.spinlock); 6641 6642 spdk_bdev_for_each_channel(bdev, 6643 bdev_reset_each_channel_stat, 6644 ctx, 6645 bdev_reset_device_stat_done); 6646 } 6647 6648 int 6649 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6650 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6651 spdk_bdev_io_completion_cb cb, void *cb_arg) 6652 { 6653 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6654 struct spdk_bdev_io *bdev_io; 6655 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6656 6657 if (!desc->write) { 6658 return -EBADF; 6659 } 6660 6661 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6662 return -ENOTSUP; 6663 } 6664 6665 bdev_io = bdev_channel_get_io(channel); 6666 if (!bdev_io) { 6667 return -ENOMEM; 6668 } 6669 6670 bdev_io->internal.ch = channel; 6671 bdev_io->internal.desc = desc; 6672 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6673 bdev_io->u.nvme_passthru.cmd = *cmd; 6674 bdev_io->u.nvme_passthru.buf = buf; 6675 bdev_io->u.nvme_passthru.nbytes = nbytes; 6676 bdev_io->u.nvme_passthru.md_buf = NULL; 6677 bdev_io->u.nvme_passthru.md_len = 0; 6678 6679 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6680 6681 bdev_io_submit(bdev_io); 6682 return 0; 6683 } 6684 6685 int 6686 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6687 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6688 spdk_bdev_io_completion_cb cb, void *cb_arg) 6689 { 6690 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6691 struct spdk_bdev_io *bdev_io; 6692 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6693 6694 if (!desc->write) { 6695 /* 6696 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6697 * to easily determine if the command is a read or write, but for now just 6698 * do not allow io_passthru with a read-only descriptor. 6699 */ 6700 return -EBADF; 6701 } 6702 6703 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6704 return -ENOTSUP; 6705 } 6706 6707 bdev_io = bdev_channel_get_io(channel); 6708 if (!bdev_io) { 6709 return -ENOMEM; 6710 } 6711 6712 bdev_io->internal.ch = channel; 6713 bdev_io->internal.desc = desc; 6714 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6715 bdev_io->u.nvme_passthru.cmd = *cmd; 6716 bdev_io->u.nvme_passthru.buf = buf; 6717 bdev_io->u.nvme_passthru.nbytes = nbytes; 6718 bdev_io->u.nvme_passthru.md_buf = NULL; 6719 bdev_io->u.nvme_passthru.md_len = 0; 6720 6721 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6722 6723 bdev_io_submit(bdev_io); 6724 return 0; 6725 } 6726 6727 int 6728 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6729 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6730 spdk_bdev_io_completion_cb cb, void *cb_arg) 6731 { 6732 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6733 struct spdk_bdev_io *bdev_io; 6734 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6735 6736 if (!desc->write) { 6737 /* 6738 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6739 * to easily determine if the command is a read or write, but for now just 6740 * do not allow io_passthru with a read-only descriptor. 6741 */ 6742 return -EBADF; 6743 } 6744 6745 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6746 return -ENOTSUP; 6747 } 6748 6749 bdev_io = bdev_channel_get_io(channel); 6750 if (!bdev_io) { 6751 return -ENOMEM; 6752 } 6753 6754 bdev_io->internal.ch = channel; 6755 bdev_io->internal.desc = desc; 6756 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6757 bdev_io->u.nvme_passthru.cmd = *cmd; 6758 bdev_io->u.nvme_passthru.buf = buf; 6759 bdev_io->u.nvme_passthru.nbytes = nbytes; 6760 bdev_io->u.nvme_passthru.md_buf = md_buf; 6761 bdev_io->u.nvme_passthru.md_len = md_len; 6762 6763 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6764 6765 bdev_io_submit(bdev_io); 6766 return 0; 6767 } 6768 6769 int 6770 spdk_bdev_nvme_iov_passthru_md(struct spdk_bdev_desc *desc, 6771 struct spdk_io_channel *ch, 6772 const struct spdk_nvme_cmd *cmd, 6773 struct iovec *iov, int iovcnt, size_t nbytes, 6774 void *md_buf, size_t md_len, 6775 spdk_bdev_io_completion_cb cb, void *cb_arg) 6776 { 6777 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6778 struct spdk_bdev_io *bdev_io; 6779 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6780 6781 if (!desc->write) { 6782 /* 6783 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6784 * to easily determine if the command is a read or write, but for now just 6785 * do not allow io_passthru with a read-only descriptor. 6786 */ 6787 return -EBADF; 6788 } 6789 6790 if (md_buf && spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6791 return -ENOTSUP; 6792 } else if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6793 return -ENOTSUP; 6794 } 6795 6796 bdev_io = bdev_channel_get_io(channel); 6797 if (!bdev_io) { 6798 return -ENOMEM; 6799 } 6800 6801 bdev_io->internal.ch = channel; 6802 bdev_io->internal.desc = desc; 6803 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IOV_MD; 6804 bdev_io->u.nvme_passthru.cmd = *cmd; 6805 bdev_io->u.nvme_passthru.iovs = iov; 6806 bdev_io->u.nvme_passthru.iovcnt = iovcnt; 6807 bdev_io->u.nvme_passthru.nbytes = nbytes; 6808 bdev_io->u.nvme_passthru.md_buf = md_buf; 6809 bdev_io->u.nvme_passthru.md_len = md_len; 6810 6811 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6812 6813 bdev_io_submit(bdev_io); 6814 return 0; 6815 } 6816 6817 static void bdev_abort_retry(void *ctx); 6818 static void bdev_abort(struct spdk_bdev_io *parent_io); 6819 6820 static void 6821 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6822 { 6823 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6824 struct spdk_bdev_io *parent_io = cb_arg; 6825 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6826 6827 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6828 6829 spdk_bdev_free_io(bdev_io); 6830 6831 if (!success) { 6832 /* Check if the target I/O completed in the meantime. */ 6833 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6834 if (tmp_io == bio_to_abort) { 6835 break; 6836 } 6837 } 6838 6839 /* If the target I/O still exists, set the parent to failed. */ 6840 if (tmp_io != NULL) { 6841 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6842 } 6843 } 6844 6845 parent_io->u.bdev.split_outstanding--; 6846 if (parent_io->u.bdev.split_outstanding == 0) { 6847 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6848 bdev_abort_retry(parent_io); 6849 } else { 6850 bdev_io_complete(parent_io); 6851 } 6852 } 6853 } 6854 6855 static int 6856 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6857 struct spdk_bdev_io *bio_to_abort, 6858 spdk_bdev_io_completion_cb cb, void *cb_arg) 6859 { 6860 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6861 struct spdk_bdev_io *bdev_io; 6862 6863 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6864 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6865 /* TODO: Abort reset or abort request. */ 6866 return -ENOTSUP; 6867 } 6868 6869 bdev_io = bdev_channel_get_io(channel); 6870 if (bdev_io == NULL) { 6871 return -ENOMEM; 6872 } 6873 6874 bdev_io->internal.ch = channel; 6875 bdev_io->internal.desc = desc; 6876 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6877 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6878 6879 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6880 assert(bdev_io_should_split(bio_to_abort)); 6881 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6882 6883 /* Parent abort request is not submitted directly, but to manage its 6884 * execution add it to the submitted list here. 6885 */ 6886 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6887 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6888 6889 bdev_abort(bdev_io); 6890 6891 return 0; 6892 } 6893 6894 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6895 6896 /* Submit the abort request to the underlying bdev module. */ 6897 bdev_io_submit(bdev_io); 6898 6899 return 0; 6900 } 6901 6902 static bool 6903 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6904 { 6905 struct spdk_bdev_io *iter; 6906 6907 TAILQ_FOREACH(iter, tailq, internal.link) { 6908 if (iter == bdev_io) { 6909 return true; 6910 } 6911 } 6912 6913 return false; 6914 } 6915 6916 static uint32_t 6917 _bdev_abort(struct spdk_bdev_io *parent_io) 6918 { 6919 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6920 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6921 void *bio_cb_arg; 6922 struct spdk_bdev_io *bio_to_abort; 6923 uint32_t matched_ios; 6924 int rc; 6925 6926 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6927 6928 /* matched_ios is returned and will be kept by the caller. 6929 * 6930 * This function will be used for two cases, 1) the same cb_arg is used for 6931 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6932 * Incrementing split_outstanding directly here may confuse readers especially 6933 * for the 1st case. 6934 * 6935 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6936 * works as expected. 6937 */ 6938 matched_ios = 0; 6939 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6940 6941 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6942 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6943 continue; 6944 } 6945 6946 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6947 /* Any I/O which was submitted after this abort command should be excluded. */ 6948 continue; 6949 } 6950 6951 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6952 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6953 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6954 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6955 break; 6956 } 6957 6958 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6959 if (rc != 0) { 6960 if (rc == -ENOMEM) { 6961 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6962 } else { 6963 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6964 } 6965 break; 6966 } 6967 matched_ios++; 6968 } 6969 6970 return matched_ios; 6971 } 6972 6973 static void 6974 bdev_abort_retry(void *ctx) 6975 { 6976 struct spdk_bdev_io *parent_io = ctx; 6977 uint32_t matched_ios; 6978 6979 matched_ios = _bdev_abort(parent_io); 6980 6981 if (matched_ios == 0) { 6982 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6983 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6984 } else { 6985 /* For retry, the case that no target I/O was found is success 6986 * because it means target I/Os completed in the meantime. 6987 */ 6988 bdev_io_complete(parent_io); 6989 } 6990 return; 6991 } 6992 6993 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6994 parent_io->u.bdev.split_outstanding = matched_ios; 6995 } 6996 6997 static void 6998 bdev_abort(struct spdk_bdev_io *parent_io) 6999 { 7000 uint32_t matched_ios; 7001 7002 matched_ios = _bdev_abort(parent_io); 7003 7004 if (matched_ios == 0) { 7005 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 7006 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 7007 } else { 7008 /* The case the no target I/O was found is failure. */ 7009 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7010 bdev_io_complete(parent_io); 7011 } 7012 return; 7013 } 7014 7015 /* Use split_outstanding to manage the progress of aborting I/Os. */ 7016 parent_io->u.bdev.split_outstanding = matched_ios; 7017 } 7018 7019 int 7020 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 7021 void *bio_cb_arg, 7022 spdk_bdev_io_completion_cb cb, void *cb_arg) 7023 { 7024 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7025 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7026 struct spdk_bdev_io *bdev_io; 7027 7028 if (bio_cb_arg == NULL) { 7029 return -EINVAL; 7030 } 7031 7032 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 7033 return -ENOTSUP; 7034 } 7035 7036 bdev_io = bdev_channel_get_io(channel); 7037 if (bdev_io == NULL) { 7038 return -ENOMEM; 7039 } 7040 7041 bdev_io->internal.ch = channel; 7042 bdev_io->internal.desc = desc; 7043 bdev_io->internal.submit_tsc = spdk_get_ticks(); 7044 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 7045 bdev_io_init(bdev_io, bdev, cb_arg, cb); 7046 7047 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 7048 7049 /* Parent abort request is not submitted directly, but to manage its execution, 7050 * add it to the submitted list here. 7051 */ 7052 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 7053 7054 bdev_abort(bdev_io); 7055 7056 return 0; 7057 } 7058 7059 int 7060 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 7061 struct spdk_bdev_io_wait_entry *entry) 7062 { 7063 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 7064 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 7065 7066 if (bdev != entry->bdev) { 7067 SPDK_ERRLOG("bdevs do not match\n"); 7068 return -EINVAL; 7069 } 7070 7071 if (mgmt_ch->per_thread_cache_count > 0) { 7072 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 7073 return -EINVAL; 7074 } 7075 7076 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 7077 return 0; 7078 } 7079 7080 static inline void 7081 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 7082 { 7083 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 7084 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 7085 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 7086 uint32_t blocklen = bdev_io->bdev->blocklen; 7087 7088 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7089 switch (bdev_io->type) { 7090 case SPDK_BDEV_IO_TYPE_READ: 7091 io_stat->bytes_read += num_blocks * blocklen; 7092 io_stat->num_read_ops++; 7093 io_stat->read_latency_ticks += tsc_diff; 7094 if (io_stat->max_read_latency_ticks < tsc_diff) { 7095 io_stat->max_read_latency_ticks = tsc_diff; 7096 } 7097 if (io_stat->min_read_latency_ticks > tsc_diff) { 7098 io_stat->min_read_latency_ticks = tsc_diff; 7099 } 7100 break; 7101 case SPDK_BDEV_IO_TYPE_WRITE: 7102 io_stat->bytes_written += num_blocks * blocklen; 7103 io_stat->num_write_ops++; 7104 io_stat->write_latency_ticks += tsc_diff; 7105 if (io_stat->max_write_latency_ticks < tsc_diff) { 7106 io_stat->max_write_latency_ticks = tsc_diff; 7107 } 7108 if (io_stat->min_write_latency_ticks > tsc_diff) { 7109 io_stat->min_write_latency_ticks = tsc_diff; 7110 } 7111 break; 7112 case SPDK_BDEV_IO_TYPE_UNMAP: 7113 io_stat->bytes_unmapped += num_blocks * blocklen; 7114 io_stat->num_unmap_ops++; 7115 io_stat->unmap_latency_ticks += tsc_diff; 7116 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 7117 io_stat->max_unmap_latency_ticks = tsc_diff; 7118 } 7119 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 7120 io_stat->min_unmap_latency_ticks = tsc_diff; 7121 } 7122 break; 7123 case SPDK_BDEV_IO_TYPE_ZCOPY: 7124 /* Track the data in the start phase only */ 7125 if (bdev_io->u.bdev.zcopy.start) { 7126 if (bdev_io->u.bdev.zcopy.populate) { 7127 io_stat->bytes_read += num_blocks * blocklen; 7128 io_stat->num_read_ops++; 7129 io_stat->read_latency_ticks += tsc_diff; 7130 if (io_stat->max_read_latency_ticks < tsc_diff) { 7131 io_stat->max_read_latency_ticks = tsc_diff; 7132 } 7133 if (io_stat->min_read_latency_ticks > tsc_diff) { 7134 io_stat->min_read_latency_ticks = tsc_diff; 7135 } 7136 } else { 7137 io_stat->bytes_written += num_blocks * blocklen; 7138 io_stat->num_write_ops++; 7139 io_stat->write_latency_ticks += tsc_diff; 7140 if (io_stat->max_write_latency_ticks < tsc_diff) { 7141 io_stat->max_write_latency_ticks = tsc_diff; 7142 } 7143 if (io_stat->min_write_latency_ticks > tsc_diff) { 7144 io_stat->min_write_latency_ticks = tsc_diff; 7145 } 7146 } 7147 } 7148 break; 7149 case SPDK_BDEV_IO_TYPE_COPY: 7150 io_stat->bytes_copied += num_blocks * blocklen; 7151 io_stat->num_copy_ops++; 7152 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 7153 if (io_stat->max_copy_latency_ticks < tsc_diff) { 7154 io_stat->max_copy_latency_ticks = tsc_diff; 7155 } 7156 if (io_stat->min_copy_latency_ticks > tsc_diff) { 7157 io_stat->min_copy_latency_ticks = tsc_diff; 7158 } 7159 break; 7160 default: 7161 break; 7162 } 7163 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 7164 io_stat = bdev_io->bdev->internal.stat; 7165 assert(io_stat->io_error != NULL); 7166 7167 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 7168 io_stat->io_error->error_status[-io_status - 1]++; 7169 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 7170 } 7171 7172 #ifdef SPDK_CONFIG_VTUNE 7173 uint64_t now_tsc = spdk_get_ticks(); 7174 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 7175 uint64_t data[5]; 7176 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 7177 7178 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 7179 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 7180 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 7181 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 7182 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 7183 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 7184 7185 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 7186 __itt_metadata_u64, 5, data); 7187 7188 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 7189 bdev_io->internal.ch->start_tsc = now_tsc; 7190 } 7191 #endif 7192 } 7193 7194 static inline void 7195 _bdev_io_complete(void *ctx) 7196 { 7197 struct spdk_bdev_io *bdev_io = ctx; 7198 7199 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 7200 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7201 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 7202 } 7203 7204 assert(bdev_io->internal.cb != NULL); 7205 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 7206 7207 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 7208 bdev_io->internal.caller_ctx); 7209 } 7210 7211 static inline void 7212 bdev_io_complete(void *ctx) 7213 { 7214 struct spdk_bdev_io *bdev_io = ctx; 7215 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7216 uint64_t tsc, tsc_diff; 7217 7218 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7219 /* 7220 * Defer completion to avoid potential infinite recursion if the 7221 * user's completion callback issues a new I/O. 7222 */ 7223 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7224 bdev_io_complete, bdev_io); 7225 return; 7226 } 7227 7228 tsc = spdk_get_ticks(); 7229 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7230 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, bdev_ch->trace_id, 0, (uintptr_t)bdev_io, 7231 bdev_io->internal.caller_ctx); 7232 7233 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7234 7235 if (bdev_ch->histogram) { 7236 spdk_histogram_data_tally(bdev_ch->histogram, tsc_diff); 7237 } 7238 7239 bdev_io_update_io_stat(bdev_io, tsc_diff); 7240 _bdev_io_complete(bdev_io); 7241 } 7242 7243 /* The difference between this function and bdev_io_complete() is that this should be called to 7244 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7245 * io_submitted list and don't have submit_tsc updated. 7246 */ 7247 static inline void 7248 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7249 { 7250 /* Since the IO hasn't been submitted it's bound to be failed */ 7251 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7252 7253 /* At this point we don't know if the IO is completed from submission context or not, but, 7254 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7255 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7256 _bdev_io_complete, bdev_io); 7257 } 7258 7259 static void bdev_destroy_cb(void *io_device); 7260 7261 static void 7262 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7263 { 7264 struct spdk_bdev_io *bdev_io = _ctx; 7265 7266 if (bdev_io->u.reset.ch_ref != NULL) { 7267 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7268 bdev_io->u.reset.ch_ref = NULL; 7269 } 7270 7271 bdev_io_complete(bdev_io); 7272 7273 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7274 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7275 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7276 } 7277 } 7278 7279 static void 7280 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7281 struct spdk_io_channel *_ch, void *_ctx) 7282 { 7283 struct spdk_bdev_io *bdev_io = _ctx; 7284 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7285 struct spdk_bdev_io *queued_reset; 7286 7287 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7288 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7289 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7290 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7291 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7292 } 7293 7294 spdk_bdev_for_each_channel_continue(i, 0); 7295 } 7296 7297 static void 7298 bdev_io_complete_sequence_cb(void *ctx, int status) 7299 { 7300 struct spdk_bdev_io *bdev_io = ctx; 7301 7302 /* u.bdev.accel_sequence should have already been cleared at this point */ 7303 assert(bdev_io->u.bdev.accel_sequence == NULL); 7304 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7305 bdev_io->internal.accel_sequence = NULL; 7306 7307 if (spdk_unlikely(status != 0)) { 7308 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7309 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7310 } 7311 7312 bdev_io_complete(bdev_io); 7313 } 7314 7315 void 7316 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7317 { 7318 struct spdk_bdev *bdev = bdev_io->bdev; 7319 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7320 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7321 7322 if (spdk_unlikely(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING)) { 7323 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7324 spdk_bdev_get_module_name(bdev), 7325 bdev_io_status_get_string(bdev_io->internal.status)); 7326 assert(false); 7327 } 7328 bdev_io->internal.status = status; 7329 7330 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7331 bool unlock_channels = false; 7332 7333 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7334 SPDK_ERRLOG("NOMEM returned for reset\n"); 7335 } 7336 spdk_spin_lock(&bdev->internal.spinlock); 7337 if (bdev_io == bdev->internal.reset_in_progress) { 7338 bdev->internal.reset_in_progress = NULL; 7339 unlock_channels = true; 7340 } 7341 spdk_spin_unlock(&bdev->internal.spinlock); 7342 7343 if (unlock_channels) { 7344 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7345 bdev_reset_complete); 7346 return; 7347 } 7348 } else { 7349 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7350 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7351 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7352 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7353 return; 7354 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0 && 7355 !bdev_io_use_accel_sequence(bdev_io))) { 7356 _bdev_io_push_bounce_data_buffer(bdev_io, 7357 _bdev_io_complete_push_bounce_done); 7358 /* bdev IO will be completed in the callback */ 7359 return; 7360 } 7361 } 7362 7363 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7364 return; 7365 } 7366 } 7367 7368 bdev_io_complete(bdev_io); 7369 } 7370 7371 void 7372 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7373 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7374 { 7375 enum spdk_bdev_io_status status; 7376 7377 if (sc == SPDK_SCSI_STATUS_GOOD) { 7378 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7379 } else { 7380 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7381 bdev_io->internal.error.scsi.sc = sc; 7382 bdev_io->internal.error.scsi.sk = sk; 7383 bdev_io->internal.error.scsi.asc = asc; 7384 bdev_io->internal.error.scsi.ascq = ascq; 7385 } 7386 7387 spdk_bdev_io_complete(bdev_io, status); 7388 } 7389 7390 void 7391 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7392 int *sc, int *sk, int *asc, int *ascq) 7393 { 7394 assert(sc != NULL); 7395 assert(sk != NULL); 7396 assert(asc != NULL); 7397 assert(ascq != NULL); 7398 7399 switch (bdev_io->internal.status) { 7400 case SPDK_BDEV_IO_STATUS_SUCCESS: 7401 *sc = SPDK_SCSI_STATUS_GOOD; 7402 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7403 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7404 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7405 break; 7406 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7407 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7408 break; 7409 case SPDK_BDEV_IO_STATUS_MISCOMPARE: 7410 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7411 *sk = SPDK_SCSI_SENSE_MISCOMPARE; 7412 *asc = SPDK_SCSI_ASC_MISCOMPARE_DURING_VERIFY_OPERATION; 7413 *ascq = bdev_io->internal.error.scsi.ascq; 7414 break; 7415 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7416 *sc = bdev_io->internal.error.scsi.sc; 7417 *sk = bdev_io->internal.error.scsi.sk; 7418 *asc = bdev_io->internal.error.scsi.asc; 7419 *ascq = bdev_io->internal.error.scsi.ascq; 7420 break; 7421 default: 7422 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7423 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7424 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7425 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7426 break; 7427 } 7428 } 7429 7430 void 7431 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7432 { 7433 enum spdk_bdev_io_status status; 7434 7435 if (aio_result == 0) { 7436 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7437 } else { 7438 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7439 } 7440 7441 bdev_io->internal.error.aio_result = aio_result; 7442 7443 spdk_bdev_io_complete(bdev_io, status); 7444 } 7445 7446 void 7447 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7448 { 7449 assert(aio_result != NULL); 7450 7451 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7452 *aio_result = bdev_io->internal.error.aio_result; 7453 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7454 *aio_result = 0; 7455 } else { 7456 *aio_result = -EIO; 7457 } 7458 } 7459 7460 void 7461 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7462 { 7463 enum spdk_bdev_io_status status; 7464 7465 if (spdk_likely(sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS)) { 7466 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7467 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7468 status = SPDK_BDEV_IO_STATUS_ABORTED; 7469 } else { 7470 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7471 } 7472 7473 bdev_io->internal.error.nvme.cdw0 = cdw0; 7474 bdev_io->internal.error.nvme.sct = sct; 7475 bdev_io->internal.error.nvme.sc = sc; 7476 7477 spdk_bdev_io_complete(bdev_io, status); 7478 } 7479 7480 void 7481 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7482 { 7483 assert(sct != NULL); 7484 assert(sc != NULL); 7485 assert(cdw0 != NULL); 7486 7487 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7488 *sct = SPDK_NVME_SCT_GENERIC; 7489 *sc = SPDK_NVME_SC_SUCCESS; 7490 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7491 *cdw0 = 0; 7492 } else { 7493 *cdw0 = 1U; 7494 } 7495 return; 7496 } 7497 7498 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7499 *sct = SPDK_NVME_SCT_GENERIC; 7500 *sc = SPDK_NVME_SC_SUCCESS; 7501 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7502 *sct = bdev_io->internal.error.nvme.sct; 7503 *sc = bdev_io->internal.error.nvme.sc; 7504 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7505 *sct = SPDK_NVME_SCT_GENERIC; 7506 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7507 } else { 7508 *sct = SPDK_NVME_SCT_GENERIC; 7509 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7510 } 7511 7512 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7513 } 7514 7515 void 7516 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7517 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7518 { 7519 assert(first_sct != NULL); 7520 assert(first_sc != NULL); 7521 assert(second_sct != NULL); 7522 assert(second_sc != NULL); 7523 assert(cdw0 != NULL); 7524 7525 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7526 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7527 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7528 *first_sct = bdev_io->internal.error.nvme.sct; 7529 *first_sc = bdev_io->internal.error.nvme.sc; 7530 *second_sct = SPDK_NVME_SCT_GENERIC; 7531 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7532 } else { 7533 *first_sct = SPDK_NVME_SCT_GENERIC; 7534 *first_sc = SPDK_NVME_SC_SUCCESS; 7535 *second_sct = bdev_io->internal.error.nvme.sct; 7536 *second_sc = bdev_io->internal.error.nvme.sc; 7537 } 7538 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7539 *first_sct = SPDK_NVME_SCT_GENERIC; 7540 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7541 *second_sct = SPDK_NVME_SCT_GENERIC; 7542 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7543 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7544 *first_sct = SPDK_NVME_SCT_GENERIC; 7545 *first_sc = SPDK_NVME_SC_SUCCESS; 7546 *second_sct = SPDK_NVME_SCT_GENERIC; 7547 *second_sc = SPDK_NVME_SC_SUCCESS; 7548 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7549 *first_sct = SPDK_NVME_SCT_GENERIC; 7550 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7551 *second_sct = SPDK_NVME_SCT_GENERIC; 7552 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7553 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7554 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7555 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7556 *second_sct = SPDK_NVME_SCT_GENERIC; 7557 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7558 } else { 7559 *first_sct = SPDK_NVME_SCT_GENERIC; 7560 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7561 *second_sct = SPDK_NVME_SCT_GENERIC; 7562 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7563 } 7564 7565 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7566 } 7567 7568 void 7569 spdk_bdev_io_complete_base_io_status(struct spdk_bdev_io *bdev_io, 7570 const struct spdk_bdev_io *base_io) 7571 { 7572 switch (base_io->internal.status) { 7573 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7574 spdk_bdev_io_complete_nvme_status(bdev_io, 7575 base_io->internal.error.nvme.cdw0, 7576 base_io->internal.error.nvme.sct, 7577 base_io->internal.error.nvme.sc); 7578 break; 7579 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7580 spdk_bdev_io_complete_scsi_status(bdev_io, 7581 base_io->internal.error.scsi.sc, 7582 base_io->internal.error.scsi.sk, 7583 base_io->internal.error.scsi.asc, 7584 base_io->internal.error.scsi.ascq); 7585 break; 7586 case SPDK_BDEV_IO_STATUS_AIO_ERROR: 7587 spdk_bdev_io_complete_aio_status(bdev_io, base_io->internal.error.aio_result); 7588 break; 7589 default: 7590 spdk_bdev_io_complete(bdev_io, base_io->internal.status); 7591 break; 7592 } 7593 } 7594 7595 struct spdk_thread * 7596 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7597 { 7598 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7599 } 7600 7601 struct spdk_io_channel * 7602 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7603 { 7604 return bdev_io->internal.ch->channel; 7605 } 7606 7607 static int 7608 bdev_register(struct spdk_bdev *bdev) 7609 { 7610 char *bdev_name; 7611 char uuid[SPDK_UUID_STRING_LEN]; 7612 struct spdk_iobuf_opts iobuf_opts; 7613 int ret; 7614 7615 assert(bdev->module != NULL); 7616 7617 if (!bdev->name) { 7618 SPDK_ERRLOG("Bdev name is NULL\n"); 7619 return -EINVAL; 7620 } 7621 7622 if (!strlen(bdev->name)) { 7623 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7624 return -EINVAL; 7625 } 7626 7627 /* Users often register their own I/O devices using the bdev name. In 7628 * order to avoid conflicts, prepend bdev_. */ 7629 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7630 if (!bdev_name) { 7631 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7632 return -ENOMEM; 7633 } 7634 7635 bdev->internal.stat = bdev_alloc_io_stat(true); 7636 if (!bdev->internal.stat) { 7637 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7638 free(bdev_name); 7639 return -ENOMEM; 7640 } 7641 7642 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7643 bdev->internal.measured_queue_depth = UINT64_MAX; 7644 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7645 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7646 bdev->internal.qd_poller = NULL; 7647 bdev->internal.qos = NULL; 7648 7649 TAILQ_INIT(&bdev->internal.open_descs); 7650 TAILQ_INIT(&bdev->internal.locked_ranges); 7651 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7652 TAILQ_INIT(&bdev->aliases); 7653 7654 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7655 if (ret != 0) { 7656 bdev_free_io_stat(bdev->internal.stat); 7657 free(bdev_name); 7658 return ret; 7659 } 7660 7661 /* UUID may be specified by the user or defined by bdev itself. 7662 * Otherwise it will be generated here, so this field will never be empty. */ 7663 if (spdk_uuid_is_null(&bdev->uuid)) { 7664 spdk_uuid_generate(&bdev->uuid); 7665 } 7666 7667 /* Add the UUID alias only if it's different than the name */ 7668 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7669 if (strcmp(bdev->name, uuid) != 0) { 7670 ret = spdk_bdev_alias_add(bdev, uuid); 7671 if (ret != 0) { 7672 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7673 bdev_name_del(&bdev->internal.bdev_name); 7674 bdev_free_io_stat(bdev->internal.stat); 7675 free(bdev_name); 7676 return ret; 7677 } 7678 } 7679 7680 spdk_iobuf_get_opts(&iobuf_opts, sizeof(iobuf_opts)); 7681 if (spdk_bdev_get_buf_align(bdev) > 1) { 7682 bdev->max_rw_size = spdk_min(bdev->max_rw_size ? bdev->max_rw_size : UINT32_MAX, 7683 iobuf_opts.large_bufsize / bdev->blocklen); 7684 } 7685 7686 /* If the user didn't specify a write unit size, set it to one. */ 7687 if (bdev->write_unit_size == 0) { 7688 bdev->write_unit_size = 1; 7689 } 7690 7691 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7692 if (bdev->acwu == 0) { 7693 bdev->acwu = bdev->write_unit_size; 7694 } 7695 7696 if (bdev->phys_blocklen == 0) { 7697 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7698 } 7699 7700 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7701 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7702 } 7703 7704 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7705 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7706 } 7707 7708 bdev->internal.reset_in_progress = NULL; 7709 bdev->internal.qd_poll_in_progress = false; 7710 bdev->internal.period = 0; 7711 bdev->internal.new_period = 0; 7712 bdev->internal.trace_id = spdk_trace_register_owner(OWNER_TYPE_BDEV, bdev_name); 7713 7714 spdk_io_device_register(__bdev_to_io_dev(bdev), 7715 bdev_channel_create, bdev_channel_destroy, 7716 sizeof(struct spdk_bdev_channel), 7717 bdev_name); 7718 7719 free(bdev_name); 7720 7721 spdk_spin_init(&bdev->internal.spinlock); 7722 7723 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7724 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7725 7726 return 0; 7727 } 7728 7729 static void 7730 bdev_destroy_cb(void *io_device) 7731 { 7732 int rc; 7733 struct spdk_bdev *bdev; 7734 spdk_bdev_unregister_cb cb_fn; 7735 void *cb_arg; 7736 7737 bdev = __bdev_from_io_dev(io_device); 7738 7739 if (bdev->internal.unregister_td != spdk_get_thread()) { 7740 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7741 return; 7742 } 7743 7744 cb_fn = bdev->internal.unregister_cb; 7745 cb_arg = bdev->internal.unregister_ctx; 7746 7747 spdk_spin_destroy(&bdev->internal.spinlock); 7748 free(bdev->internal.qos); 7749 bdev_free_io_stat(bdev->internal.stat); 7750 spdk_trace_unregister_owner(bdev->internal.trace_id); 7751 7752 rc = bdev->fn_table->destruct(bdev->ctxt); 7753 if (rc < 0) { 7754 SPDK_ERRLOG("destruct failed\n"); 7755 } 7756 if (rc <= 0 && cb_fn != NULL) { 7757 cb_fn(cb_arg, rc); 7758 } 7759 } 7760 7761 void 7762 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7763 { 7764 if (bdev->internal.unregister_cb != NULL) { 7765 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7766 } 7767 } 7768 7769 static void 7770 _remove_notify(void *arg) 7771 { 7772 struct spdk_bdev_desc *desc = arg; 7773 7774 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7775 } 7776 7777 /* returns: 0 - bdev removed and ready to be destructed. 7778 * -EBUSY - bdev can't be destructed yet. */ 7779 static int 7780 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7781 { 7782 struct spdk_bdev_desc *desc, *tmp; 7783 int rc = 0; 7784 char uuid[SPDK_UUID_STRING_LEN]; 7785 7786 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7787 assert(spdk_spin_held(&bdev->internal.spinlock)); 7788 7789 /* Notify each descriptor about hotremoval */ 7790 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7791 rc = -EBUSY; 7792 /* 7793 * Defer invocation of the event_cb to a separate message that will 7794 * run later on its thread. This ensures this context unwinds and 7795 * we don't recursively unregister this bdev again if the event_cb 7796 * immediately closes its descriptor. 7797 */ 7798 event_notify(desc, _remove_notify); 7799 } 7800 7801 /* If there are no descriptors, proceed removing the bdev */ 7802 if (rc == 0) { 7803 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7804 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7805 7806 /* Delete the name and the UUID alias */ 7807 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7808 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7809 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7810 7811 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7812 7813 if (bdev->internal.reset_in_progress != NULL) { 7814 /* If reset is in progress, let the completion callback for reset 7815 * unregister the bdev. 7816 */ 7817 rc = -EBUSY; 7818 } 7819 } 7820 7821 return rc; 7822 } 7823 7824 static void 7825 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7826 struct spdk_io_channel *io_ch, void *_ctx) 7827 { 7828 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7829 7830 bdev_channel_abort_queued_ios(bdev_ch); 7831 spdk_bdev_for_each_channel_continue(i, 0); 7832 } 7833 7834 static void 7835 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7836 { 7837 int rc; 7838 7839 spdk_spin_lock(&g_bdev_mgr.spinlock); 7840 spdk_spin_lock(&bdev->internal.spinlock); 7841 /* 7842 * Set the status to REMOVING after completing to abort channels. Otherwise, 7843 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7844 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7845 * may fail. 7846 */ 7847 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7848 rc = bdev_unregister_unsafe(bdev); 7849 spdk_spin_unlock(&bdev->internal.spinlock); 7850 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7851 7852 if (rc == 0) { 7853 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7854 } 7855 } 7856 7857 void 7858 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7859 { 7860 struct spdk_thread *thread; 7861 7862 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7863 7864 thread = spdk_get_thread(); 7865 if (!thread) { 7866 /* The user called this from a non-SPDK thread. */ 7867 if (cb_fn != NULL) { 7868 cb_fn(cb_arg, -ENOTSUP); 7869 } 7870 return; 7871 } 7872 7873 spdk_spin_lock(&g_bdev_mgr.spinlock); 7874 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7875 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7876 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7877 if (cb_fn) { 7878 cb_fn(cb_arg, -EBUSY); 7879 } 7880 return; 7881 } 7882 7883 spdk_spin_lock(&bdev->internal.spinlock); 7884 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7885 bdev->internal.unregister_cb = cb_fn; 7886 bdev->internal.unregister_ctx = cb_arg; 7887 bdev->internal.unregister_td = thread; 7888 spdk_spin_unlock(&bdev->internal.spinlock); 7889 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7890 7891 spdk_bdev_set_qd_sampling_period(bdev, 0); 7892 7893 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7894 bdev_unregister); 7895 } 7896 7897 int 7898 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7899 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7900 { 7901 struct spdk_bdev_desc *desc; 7902 struct spdk_bdev *bdev; 7903 int rc; 7904 7905 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7906 if (rc != 0) { 7907 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7908 return rc; 7909 } 7910 7911 bdev = spdk_bdev_desc_get_bdev(desc); 7912 7913 if (bdev->module != module) { 7914 spdk_bdev_close(desc); 7915 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7916 bdev_name); 7917 return -ENODEV; 7918 } 7919 7920 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7921 7922 spdk_bdev_close(desc); 7923 7924 return 0; 7925 } 7926 7927 static int 7928 bdev_start_qos(struct spdk_bdev *bdev) 7929 { 7930 struct set_qos_limit_ctx *ctx; 7931 7932 /* Enable QoS */ 7933 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7934 ctx = calloc(1, sizeof(*ctx)); 7935 if (ctx == NULL) { 7936 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7937 return -ENOMEM; 7938 } 7939 ctx->bdev = bdev; 7940 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7941 } 7942 7943 return 0; 7944 } 7945 7946 static void 7947 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7948 struct spdk_bdev *bdev) 7949 { 7950 enum spdk_bdev_claim_type type; 7951 const char *typename, *modname; 7952 extern struct spdk_log_flag SPDK_LOG_bdev; 7953 7954 assert(spdk_spin_held(&bdev->internal.spinlock)); 7955 7956 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7957 return; 7958 } 7959 7960 type = bdev->internal.claim_type; 7961 typename = spdk_bdev_claim_get_name(type); 7962 7963 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7964 modname = bdev->internal.claim.v1.module->name; 7965 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7966 bdev->name, detail, typename, modname); 7967 return; 7968 } 7969 7970 if (claim_type_is_v2(type)) { 7971 struct spdk_bdev_module_claim *claim; 7972 7973 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7974 modname = claim->module->name; 7975 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7976 bdev->name, detail, typename, modname); 7977 } 7978 return; 7979 } 7980 7981 assert(false); 7982 } 7983 7984 static int 7985 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7986 { 7987 struct spdk_thread *thread; 7988 int rc = 0; 7989 7990 thread = spdk_get_thread(); 7991 if (!thread) { 7992 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7993 return -ENOTSUP; 7994 } 7995 7996 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7997 spdk_get_thread()); 7998 7999 desc->bdev = bdev; 8000 desc->thread = thread; 8001 desc->write = write; 8002 8003 spdk_spin_lock(&bdev->internal.spinlock); 8004 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 8005 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 8006 spdk_spin_unlock(&bdev->internal.spinlock); 8007 return -ENODEV; 8008 } 8009 8010 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8011 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8012 spdk_spin_unlock(&bdev->internal.spinlock); 8013 return -EPERM; 8014 } 8015 8016 rc = bdev_start_qos(bdev); 8017 if (rc != 0) { 8018 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 8019 spdk_spin_unlock(&bdev->internal.spinlock); 8020 return rc; 8021 } 8022 8023 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 8024 8025 spdk_spin_unlock(&bdev->internal.spinlock); 8026 8027 return 0; 8028 } 8029 8030 static int 8031 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 8032 struct spdk_bdev_desc **_desc) 8033 { 8034 struct spdk_bdev_desc *desc; 8035 unsigned int i; 8036 8037 desc = calloc(1, sizeof(*desc)); 8038 if (desc == NULL) { 8039 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 8040 return -ENOMEM; 8041 } 8042 8043 TAILQ_INIT(&desc->pending_media_events); 8044 TAILQ_INIT(&desc->free_media_events); 8045 8046 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 8047 desc->callback.event_fn = event_cb; 8048 desc->callback.ctx = event_ctx; 8049 spdk_spin_init(&desc->spinlock); 8050 8051 if (bdev->media_events) { 8052 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 8053 sizeof(*desc->media_events_buffer)); 8054 if (desc->media_events_buffer == NULL) { 8055 SPDK_ERRLOG("Failed to initialize media event pool\n"); 8056 bdev_desc_free(desc); 8057 return -ENOMEM; 8058 } 8059 8060 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 8061 TAILQ_INSERT_TAIL(&desc->free_media_events, 8062 &desc->media_events_buffer[i], tailq); 8063 } 8064 } 8065 8066 if (bdev->fn_table->accel_sequence_supported != NULL) { 8067 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 8068 desc->accel_sequence_supported[i] = 8069 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 8070 (enum spdk_bdev_io_type)i); 8071 } 8072 } 8073 8074 *_desc = desc; 8075 8076 return 0; 8077 } 8078 8079 static int 8080 bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8081 void *event_ctx, struct spdk_bdev_desc **_desc) 8082 { 8083 struct spdk_bdev_desc *desc; 8084 struct spdk_bdev *bdev; 8085 int rc; 8086 8087 bdev = bdev_get_by_name(bdev_name); 8088 8089 if (bdev == NULL) { 8090 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 8091 return -ENODEV; 8092 } 8093 8094 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 8095 if (rc != 0) { 8096 return rc; 8097 } 8098 8099 rc = bdev_open(bdev, write, desc); 8100 if (rc != 0) { 8101 bdev_desc_free(desc); 8102 desc = NULL; 8103 } 8104 8105 *_desc = desc; 8106 8107 return rc; 8108 } 8109 8110 int 8111 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8112 void *event_ctx, struct spdk_bdev_desc **_desc) 8113 { 8114 int rc; 8115 8116 if (event_cb == NULL) { 8117 SPDK_ERRLOG("Missing event callback function\n"); 8118 return -EINVAL; 8119 } 8120 8121 spdk_spin_lock(&g_bdev_mgr.spinlock); 8122 rc = bdev_open_ext(bdev_name, write, event_cb, event_ctx, _desc); 8123 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8124 8125 return rc; 8126 } 8127 8128 struct spdk_bdev_open_async_ctx { 8129 char *bdev_name; 8130 spdk_bdev_event_cb_t event_cb; 8131 void *event_ctx; 8132 bool write; 8133 int rc; 8134 spdk_bdev_open_async_cb_t cb_fn; 8135 void *cb_arg; 8136 struct spdk_bdev_desc *desc; 8137 struct spdk_bdev_open_async_opts opts; 8138 uint64_t start_ticks; 8139 struct spdk_thread *orig_thread; 8140 struct spdk_poller *poller; 8141 TAILQ_ENTRY(spdk_bdev_open_async_ctx) tailq; 8142 }; 8143 8144 static void 8145 bdev_open_async_done(void *arg) 8146 { 8147 struct spdk_bdev_open_async_ctx *ctx = arg; 8148 8149 ctx->cb_fn(ctx->desc, ctx->rc, ctx->cb_arg); 8150 8151 free(ctx->bdev_name); 8152 free(ctx); 8153 } 8154 8155 static void 8156 bdev_open_async_cancel(void *arg) 8157 { 8158 struct spdk_bdev_open_async_ctx *ctx = arg; 8159 8160 assert(ctx->rc == -ESHUTDOWN); 8161 8162 spdk_poller_unregister(&ctx->poller); 8163 8164 bdev_open_async_done(ctx); 8165 } 8166 8167 /* This is called when the bdev library finishes at shutdown. */ 8168 static void 8169 bdev_open_async_fini(void) 8170 { 8171 struct spdk_bdev_open_async_ctx *ctx, *tmp_ctx; 8172 8173 spdk_spin_lock(&g_bdev_mgr.spinlock); 8174 TAILQ_FOREACH_SAFE(ctx, &g_bdev_mgr.async_bdev_opens, tailq, tmp_ctx) { 8175 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8176 /* 8177 * We have to move to ctx->orig_thread to unregister ctx->poller. 8178 * However, there is a chance that ctx->poller is executed before 8179 * message is executed, which could result in bdev_open_async_done() 8180 * being called twice. To avoid such race condition, set ctx->rc to 8181 * -ESHUTDOWN. 8182 */ 8183 ctx->rc = -ESHUTDOWN; 8184 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_cancel, ctx); 8185 } 8186 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8187 } 8188 8189 static int bdev_open_async(void *arg); 8190 8191 static void 8192 _bdev_open_async(struct spdk_bdev_open_async_ctx *ctx) 8193 { 8194 uint64_t timeout_ticks; 8195 8196 if (ctx->rc == -ESHUTDOWN) { 8197 /* This context is being canceled. Do nothing. */ 8198 return; 8199 } 8200 8201 ctx->rc = bdev_open_ext(ctx->bdev_name, ctx->write, ctx->event_cb, ctx->event_ctx, 8202 &ctx->desc); 8203 if (ctx->rc == 0 || ctx->opts.timeout_ms == 0) { 8204 goto exit; 8205 } 8206 8207 timeout_ticks = ctx->start_ticks + ctx->opts.timeout_ms * spdk_get_ticks_hz() / 1000ull; 8208 if (spdk_get_ticks() >= timeout_ticks) { 8209 SPDK_ERRLOG("Timed out while waiting for bdev '%s' to appear\n", ctx->bdev_name); 8210 ctx->rc = -ETIMEDOUT; 8211 goto exit; 8212 } 8213 8214 return; 8215 8216 exit: 8217 spdk_poller_unregister(&ctx->poller); 8218 TAILQ_REMOVE(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8219 8220 /* Completion callback is processed after stack unwinding. */ 8221 spdk_thread_send_msg(ctx->orig_thread, bdev_open_async_done, ctx); 8222 } 8223 8224 static int 8225 bdev_open_async(void *arg) 8226 { 8227 struct spdk_bdev_open_async_ctx *ctx = arg; 8228 8229 spdk_spin_lock(&g_bdev_mgr.spinlock); 8230 8231 _bdev_open_async(ctx); 8232 8233 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8234 8235 return SPDK_POLLER_BUSY; 8236 } 8237 8238 static void 8239 bdev_open_async_opts_copy(struct spdk_bdev_open_async_opts *opts, 8240 struct spdk_bdev_open_async_opts *opts_src, 8241 size_t size) 8242 { 8243 assert(opts); 8244 assert(opts_src); 8245 8246 opts->size = size; 8247 8248 #define SET_FIELD(field) \ 8249 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8250 opts->field = opts_src->field; \ 8251 } \ 8252 8253 SET_FIELD(timeout_ms); 8254 8255 /* Do not remove this statement, you should always update this statement when you adding a new field, 8256 * and do not forget to add the SET_FIELD statement for your added field. */ 8257 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_open_async_opts) == 16, "Incorrect size"); 8258 8259 #undef SET_FIELD 8260 } 8261 8262 static void 8263 bdev_open_async_opts_get_default(struct spdk_bdev_open_async_opts *opts, size_t size) 8264 { 8265 assert(opts); 8266 8267 opts->size = size; 8268 8269 #define SET_FIELD(field, value) \ 8270 if (offsetof(struct spdk_bdev_open_async_opts, field) + sizeof(opts->field) <= size) { \ 8271 opts->field = value; \ 8272 } \ 8273 8274 SET_FIELD(timeout_ms, 0); 8275 8276 #undef SET_FIELD 8277 } 8278 8279 int 8280 spdk_bdev_open_async(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 8281 void *event_ctx, struct spdk_bdev_open_async_opts *opts, 8282 spdk_bdev_open_async_cb_t open_cb, void *open_cb_arg) 8283 { 8284 struct spdk_bdev_open_async_ctx *ctx; 8285 8286 if (event_cb == NULL) { 8287 SPDK_ERRLOG("Missing event callback function\n"); 8288 return -EINVAL; 8289 } 8290 8291 if (open_cb == NULL) { 8292 SPDK_ERRLOG("Missing open callback function\n"); 8293 return -EINVAL; 8294 } 8295 8296 if (opts != NULL && opts->size == 0) { 8297 SPDK_ERRLOG("size in the options structure should not be zero\n"); 8298 return -EINVAL; 8299 } 8300 8301 ctx = calloc(1, sizeof(*ctx)); 8302 if (ctx == NULL) { 8303 SPDK_ERRLOG("Failed to allocate open context\n"); 8304 return -ENOMEM; 8305 } 8306 8307 ctx->bdev_name = strdup(bdev_name); 8308 if (ctx->bdev_name == NULL) { 8309 SPDK_ERRLOG("Failed to duplicate bdev_name\n"); 8310 free(ctx); 8311 return -ENOMEM; 8312 } 8313 8314 ctx->poller = SPDK_POLLER_REGISTER(bdev_open_async, ctx, 100 * 1000); 8315 if (ctx->poller == NULL) { 8316 SPDK_ERRLOG("Failed to register bdev_open_async poller\n"); 8317 free(ctx->bdev_name); 8318 free(ctx); 8319 return -ENOMEM; 8320 } 8321 8322 ctx->cb_fn = open_cb; 8323 ctx->cb_arg = open_cb_arg; 8324 ctx->write = write; 8325 ctx->event_cb = event_cb; 8326 ctx->event_ctx = event_ctx; 8327 ctx->orig_thread = spdk_get_thread(); 8328 ctx->start_ticks = spdk_get_ticks(); 8329 8330 bdev_open_async_opts_get_default(&ctx->opts, sizeof(ctx->opts)); 8331 if (opts != NULL) { 8332 bdev_open_async_opts_copy(&ctx->opts, opts, opts->size); 8333 } 8334 8335 spdk_spin_lock(&g_bdev_mgr.spinlock); 8336 8337 TAILQ_INSERT_TAIL(&g_bdev_mgr.async_bdev_opens, ctx, tailq); 8338 _bdev_open_async(ctx); 8339 8340 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8341 8342 return 0; 8343 } 8344 8345 static void 8346 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 8347 { 8348 int rc; 8349 8350 spdk_spin_lock(&bdev->internal.spinlock); 8351 spdk_spin_lock(&desc->spinlock); 8352 8353 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 8354 8355 desc->closed = true; 8356 8357 if (desc->claim != NULL) { 8358 bdev_desc_release_claims(desc); 8359 } 8360 8361 if (0 == desc->refs) { 8362 spdk_spin_unlock(&desc->spinlock); 8363 bdev_desc_free(desc); 8364 } else { 8365 spdk_spin_unlock(&desc->spinlock); 8366 } 8367 8368 /* If no more descriptors, kill QoS channel */ 8369 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8370 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 8371 bdev->name, spdk_get_thread()); 8372 8373 if (bdev_qos_destroy(bdev)) { 8374 /* There isn't anything we can do to recover here. Just let the 8375 * old QoS poller keep running. The QoS handling won't change 8376 * cores when the user allocates a new channel, but it won't break. */ 8377 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 8378 } 8379 } 8380 8381 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 8382 rc = bdev_unregister_unsafe(bdev); 8383 spdk_spin_unlock(&bdev->internal.spinlock); 8384 8385 if (rc == 0) { 8386 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 8387 } 8388 } else { 8389 spdk_spin_unlock(&bdev->internal.spinlock); 8390 } 8391 } 8392 8393 void 8394 spdk_bdev_close(struct spdk_bdev_desc *desc) 8395 { 8396 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8397 8398 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 8399 spdk_get_thread()); 8400 8401 assert(desc->thread == spdk_get_thread()); 8402 8403 spdk_poller_unregister(&desc->io_timeout_poller); 8404 8405 spdk_spin_lock(&g_bdev_mgr.spinlock); 8406 8407 bdev_close(bdev, desc); 8408 8409 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8410 } 8411 8412 static void 8413 bdev_register_finished(void *arg) 8414 { 8415 struct spdk_bdev_desc *desc = arg; 8416 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8417 8418 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 8419 8420 spdk_spin_lock(&g_bdev_mgr.spinlock); 8421 8422 bdev_close(bdev, desc); 8423 8424 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8425 } 8426 8427 int 8428 spdk_bdev_register(struct spdk_bdev *bdev) 8429 { 8430 struct spdk_bdev_desc *desc; 8431 struct spdk_thread *thread = spdk_get_thread(); 8432 int rc; 8433 8434 if (spdk_unlikely(!spdk_thread_is_app_thread(NULL))) { 8435 SPDK_ERRLOG("Cannot register bdev %s on thread %p (%s)\n", bdev->name, thread, 8436 thread ? spdk_thread_get_name(thread) : "null"); 8437 return -EINVAL; 8438 } 8439 8440 rc = bdev_register(bdev); 8441 if (rc != 0) { 8442 return rc; 8443 } 8444 8445 /* A descriptor is opened to prevent bdev deletion during examination */ 8446 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8447 if (rc != 0) { 8448 spdk_bdev_unregister(bdev, NULL, NULL); 8449 return rc; 8450 } 8451 8452 rc = bdev_open(bdev, false, desc); 8453 if (rc != 0) { 8454 bdev_desc_free(desc); 8455 spdk_bdev_unregister(bdev, NULL, NULL); 8456 return rc; 8457 } 8458 8459 /* Examine configuration before initializing I/O */ 8460 bdev_examine(bdev); 8461 8462 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8463 if (rc != 0) { 8464 bdev_close(bdev, desc); 8465 spdk_bdev_unregister(bdev, NULL, NULL); 8466 } 8467 8468 return rc; 8469 } 8470 8471 int 8472 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8473 struct spdk_bdev_module *module) 8474 { 8475 spdk_spin_lock(&bdev->internal.spinlock); 8476 8477 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8478 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8479 spdk_spin_unlock(&bdev->internal.spinlock); 8480 return -EPERM; 8481 } 8482 8483 if (desc && !desc->write) { 8484 desc->write = true; 8485 } 8486 8487 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8488 bdev->internal.claim.v1.module = module; 8489 8490 spdk_spin_unlock(&bdev->internal.spinlock); 8491 return 0; 8492 } 8493 8494 void 8495 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8496 { 8497 spdk_spin_lock(&bdev->internal.spinlock); 8498 8499 assert(bdev->internal.claim.v1.module != NULL); 8500 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8501 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8502 bdev->internal.claim.v1.module = NULL; 8503 8504 spdk_spin_unlock(&bdev->internal.spinlock); 8505 } 8506 8507 /* 8508 * Start claims v2 8509 */ 8510 8511 const char * 8512 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8513 { 8514 switch (type) { 8515 case SPDK_BDEV_CLAIM_NONE: 8516 return "not_claimed"; 8517 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8518 return "exclusive_write"; 8519 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8520 return "read_many_write_one"; 8521 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8522 return "read_many_write_none"; 8523 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8524 return "read_many_write_many"; 8525 default: 8526 break; 8527 } 8528 return "invalid_claim"; 8529 } 8530 8531 static bool 8532 claim_type_is_v2(enum spdk_bdev_claim_type type) 8533 { 8534 switch (type) { 8535 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8536 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8537 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8538 return true; 8539 default: 8540 break; 8541 } 8542 return false; 8543 } 8544 8545 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8546 static bool 8547 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8548 { 8549 switch (type) { 8550 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8551 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8552 return true; 8553 default: 8554 break; 8555 } 8556 return false; 8557 } 8558 8559 void 8560 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8561 { 8562 if (opts == NULL) { 8563 SPDK_ERRLOG("opts should not be NULL\n"); 8564 assert(opts != NULL); 8565 return; 8566 } 8567 if (size == 0) { 8568 SPDK_ERRLOG("size should not be zero\n"); 8569 assert(size != 0); 8570 return; 8571 } 8572 8573 memset(opts, 0, size); 8574 opts->opts_size = size; 8575 8576 #define FIELD_OK(field) \ 8577 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8578 8579 #define SET_FIELD(field, value) \ 8580 if (FIELD_OK(field)) { \ 8581 opts->field = value; \ 8582 } \ 8583 8584 SET_FIELD(shared_claim_key, 0); 8585 8586 #undef FIELD_OK 8587 #undef SET_FIELD 8588 } 8589 8590 static int 8591 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8592 { 8593 if (src->opts_size == 0) { 8594 SPDK_ERRLOG("size should not be zero\n"); 8595 return -1; 8596 } 8597 8598 memset(dst, 0, sizeof(*dst)); 8599 dst->opts_size = src->opts_size; 8600 8601 #define FIELD_OK(field) \ 8602 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8603 8604 #define SET_FIELD(field) \ 8605 if (FIELD_OK(field)) { \ 8606 dst->field = src->field; \ 8607 } \ 8608 8609 if (FIELD_OK(name)) { 8610 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8611 } 8612 8613 SET_FIELD(shared_claim_key); 8614 8615 /* You should not remove this statement, but need to update the assert statement 8616 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8617 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8618 8619 #undef FIELD_OK 8620 #undef SET_FIELD 8621 return 0; 8622 } 8623 8624 /* Returns 0 if a read-write-once claim can be taken. */ 8625 static int 8626 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8627 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8628 { 8629 struct spdk_bdev *bdev = desc->bdev; 8630 struct spdk_bdev_desc *open_desc; 8631 8632 assert(spdk_spin_held(&bdev->internal.spinlock)); 8633 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8634 8635 if (opts->shared_claim_key != 0) { 8636 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8637 bdev->name); 8638 return -EINVAL; 8639 } 8640 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8641 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8642 return -EPERM; 8643 } 8644 if (desc->claim != NULL) { 8645 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8646 bdev->name, desc->claim->module->name); 8647 return -EPERM; 8648 } 8649 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8650 if (desc != open_desc && open_desc->write) { 8651 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8652 "another descriptor is open for writing\n", 8653 bdev->name); 8654 return -EPERM; 8655 } 8656 } 8657 8658 return 0; 8659 } 8660 8661 /* Returns 0 if a read-only-many claim can be taken. */ 8662 static int 8663 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8664 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8665 { 8666 struct spdk_bdev *bdev = desc->bdev; 8667 struct spdk_bdev_desc *open_desc; 8668 8669 assert(spdk_spin_held(&bdev->internal.spinlock)); 8670 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8671 assert(desc->claim == NULL); 8672 8673 if (desc->write) { 8674 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8675 bdev->name); 8676 return -EINVAL; 8677 } 8678 if (opts->shared_claim_key != 0) { 8679 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8680 return -EINVAL; 8681 } 8682 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8683 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8684 if (open_desc->write) { 8685 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8686 "another descriptor is open for writing\n", 8687 bdev->name); 8688 return -EPERM; 8689 } 8690 } 8691 } 8692 8693 return 0; 8694 } 8695 8696 /* Returns 0 if a read-write-many claim can be taken. */ 8697 static int 8698 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8699 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8700 { 8701 struct spdk_bdev *bdev = desc->bdev; 8702 struct spdk_bdev_desc *open_desc; 8703 8704 assert(spdk_spin_held(&bdev->internal.spinlock)); 8705 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8706 assert(desc->claim == NULL); 8707 8708 if (opts->shared_claim_key == 0) { 8709 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8710 bdev->name); 8711 return -EINVAL; 8712 } 8713 switch (bdev->internal.claim_type) { 8714 case SPDK_BDEV_CLAIM_NONE: 8715 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8716 if (open_desc == desc) { 8717 continue; 8718 } 8719 if (open_desc->write) { 8720 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8721 "another descriptor is open for writing without a " 8722 "claim\n", bdev->name); 8723 return -EPERM; 8724 } 8725 } 8726 break; 8727 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8728 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8729 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8730 return -EPERM; 8731 } 8732 break; 8733 default: 8734 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8735 return -EBUSY; 8736 } 8737 8738 return 0; 8739 } 8740 8741 /* Updates desc and its bdev with a v2 claim. */ 8742 static int 8743 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8744 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8745 { 8746 struct spdk_bdev *bdev = desc->bdev; 8747 struct spdk_bdev_module_claim *claim; 8748 8749 assert(spdk_spin_held(&bdev->internal.spinlock)); 8750 assert(claim_type_is_v2(type)); 8751 assert(desc->claim == NULL); 8752 8753 claim = calloc(1, sizeof(*desc->claim)); 8754 if (claim == NULL) { 8755 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8756 return -ENOMEM; 8757 } 8758 claim->module = module; 8759 claim->desc = desc; 8760 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8761 memcpy(claim->name, opts->name, sizeof(claim->name)); 8762 desc->claim = claim; 8763 8764 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8765 bdev->internal.claim_type = type; 8766 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8767 bdev->internal.claim.v2.key = opts->shared_claim_key; 8768 } 8769 assert(type == bdev->internal.claim_type); 8770 8771 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8772 8773 if (!desc->write && claim_type_promotes_to_write(type)) { 8774 desc->write = true; 8775 } 8776 8777 return 0; 8778 } 8779 8780 int 8781 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8782 struct spdk_bdev_claim_opts *_opts, 8783 struct spdk_bdev_module *module) 8784 { 8785 struct spdk_bdev *bdev; 8786 struct spdk_bdev_claim_opts opts; 8787 int rc = 0; 8788 8789 if (desc == NULL) { 8790 SPDK_ERRLOG("descriptor must not be NULL\n"); 8791 return -EINVAL; 8792 } 8793 8794 bdev = desc->bdev; 8795 8796 if (_opts == NULL) { 8797 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8798 } else if (claim_opts_copy(_opts, &opts) != 0) { 8799 return -EINVAL; 8800 } 8801 8802 spdk_spin_lock(&bdev->internal.spinlock); 8803 8804 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8805 bdev->internal.claim_type != type) { 8806 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8807 spdk_spin_unlock(&bdev->internal.spinlock); 8808 return -EPERM; 8809 } 8810 8811 if (claim_type_is_v2(type) && desc->claim != NULL) { 8812 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8813 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8814 spdk_spin_unlock(&bdev->internal.spinlock); 8815 return -EPERM; 8816 } 8817 8818 switch (type) { 8819 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8820 spdk_spin_unlock(&bdev->internal.spinlock); 8821 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8822 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8823 rc = claim_verify_rwo(desc, type, &opts, module); 8824 break; 8825 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8826 rc = claim_verify_rom(desc, type, &opts, module); 8827 break; 8828 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8829 rc = claim_verify_rwm(desc, type, &opts, module); 8830 break; 8831 default: 8832 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8833 rc = -ENOTSUP; 8834 } 8835 8836 if (rc == 0) { 8837 rc = claim_bdev(desc, type, &opts, module); 8838 } 8839 8840 spdk_spin_unlock(&bdev->internal.spinlock); 8841 return rc; 8842 } 8843 8844 static void 8845 claim_reset(struct spdk_bdev *bdev) 8846 { 8847 assert(spdk_spin_held(&bdev->internal.spinlock)); 8848 assert(claim_type_is_v2(bdev->internal.claim_type)); 8849 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8850 8851 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8852 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8853 } 8854 8855 static void 8856 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8857 { 8858 struct spdk_bdev *bdev = desc->bdev; 8859 8860 assert(spdk_spin_held(&bdev->internal.spinlock)); 8861 assert(claim_type_is_v2(bdev->internal.claim_type)); 8862 8863 if (bdev->internal.examine_in_progress == 0) { 8864 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8865 free(desc->claim); 8866 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8867 claim_reset(bdev); 8868 } 8869 } else { 8870 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8871 desc->claim->module = NULL; 8872 desc->claim->desc = NULL; 8873 } 8874 desc->claim = NULL; 8875 } 8876 8877 /* 8878 * End claims v2 8879 */ 8880 8881 struct spdk_bdev * 8882 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8883 { 8884 assert(desc != NULL); 8885 return desc->bdev; 8886 } 8887 8888 int 8889 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8890 { 8891 struct spdk_bdev *bdev, *tmp; 8892 struct spdk_bdev_desc *desc; 8893 int rc = 0; 8894 8895 assert(fn != NULL); 8896 8897 spdk_spin_lock(&g_bdev_mgr.spinlock); 8898 bdev = spdk_bdev_first(); 8899 while (bdev != NULL) { 8900 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8901 if (rc != 0) { 8902 break; 8903 } 8904 rc = bdev_open(bdev, false, desc); 8905 if (rc != 0) { 8906 bdev_desc_free(desc); 8907 if (rc == -ENODEV) { 8908 /* Ignore the error and move to the next bdev. */ 8909 rc = 0; 8910 bdev = spdk_bdev_next(bdev); 8911 continue; 8912 } 8913 break; 8914 } 8915 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8916 8917 rc = fn(ctx, bdev); 8918 8919 spdk_spin_lock(&g_bdev_mgr.spinlock); 8920 tmp = spdk_bdev_next(bdev); 8921 bdev_close(bdev, desc); 8922 if (rc != 0) { 8923 break; 8924 } 8925 bdev = tmp; 8926 } 8927 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8928 8929 return rc; 8930 } 8931 8932 int 8933 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8934 { 8935 struct spdk_bdev *bdev, *tmp; 8936 struct spdk_bdev_desc *desc; 8937 int rc = 0; 8938 8939 assert(fn != NULL); 8940 8941 spdk_spin_lock(&g_bdev_mgr.spinlock); 8942 bdev = spdk_bdev_first_leaf(); 8943 while (bdev != NULL) { 8944 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8945 if (rc != 0) { 8946 break; 8947 } 8948 rc = bdev_open(bdev, false, desc); 8949 if (rc != 0) { 8950 bdev_desc_free(desc); 8951 if (rc == -ENODEV) { 8952 /* Ignore the error and move to the next bdev. */ 8953 rc = 0; 8954 bdev = spdk_bdev_next_leaf(bdev); 8955 continue; 8956 } 8957 break; 8958 } 8959 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8960 8961 rc = fn(ctx, bdev); 8962 8963 spdk_spin_lock(&g_bdev_mgr.spinlock); 8964 tmp = spdk_bdev_next_leaf(bdev); 8965 bdev_close(bdev, desc); 8966 if (rc != 0) { 8967 break; 8968 } 8969 bdev = tmp; 8970 } 8971 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8972 8973 return rc; 8974 } 8975 8976 void 8977 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8978 { 8979 struct iovec *iovs; 8980 int iovcnt; 8981 8982 if (bdev_io == NULL) { 8983 return; 8984 } 8985 8986 switch (bdev_io->type) { 8987 case SPDK_BDEV_IO_TYPE_READ: 8988 case SPDK_BDEV_IO_TYPE_WRITE: 8989 case SPDK_BDEV_IO_TYPE_ZCOPY: 8990 iovs = bdev_io->u.bdev.iovs; 8991 iovcnt = bdev_io->u.bdev.iovcnt; 8992 break; 8993 default: 8994 iovs = NULL; 8995 iovcnt = 0; 8996 break; 8997 } 8998 8999 if (iovp) { 9000 *iovp = iovs; 9001 } 9002 if (iovcntp) { 9003 *iovcntp = iovcnt; 9004 } 9005 } 9006 9007 void * 9008 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 9009 { 9010 if (bdev_io == NULL) { 9011 return NULL; 9012 } 9013 9014 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 9015 return NULL; 9016 } 9017 9018 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 9019 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 9020 return bdev_io->u.bdev.md_buf; 9021 } 9022 9023 return NULL; 9024 } 9025 9026 void * 9027 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 9028 { 9029 if (bdev_io == NULL) { 9030 assert(false); 9031 return NULL; 9032 } 9033 9034 return bdev_io->internal.caller_ctx; 9035 } 9036 9037 void 9038 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 9039 { 9040 9041 if (spdk_bdev_module_list_find(bdev_module->name)) { 9042 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 9043 assert(false); 9044 } 9045 9046 spdk_spin_init(&bdev_module->internal.spinlock); 9047 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 9048 9049 /* 9050 * Modules with examine callbacks must be initialized first, so they are 9051 * ready to handle examine callbacks from later modules that will 9052 * register physical bdevs. 9053 */ 9054 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 9055 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9056 } else { 9057 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 9058 } 9059 } 9060 9061 struct spdk_bdev_module * 9062 spdk_bdev_module_list_find(const char *name) 9063 { 9064 struct spdk_bdev_module *bdev_module; 9065 9066 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 9067 if (strcmp(name, bdev_module->name) == 0) { 9068 break; 9069 } 9070 } 9071 9072 return bdev_module; 9073 } 9074 9075 static int 9076 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 9077 { 9078 uint64_t num_blocks; 9079 void *md_buf = NULL; 9080 9081 num_blocks = bdev_io->u.bdev.num_blocks; 9082 9083 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 9084 md_buf = (char *)g_bdev_mgr.zero_buffer + 9085 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 9086 } 9087 9088 return bdev_write_blocks_with_md(bdev_io->internal.desc, 9089 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9090 g_bdev_mgr.zero_buffer, md_buf, 9091 bdev_io->u.bdev.offset_blocks, num_blocks, 9092 bdev_write_zero_buffer_done, bdev_io); 9093 } 9094 9095 static void 9096 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9097 { 9098 struct spdk_bdev_io *parent_io = cb_arg; 9099 9100 spdk_bdev_free_io(bdev_io); 9101 9102 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9103 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9104 } 9105 9106 static void 9107 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 9108 { 9109 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9110 ctx->bdev->internal.qos_mod_in_progress = false; 9111 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9112 9113 if (ctx->cb_fn) { 9114 ctx->cb_fn(ctx->cb_arg, status); 9115 } 9116 free(ctx); 9117 } 9118 9119 static void 9120 bdev_disable_qos_done(void *cb_arg) 9121 { 9122 struct set_qos_limit_ctx *ctx = cb_arg; 9123 struct spdk_bdev *bdev = ctx->bdev; 9124 struct spdk_bdev_qos *qos; 9125 9126 spdk_spin_lock(&bdev->internal.spinlock); 9127 qos = bdev->internal.qos; 9128 bdev->internal.qos = NULL; 9129 spdk_spin_unlock(&bdev->internal.spinlock); 9130 9131 if (qos->thread != NULL) { 9132 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 9133 spdk_poller_unregister(&qos->poller); 9134 } 9135 9136 free(qos); 9137 9138 bdev_set_qos_limit_done(ctx, 0); 9139 } 9140 9141 static void 9142 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 9143 { 9144 struct set_qos_limit_ctx *ctx = _ctx; 9145 struct spdk_thread *thread; 9146 9147 spdk_spin_lock(&bdev->internal.spinlock); 9148 thread = bdev->internal.qos->thread; 9149 spdk_spin_unlock(&bdev->internal.spinlock); 9150 9151 if (thread != NULL) { 9152 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 9153 } else { 9154 bdev_disable_qos_done(ctx); 9155 } 9156 } 9157 9158 static void 9159 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9160 struct spdk_io_channel *ch, void *_ctx) 9161 { 9162 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9163 struct spdk_bdev_io *bdev_io; 9164 9165 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 9166 9167 while (!TAILQ_EMPTY(&bdev_ch->qos_queued_io)) { 9168 /* Re-submit the queued I/O. */ 9169 bdev_io = TAILQ_FIRST(&bdev_ch->qos_queued_io); 9170 TAILQ_REMOVE(&bdev_ch->qos_queued_io, bdev_io, internal.link); 9171 _bdev_io_submit(bdev_io); 9172 } 9173 9174 spdk_bdev_for_each_channel_continue(i, 0); 9175 } 9176 9177 static void 9178 bdev_update_qos_rate_limit_msg(void *cb_arg) 9179 { 9180 struct set_qos_limit_ctx *ctx = cb_arg; 9181 struct spdk_bdev *bdev = ctx->bdev; 9182 9183 spdk_spin_lock(&bdev->internal.spinlock); 9184 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 9185 spdk_spin_unlock(&bdev->internal.spinlock); 9186 9187 bdev_set_qos_limit_done(ctx, 0); 9188 } 9189 9190 static void 9191 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9192 struct spdk_io_channel *ch, void *_ctx) 9193 { 9194 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9195 9196 spdk_spin_lock(&bdev->internal.spinlock); 9197 bdev_enable_qos(bdev, bdev_ch); 9198 spdk_spin_unlock(&bdev->internal.spinlock); 9199 spdk_bdev_for_each_channel_continue(i, 0); 9200 } 9201 9202 static void 9203 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 9204 { 9205 struct set_qos_limit_ctx *ctx = _ctx; 9206 9207 bdev_set_qos_limit_done(ctx, status); 9208 } 9209 9210 static void 9211 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 9212 { 9213 int i; 9214 9215 assert(bdev->internal.qos != NULL); 9216 9217 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9218 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9219 bdev->internal.qos->rate_limits[i].limit = limits[i]; 9220 9221 if (limits[i] == 0) { 9222 bdev->internal.qos->rate_limits[i].limit = 9223 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 9224 } 9225 } 9226 } 9227 } 9228 9229 void 9230 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 9231 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 9232 { 9233 struct set_qos_limit_ctx *ctx; 9234 uint32_t limit_set_complement; 9235 uint64_t min_limit_per_sec; 9236 int i; 9237 bool disable_rate_limit = true; 9238 9239 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9240 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 9241 continue; 9242 } 9243 9244 if (limits[i] > 0) { 9245 disable_rate_limit = false; 9246 } 9247 9248 if (bdev_qos_is_iops_rate_limit(i) == true) { 9249 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 9250 } else { 9251 if (limits[i] > SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC) { 9252 SPDK_WARNLOG("Requested rate limit %" PRIu64 " will result in uint64_t overflow, " 9253 "reset to %" PRIu64 "\n", limits[i], SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC); 9254 limits[i] = SPDK_BDEV_QOS_MAX_MBYTES_PER_SEC; 9255 } 9256 /* Change from megabyte to byte rate limit */ 9257 limits[i] = limits[i] * 1024 * 1024; 9258 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 9259 } 9260 9261 limit_set_complement = limits[i] % min_limit_per_sec; 9262 if (limit_set_complement) { 9263 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 9264 limits[i], min_limit_per_sec); 9265 limits[i] += min_limit_per_sec - limit_set_complement; 9266 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 9267 } 9268 } 9269 9270 ctx = calloc(1, sizeof(*ctx)); 9271 if (ctx == NULL) { 9272 cb_fn(cb_arg, -ENOMEM); 9273 return; 9274 } 9275 9276 ctx->cb_fn = cb_fn; 9277 ctx->cb_arg = cb_arg; 9278 ctx->bdev = bdev; 9279 9280 spdk_spin_lock(&bdev->internal.spinlock); 9281 if (bdev->internal.qos_mod_in_progress) { 9282 spdk_spin_unlock(&bdev->internal.spinlock); 9283 free(ctx); 9284 cb_fn(cb_arg, -EAGAIN); 9285 return; 9286 } 9287 bdev->internal.qos_mod_in_progress = true; 9288 9289 if (disable_rate_limit == true && bdev->internal.qos) { 9290 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 9291 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 9292 (bdev->internal.qos->rate_limits[i].limit > 0 && 9293 bdev->internal.qos->rate_limits[i].limit != 9294 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 9295 disable_rate_limit = false; 9296 break; 9297 } 9298 } 9299 } 9300 9301 if (disable_rate_limit == false) { 9302 if (bdev->internal.qos == NULL) { 9303 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 9304 if (!bdev->internal.qos) { 9305 spdk_spin_unlock(&bdev->internal.spinlock); 9306 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 9307 bdev_set_qos_limit_done(ctx, -ENOMEM); 9308 return; 9309 } 9310 } 9311 9312 if (bdev->internal.qos->thread == NULL) { 9313 /* Enabling */ 9314 bdev_set_qos_rate_limits(bdev, limits); 9315 9316 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 9317 bdev_enable_qos_done); 9318 } else { 9319 /* Updating */ 9320 bdev_set_qos_rate_limits(bdev, limits); 9321 9322 spdk_thread_send_msg(bdev->internal.qos->thread, 9323 bdev_update_qos_rate_limit_msg, ctx); 9324 } 9325 } else { 9326 if (bdev->internal.qos != NULL) { 9327 bdev_set_qos_rate_limits(bdev, limits); 9328 9329 /* Disabling */ 9330 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 9331 bdev_disable_qos_msg_done); 9332 } else { 9333 spdk_spin_unlock(&bdev->internal.spinlock); 9334 bdev_set_qos_limit_done(ctx, 0); 9335 return; 9336 } 9337 } 9338 9339 spdk_spin_unlock(&bdev->internal.spinlock); 9340 } 9341 9342 struct spdk_bdev_histogram_ctx { 9343 spdk_bdev_histogram_status_cb cb_fn; 9344 void *cb_arg; 9345 struct spdk_bdev *bdev; 9346 int status; 9347 }; 9348 9349 static void 9350 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9351 { 9352 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9353 9354 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9355 ctx->bdev->internal.histogram_in_progress = false; 9356 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9357 ctx->cb_fn(ctx->cb_arg, ctx->status); 9358 free(ctx); 9359 } 9360 9361 static void 9362 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9363 struct spdk_io_channel *_ch, void *_ctx) 9364 { 9365 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9366 9367 if (ch->histogram != NULL) { 9368 spdk_histogram_data_free(ch->histogram); 9369 ch->histogram = NULL; 9370 } 9371 spdk_bdev_for_each_channel_continue(i, 0); 9372 } 9373 9374 static void 9375 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9376 { 9377 struct spdk_bdev_histogram_ctx *ctx = _ctx; 9378 9379 if (status != 0) { 9380 ctx->status = status; 9381 ctx->bdev->internal.histogram_enabled = false; 9382 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 9383 bdev_histogram_disable_channel_cb); 9384 } else { 9385 spdk_spin_lock(&ctx->bdev->internal.spinlock); 9386 ctx->bdev->internal.histogram_in_progress = false; 9387 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 9388 ctx->cb_fn(ctx->cb_arg, ctx->status); 9389 free(ctx); 9390 } 9391 } 9392 9393 static void 9394 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9395 struct spdk_io_channel *_ch, void *_ctx) 9396 { 9397 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9398 int status = 0; 9399 9400 if (ch->histogram == NULL) { 9401 ch->histogram = spdk_histogram_data_alloc(); 9402 if (ch->histogram == NULL) { 9403 status = -ENOMEM; 9404 } 9405 } 9406 9407 spdk_bdev_for_each_channel_continue(i, status); 9408 } 9409 9410 void 9411 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 9412 void *cb_arg, bool enable) 9413 { 9414 struct spdk_bdev_histogram_ctx *ctx; 9415 9416 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 9417 if (ctx == NULL) { 9418 cb_fn(cb_arg, -ENOMEM); 9419 return; 9420 } 9421 9422 ctx->bdev = bdev; 9423 ctx->status = 0; 9424 ctx->cb_fn = cb_fn; 9425 ctx->cb_arg = cb_arg; 9426 9427 spdk_spin_lock(&bdev->internal.spinlock); 9428 if (bdev->internal.histogram_in_progress) { 9429 spdk_spin_unlock(&bdev->internal.spinlock); 9430 free(ctx); 9431 cb_fn(cb_arg, -EAGAIN); 9432 return; 9433 } 9434 9435 bdev->internal.histogram_in_progress = true; 9436 spdk_spin_unlock(&bdev->internal.spinlock); 9437 9438 bdev->internal.histogram_enabled = enable; 9439 9440 if (enable) { 9441 /* Allocate histogram for each channel */ 9442 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 9443 bdev_histogram_enable_channel_cb); 9444 } else { 9445 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9446 bdev_histogram_disable_channel_cb); 9447 } 9448 } 9449 9450 struct spdk_bdev_histogram_data_ctx { 9451 spdk_bdev_histogram_data_cb cb_fn; 9452 void *cb_arg; 9453 struct spdk_bdev *bdev; 9454 /** merged histogram data from all channels */ 9455 struct spdk_histogram_data *histogram; 9456 }; 9457 9458 static void 9459 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9460 { 9461 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9462 9463 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9464 free(ctx); 9465 } 9466 9467 static void 9468 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9469 struct spdk_io_channel *_ch, void *_ctx) 9470 { 9471 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9472 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9473 int status = 0; 9474 9475 if (ch->histogram == NULL) { 9476 status = -EFAULT; 9477 } else { 9478 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9479 } 9480 9481 spdk_bdev_for_each_channel_continue(i, status); 9482 } 9483 9484 void 9485 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9486 spdk_bdev_histogram_data_cb cb_fn, 9487 void *cb_arg) 9488 { 9489 struct spdk_bdev_histogram_data_ctx *ctx; 9490 9491 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9492 if (ctx == NULL) { 9493 cb_fn(cb_arg, -ENOMEM, NULL); 9494 return; 9495 } 9496 9497 ctx->bdev = bdev; 9498 ctx->cb_fn = cb_fn; 9499 ctx->cb_arg = cb_arg; 9500 9501 ctx->histogram = histogram; 9502 9503 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9504 bdev_histogram_get_channel_cb); 9505 } 9506 9507 void 9508 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9509 void *cb_arg) 9510 { 9511 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9512 int status = 0; 9513 9514 assert(cb_fn != NULL); 9515 9516 if (bdev_ch->histogram == NULL) { 9517 status = -EFAULT; 9518 } 9519 cb_fn(cb_arg, status, bdev_ch->histogram); 9520 } 9521 9522 size_t 9523 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9524 size_t max_events) 9525 { 9526 struct media_event_entry *entry; 9527 size_t num_events = 0; 9528 9529 for (; num_events < max_events; ++num_events) { 9530 entry = TAILQ_FIRST(&desc->pending_media_events); 9531 if (entry == NULL) { 9532 break; 9533 } 9534 9535 events[num_events] = entry->event; 9536 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9537 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9538 } 9539 9540 return num_events; 9541 } 9542 9543 int 9544 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9545 size_t num_events) 9546 { 9547 struct spdk_bdev_desc *desc; 9548 struct media_event_entry *entry; 9549 size_t event_id; 9550 int rc = 0; 9551 9552 assert(bdev->media_events); 9553 9554 spdk_spin_lock(&bdev->internal.spinlock); 9555 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9556 if (desc->write) { 9557 break; 9558 } 9559 } 9560 9561 if (desc == NULL || desc->media_events_buffer == NULL) { 9562 rc = -ENODEV; 9563 goto out; 9564 } 9565 9566 for (event_id = 0; event_id < num_events; ++event_id) { 9567 entry = TAILQ_FIRST(&desc->free_media_events); 9568 if (entry == NULL) { 9569 break; 9570 } 9571 9572 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9573 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9574 entry->event = events[event_id]; 9575 } 9576 9577 rc = event_id; 9578 out: 9579 spdk_spin_unlock(&bdev->internal.spinlock); 9580 return rc; 9581 } 9582 9583 static void 9584 _media_management_notify(void *arg) 9585 { 9586 struct spdk_bdev_desc *desc = arg; 9587 9588 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9589 } 9590 9591 void 9592 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9593 { 9594 struct spdk_bdev_desc *desc; 9595 9596 spdk_spin_lock(&bdev->internal.spinlock); 9597 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9598 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9599 event_notify(desc, _media_management_notify); 9600 } 9601 } 9602 spdk_spin_unlock(&bdev->internal.spinlock); 9603 } 9604 9605 struct locked_lba_range_ctx { 9606 struct lba_range range; 9607 struct lba_range *current_range; 9608 struct lba_range *owner_range; 9609 struct spdk_poller *poller; 9610 lock_range_cb cb_fn; 9611 void *cb_arg; 9612 }; 9613 9614 static void 9615 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9616 { 9617 struct locked_lba_range_ctx *ctx = _ctx; 9618 9619 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9620 free(ctx); 9621 } 9622 9623 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9624 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9625 9626 static void 9627 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9628 { 9629 struct locked_lba_range_ctx *ctx = _ctx; 9630 9631 if (status == -ENOMEM) { 9632 /* One of the channels could not allocate a range object. 9633 * So we have to go back and clean up any ranges that were 9634 * allocated successfully before we return error status to 9635 * the caller. We can reuse the unlock function to do that 9636 * clean up. 9637 */ 9638 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9639 bdev_lock_error_cleanup_cb); 9640 return; 9641 } 9642 9643 /* All channels have locked this range and no I/O overlapping the range 9644 * are outstanding! Set the owner_ch for the range object for the 9645 * locking channel, so that this channel will know that it is allowed 9646 * to write to this range. 9647 */ 9648 if (ctx->owner_range != NULL) { 9649 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9650 } 9651 9652 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9653 9654 /* Don't free the ctx here. Its range is in the bdev's global list of 9655 * locked ranges still, and will be removed and freed when this range 9656 * is later unlocked. 9657 */ 9658 } 9659 9660 static int 9661 bdev_lock_lba_range_check_io(void *_i) 9662 { 9663 struct spdk_bdev_channel_iter *i = _i; 9664 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9665 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9666 struct locked_lba_range_ctx *ctx = i->ctx; 9667 struct lba_range *range = ctx->current_range; 9668 struct spdk_bdev_io *bdev_io; 9669 9670 spdk_poller_unregister(&ctx->poller); 9671 9672 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9673 * range. But we need to wait until any outstanding IO overlapping with this range 9674 * are completed. 9675 */ 9676 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9677 if (bdev_io_range_is_locked(bdev_io, range)) { 9678 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9679 return SPDK_POLLER_BUSY; 9680 } 9681 } 9682 9683 spdk_bdev_for_each_channel_continue(i, 0); 9684 return SPDK_POLLER_BUSY; 9685 } 9686 9687 static void 9688 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9689 struct spdk_io_channel *_ch, void *_ctx) 9690 { 9691 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9692 struct locked_lba_range_ctx *ctx = _ctx; 9693 struct lba_range *range; 9694 9695 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9696 if (range->length == ctx->range.length && 9697 range->offset == ctx->range.offset && 9698 range->locked_ctx == ctx->range.locked_ctx) { 9699 /* This range already exists on this channel, so don't add 9700 * it again. This can happen when a new channel is created 9701 * while the for_each_channel operation is in progress. 9702 * Do not check for outstanding I/O in that case, since the 9703 * range was locked before any I/O could be submitted to the 9704 * new channel. 9705 */ 9706 spdk_bdev_for_each_channel_continue(i, 0); 9707 return; 9708 } 9709 } 9710 9711 range = calloc(1, sizeof(*range)); 9712 if (range == NULL) { 9713 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9714 return; 9715 } 9716 9717 range->length = ctx->range.length; 9718 range->offset = ctx->range.offset; 9719 range->locked_ctx = ctx->range.locked_ctx; 9720 range->quiesce = ctx->range.quiesce; 9721 ctx->current_range = range; 9722 if (ctx->range.owner_ch == ch) { 9723 /* This is the range object for the channel that will hold 9724 * the lock. Store it in the ctx object so that we can easily 9725 * set its owner_ch after the lock is finally acquired. 9726 */ 9727 ctx->owner_range = range; 9728 } 9729 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9730 bdev_lock_lba_range_check_io(i); 9731 } 9732 9733 static void 9734 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9735 { 9736 assert(spdk_get_thread() == ctx->range.owner_thread); 9737 assert(ctx->range.owner_ch == NULL || 9738 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9739 9740 /* We will add a copy of this range to each channel now. */ 9741 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9742 bdev_lock_lba_range_cb); 9743 } 9744 9745 static bool 9746 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9747 { 9748 struct lba_range *r; 9749 9750 TAILQ_FOREACH(r, tailq, tailq) { 9751 if (bdev_lba_range_overlapped(range, r)) { 9752 return true; 9753 } 9754 } 9755 return false; 9756 } 9757 9758 static void bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status); 9759 9760 static int 9761 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9762 uint64_t offset, uint64_t length, 9763 lock_range_cb cb_fn, void *cb_arg) 9764 { 9765 struct locked_lba_range_ctx *ctx; 9766 9767 ctx = calloc(1, sizeof(*ctx)); 9768 if (ctx == NULL) { 9769 return -ENOMEM; 9770 } 9771 9772 ctx->range.offset = offset; 9773 ctx->range.length = length; 9774 ctx->range.owner_thread = spdk_get_thread(); 9775 ctx->range.owner_ch = ch; 9776 ctx->range.locked_ctx = cb_arg; 9777 ctx->range.bdev = bdev; 9778 ctx->range.quiesce = (cb_fn == bdev_quiesce_range_locked); 9779 ctx->cb_fn = cb_fn; 9780 ctx->cb_arg = cb_arg; 9781 9782 spdk_spin_lock(&bdev->internal.spinlock); 9783 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9784 /* There is an active lock overlapping with this range. 9785 * Put it on the pending list until this range no 9786 * longer overlaps with another. 9787 */ 9788 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9789 } else { 9790 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9791 bdev_lock_lba_range_ctx(bdev, ctx); 9792 } 9793 spdk_spin_unlock(&bdev->internal.spinlock); 9794 return 0; 9795 } 9796 9797 static int 9798 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9799 uint64_t offset, uint64_t length, 9800 lock_range_cb cb_fn, void *cb_arg) 9801 { 9802 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9803 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9804 9805 if (cb_arg == NULL) { 9806 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9807 return -EINVAL; 9808 } 9809 9810 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9811 } 9812 9813 static void 9814 bdev_lock_lba_range_ctx_msg(void *_ctx) 9815 { 9816 struct locked_lba_range_ctx *ctx = _ctx; 9817 9818 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9819 } 9820 9821 static void 9822 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9823 { 9824 struct locked_lba_range_ctx *ctx = _ctx; 9825 struct locked_lba_range_ctx *pending_ctx; 9826 struct lba_range *range, *tmp; 9827 9828 spdk_spin_lock(&bdev->internal.spinlock); 9829 /* Check if there are any pending locked ranges that overlap with this range 9830 * that was just unlocked. If there are, check that it doesn't overlap with any 9831 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9832 * the lock process. 9833 */ 9834 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9835 if (bdev_lba_range_overlapped(range, &ctx->range) && 9836 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9837 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9838 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9839 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9840 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9841 bdev_lock_lba_range_ctx_msg, pending_ctx); 9842 } 9843 } 9844 spdk_spin_unlock(&bdev->internal.spinlock); 9845 9846 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9847 free(ctx); 9848 } 9849 9850 static void 9851 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9852 struct spdk_io_channel *_ch, void *_ctx) 9853 { 9854 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9855 struct locked_lba_range_ctx *ctx = _ctx; 9856 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9857 struct spdk_bdev_io *bdev_io; 9858 struct lba_range *range; 9859 9860 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9861 if (ctx->range.offset == range->offset && 9862 ctx->range.length == range->length && 9863 ctx->range.locked_ctx == range->locked_ctx) { 9864 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9865 free(range); 9866 break; 9867 } 9868 } 9869 9870 /* Note: we should almost always be able to assert that the range specified 9871 * was found. But there are some very rare corner cases where a new channel 9872 * gets created simultaneously with a range unlock, where this function 9873 * would execute on that new channel and wouldn't have the range. 9874 * We also use this to clean up range allocations when a later allocation 9875 * fails in the locking path. 9876 * So we can't actually assert() here. 9877 */ 9878 9879 /* Swap the locked IO into a temporary list, and then try to submit them again. 9880 * We could hyper-optimize this to only resubmit locked I/O that overlap 9881 * with the range that was just unlocked, but this isn't a performance path so 9882 * we go for simplicity here. 9883 */ 9884 TAILQ_INIT(&io_locked); 9885 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9886 while (!TAILQ_EMPTY(&io_locked)) { 9887 bdev_io = TAILQ_FIRST(&io_locked); 9888 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9889 bdev_io_submit(bdev_io); 9890 } 9891 9892 spdk_bdev_for_each_channel_continue(i, 0); 9893 } 9894 9895 static int 9896 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9897 lock_range_cb cb_fn, void *cb_arg) 9898 { 9899 struct locked_lba_range_ctx *ctx; 9900 struct lba_range *range; 9901 9902 spdk_spin_lock(&bdev->internal.spinlock); 9903 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9904 * and remove it. This ensures new channels don't inherit the locked range. 9905 * Then we will send a message to each channel to remove the range from its 9906 * per-channel list. 9907 */ 9908 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9909 if (range->offset == offset && range->length == length && 9910 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9911 break; 9912 } 9913 } 9914 if (range == NULL) { 9915 assert(false); 9916 spdk_spin_unlock(&bdev->internal.spinlock); 9917 return -EINVAL; 9918 } 9919 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9920 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9921 spdk_spin_unlock(&bdev->internal.spinlock); 9922 9923 ctx->cb_fn = cb_fn; 9924 ctx->cb_arg = cb_arg; 9925 9926 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9927 bdev_unlock_lba_range_cb); 9928 return 0; 9929 } 9930 9931 static int 9932 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9933 uint64_t offset, uint64_t length, 9934 lock_range_cb cb_fn, void *cb_arg) 9935 { 9936 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9937 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9938 struct lba_range *range; 9939 bool range_found = false; 9940 9941 /* Let's make sure the specified channel actually has a lock on 9942 * the specified range. Note that the range must match exactly. 9943 */ 9944 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9945 if (range->offset == offset && range->length == length && 9946 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9947 range_found = true; 9948 break; 9949 } 9950 } 9951 9952 if (!range_found) { 9953 return -EINVAL; 9954 } 9955 9956 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9957 } 9958 9959 struct bdev_quiesce_ctx { 9960 spdk_bdev_quiesce_cb cb_fn; 9961 void *cb_arg; 9962 }; 9963 9964 static void 9965 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9966 { 9967 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9968 9969 if (quiesce_ctx->cb_fn != NULL) { 9970 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9971 } 9972 9973 free(quiesce_ctx); 9974 } 9975 9976 static void 9977 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9978 { 9979 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9980 struct spdk_bdev_module *module = range->bdev->module; 9981 9982 if (status != 0) { 9983 if (quiesce_ctx->cb_fn != NULL) { 9984 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9985 } 9986 free(quiesce_ctx); 9987 return; 9988 } 9989 9990 spdk_spin_lock(&module->internal.spinlock); 9991 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9992 spdk_spin_unlock(&module->internal.spinlock); 9993 9994 if (quiesce_ctx->cb_fn != NULL) { 9995 /* copy the context in case the range is unlocked by the callback */ 9996 struct bdev_quiesce_ctx tmp = *quiesce_ctx; 9997 9998 quiesce_ctx->cb_fn = NULL; 9999 quiesce_ctx->cb_arg = NULL; 10000 10001 tmp.cb_fn(tmp.cb_arg, status); 10002 } 10003 /* quiesce_ctx will be freed on unquiesce */ 10004 } 10005 10006 static int 10007 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10008 uint64_t offset, uint64_t length, 10009 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 10010 bool unquiesce) 10011 { 10012 struct bdev_quiesce_ctx *quiesce_ctx; 10013 int rc; 10014 10015 if (module != bdev->module) { 10016 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 10017 return -EINVAL; 10018 } 10019 10020 if (!bdev_io_valid_blocks(bdev, offset, length)) { 10021 return -EINVAL; 10022 } 10023 10024 if (unquiesce) { 10025 struct lba_range *range; 10026 10027 /* Make sure the specified range is actually quiesced in the specified module and 10028 * then remove it from the list. Note that the range must match exactly. 10029 */ 10030 spdk_spin_lock(&module->internal.spinlock); 10031 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 10032 if (range->bdev == bdev && range->offset == offset && range->length == length) { 10033 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 10034 break; 10035 } 10036 } 10037 spdk_spin_unlock(&module->internal.spinlock); 10038 10039 if (range == NULL) { 10040 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 10041 return -EINVAL; 10042 } 10043 10044 quiesce_ctx = range->locked_ctx; 10045 quiesce_ctx->cb_fn = cb_fn; 10046 quiesce_ctx->cb_arg = cb_arg; 10047 10048 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 10049 } else { 10050 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 10051 if (quiesce_ctx == NULL) { 10052 return -ENOMEM; 10053 } 10054 10055 quiesce_ctx->cb_fn = cb_fn; 10056 quiesce_ctx->cb_arg = cb_arg; 10057 10058 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 10059 if (rc != 0) { 10060 free(quiesce_ctx); 10061 } 10062 } 10063 10064 return rc; 10065 } 10066 10067 int 10068 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10069 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10070 { 10071 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 10072 } 10073 10074 int 10075 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10076 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10077 { 10078 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 10079 } 10080 10081 int 10082 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10083 uint64_t offset, uint64_t length, 10084 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10085 { 10086 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 10087 } 10088 10089 int 10090 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 10091 uint64_t offset, uint64_t length, 10092 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 10093 { 10094 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 10095 } 10096 10097 int 10098 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 10099 int array_size) 10100 { 10101 if (!bdev) { 10102 return -EINVAL; 10103 } 10104 10105 if (bdev->fn_table->get_memory_domains) { 10106 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 10107 } 10108 10109 return 0; 10110 } 10111 10112 struct spdk_bdev_for_each_io_ctx { 10113 void *ctx; 10114 spdk_bdev_io_fn fn; 10115 spdk_bdev_for_each_io_cb cb; 10116 }; 10117 10118 static void 10119 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 10120 struct spdk_io_channel *io_ch, void *_ctx) 10121 { 10122 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10123 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 10124 struct spdk_bdev_io *bdev_io; 10125 int rc = 0; 10126 10127 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 10128 rc = ctx->fn(ctx->ctx, bdev_io); 10129 if (rc != 0) { 10130 break; 10131 } 10132 } 10133 10134 spdk_bdev_for_each_channel_continue(i, rc); 10135 } 10136 10137 static void 10138 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 10139 { 10140 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 10141 10142 ctx->cb(ctx->ctx, status); 10143 10144 free(ctx); 10145 } 10146 10147 void 10148 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 10149 spdk_bdev_for_each_io_cb cb) 10150 { 10151 struct spdk_bdev_for_each_io_ctx *ctx; 10152 10153 assert(fn != NULL && cb != NULL); 10154 10155 ctx = calloc(1, sizeof(*ctx)); 10156 if (ctx == NULL) { 10157 SPDK_ERRLOG("Failed to allocate context.\n"); 10158 cb(_ctx, -ENOMEM); 10159 return; 10160 } 10161 10162 ctx->ctx = _ctx; 10163 ctx->fn = fn; 10164 ctx->cb = cb; 10165 10166 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 10167 bdev_for_each_io_done); 10168 } 10169 10170 void 10171 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 10172 { 10173 spdk_for_each_channel_continue(iter->i, status); 10174 } 10175 10176 static struct spdk_bdev * 10177 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 10178 { 10179 void *io_device = spdk_io_channel_iter_get_io_device(i); 10180 10181 return __bdev_from_io_dev(io_device); 10182 } 10183 10184 static void 10185 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 10186 { 10187 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10188 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10189 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 10190 10191 iter->i = i; 10192 iter->fn(iter, bdev, ch, iter->ctx); 10193 } 10194 10195 static void 10196 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 10197 { 10198 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 10199 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 10200 10201 iter->i = i; 10202 iter->cpl(bdev, iter->ctx, status); 10203 10204 free(iter); 10205 } 10206 10207 void 10208 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 10209 void *ctx, spdk_bdev_for_each_channel_done cpl) 10210 { 10211 struct spdk_bdev_channel_iter *iter; 10212 10213 assert(bdev != NULL && fn != NULL && ctx != NULL); 10214 10215 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 10216 if (iter == NULL) { 10217 SPDK_ERRLOG("Unable to allocate iterator\n"); 10218 assert(false); 10219 return; 10220 } 10221 10222 iter->fn = fn; 10223 iter->cpl = cpl; 10224 iter->ctx = ctx; 10225 10226 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 10227 iter, bdev_each_channel_cpl); 10228 } 10229 10230 static void 10231 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10232 { 10233 struct spdk_bdev_io *parent_io = cb_arg; 10234 10235 spdk_bdev_free_io(bdev_io); 10236 10237 /* Check return status of write */ 10238 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 10239 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 10240 } 10241 10242 static void 10243 bdev_copy_do_write(void *_bdev_io) 10244 { 10245 struct spdk_bdev_io *bdev_io = _bdev_io; 10246 int rc; 10247 10248 /* Write blocks */ 10249 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 10250 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10251 bdev_io->u.bdev.iovs[0].iov_base, 10252 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 10253 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 10254 10255 if (rc == -ENOMEM) { 10256 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 10257 } else if (rc != 0) { 10258 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10259 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10260 } 10261 } 10262 10263 static void 10264 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 10265 { 10266 struct spdk_bdev_io *parent_io = cb_arg; 10267 10268 spdk_bdev_free_io(bdev_io); 10269 10270 /* Check return status of read */ 10271 if (!success) { 10272 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10273 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 10274 return; 10275 } 10276 10277 /* Do write */ 10278 bdev_copy_do_write(parent_io); 10279 } 10280 10281 static void 10282 bdev_copy_do_read(void *_bdev_io) 10283 { 10284 struct spdk_bdev_io *bdev_io = _bdev_io; 10285 int rc; 10286 10287 /* Read blocks */ 10288 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 10289 spdk_io_channel_from_ctx(bdev_io->internal.ch), 10290 bdev_io->u.bdev.iovs[0].iov_base, 10291 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 10292 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 10293 10294 if (rc == -ENOMEM) { 10295 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 10296 } else if (rc != 0) { 10297 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10298 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10299 } 10300 } 10301 10302 static void 10303 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 10304 { 10305 if (!success) { 10306 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 10307 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 10308 return; 10309 } 10310 10311 bdev_copy_do_read(bdev_io); 10312 } 10313 10314 int 10315 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 10316 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 10317 spdk_bdev_io_completion_cb cb, void *cb_arg) 10318 { 10319 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 10320 struct spdk_bdev_io *bdev_io; 10321 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 10322 10323 if (!desc->write) { 10324 return -EBADF; 10325 } 10326 10327 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 10328 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 10329 SPDK_DEBUGLOG(bdev, 10330 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 10331 dst_offset_blocks, src_offset_blocks, num_blocks); 10332 return -EINVAL; 10333 } 10334 10335 bdev_io = bdev_channel_get_io(channel); 10336 if (!bdev_io) { 10337 return -ENOMEM; 10338 } 10339 10340 bdev_io->internal.ch = channel; 10341 bdev_io->internal.desc = desc; 10342 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 10343 10344 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 10345 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 10346 bdev_io->u.bdev.num_blocks = num_blocks; 10347 bdev_io->u.bdev.memory_domain = NULL; 10348 bdev_io->u.bdev.memory_domain_ctx = NULL; 10349 bdev_io->u.bdev.iovs = NULL; 10350 bdev_io->u.bdev.iovcnt = 0; 10351 bdev_io->u.bdev.md_buf = NULL; 10352 bdev_io->u.bdev.accel_sequence = NULL; 10353 bdev_io_init(bdev_io, bdev, cb_arg, cb); 10354 10355 if (dst_offset_blocks == src_offset_blocks || num_blocks == 0) { 10356 spdk_thread_send_msg(spdk_get_thread(), bdev_io_complete_cb, bdev_io); 10357 return 0; 10358 } 10359 10360 10361 /* If the copy size is large and should be split, use the generic split logic 10362 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 10363 * 10364 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 10365 * emulate it using regular read and write requests otherwise. 10366 */ 10367 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 10368 bdev_io->internal.split) { 10369 bdev_io_submit(bdev_io); 10370 return 0; 10371 } 10372 10373 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 10374 10375 return 0; 10376 } 10377 10378 SPDK_LOG_REGISTER_COMPONENT(bdev) 10379 10380 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 10381 { 10382 struct spdk_trace_tpoint_opts opts[] = { 10383 { 10384 "BDEV_IO_START", TRACE_BDEV_IO_START, 10385 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 1, 10386 { 10387 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10388 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 10389 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 10390 } 10391 }, 10392 { 10393 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 10394 OWNER_TYPE_BDEV, OBJECT_BDEV_IO, 0, 10395 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 10396 }, 10397 { 10398 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 10399 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10400 { 10401 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10402 } 10403 }, 10404 { 10405 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 10406 OWNER_TYPE_BDEV, OBJECT_NONE, 0, 10407 { 10408 { "tid", SPDK_TRACE_ARG_TYPE_INT, 8 } 10409 } 10410 }, 10411 }; 10412 10413 10414 spdk_trace_register_owner_type(OWNER_TYPE_BDEV, 'b'); 10415 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 10416 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 10417 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 10418 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 10419 } 10420