1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 #include "spdk_internal/assert.h" 30 31 #ifdef SPDK_CONFIG_VTUNE 32 #include "ittnotify.h" 33 #include "ittnotify_types.h" 34 int __itt_init_ittlib(const char *, __itt_group_id); 35 #endif 36 37 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 38 #define SPDK_BDEV_IO_CACHE_SIZE 256 39 #define SPDK_BDEV_AUTO_EXAMINE true 40 #define BUF_SMALL_POOL_SIZE 8191 41 #define BUF_LARGE_POOL_SIZE 1023 42 #define BUF_SMALL_CACHE_SIZE 128 43 #define BUF_LARGE_CACHE_SIZE 16 44 #define NOMEM_THRESHOLD_COUNT 8 45 46 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 47 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 48 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 49 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 50 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 51 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 52 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 53 54 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 55 * when splitting into children requests at a time. 56 */ 57 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 58 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 59 60 /* The maximum number of children requests for a COPY command 61 * when splitting into children requests at a time. 62 */ 63 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 64 65 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 66 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 67 #ifdef DEBUG 68 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 69 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 70 #else 71 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 72 #endif 73 74 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 75 const char *detail, struct spdk_bdev *bdev); 76 77 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 78 79 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 80 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 81 }; 82 83 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 84 85 RB_HEAD(bdev_name_tree, spdk_bdev_name); 86 87 static int 88 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 89 { 90 return strcmp(name1->name, name2->name); 91 } 92 93 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 94 95 struct spdk_bdev_mgr { 96 struct spdk_mempool *bdev_io_pool; 97 98 void *zero_buffer; 99 100 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 101 102 struct spdk_bdev_list bdevs; 103 struct bdev_name_tree bdev_names; 104 105 bool init_complete; 106 bool module_init_complete; 107 108 struct spdk_spinlock spinlock; 109 110 #ifdef SPDK_CONFIG_VTUNE 111 __itt_domain *domain; 112 #endif 113 }; 114 115 static struct spdk_bdev_mgr g_bdev_mgr = { 116 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 117 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 118 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 119 .init_complete = false, 120 .module_init_complete = false, 121 }; 122 123 static void 124 __attribute__((constructor)) 125 _bdev_init(void) 126 { 127 spdk_spin_init(&g_bdev_mgr.spinlock); 128 } 129 130 typedef void (*lock_range_cb)(void *ctx, int status); 131 132 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 133 134 struct lba_range { 135 uint64_t offset; 136 uint64_t length; 137 void *locked_ctx; 138 struct spdk_bdev_channel *owner_ch; 139 TAILQ_ENTRY(lba_range) tailq; 140 }; 141 142 static struct spdk_bdev_opts g_bdev_opts = { 143 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 144 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 145 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 146 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 147 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 148 }; 149 150 static spdk_bdev_init_cb g_init_cb_fn = NULL; 151 static void *g_init_cb_arg = NULL; 152 153 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 154 static void *g_fini_cb_arg = NULL; 155 static struct spdk_thread *g_fini_thread = NULL; 156 157 struct spdk_bdev_qos_limit { 158 /** IOs or bytes allowed per second (i.e., 1s). */ 159 uint64_t limit; 160 161 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 162 * For remaining bytes, allowed to run negative if an I/O is submitted when 163 * some bytes are remaining, but the I/O is bigger than that amount. The 164 * excess will be deducted from the next timeslice. 165 */ 166 int64_t remaining_this_timeslice; 167 168 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 169 uint32_t min_per_timeslice; 170 171 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 172 uint32_t max_per_timeslice; 173 174 /** Function to check whether to queue the IO. */ 175 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 176 177 /** Function to update for the submitted IO. */ 178 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 179 }; 180 181 struct spdk_bdev_qos { 182 /** Types of structure of rate limits. */ 183 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 184 185 /** The channel that all I/O are funneled through. */ 186 struct spdk_bdev_channel *ch; 187 188 /** The thread on which the poller is running. */ 189 struct spdk_thread *thread; 190 191 /** Queue of I/O waiting to be issued. */ 192 bdev_io_tailq_t queued; 193 194 /** Size of a timeslice in tsc ticks. */ 195 uint64_t timeslice_size; 196 197 /** Timestamp of start of last timeslice. */ 198 uint64_t last_timeslice; 199 200 /** Poller that processes queued I/O commands each time slice. */ 201 struct spdk_poller *poller; 202 }; 203 204 struct spdk_bdev_mgmt_channel { 205 /* 206 * Each thread keeps a cache of bdev_io - this allows 207 * bdev threads which are *not* DPDK threads to still 208 * benefit from a per-thread bdev_io cache. Without 209 * this, non-DPDK threads fetching from the mempool 210 * incur a cmpxchg on get and put. 211 */ 212 bdev_io_stailq_t per_thread_cache; 213 uint32_t per_thread_cache_count; 214 uint32_t bdev_io_cache_size; 215 216 struct spdk_iobuf_channel iobuf; 217 218 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 219 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 220 }; 221 222 /* 223 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 224 * will queue here their IO that awaits retry. It makes it possible to retry sending 225 * IO to one bdev after IO from other bdev completes. 226 */ 227 struct spdk_bdev_shared_resource { 228 /* The bdev management channel */ 229 struct spdk_bdev_mgmt_channel *mgmt_ch; 230 231 /* 232 * Count of I/O submitted to bdev module and waiting for completion. 233 * Incremented before submit_request() is called on an spdk_bdev_io. 234 */ 235 uint64_t io_outstanding; 236 237 /* 238 * Queue of IO awaiting retry because of a previous NOMEM status returned 239 * on this channel. 240 */ 241 bdev_io_tailq_t nomem_io; 242 243 /* 244 * Threshold which io_outstanding must drop to before retrying nomem_io. 245 */ 246 uint64_t nomem_threshold; 247 248 /* I/O channel allocated by a bdev module */ 249 struct spdk_io_channel *shared_ch; 250 251 /* Refcount of bdev channels using this resource */ 252 uint32_t ref; 253 254 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 255 }; 256 257 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 258 #define BDEV_CH_QOS_ENABLED (1 << 1) 259 260 struct spdk_bdev_channel { 261 struct spdk_bdev *bdev; 262 263 /* The channel for the underlying device */ 264 struct spdk_io_channel *channel; 265 266 /* Per io_device per thread data */ 267 struct spdk_bdev_shared_resource *shared_resource; 268 269 struct spdk_bdev_io_stat *stat; 270 271 /* 272 * Count of I/O submitted to the underlying dev module through this channel 273 * and waiting for completion. 274 */ 275 uint64_t io_outstanding; 276 277 /* 278 * List of all submitted I/Os including I/O that are generated via splitting. 279 */ 280 bdev_io_tailq_t io_submitted; 281 282 /* 283 * List of spdk_bdev_io that are currently queued because they write to a locked 284 * LBA range. 285 */ 286 bdev_io_tailq_t io_locked; 287 288 uint32_t flags; 289 290 struct spdk_histogram_data *histogram; 291 292 #ifdef SPDK_CONFIG_VTUNE 293 uint64_t start_tsc; 294 uint64_t interval_tsc; 295 __itt_string_handle *handle; 296 struct spdk_bdev_io_stat *prev_stat; 297 #endif 298 299 bdev_io_tailq_t queued_resets; 300 301 lba_range_tailq_t locked_ranges; 302 }; 303 304 struct media_event_entry { 305 struct spdk_bdev_media_event event; 306 TAILQ_ENTRY(media_event_entry) tailq; 307 }; 308 309 #define MEDIA_EVENT_POOL_SIZE 64 310 311 struct spdk_bdev_desc { 312 struct spdk_bdev *bdev; 313 struct spdk_thread *thread; 314 struct { 315 spdk_bdev_event_cb_t event_fn; 316 void *ctx; 317 } callback; 318 bool closed; 319 bool write; 320 bool memory_domains_supported; 321 struct spdk_spinlock spinlock; 322 uint32_t refs; 323 TAILQ_HEAD(, media_event_entry) pending_media_events; 324 TAILQ_HEAD(, media_event_entry) free_media_events; 325 struct media_event_entry *media_events_buffer; 326 TAILQ_ENTRY(spdk_bdev_desc) link; 327 328 uint64_t timeout_in_sec; 329 spdk_bdev_io_timeout_cb cb_fn; 330 void *cb_arg; 331 struct spdk_poller *io_timeout_poller; 332 struct spdk_bdev_module_claim *claim; 333 }; 334 335 struct spdk_bdev_iostat_ctx { 336 struct spdk_bdev_io_stat *stat; 337 spdk_bdev_get_device_stat_cb cb; 338 void *cb_arg; 339 }; 340 341 struct set_qos_limit_ctx { 342 void (*cb_fn)(void *cb_arg, int status); 343 void *cb_arg; 344 struct spdk_bdev *bdev; 345 }; 346 347 struct spdk_bdev_channel_iter { 348 spdk_bdev_for_each_channel_msg fn; 349 spdk_bdev_for_each_channel_done cpl; 350 struct spdk_io_channel_iter *i; 351 void *ctx; 352 }; 353 354 struct spdk_bdev_io_error_stat { 355 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 356 }; 357 358 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 359 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 360 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 361 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 362 363 static inline void bdev_io_complete(void *ctx); 364 365 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 366 static void bdev_write_zero_buffer_next(void *_bdev_io); 367 368 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 369 struct spdk_io_channel *ch, void *_ctx); 370 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 371 372 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 373 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 374 uint64_t num_blocks, 375 struct spdk_memory_domain *domain, void *domain_ctx, 376 spdk_bdev_io_completion_cb cb, void *cb_arg); 377 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 378 struct iovec *iov, int iovcnt, void *md_buf, 379 uint64_t offset_blocks, uint64_t num_blocks, 380 struct spdk_memory_domain *domain, void *domain_ctx, 381 spdk_bdev_io_completion_cb cb, void *cb_arg); 382 383 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 384 uint64_t offset, uint64_t length, 385 lock_range_cb cb_fn, void *cb_arg); 386 387 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 388 uint64_t offset, uint64_t length, 389 lock_range_cb cb_fn, void *cb_arg); 390 391 static inline void bdev_io_complete(void *ctx); 392 393 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 394 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 395 396 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 397 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 398 static void claim_reset(struct spdk_bdev *bdev); 399 400 #define bdev_get_ext_io_opt(opts, field, defval) \ 401 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 402 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 403 404 void 405 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 406 { 407 if (!opts) { 408 SPDK_ERRLOG("opts should not be NULL\n"); 409 return; 410 } 411 412 if (!opts_size) { 413 SPDK_ERRLOG("opts_size should not be zero value\n"); 414 return; 415 } 416 417 opts->opts_size = opts_size; 418 419 #define SET_FIELD(field) \ 420 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 421 opts->field = g_bdev_opts.field; \ 422 } \ 423 424 SET_FIELD(bdev_io_pool_size); 425 SET_FIELD(bdev_io_cache_size); 426 SET_FIELD(bdev_auto_examine); 427 SET_FIELD(small_buf_pool_size); 428 SET_FIELD(large_buf_pool_size); 429 430 /* Do not remove this statement, you should always update this statement when you adding a new field, 431 * and do not forget to add the SET_FIELD statement for your added field. */ 432 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 433 434 #undef SET_FIELD 435 } 436 437 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 438 "v23.05", 0); 439 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 440 "v23.05", 0); 441 int 442 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 443 { 444 struct spdk_iobuf_opts iobuf_opts; 445 uint32_t min_pool_size; 446 int rc; 447 448 if (!opts) { 449 SPDK_ERRLOG("opts cannot be NULL\n"); 450 return -1; 451 } 452 453 if (!opts->opts_size) { 454 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 455 return -1; 456 } 457 458 /* 459 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 460 * initialization. A second mgmt_ch will be created on the same thread when the application starts 461 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 462 */ 463 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 464 if (opts->bdev_io_pool_size < min_pool_size) { 465 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 466 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 467 spdk_thread_get_count()); 468 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 469 return -1; 470 } 471 472 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 473 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 474 } 475 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 476 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 477 } 478 479 #define SET_FIELD(field) \ 480 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 481 g_bdev_opts.field = opts->field; \ 482 } \ 483 484 SET_FIELD(bdev_io_pool_size); 485 SET_FIELD(bdev_io_cache_size); 486 SET_FIELD(bdev_auto_examine); 487 SET_FIELD(small_buf_pool_size); 488 SET_FIELD(large_buf_pool_size); 489 490 spdk_iobuf_get_opts(&iobuf_opts); 491 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 492 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 493 494 rc = spdk_iobuf_set_opts(&iobuf_opts); 495 if (rc != 0) { 496 SPDK_ERRLOG("Failed to set iobuf opts\n"); 497 return -1; 498 } 499 500 g_bdev_opts.opts_size = opts->opts_size; 501 502 #undef SET_FIELD 503 504 return 0; 505 } 506 507 static struct spdk_bdev * 508 bdev_get_by_name(const char *bdev_name) 509 { 510 struct spdk_bdev_name find; 511 struct spdk_bdev_name *res; 512 513 find.name = (char *)bdev_name; 514 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 515 if (res != NULL) { 516 return res->bdev; 517 } 518 519 return NULL; 520 } 521 522 struct spdk_bdev * 523 spdk_bdev_get_by_name(const char *bdev_name) 524 { 525 struct spdk_bdev *bdev; 526 527 spdk_spin_lock(&g_bdev_mgr.spinlock); 528 bdev = bdev_get_by_name(bdev_name); 529 spdk_spin_unlock(&g_bdev_mgr.spinlock); 530 531 return bdev; 532 } 533 534 struct bdev_io_status_string { 535 enum spdk_bdev_io_status status; 536 const char *str; 537 }; 538 539 static const struct bdev_io_status_string bdev_io_status_strings[] = { 540 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 541 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 542 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 543 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 544 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 545 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 546 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 547 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 548 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 549 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 550 }; 551 552 static const char * 553 bdev_io_status_get_string(enum spdk_bdev_io_status status) 554 { 555 uint32_t i; 556 557 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 558 if (bdev_io_status_strings[i].status == status) { 559 return bdev_io_status_strings[i].str; 560 } 561 } 562 563 return "reserved"; 564 } 565 566 struct spdk_bdev_wait_for_examine_ctx { 567 struct spdk_poller *poller; 568 spdk_bdev_wait_for_examine_cb cb_fn; 569 void *cb_arg; 570 }; 571 572 static bool bdev_module_all_actions_completed(void); 573 574 static int 575 bdev_wait_for_examine_cb(void *arg) 576 { 577 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 578 579 if (!bdev_module_all_actions_completed()) { 580 return SPDK_POLLER_IDLE; 581 } 582 583 spdk_poller_unregister(&ctx->poller); 584 ctx->cb_fn(ctx->cb_arg); 585 free(ctx); 586 587 return SPDK_POLLER_BUSY; 588 } 589 590 int 591 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 592 { 593 struct spdk_bdev_wait_for_examine_ctx *ctx; 594 595 ctx = calloc(1, sizeof(*ctx)); 596 if (ctx == NULL) { 597 return -ENOMEM; 598 } 599 ctx->cb_fn = cb_fn; 600 ctx->cb_arg = cb_arg; 601 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 602 603 return 0; 604 } 605 606 struct spdk_bdev_examine_item { 607 char *name; 608 TAILQ_ENTRY(spdk_bdev_examine_item) link; 609 }; 610 611 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 612 613 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 614 g_bdev_examine_allowlist); 615 616 static inline bool 617 bdev_examine_allowlist_check(const char *name) 618 { 619 struct spdk_bdev_examine_item *item; 620 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 621 if (strcmp(name, item->name) == 0) { 622 return true; 623 } 624 } 625 return false; 626 } 627 628 static inline void 629 bdev_examine_allowlist_free(void) 630 { 631 struct spdk_bdev_examine_item *item; 632 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 633 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 634 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 635 free(item->name); 636 free(item); 637 } 638 } 639 640 static inline bool 641 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 642 { 643 struct spdk_bdev_alias *tmp; 644 if (bdev_examine_allowlist_check(bdev->name)) { 645 return true; 646 } 647 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 648 if (bdev_examine_allowlist_check(tmp->alias.name)) { 649 return true; 650 } 651 } 652 return false; 653 } 654 655 static inline bool 656 bdev_ok_to_examine(struct spdk_bdev *bdev) 657 { 658 if (g_bdev_opts.bdev_auto_examine) { 659 return true; 660 } else { 661 return bdev_in_examine_allowlist(bdev); 662 } 663 } 664 665 static void 666 bdev_examine(struct spdk_bdev *bdev) 667 { 668 struct spdk_bdev_module *module; 669 struct spdk_bdev_module_claim *claim, *tmpclaim; 670 uint32_t action; 671 672 if (!bdev_ok_to_examine(bdev)) { 673 return; 674 } 675 676 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 677 if (module->examine_config) { 678 spdk_spin_lock(&module->internal.spinlock); 679 action = module->internal.action_in_progress; 680 module->internal.action_in_progress++; 681 spdk_spin_unlock(&module->internal.spinlock); 682 module->examine_config(bdev); 683 if (action != module->internal.action_in_progress) { 684 SPDK_ERRLOG("examine_config for module %s did not call " 685 "spdk_bdev_module_examine_done()\n", module->name); 686 } 687 } 688 } 689 690 spdk_spin_lock(&bdev->internal.spinlock); 691 692 switch (bdev->internal.claim_type) { 693 case SPDK_BDEV_CLAIM_NONE: 694 /* Examine by all bdev modules */ 695 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 696 if (module->examine_disk) { 697 spdk_spin_lock(&module->internal.spinlock); 698 module->internal.action_in_progress++; 699 spdk_spin_unlock(&module->internal.spinlock); 700 spdk_spin_unlock(&bdev->internal.spinlock); 701 module->examine_disk(bdev); 702 spdk_spin_lock(&bdev->internal.spinlock); 703 } 704 } 705 break; 706 case SPDK_BDEV_CLAIM_EXCL_WRITE: 707 /* Examine by the one bdev module with a v1 claim */ 708 module = bdev->internal.claim.v1.module; 709 if (module->examine_disk) { 710 spdk_spin_lock(&module->internal.spinlock); 711 module->internal.action_in_progress++; 712 spdk_spin_unlock(&module->internal.spinlock); 713 spdk_spin_unlock(&bdev->internal.spinlock); 714 module->examine_disk(bdev); 715 return; 716 } 717 break; 718 default: 719 /* Examine by all bdev modules with a v2 claim */ 720 assert(claim_type_is_v2(bdev->internal.claim_type)); 721 /* 722 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 723 * list, perhaps accessing freed memory. Without protection, this could happen 724 * while the lock is dropped during the examine callback. 725 */ 726 bdev->internal.examine_in_progress++; 727 728 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 729 module = claim->module; 730 731 if (module == NULL) { 732 /* This is a vestigial claim, held by examine_count */ 733 continue; 734 } 735 736 if (module->examine_disk == NULL) { 737 continue; 738 } 739 740 spdk_spin_lock(&module->internal.spinlock); 741 module->internal.action_in_progress++; 742 spdk_spin_unlock(&module->internal.spinlock); 743 744 /* Call examine_disk without holding internal.spinlock. */ 745 spdk_spin_unlock(&bdev->internal.spinlock); 746 module->examine_disk(bdev); 747 spdk_spin_lock(&bdev->internal.spinlock); 748 } 749 750 assert(bdev->internal.examine_in_progress > 0); 751 bdev->internal.examine_in_progress--; 752 if (bdev->internal.examine_in_progress == 0) { 753 /* Remove any claims that were released during examine_disk */ 754 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 755 if (claim->desc != NULL) { 756 continue; 757 } 758 759 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 760 free(claim); 761 } 762 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 763 claim_reset(bdev); 764 } 765 } 766 } 767 768 spdk_spin_unlock(&bdev->internal.spinlock); 769 } 770 771 int 772 spdk_bdev_examine(const char *name) 773 { 774 struct spdk_bdev *bdev; 775 struct spdk_bdev_examine_item *item; 776 struct spdk_thread *thread = spdk_get_thread(); 777 778 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 779 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 780 thread ? spdk_thread_get_name(thread) : "null"); 781 return -EINVAL; 782 } 783 784 if (g_bdev_opts.bdev_auto_examine) { 785 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 786 return -EINVAL; 787 } 788 789 if (bdev_examine_allowlist_check(name)) { 790 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 791 return -EEXIST; 792 } 793 794 item = calloc(1, sizeof(*item)); 795 if (!item) { 796 return -ENOMEM; 797 } 798 item->name = strdup(name); 799 if (!item->name) { 800 free(item); 801 return -ENOMEM; 802 } 803 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 804 805 bdev = spdk_bdev_get_by_name(name); 806 if (bdev) { 807 bdev_examine(bdev); 808 } 809 return 0; 810 } 811 812 static inline void 813 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 814 { 815 struct spdk_bdev_examine_item *item; 816 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 817 spdk_json_write_object_begin(w); 818 spdk_json_write_named_string(w, "method", "bdev_examine"); 819 spdk_json_write_named_object_begin(w, "params"); 820 spdk_json_write_named_string(w, "name", item->name); 821 spdk_json_write_object_end(w); 822 spdk_json_write_object_end(w); 823 } 824 } 825 826 struct spdk_bdev * 827 spdk_bdev_first(void) 828 { 829 struct spdk_bdev *bdev; 830 831 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 832 if (bdev) { 833 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 834 } 835 836 return bdev; 837 } 838 839 struct spdk_bdev * 840 spdk_bdev_next(struct spdk_bdev *prev) 841 { 842 struct spdk_bdev *bdev; 843 844 bdev = TAILQ_NEXT(prev, internal.link); 845 if (bdev) { 846 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 847 } 848 849 return bdev; 850 } 851 852 static struct spdk_bdev * 853 _bdev_next_leaf(struct spdk_bdev *bdev) 854 { 855 while (bdev != NULL) { 856 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 857 return bdev; 858 } else { 859 bdev = TAILQ_NEXT(bdev, internal.link); 860 } 861 } 862 863 return bdev; 864 } 865 866 struct spdk_bdev * 867 spdk_bdev_first_leaf(void) 868 { 869 struct spdk_bdev *bdev; 870 871 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 872 873 if (bdev) { 874 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 875 } 876 877 return bdev; 878 } 879 880 struct spdk_bdev * 881 spdk_bdev_next_leaf(struct spdk_bdev *prev) 882 { 883 struct spdk_bdev *bdev; 884 885 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 886 887 if (bdev) { 888 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 889 } 890 891 return bdev; 892 } 893 894 static inline bool 895 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 896 { 897 return bdev_io->internal.memory_domain; 898 } 899 900 void 901 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 902 { 903 struct iovec *iovs; 904 905 if (bdev_io->u.bdev.iovs == NULL) { 906 bdev_io->u.bdev.iovs = &bdev_io->iov; 907 bdev_io->u.bdev.iovcnt = 1; 908 } 909 910 iovs = bdev_io->u.bdev.iovs; 911 912 assert(iovs != NULL); 913 assert(bdev_io->u.bdev.iovcnt >= 1); 914 915 iovs[0].iov_base = buf; 916 iovs[0].iov_len = len; 917 } 918 919 void 920 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 921 { 922 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 923 bdev_io->u.bdev.md_buf = md_buf; 924 } 925 926 static bool 927 _is_buf_allocated(const struct iovec *iovs) 928 { 929 if (iovs == NULL) { 930 return false; 931 } 932 933 return iovs[0].iov_base != NULL; 934 } 935 936 static bool 937 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 938 { 939 int i; 940 uintptr_t iov_base; 941 942 if (spdk_likely(alignment == 1)) { 943 return true; 944 } 945 946 for (i = 0; i < iovcnt; i++) { 947 iov_base = (uintptr_t)iovs[i].iov_base; 948 if ((iov_base & (alignment - 1)) != 0) { 949 return false; 950 } 951 } 952 953 return true; 954 } 955 956 static void 957 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 958 { 959 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 960 void *buf; 961 962 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 963 buf = bdev_io->internal.buf; 964 bdev_io->internal.buf = NULL; 965 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 966 bdev_io->internal.get_aux_buf_cb = NULL; 967 } else { 968 assert(bdev_io->internal.get_buf_cb != NULL); 969 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 970 bdev_io->internal.get_buf_cb = NULL; 971 } 972 } 973 974 static void 975 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 976 { 977 struct spdk_bdev_io *bdev_io = ctx; 978 979 if (rc) { 980 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 981 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 982 } 983 bdev_io_get_buf_complete(bdev_io, !rc); 984 } 985 986 static void 987 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 988 { 989 int rc = 0; 990 991 /* save original md_buf */ 992 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 993 bdev_io->internal.orig_md_iov.iov_len = len; 994 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 995 bdev_io->internal.bounce_md_iov.iov_len = len; 996 /* set bounce md_buf */ 997 bdev_io->u.bdev.md_buf = md_buf; 998 999 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1000 if (bdev_io_use_memory_domain(bdev_io)) { 1001 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1002 bdev_io->internal.memory_domain_ctx, 1003 &bdev_io->internal.orig_md_iov, 1, 1004 &bdev_io->internal.bounce_md_iov, 1, 1005 bdev_io->internal.data_transfer_cpl, 1006 bdev_io); 1007 if (rc == 0) { 1008 /* Continue to submit IO in completion callback */ 1009 return; 1010 } 1011 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1012 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1013 } else { 1014 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1015 } 1016 } 1017 1018 assert(bdev_io->internal.data_transfer_cpl); 1019 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1020 } 1021 1022 static void 1023 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1024 { 1025 struct spdk_bdev *bdev = bdev_io->bdev; 1026 uint64_t md_len; 1027 void *buf; 1028 1029 if (spdk_bdev_is_md_separate(bdev)) { 1030 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1031 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1032 1033 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1034 1035 if (bdev_io->u.bdev.md_buf != NULL) { 1036 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1037 return; 1038 } else { 1039 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1040 } 1041 } 1042 1043 bdev_io_get_buf_complete(bdev_io, true); 1044 } 1045 1046 static void 1047 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1048 { 1049 struct spdk_bdev_io *bdev_io = ctx; 1050 1051 if (rc) { 1052 SPDK_ERRLOG("Failed to get data buffer\n"); 1053 assert(bdev_io->internal.data_transfer_cpl); 1054 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1055 return; 1056 } 1057 1058 _bdev_io_set_md_buf(bdev_io); 1059 } 1060 1061 static void 1062 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1063 bdev_copy_bounce_buffer_cpl cpl_cb) 1064 { 1065 int rc = 0; 1066 1067 bdev_io->internal.data_transfer_cpl = cpl_cb; 1068 /* save original iovec */ 1069 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1070 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1071 /* set bounce iov */ 1072 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1073 bdev_io->u.bdev.iovcnt = 1; 1074 /* set bounce buffer for this operation */ 1075 bdev_io->u.bdev.iovs[0].iov_base = buf; 1076 bdev_io->u.bdev.iovs[0].iov_len = len; 1077 /* if this is write path, copy data from original buffer to bounce buffer */ 1078 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1079 if (bdev_io_use_memory_domain(bdev_io)) { 1080 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1081 bdev_io->internal.memory_domain_ctx, 1082 bdev_io->internal.orig_iovs, 1083 (uint32_t) bdev_io->internal.orig_iovcnt, 1084 bdev_io->u.bdev.iovs, 1, 1085 _bdev_io_pull_bounce_data_buf_done, 1086 bdev_io); 1087 if (rc == 0) { 1088 /* Continue to submit IO in completion callback */ 1089 return; 1090 } 1091 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1092 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1093 } else { 1094 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1095 } 1096 } 1097 1098 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1099 } 1100 1101 static void 1102 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1103 { 1104 struct spdk_bdev *bdev = bdev_io->bdev; 1105 bool buf_allocated; 1106 uint64_t alignment; 1107 void *aligned_buf; 1108 1109 bdev_io->internal.buf = buf; 1110 1111 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1112 bdev_io_get_buf_complete(bdev_io, true); 1113 return; 1114 } 1115 1116 alignment = spdk_bdev_get_buf_align(bdev); 1117 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1118 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1119 1120 if (buf_allocated) { 1121 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1122 /* Continue in completion callback */ 1123 return; 1124 } else { 1125 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1126 } 1127 1128 _bdev_io_set_md_buf(bdev_io); 1129 } 1130 1131 static inline uint64_t 1132 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1133 { 1134 struct spdk_bdev *bdev = bdev_io->bdev; 1135 uint64_t md_len, alignment; 1136 1137 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1138 alignment = spdk_bdev_get_buf_align(bdev); 1139 1140 return len + alignment + md_len; 1141 } 1142 1143 static void 1144 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1145 { 1146 struct spdk_bdev_mgmt_channel *ch; 1147 1148 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1149 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1150 } 1151 1152 static void 1153 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1154 { 1155 assert(bdev_io->internal.buf != NULL); 1156 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1157 bdev_io->internal.buf = NULL; 1158 } 1159 1160 void 1161 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1162 { 1163 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1164 1165 assert(buf != NULL); 1166 _bdev_io_put_buf(bdev_io, buf, len); 1167 } 1168 1169 static void 1170 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1171 { 1172 struct spdk_bdev *bdev = bdev_ch->bdev; 1173 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1174 struct spdk_bdev_io *bdev_io; 1175 1176 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1177 /* 1178 * Allow some more I/O to complete before retrying the nomem_io queue. 1179 * Some drivers (such as nvme) cannot immediately take a new I/O in 1180 * the context of a completion, because the resources for the I/O are 1181 * not released until control returns to the bdev poller. Also, we 1182 * may require several small I/O to complete before a larger I/O 1183 * (that requires splitting) can be submitted. 1184 */ 1185 return; 1186 } 1187 1188 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1189 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1190 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1191 bdev_io->internal.ch->io_outstanding++; 1192 shared_resource->io_outstanding++; 1193 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1194 bdev_io->internal.error.nvme.cdw0 = 0; 1195 bdev_io->num_retries++; 1196 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1197 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1198 break; 1199 } 1200 } 1201 } 1202 1203 static inline void 1204 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1205 struct spdk_bdev_shared_resource *shared_resource) 1206 { 1207 assert(bdev_ch->io_outstanding > 0); 1208 assert(shared_resource->io_outstanding > 0); 1209 bdev_ch->io_outstanding--; 1210 shared_resource->io_outstanding--; 1211 } 1212 1213 static inline bool 1214 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1215 { 1216 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1217 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1218 1219 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1220 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1221 /* 1222 * Wait for some of the outstanding I/O to complete before we 1223 * retry any of the nomem_io. Normally we will wait for 1224 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1225 * depth channels we will instead wait for half to complete. 1226 */ 1227 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1228 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1229 return true; 1230 } 1231 1232 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1233 bdev_ch_retry_io(bdev_ch); 1234 } 1235 1236 return false; 1237 } 1238 1239 static void 1240 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1241 { 1242 struct spdk_bdev_io *bdev_io = ctx; 1243 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1244 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1245 1246 if (rc) { 1247 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1248 } 1249 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1250 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1251 */ 1252 bdev_io_put_buf(bdev_io); 1253 1254 /* Continue with IO completion flow */ 1255 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1256 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1257 return; 1258 } 1259 1260 bdev_io_complete(bdev_io); 1261 } 1262 1263 static inline void 1264 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1265 { 1266 int rc = 0; 1267 1268 /* do the same for metadata buffer */ 1269 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1270 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1271 1272 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1273 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1274 if (bdev_io_use_memory_domain(bdev_io)) { 1275 /* If memory domain is used then we need to call async push function */ 1276 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1277 bdev_io->internal.memory_domain_ctx, 1278 &bdev_io->internal.orig_md_iov, 1279 (uint32_t)bdev_io->internal.orig_iovcnt, 1280 &bdev_io->internal.bounce_md_iov, 1, 1281 bdev_io->internal.data_transfer_cpl, 1282 bdev_io); 1283 if (rc == 0) { 1284 /* Continue IO completion in async callback */ 1285 return; 1286 } 1287 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1288 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1289 } else { 1290 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1291 bdev_io->internal.orig_md_iov.iov_len); 1292 } 1293 } 1294 } 1295 1296 assert(bdev_io->internal.data_transfer_cpl); 1297 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1298 } 1299 1300 static void 1301 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1302 { 1303 struct spdk_bdev_io *bdev_io = ctx; 1304 1305 assert(bdev_io->internal.data_transfer_cpl); 1306 1307 if (rc) { 1308 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1309 return; 1310 } 1311 1312 /* set original buffer for this io */ 1313 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1314 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1315 /* disable bouncing buffer for this io */ 1316 bdev_io->internal.orig_iovcnt = 0; 1317 bdev_io->internal.orig_iovs = NULL; 1318 1319 _bdev_io_push_bounce_md_buffer(bdev_io); 1320 } 1321 1322 static inline void 1323 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1324 { 1325 int rc = 0; 1326 1327 bdev_io->internal.data_transfer_cpl = cpl_cb; 1328 1329 /* if this is read path, copy data from bounce buffer to original buffer */ 1330 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1331 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1332 if (bdev_io_use_memory_domain(bdev_io)) { 1333 /* If memory domain is used then we need to call async push function */ 1334 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1335 bdev_io->internal.memory_domain_ctx, 1336 bdev_io->internal.orig_iovs, 1337 (uint32_t)bdev_io->internal.orig_iovcnt, 1338 &bdev_io->internal.bounce_iov, 1, 1339 _bdev_io_push_bounce_data_buffer_done, 1340 bdev_io); 1341 if (rc == 0) { 1342 /* Continue IO completion in async callback */ 1343 return; 1344 } 1345 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1346 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1347 } else { 1348 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1349 bdev_io->internal.orig_iovcnt, 1350 bdev_io->internal.bounce_iov.iov_base, 1351 bdev_io->internal.bounce_iov.iov_len); 1352 } 1353 } 1354 1355 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1356 } 1357 1358 static void 1359 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1360 { 1361 struct spdk_bdev_io *bdev_io; 1362 1363 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1364 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1365 } 1366 1367 static void 1368 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1369 { 1370 struct spdk_bdev_mgmt_channel *mgmt_ch; 1371 uint64_t max_len; 1372 void *buf; 1373 1374 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1375 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1376 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1377 1378 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1379 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1380 bdev_io_get_buf_complete(bdev_io, false); 1381 return; 1382 } 1383 1384 bdev_io->internal.buf_len = len; 1385 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1386 bdev_io_get_iobuf_cb); 1387 if (buf != NULL) { 1388 _bdev_io_set_buf(bdev_io, buf, len); 1389 } 1390 } 1391 1392 void 1393 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1394 { 1395 struct spdk_bdev *bdev = bdev_io->bdev; 1396 uint64_t alignment; 1397 1398 assert(cb != NULL); 1399 bdev_io->internal.get_buf_cb = cb; 1400 1401 alignment = spdk_bdev_get_buf_align(bdev); 1402 1403 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1404 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1405 /* Buffer already present and aligned */ 1406 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1407 return; 1408 } 1409 1410 bdev_io_get_buf(bdev_io, len); 1411 } 1412 1413 static void 1414 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1415 bool success) 1416 { 1417 if (!success) { 1418 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1419 bdev_io_complete(bdev_io); 1420 } else { 1421 bdev_io_submit(bdev_io); 1422 } 1423 } 1424 1425 static void 1426 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1427 uint64_t len) 1428 { 1429 assert(cb != NULL); 1430 bdev_io->internal.get_buf_cb = cb; 1431 1432 bdev_io_get_buf(bdev_io, len); 1433 } 1434 1435 void 1436 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1437 { 1438 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1439 1440 assert(cb != NULL); 1441 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1442 bdev_io->internal.get_aux_buf_cb = cb; 1443 bdev_io_get_buf(bdev_io, len); 1444 } 1445 1446 static int 1447 bdev_module_get_max_ctx_size(void) 1448 { 1449 struct spdk_bdev_module *bdev_module; 1450 int max_bdev_module_size = 0; 1451 1452 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1453 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1454 max_bdev_module_size = bdev_module->get_ctx_size(); 1455 } 1456 } 1457 1458 return max_bdev_module_size; 1459 } 1460 1461 static void 1462 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1463 { 1464 int i; 1465 struct spdk_bdev_qos *qos = bdev->internal.qos; 1466 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1467 1468 if (!qos) { 1469 return; 1470 } 1471 1472 spdk_bdev_get_qos_rate_limits(bdev, limits); 1473 1474 spdk_json_write_object_begin(w); 1475 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1476 1477 spdk_json_write_named_object_begin(w, "params"); 1478 spdk_json_write_named_string(w, "name", bdev->name); 1479 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1480 if (limits[i] > 0) { 1481 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1482 } 1483 } 1484 spdk_json_write_object_end(w); 1485 1486 spdk_json_write_object_end(w); 1487 } 1488 1489 void 1490 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1491 { 1492 struct spdk_bdev_module *bdev_module; 1493 struct spdk_bdev *bdev; 1494 1495 assert(w != NULL); 1496 1497 spdk_json_write_array_begin(w); 1498 1499 spdk_json_write_object_begin(w); 1500 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1501 spdk_json_write_named_object_begin(w, "params"); 1502 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1503 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1504 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1505 spdk_json_write_object_end(w); 1506 spdk_json_write_object_end(w); 1507 1508 bdev_examine_allowlist_config_json(w); 1509 1510 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1511 if (bdev_module->config_json) { 1512 bdev_module->config_json(w); 1513 } 1514 } 1515 1516 spdk_spin_lock(&g_bdev_mgr.spinlock); 1517 1518 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1519 if (bdev->fn_table->write_config_json) { 1520 bdev->fn_table->write_config_json(bdev, w); 1521 } 1522 1523 bdev_qos_config_json(bdev, w); 1524 } 1525 1526 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1527 1528 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1529 spdk_json_write_object_begin(w); 1530 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1531 spdk_json_write_object_end(w); 1532 1533 spdk_json_write_array_end(w); 1534 } 1535 1536 static void 1537 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1538 { 1539 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1540 struct spdk_bdev_io *bdev_io; 1541 1542 spdk_iobuf_channel_fini(&ch->iobuf); 1543 1544 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1545 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1546 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1547 ch->per_thread_cache_count--; 1548 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1549 } 1550 1551 assert(ch->per_thread_cache_count == 0); 1552 } 1553 1554 static int 1555 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1556 { 1557 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1558 struct spdk_bdev_io *bdev_io; 1559 uint32_t i; 1560 int rc; 1561 1562 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1563 if (rc != 0) { 1564 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1565 return -1; 1566 } 1567 1568 STAILQ_INIT(&ch->per_thread_cache); 1569 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1570 1571 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1572 ch->per_thread_cache_count = 0; 1573 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1574 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1575 if (bdev_io == NULL) { 1576 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1577 assert(false); 1578 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1579 return -1; 1580 } 1581 ch->per_thread_cache_count++; 1582 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1583 } 1584 1585 TAILQ_INIT(&ch->shared_resources); 1586 TAILQ_INIT(&ch->io_wait_queue); 1587 1588 return 0; 1589 } 1590 1591 static void 1592 bdev_init_complete(int rc) 1593 { 1594 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1595 void *cb_arg = g_init_cb_arg; 1596 struct spdk_bdev_module *m; 1597 1598 g_bdev_mgr.init_complete = true; 1599 g_init_cb_fn = NULL; 1600 g_init_cb_arg = NULL; 1601 1602 /* 1603 * For modules that need to know when subsystem init is complete, 1604 * inform them now. 1605 */ 1606 if (rc == 0) { 1607 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1608 if (m->init_complete) { 1609 m->init_complete(); 1610 } 1611 } 1612 } 1613 1614 cb_fn(cb_arg, rc); 1615 } 1616 1617 static bool 1618 bdev_module_all_actions_completed(void) 1619 { 1620 struct spdk_bdev_module *m; 1621 1622 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1623 if (m->internal.action_in_progress > 0) { 1624 return false; 1625 } 1626 } 1627 return true; 1628 } 1629 1630 static void 1631 bdev_module_action_complete(void) 1632 { 1633 /* 1634 * Don't finish bdev subsystem initialization if 1635 * module pre-initialization is still in progress, or 1636 * the subsystem been already initialized. 1637 */ 1638 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1639 return; 1640 } 1641 1642 /* 1643 * Check all bdev modules for inits/examinations in progress. If any 1644 * exist, return immediately since we cannot finish bdev subsystem 1645 * initialization until all are completed. 1646 */ 1647 if (!bdev_module_all_actions_completed()) { 1648 return; 1649 } 1650 1651 /* 1652 * Modules already finished initialization - now that all 1653 * the bdev modules have finished their asynchronous I/O 1654 * processing, the entire bdev layer can be marked as complete. 1655 */ 1656 bdev_init_complete(0); 1657 } 1658 1659 static void 1660 bdev_module_action_done(struct spdk_bdev_module *module) 1661 { 1662 spdk_spin_lock(&module->internal.spinlock); 1663 assert(module->internal.action_in_progress > 0); 1664 module->internal.action_in_progress--; 1665 spdk_spin_unlock(&module->internal.spinlock); 1666 bdev_module_action_complete(); 1667 } 1668 1669 void 1670 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1671 { 1672 assert(module->async_init); 1673 bdev_module_action_done(module); 1674 } 1675 1676 void 1677 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1678 { 1679 bdev_module_action_done(module); 1680 } 1681 1682 /** The last initialized bdev module */ 1683 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1684 1685 static void 1686 bdev_init_failed(void *cb_arg) 1687 { 1688 struct spdk_bdev_module *module = cb_arg; 1689 1690 spdk_spin_lock(&module->internal.spinlock); 1691 assert(module->internal.action_in_progress > 0); 1692 module->internal.action_in_progress--; 1693 spdk_spin_unlock(&module->internal.spinlock); 1694 bdev_init_complete(-1); 1695 } 1696 1697 static int 1698 bdev_modules_init(void) 1699 { 1700 struct spdk_bdev_module *module; 1701 int rc = 0; 1702 1703 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1704 g_resume_bdev_module = module; 1705 if (module->async_init) { 1706 spdk_spin_lock(&module->internal.spinlock); 1707 module->internal.action_in_progress = 1; 1708 spdk_spin_unlock(&module->internal.spinlock); 1709 } 1710 rc = module->module_init(); 1711 if (rc != 0) { 1712 /* Bump action_in_progress to prevent other modules from completion of modules_init 1713 * Send message to defer application shutdown until resources are cleaned up */ 1714 spdk_spin_lock(&module->internal.spinlock); 1715 module->internal.action_in_progress = 1; 1716 spdk_spin_unlock(&module->internal.spinlock); 1717 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1718 return rc; 1719 } 1720 } 1721 1722 g_resume_bdev_module = NULL; 1723 return 0; 1724 } 1725 1726 void 1727 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1728 { 1729 int rc = 0; 1730 char mempool_name[32]; 1731 1732 assert(cb_fn != NULL); 1733 1734 g_init_cb_fn = cb_fn; 1735 g_init_cb_arg = cb_arg; 1736 1737 spdk_notify_type_register("bdev_register"); 1738 spdk_notify_type_register("bdev_unregister"); 1739 1740 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1741 1742 rc = spdk_iobuf_register_module("bdev"); 1743 if (rc != 0) { 1744 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1745 bdev_init_complete(-1); 1746 return; 1747 } 1748 1749 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1750 g_bdev_opts.bdev_io_pool_size, 1751 sizeof(struct spdk_bdev_io) + 1752 bdev_module_get_max_ctx_size(), 1753 0, 1754 SPDK_ENV_SOCKET_ID_ANY); 1755 1756 if (g_bdev_mgr.bdev_io_pool == NULL) { 1757 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1758 bdev_init_complete(-1); 1759 return; 1760 } 1761 1762 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1763 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1764 if (!g_bdev_mgr.zero_buffer) { 1765 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1766 bdev_init_complete(-1); 1767 return; 1768 } 1769 1770 #ifdef SPDK_CONFIG_VTUNE 1771 SPDK_LOG_DEPRECATED(vtune_support); 1772 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1773 #endif 1774 1775 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1776 bdev_mgmt_channel_destroy, 1777 sizeof(struct spdk_bdev_mgmt_channel), 1778 "bdev_mgr"); 1779 1780 rc = bdev_modules_init(); 1781 g_bdev_mgr.module_init_complete = true; 1782 if (rc != 0) { 1783 SPDK_ERRLOG("bdev modules init failed\n"); 1784 return; 1785 } 1786 1787 bdev_module_action_complete(); 1788 } 1789 1790 static void 1791 bdev_mgr_unregister_cb(void *io_device) 1792 { 1793 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1794 1795 if (g_bdev_mgr.bdev_io_pool) { 1796 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1797 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1798 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1799 g_bdev_opts.bdev_io_pool_size); 1800 } 1801 1802 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1803 } 1804 1805 spdk_free(g_bdev_mgr.zero_buffer); 1806 1807 bdev_examine_allowlist_free(); 1808 1809 cb_fn(g_fini_cb_arg); 1810 g_fini_cb_fn = NULL; 1811 g_fini_cb_arg = NULL; 1812 g_bdev_mgr.init_complete = false; 1813 g_bdev_mgr.module_init_complete = false; 1814 } 1815 1816 static void 1817 bdev_module_fini_iter(void *arg) 1818 { 1819 struct spdk_bdev_module *bdev_module; 1820 1821 /* FIXME: Handling initialization failures is broken now, 1822 * so we won't even try cleaning up after successfully 1823 * initialized modules. if module_init_complete is false, 1824 * just call spdk_bdev_mgr_unregister_cb 1825 */ 1826 if (!g_bdev_mgr.module_init_complete) { 1827 bdev_mgr_unregister_cb(NULL); 1828 return; 1829 } 1830 1831 /* Start iterating from the last touched module */ 1832 if (!g_resume_bdev_module) { 1833 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1834 } else { 1835 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1836 internal.tailq); 1837 } 1838 1839 while (bdev_module) { 1840 if (bdev_module->async_fini) { 1841 /* Save our place so we can resume later. We must 1842 * save the variable here, before calling module_fini() 1843 * below, because in some cases the module may immediately 1844 * call spdk_bdev_module_fini_done() and re-enter 1845 * this function to continue iterating. */ 1846 g_resume_bdev_module = bdev_module; 1847 } 1848 1849 if (bdev_module->module_fini) { 1850 bdev_module->module_fini(); 1851 } 1852 1853 if (bdev_module->async_fini) { 1854 return; 1855 } 1856 1857 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1858 internal.tailq); 1859 } 1860 1861 g_resume_bdev_module = NULL; 1862 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1863 } 1864 1865 void 1866 spdk_bdev_module_fini_done(void) 1867 { 1868 if (spdk_get_thread() != g_fini_thread) { 1869 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1870 } else { 1871 bdev_module_fini_iter(NULL); 1872 } 1873 } 1874 1875 static void 1876 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1877 { 1878 struct spdk_bdev *bdev = cb_arg; 1879 1880 if (bdeverrno && bdev) { 1881 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1882 bdev->name); 1883 1884 /* 1885 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1886 * bdev; try to continue by manually removing this bdev from the list and continue 1887 * with the next bdev in the list. 1888 */ 1889 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1890 } 1891 1892 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1893 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1894 /* 1895 * Bdev module finish need to be deferred as we might be in the middle of some context 1896 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1897 * after returning. 1898 */ 1899 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1900 return; 1901 } 1902 1903 /* 1904 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1905 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1906 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1907 * base bdevs. 1908 * 1909 * Also, walk the list in the reverse order. 1910 */ 1911 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1912 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1913 spdk_spin_lock(&bdev->internal.spinlock); 1914 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1915 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 1916 spdk_spin_unlock(&bdev->internal.spinlock); 1917 continue; 1918 } 1919 spdk_spin_unlock(&bdev->internal.spinlock); 1920 1921 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1922 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1923 return; 1924 } 1925 1926 /* 1927 * If any bdev fails to unclaim underlying bdev properly, we may face the 1928 * case of bdev list consisting of claimed bdevs only (if claims are managed 1929 * correctly, this would mean there's a loop in the claims graph which is 1930 * clearly impossible). Warn and unregister last bdev on the list then. 1931 */ 1932 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1933 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1934 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1935 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1936 return; 1937 } 1938 } 1939 1940 static void 1941 bdev_module_fini_start_iter(void *arg) 1942 { 1943 struct spdk_bdev_module *bdev_module; 1944 1945 if (!g_resume_bdev_module) { 1946 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1947 } else { 1948 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1949 } 1950 1951 while (bdev_module) { 1952 if (bdev_module->async_fini_start) { 1953 /* Save our place so we can resume later. We must 1954 * save the variable here, before calling fini_start() 1955 * below, because in some cases the module may immediately 1956 * call spdk_bdev_module_fini_start_done() and re-enter 1957 * this function to continue iterating. */ 1958 g_resume_bdev_module = bdev_module; 1959 } 1960 1961 if (bdev_module->fini_start) { 1962 bdev_module->fini_start(); 1963 } 1964 1965 if (bdev_module->async_fini_start) { 1966 return; 1967 } 1968 1969 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1970 } 1971 1972 g_resume_bdev_module = NULL; 1973 1974 bdev_finish_unregister_bdevs_iter(NULL, 0); 1975 } 1976 1977 void 1978 spdk_bdev_module_fini_start_done(void) 1979 { 1980 if (spdk_get_thread() != g_fini_thread) { 1981 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1982 } else { 1983 bdev_module_fini_start_iter(NULL); 1984 } 1985 } 1986 1987 static void 1988 bdev_finish_wait_for_examine_done(void *cb_arg) 1989 { 1990 bdev_module_fini_start_iter(NULL); 1991 } 1992 1993 void 1994 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1995 { 1996 int rc; 1997 1998 assert(cb_fn != NULL); 1999 2000 g_fini_thread = spdk_get_thread(); 2001 2002 g_fini_cb_fn = cb_fn; 2003 g_fini_cb_arg = cb_arg; 2004 2005 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2006 if (rc != 0) { 2007 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2008 bdev_finish_wait_for_examine_done(NULL); 2009 } 2010 } 2011 2012 struct spdk_bdev_io * 2013 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2014 { 2015 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2016 struct spdk_bdev_io *bdev_io; 2017 2018 if (ch->per_thread_cache_count > 0) { 2019 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2020 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2021 ch->per_thread_cache_count--; 2022 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2023 /* 2024 * Don't try to look for bdev_ios in the global pool if there are 2025 * waiters on bdev_ios - we don't want this caller to jump the line. 2026 */ 2027 bdev_io = NULL; 2028 } else { 2029 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2030 } 2031 2032 return bdev_io; 2033 } 2034 2035 void 2036 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2037 { 2038 struct spdk_bdev_mgmt_channel *ch; 2039 2040 assert(bdev_io != NULL); 2041 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2042 2043 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2044 2045 if (bdev_io->internal.buf != NULL) { 2046 bdev_io_put_buf(bdev_io); 2047 } 2048 2049 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2050 ch->per_thread_cache_count++; 2051 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2052 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2053 struct spdk_bdev_io_wait_entry *entry; 2054 2055 entry = TAILQ_FIRST(&ch->io_wait_queue); 2056 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2057 entry->cb_fn(entry->cb_arg); 2058 } 2059 } else { 2060 /* We should never have a full cache with entries on the io wait queue. */ 2061 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2062 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2063 } 2064 } 2065 2066 static bool 2067 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2068 { 2069 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2070 2071 switch (limit) { 2072 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2073 return true; 2074 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2075 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2076 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2077 return false; 2078 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2079 default: 2080 return false; 2081 } 2082 } 2083 2084 static bool 2085 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2086 { 2087 switch (bdev_io->type) { 2088 case SPDK_BDEV_IO_TYPE_NVME_IO: 2089 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2090 case SPDK_BDEV_IO_TYPE_READ: 2091 case SPDK_BDEV_IO_TYPE_WRITE: 2092 return true; 2093 case SPDK_BDEV_IO_TYPE_ZCOPY: 2094 if (bdev_io->u.bdev.zcopy.start) { 2095 return true; 2096 } else { 2097 return false; 2098 } 2099 default: 2100 return false; 2101 } 2102 } 2103 2104 static bool 2105 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2106 { 2107 switch (bdev_io->type) { 2108 case SPDK_BDEV_IO_TYPE_NVME_IO: 2109 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2110 /* Bit 1 (0x2) set for read operation */ 2111 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2112 return true; 2113 } else { 2114 return false; 2115 } 2116 case SPDK_BDEV_IO_TYPE_READ: 2117 return true; 2118 case SPDK_BDEV_IO_TYPE_ZCOPY: 2119 /* Populate to read from disk */ 2120 if (bdev_io->u.bdev.zcopy.populate) { 2121 return true; 2122 } else { 2123 return false; 2124 } 2125 default: 2126 return false; 2127 } 2128 } 2129 2130 static uint64_t 2131 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2132 { 2133 struct spdk_bdev *bdev = bdev_io->bdev; 2134 2135 switch (bdev_io->type) { 2136 case SPDK_BDEV_IO_TYPE_NVME_IO: 2137 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2138 return bdev_io->u.nvme_passthru.nbytes; 2139 case SPDK_BDEV_IO_TYPE_READ: 2140 case SPDK_BDEV_IO_TYPE_WRITE: 2141 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2142 case SPDK_BDEV_IO_TYPE_ZCOPY: 2143 /* Track the data in the start phase only */ 2144 if (bdev_io->u.bdev.zcopy.start) { 2145 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2146 } else { 2147 return 0; 2148 } 2149 default: 2150 return 0; 2151 } 2152 } 2153 2154 static bool 2155 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2156 { 2157 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2158 return true; 2159 } else { 2160 return false; 2161 } 2162 } 2163 2164 static bool 2165 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2166 { 2167 if (bdev_is_read_io(io) == false) { 2168 return false; 2169 } 2170 2171 return bdev_qos_rw_queue_io(limit, io); 2172 } 2173 2174 static bool 2175 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2176 { 2177 if (bdev_is_read_io(io) == true) { 2178 return false; 2179 } 2180 2181 return bdev_qos_rw_queue_io(limit, io); 2182 } 2183 2184 static void 2185 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2186 { 2187 limit->remaining_this_timeslice--; 2188 } 2189 2190 static void 2191 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2192 { 2193 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2194 } 2195 2196 static void 2197 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2198 { 2199 if (bdev_is_read_io(io) == false) { 2200 return; 2201 } 2202 2203 return bdev_qos_rw_bps_update_quota(limit, io); 2204 } 2205 2206 static void 2207 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2208 { 2209 if (bdev_is_read_io(io) == true) { 2210 return; 2211 } 2212 2213 return bdev_qos_rw_bps_update_quota(limit, io); 2214 } 2215 2216 static void 2217 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2218 { 2219 int i; 2220 2221 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2222 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2223 qos->rate_limits[i].queue_io = NULL; 2224 qos->rate_limits[i].update_quota = NULL; 2225 continue; 2226 } 2227 2228 switch (i) { 2229 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2230 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2231 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2232 break; 2233 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2234 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2235 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2236 break; 2237 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2238 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2239 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2240 break; 2241 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2242 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2243 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2244 break; 2245 default: 2246 break; 2247 } 2248 } 2249 } 2250 2251 static void 2252 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2253 struct spdk_bdev_io *bdev_io, 2254 enum spdk_bdev_io_status status) 2255 { 2256 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2257 2258 bdev_io->internal.in_submit_request = true; 2259 bdev_ch->io_outstanding++; 2260 shared_resource->io_outstanding++; 2261 spdk_bdev_io_complete(bdev_io, status); 2262 bdev_io->internal.in_submit_request = false; 2263 } 2264 2265 static inline void 2266 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2267 { 2268 struct spdk_bdev *bdev = bdev_io->bdev; 2269 struct spdk_io_channel *ch = bdev_ch->channel; 2270 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2271 2272 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2273 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2274 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2275 2276 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2277 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2278 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2279 SPDK_BDEV_IO_STATUS_SUCCESS); 2280 return; 2281 } 2282 } 2283 2284 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2285 bdev_io->bdev->split_on_write_unit && 2286 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2287 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2288 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2289 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2290 return; 2291 } 2292 2293 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2294 bdev_ch->io_outstanding++; 2295 shared_resource->io_outstanding++; 2296 bdev_io->internal.in_submit_request = true; 2297 bdev->fn_table->submit_request(ch, bdev_io); 2298 bdev_io->internal.in_submit_request = false; 2299 } else { 2300 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2301 } 2302 } 2303 2304 static bool 2305 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2306 { 2307 int i; 2308 2309 if (bdev_qos_io_to_limit(bdev_io) == true) { 2310 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2311 if (!qos->rate_limits[i].queue_io) { 2312 continue; 2313 } 2314 2315 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2316 bdev_io) == true) { 2317 return true; 2318 } 2319 } 2320 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2321 if (!qos->rate_limits[i].update_quota) { 2322 continue; 2323 } 2324 2325 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2326 } 2327 } 2328 2329 return false; 2330 } 2331 2332 static inline void 2333 _bdev_io_do_submit(void *ctx) 2334 { 2335 struct spdk_bdev_io *bdev_io = ctx; 2336 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2337 2338 bdev_io_do_submit(ch, bdev_io); 2339 } 2340 2341 static int 2342 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2343 { 2344 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2345 int submitted_ios = 0; 2346 2347 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2348 if (!bdev_qos_queue_io(qos, bdev_io)) { 2349 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2350 2351 if (bdev_io->internal.io_submit_ch) { 2352 /* Send back the IO to the original thread for the actual processing. */ 2353 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2354 bdev_io->internal.io_submit_ch = NULL; 2355 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2356 _bdev_io_do_submit, bdev_io); 2357 } else { 2358 bdev_io_do_submit(ch, bdev_io); 2359 } 2360 2361 submitted_ios++; 2362 } 2363 } 2364 2365 return submitted_ios; 2366 } 2367 2368 static void 2369 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2370 { 2371 int rc; 2372 2373 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2374 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2375 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2376 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2377 &bdev_io->internal.waitq_entry); 2378 if (rc != 0) { 2379 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2380 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2381 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2382 } 2383 } 2384 2385 static bool 2386 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2387 { 2388 uint32_t io_boundary; 2389 struct spdk_bdev *bdev = bdev_io->bdev; 2390 uint32_t max_size = bdev->max_segment_size; 2391 int max_segs = bdev->max_num_segments; 2392 2393 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2394 io_boundary = bdev->write_unit_size; 2395 } else if (bdev->split_on_optimal_io_boundary) { 2396 io_boundary = bdev->optimal_io_boundary; 2397 } else { 2398 io_boundary = 0; 2399 } 2400 2401 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2402 return false; 2403 } 2404 2405 if (io_boundary) { 2406 uint64_t start_stripe, end_stripe; 2407 2408 start_stripe = bdev_io->u.bdev.offset_blocks; 2409 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2410 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2411 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2412 start_stripe >>= spdk_u32log2(io_boundary); 2413 end_stripe >>= spdk_u32log2(io_boundary); 2414 } else { 2415 start_stripe /= io_boundary; 2416 end_stripe /= io_boundary; 2417 } 2418 2419 if (start_stripe != end_stripe) { 2420 return true; 2421 } 2422 } 2423 2424 if (max_segs) { 2425 if (bdev_io->u.bdev.iovcnt > max_segs) { 2426 return true; 2427 } 2428 } 2429 2430 if (max_size) { 2431 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2432 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2433 return true; 2434 } 2435 } 2436 } 2437 2438 return false; 2439 } 2440 2441 static bool 2442 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2443 { 2444 uint32_t num_unmap_segments; 2445 2446 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2447 return false; 2448 } 2449 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2450 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2451 return true; 2452 } 2453 2454 return false; 2455 } 2456 2457 static bool 2458 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2459 { 2460 if (!bdev_io->bdev->max_write_zeroes) { 2461 return false; 2462 } 2463 2464 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2465 return true; 2466 } 2467 2468 return false; 2469 } 2470 2471 static bool 2472 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2473 { 2474 if (bdev_io->bdev->max_copy != 0 && 2475 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2476 return true; 2477 } 2478 2479 return false; 2480 } 2481 2482 static bool 2483 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2484 { 2485 switch (bdev_io->type) { 2486 case SPDK_BDEV_IO_TYPE_READ: 2487 case SPDK_BDEV_IO_TYPE_WRITE: 2488 return bdev_rw_should_split(bdev_io); 2489 case SPDK_BDEV_IO_TYPE_UNMAP: 2490 return bdev_unmap_should_split(bdev_io); 2491 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2492 return bdev_write_zeroes_should_split(bdev_io); 2493 case SPDK_BDEV_IO_TYPE_COPY: 2494 return bdev_copy_should_split(bdev_io); 2495 default: 2496 return false; 2497 } 2498 } 2499 2500 static uint32_t 2501 _to_next_boundary(uint64_t offset, uint32_t boundary) 2502 { 2503 return (boundary - (offset % boundary)); 2504 } 2505 2506 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2507 2508 static void _bdev_rw_split(void *_bdev_io); 2509 2510 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2511 2512 static void 2513 _bdev_unmap_split(void *_bdev_io) 2514 { 2515 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2516 } 2517 2518 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2519 2520 static void 2521 _bdev_write_zeroes_split(void *_bdev_io) 2522 { 2523 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2524 } 2525 2526 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2527 2528 static void 2529 _bdev_copy_split(void *_bdev_io) 2530 { 2531 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2532 } 2533 2534 static int 2535 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2536 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2537 { 2538 int rc; 2539 uint64_t current_offset, current_remaining, current_src_offset; 2540 spdk_bdev_io_wait_cb io_wait_fn; 2541 2542 current_offset = *offset; 2543 current_remaining = *remaining; 2544 2545 bdev_io->u.bdev.split_outstanding++; 2546 2547 io_wait_fn = _bdev_rw_split; 2548 switch (bdev_io->type) { 2549 case SPDK_BDEV_IO_TYPE_READ: 2550 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2551 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2552 iov, iovcnt, md_buf, current_offset, 2553 num_blocks, bdev_io->internal.memory_domain, 2554 bdev_io->internal.memory_domain_ctx, 2555 bdev_io_split_done, bdev_io); 2556 break; 2557 case SPDK_BDEV_IO_TYPE_WRITE: 2558 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2559 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2560 iov, iovcnt, md_buf, current_offset, 2561 num_blocks, bdev_io->internal.memory_domain, 2562 bdev_io->internal.memory_domain_ctx, 2563 bdev_io_split_done, bdev_io); 2564 break; 2565 case SPDK_BDEV_IO_TYPE_UNMAP: 2566 io_wait_fn = _bdev_unmap_split; 2567 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2568 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2569 current_offset, num_blocks, 2570 bdev_io_split_done, bdev_io); 2571 break; 2572 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2573 io_wait_fn = _bdev_write_zeroes_split; 2574 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2575 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2576 current_offset, num_blocks, 2577 bdev_io_split_done, bdev_io); 2578 break; 2579 case SPDK_BDEV_IO_TYPE_COPY: 2580 io_wait_fn = _bdev_copy_split; 2581 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2582 (current_offset - bdev_io->u.bdev.offset_blocks); 2583 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2584 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2585 current_offset, current_src_offset, num_blocks, 2586 bdev_io_split_done, bdev_io); 2587 break; 2588 default: 2589 assert(false); 2590 rc = -EINVAL; 2591 break; 2592 } 2593 2594 if (rc == 0) { 2595 current_offset += num_blocks; 2596 current_remaining -= num_blocks; 2597 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2598 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2599 *offset = current_offset; 2600 *remaining = current_remaining; 2601 } else { 2602 bdev_io->u.bdev.split_outstanding--; 2603 if (rc == -ENOMEM) { 2604 if (bdev_io->u.bdev.split_outstanding == 0) { 2605 /* No I/O is outstanding. Hence we should wait here. */ 2606 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2607 } 2608 } else { 2609 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2610 if (bdev_io->u.bdev.split_outstanding == 0) { 2611 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2612 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2613 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2614 } 2615 } 2616 } 2617 2618 return rc; 2619 } 2620 2621 static void 2622 _bdev_rw_split(void *_bdev_io) 2623 { 2624 struct iovec *parent_iov, *iov; 2625 struct spdk_bdev_io *bdev_io = _bdev_io; 2626 struct spdk_bdev *bdev = bdev_io->bdev; 2627 uint64_t parent_offset, current_offset, remaining; 2628 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2629 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2630 uint32_t iovcnt, iov_len, child_iovsize; 2631 uint32_t blocklen = bdev->blocklen; 2632 uint32_t io_boundary; 2633 uint32_t max_segment_size = bdev->max_segment_size; 2634 uint32_t max_child_iovcnt = bdev->max_num_segments; 2635 void *md_buf = NULL; 2636 int rc; 2637 2638 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2639 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2640 SPDK_BDEV_IO_NUM_CHILD_IOV; 2641 2642 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2643 io_boundary = bdev->write_unit_size; 2644 } else if (bdev->split_on_optimal_io_boundary) { 2645 io_boundary = bdev->optimal_io_boundary; 2646 } else { 2647 io_boundary = UINT32_MAX; 2648 } 2649 2650 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2651 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2652 parent_offset = bdev_io->u.bdev.offset_blocks; 2653 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2654 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2655 2656 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2657 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2658 if (parent_iov_offset < parent_iov->iov_len) { 2659 break; 2660 } 2661 parent_iov_offset -= parent_iov->iov_len; 2662 } 2663 2664 child_iovcnt = 0; 2665 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2666 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2667 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2668 to_next_boundary = spdk_min(remaining, to_next_boundary); 2669 to_next_boundary_bytes = to_next_boundary * blocklen; 2670 2671 iov = &bdev_io->child_iov[child_iovcnt]; 2672 iovcnt = 0; 2673 2674 if (bdev_io->u.bdev.md_buf) { 2675 md_buf = (char *)bdev_io->u.bdev.md_buf + 2676 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2677 } 2678 2679 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2680 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2681 iovcnt < child_iovsize) { 2682 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2683 iov_len = parent_iov->iov_len - parent_iov_offset; 2684 2685 iov_len = spdk_min(iov_len, max_segment_size); 2686 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2687 to_next_boundary_bytes -= iov_len; 2688 2689 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2690 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2691 2692 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2693 parent_iov_offset += iov_len; 2694 } else { 2695 parent_iovpos++; 2696 parent_iov_offset = 0; 2697 } 2698 child_iovcnt++; 2699 iovcnt++; 2700 } 2701 2702 if (to_next_boundary_bytes > 0) { 2703 /* We had to stop this child I/O early because we ran out of 2704 * child_iov space or were limited by max_num_segments. 2705 * Ensure the iovs to be aligned with block size and 2706 * then adjust to_next_boundary before starting the 2707 * child I/O. 2708 */ 2709 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2710 iovcnt == child_iovsize); 2711 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2712 if (to_last_block_bytes != 0) { 2713 uint32_t child_iovpos = child_iovcnt - 1; 2714 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2715 * so the loop will naturally end 2716 */ 2717 2718 to_last_block_bytes = blocklen - to_last_block_bytes; 2719 to_next_boundary_bytes += to_last_block_bytes; 2720 while (to_last_block_bytes > 0 && iovcnt > 0) { 2721 iov_len = spdk_min(to_last_block_bytes, 2722 bdev_io->child_iov[child_iovpos].iov_len); 2723 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2724 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2725 child_iovpos--; 2726 if (--iovcnt == 0) { 2727 /* If the child IO is less than a block size just return. 2728 * If the first child IO of any split round is less than 2729 * a block size, an error exit. 2730 */ 2731 if (bdev_io->u.bdev.split_outstanding == 0) { 2732 SPDK_ERRLOG("The first child io was less than a block size\n"); 2733 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2734 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2735 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2736 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2737 } 2738 2739 return; 2740 } 2741 } 2742 2743 to_last_block_bytes -= iov_len; 2744 2745 if (parent_iov_offset == 0) { 2746 parent_iovpos--; 2747 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2748 } 2749 parent_iov_offset -= iov_len; 2750 } 2751 2752 assert(to_last_block_bytes == 0); 2753 } 2754 to_next_boundary -= to_next_boundary_bytes / blocklen; 2755 } 2756 2757 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2758 ¤t_offset, &remaining); 2759 if (spdk_unlikely(rc)) { 2760 return; 2761 } 2762 } 2763 } 2764 2765 static void 2766 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2767 { 2768 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2769 uint32_t num_children_reqs = 0; 2770 int rc; 2771 2772 offset = bdev_io->u.bdev.split_current_offset_blocks; 2773 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2774 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2775 2776 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2777 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2778 2779 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2780 &offset, &remaining); 2781 if (spdk_likely(rc == 0)) { 2782 num_children_reqs++; 2783 } else { 2784 return; 2785 } 2786 } 2787 } 2788 2789 static void 2790 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2791 { 2792 uint64_t offset, write_zeroes_blocks, remaining; 2793 uint32_t num_children_reqs = 0; 2794 int rc; 2795 2796 offset = bdev_io->u.bdev.split_current_offset_blocks; 2797 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2798 2799 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2800 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2801 2802 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2803 &offset, &remaining); 2804 if (spdk_likely(rc == 0)) { 2805 num_children_reqs++; 2806 } else { 2807 return; 2808 } 2809 } 2810 } 2811 2812 static void 2813 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2814 { 2815 uint64_t offset, copy_blocks, remaining; 2816 uint32_t num_children_reqs = 0; 2817 int rc; 2818 2819 offset = bdev_io->u.bdev.split_current_offset_blocks; 2820 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2821 2822 assert(bdev_io->bdev->max_copy != 0); 2823 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2824 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2825 2826 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2827 &offset, &remaining); 2828 if (spdk_likely(rc == 0)) { 2829 num_children_reqs++; 2830 } else { 2831 return; 2832 } 2833 } 2834 } 2835 2836 static void 2837 parent_bdev_io_complete(void *ctx, int rc) 2838 { 2839 struct spdk_bdev_io *parent_io = ctx; 2840 2841 if (rc) { 2842 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2843 } 2844 2845 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2846 parent_io->internal.caller_ctx); 2847 } 2848 2849 static void 2850 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2851 { 2852 struct spdk_bdev_io *parent_io = cb_arg; 2853 2854 spdk_bdev_free_io(bdev_io); 2855 2856 if (!success) { 2857 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2858 /* If any child I/O failed, stop further splitting process. */ 2859 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2860 parent_io->u.bdev.split_remaining_num_blocks = 0; 2861 } 2862 parent_io->u.bdev.split_outstanding--; 2863 if (parent_io->u.bdev.split_outstanding != 0) { 2864 return; 2865 } 2866 2867 /* 2868 * Parent I/O finishes when all blocks are consumed. 2869 */ 2870 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2871 assert(parent_io->internal.cb != bdev_io_split_done); 2872 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2873 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2874 2875 if (parent_io->internal.orig_iovcnt != 0) { 2876 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2877 /* bdev IO will be completed in the callback */ 2878 } else { 2879 parent_bdev_io_complete(parent_io, 0); 2880 } 2881 return; 2882 } 2883 2884 /* 2885 * Continue with the splitting process. This function will complete the parent I/O if the 2886 * splitting is done. 2887 */ 2888 switch (parent_io->type) { 2889 case SPDK_BDEV_IO_TYPE_READ: 2890 case SPDK_BDEV_IO_TYPE_WRITE: 2891 _bdev_rw_split(parent_io); 2892 break; 2893 case SPDK_BDEV_IO_TYPE_UNMAP: 2894 bdev_unmap_split(parent_io); 2895 break; 2896 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2897 bdev_write_zeroes_split(parent_io); 2898 break; 2899 case SPDK_BDEV_IO_TYPE_COPY: 2900 bdev_copy_split(parent_io); 2901 break; 2902 default: 2903 assert(false); 2904 break; 2905 } 2906 } 2907 2908 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2909 bool success); 2910 2911 static void 2912 bdev_io_split(struct spdk_bdev_io *bdev_io) 2913 { 2914 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2915 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2916 bdev_io->u.bdev.split_outstanding = 0; 2917 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2918 2919 switch (bdev_io->type) { 2920 case SPDK_BDEV_IO_TYPE_READ: 2921 case SPDK_BDEV_IO_TYPE_WRITE: 2922 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2923 _bdev_rw_split(bdev_io); 2924 } else { 2925 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2926 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2927 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2928 } 2929 break; 2930 case SPDK_BDEV_IO_TYPE_UNMAP: 2931 bdev_unmap_split(bdev_io); 2932 break; 2933 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2934 bdev_write_zeroes_split(bdev_io); 2935 break; 2936 case SPDK_BDEV_IO_TYPE_COPY: 2937 bdev_copy_split(bdev_io); 2938 break; 2939 default: 2940 assert(false); 2941 break; 2942 } 2943 } 2944 2945 static void 2946 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2947 { 2948 if (!success) { 2949 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2950 return; 2951 } 2952 2953 _bdev_rw_split(bdev_io); 2954 } 2955 2956 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2957 * be inlined, at least on some compilers. 2958 */ 2959 static inline void 2960 _bdev_io_submit(void *ctx) 2961 { 2962 struct spdk_bdev_io *bdev_io = ctx; 2963 struct spdk_bdev *bdev = bdev_io->bdev; 2964 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2965 2966 if (spdk_likely(bdev_ch->flags == 0)) { 2967 bdev_io_do_submit(bdev_ch, bdev_io); 2968 return; 2969 } 2970 2971 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2972 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2973 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2974 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2975 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2976 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2977 } else { 2978 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2979 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2980 } 2981 } else { 2982 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2983 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2984 } 2985 } 2986 2987 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2988 2989 bool 2990 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2991 { 2992 if (range1->length == 0 || range2->length == 0) { 2993 return false; 2994 } 2995 2996 if (range1->offset + range1->length <= range2->offset) { 2997 return false; 2998 } 2999 3000 if (range2->offset + range2->length <= range1->offset) { 3001 return false; 3002 } 3003 3004 return true; 3005 } 3006 3007 static bool 3008 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3009 { 3010 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3011 struct lba_range r; 3012 3013 switch (bdev_io->type) { 3014 case SPDK_BDEV_IO_TYPE_NVME_IO: 3015 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3016 /* Don't try to decode the NVMe command - just assume worst-case and that 3017 * it overlaps a locked range. 3018 */ 3019 return true; 3020 case SPDK_BDEV_IO_TYPE_WRITE: 3021 case SPDK_BDEV_IO_TYPE_UNMAP: 3022 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3023 case SPDK_BDEV_IO_TYPE_ZCOPY: 3024 case SPDK_BDEV_IO_TYPE_COPY: 3025 r.offset = bdev_io->u.bdev.offset_blocks; 3026 r.length = bdev_io->u.bdev.num_blocks; 3027 if (!bdev_lba_range_overlapped(range, &r)) { 3028 /* This I/O doesn't overlap the specified LBA range. */ 3029 return false; 3030 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3031 /* This I/O overlaps, but the I/O is on the same channel that locked this 3032 * range, and the caller_ctx is the same as the locked_ctx. This means 3033 * that this I/O is associated with the lock, and is allowed to execute. 3034 */ 3035 return false; 3036 } else { 3037 return true; 3038 } 3039 default: 3040 return false; 3041 } 3042 } 3043 3044 void 3045 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3046 { 3047 struct spdk_bdev *bdev = bdev_io->bdev; 3048 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3049 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3050 3051 assert(thread != NULL); 3052 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3053 3054 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3055 struct lba_range *range; 3056 3057 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3058 if (bdev_io_range_is_locked(bdev_io, range)) { 3059 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3060 return; 3061 } 3062 } 3063 } 3064 3065 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3066 3067 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3068 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3069 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3070 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3071 spdk_bdev_get_name(bdev)); 3072 3073 if (bdev_io_should_split(bdev_io)) { 3074 bdev_io_split(bdev_io); 3075 return; 3076 } 3077 3078 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3079 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3080 _bdev_io_submit(bdev_io); 3081 } else { 3082 bdev_io->internal.io_submit_ch = ch; 3083 bdev_io->internal.ch = bdev->internal.qos->ch; 3084 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3085 } 3086 } else { 3087 _bdev_io_submit(bdev_io); 3088 } 3089 } 3090 3091 static inline void 3092 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3093 { 3094 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3095 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3096 * For write operation we need to pull buffers from memory domain before submitting IO. 3097 * Once read operation completes, we need to use memory_domain push functionality to 3098 * update data in original memory domain IO buffer 3099 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3100 bdev_io->u.bdev.memory_domain = NULL; 3101 bdev_io->u.bdev.memory_domain_ctx = NULL; 3102 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3103 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3104 } 3105 3106 static inline void 3107 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3108 { 3109 if (bdev_io->internal.memory_domain && !desc->memory_domains_supported) { 3110 _bdev_io_ext_use_bounce_buffer(bdev_io); 3111 return; 3112 } 3113 3114 bdev_io_submit(bdev_io); 3115 } 3116 3117 static void 3118 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3119 { 3120 struct spdk_bdev *bdev = bdev_io->bdev; 3121 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3122 struct spdk_io_channel *ch = bdev_ch->channel; 3123 3124 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3125 3126 bdev_io->internal.in_submit_request = true; 3127 bdev->fn_table->submit_request(ch, bdev_io); 3128 bdev_io->internal.in_submit_request = false; 3129 } 3130 3131 void 3132 bdev_io_init(struct spdk_bdev_io *bdev_io, 3133 struct spdk_bdev *bdev, void *cb_arg, 3134 spdk_bdev_io_completion_cb cb) 3135 { 3136 bdev_io->bdev = bdev; 3137 bdev_io->internal.caller_ctx = cb_arg; 3138 bdev_io->internal.cb = cb; 3139 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3140 bdev_io->internal.in_submit_request = false; 3141 bdev_io->internal.buf = NULL; 3142 bdev_io->internal.io_submit_ch = NULL; 3143 bdev_io->internal.orig_iovs = NULL; 3144 bdev_io->internal.orig_iovcnt = 0; 3145 bdev_io->internal.orig_md_iov.iov_base = NULL; 3146 bdev_io->internal.error.nvme.cdw0 = 0; 3147 bdev_io->num_retries = 0; 3148 bdev_io->internal.get_buf_cb = NULL; 3149 bdev_io->internal.get_aux_buf_cb = NULL; 3150 bdev_io->internal.memory_domain = NULL; 3151 bdev_io->internal.memory_domain_ctx = NULL; 3152 bdev_io->internal.data_transfer_cpl = NULL; 3153 } 3154 3155 static bool 3156 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3157 { 3158 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3159 } 3160 3161 bool 3162 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3163 { 3164 bool supported; 3165 3166 supported = bdev_io_type_supported(bdev, io_type); 3167 3168 if (!supported) { 3169 switch (io_type) { 3170 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3171 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3172 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3173 break; 3174 default: 3175 break; 3176 } 3177 } 3178 3179 return supported; 3180 } 3181 3182 uint64_t 3183 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3184 { 3185 return bdev_io->internal.submit_tsc; 3186 } 3187 3188 int 3189 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3190 { 3191 if (bdev->fn_table->dump_info_json) { 3192 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3193 } 3194 3195 return 0; 3196 } 3197 3198 static void 3199 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3200 { 3201 uint32_t max_per_timeslice = 0; 3202 int i; 3203 3204 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3205 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3206 qos->rate_limits[i].max_per_timeslice = 0; 3207 continue; 3208 } 3209 3210 max_per_timeslice = qos->rate_limits[i].limit * 3211 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3212 3213 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3214 qos->rate_limits[i].min_per_timeslice); 3215 3216 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3217 } 3218 3219 bdev_qos_set_ops(qos); 3220 } 3221 3222 static int 3223 bdev_channel_poll_qos(void *arg) 3224 { 3225 struct spdk_bdev_qos *qos = arg; 3226 uint64_t now = spdk_get_ticks(); 3227 int i; 3228 3229 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3230 /* We received our callback earlier than expected - return 3231 * immediately and wait to do accounting until at least one 3232 * timeslice has actually expired. This should never happen 3233 * with a well-behaved timer implementation. 3234 */ 3235 return SPDK_POLLER_IDLE; 3236 } 3237 3238 /* Reset for next round of rate limiting */ 3239 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3240 /* We may have allowed the IOs or bytes to slightly overrun in the last 3241 * timeslice. remaining_this_timeslice is signed, so if it's negative 3242 * here, we'll account for the overrun so that the next timeslice will 3243 * be appropriately reduced. 3244 */ 3245 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3246 qos->rate_limits[i].remaining_this_timeslice = 0; 3247 } 3248 } 3249 3250 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3251 qos->last_timeslice += qos->timeslice_size; 3252 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3253 qos->rate_limits[i].remaining_this_timeslice += 3254 qos->rate_limits[i].max_per_timeslice; 3255 } 3256 } 3257 3258 return bdev_qos_io_submit(qos->ch, qos); 3259 } 3260 3261 static void 3262 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3263 { 3264 struct spdk_bdev_shared_resource *shared_resource; 3265 struct lba_range *range; 3266 3267 bdev_free_io_stat(ch->stat); 3268 #ifdef SPDK_CONFIG_VTUNE 3269 bdev_free_io_stat(ch->prev_stat); 3270 #endif 3271 3272 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3273 range = TAILQ_FIRST(&ch->locked_ranges); 3274 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3275 free(range); 3276 } 3277 3278 spdk_put_io_channel(ch->channel); 3279 3280 shared_resource = ch->shared_resource; 3281 3282 assert(TAILQ_EMPTY(&ch->io_locked)); 3283 assert(TAILQ_EMPTY(&ch->io_submitted)); 3284 assert(ch->io_outstanding == 0); 3285 assert(shared_resource->ref > 0); 3286 shared_resource->ref--; 3287 if (shared_resource->ref == 0) { 3288 assert(shared_resource->io_outstanding == 0); 3289 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3290 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3291 free(shared_resource); 3292 } 3293 } 3294 3295 static void 3296 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3297 { 3298 struct spdk_bdev_qos *qos = bdev->internal.qos; 3299 int i; 3300 3301 assert(spdk_spin_held(&bdev->internal.spinlock)); 3302 3303 /* Rate limiting on this bdev enabled */ 3304 if (qos) { 3305 if (qos->ch == NULL) { 3306 struct spdk_io_channel *io_ch; 3307 3308 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3309 bdev->name, spdk_get_thread()); 3310 3311 /* No qos channel has been selected, so set one up */ 3312 3313 /* Take another reference to ch */ 3314 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3315 assert(io_ch != NULL); 3316 qos->ch = ch; 3317 3318 qos->thread = spdk_io_channel_get_thread(io_ch); 3319 3320 TAILQ_INIT(&qos->queued); 3321 3322 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3323 if (bdev_qos_is_iops_rate_limit(i) == true) { 3324 qos->rate_limits[i].min_per_timeslice = 3325 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3326 } else { 3327 qos->rate_limits[i].min_per_timeslice = 3328 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3329 } 3330 3331 if (qos->rate_limits[i].limit == 0) { 3332 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3333 } 3334 } 3335 bdev_qos_update_max_quota_per_timeslice(qos); 3336 qos->timeslice_size = 3337 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3338 qos->last_timeslice = spdk_get_ticks(); 3339 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3340 qos, 3341 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3342 } 3343 3344 ch->flags |= BDEV_CH_QOS_ENABLED; 3345 } 3346 } 3347 3348 struct poll_timeout_ctx { 3349 struct spdk_bdev_desc *desc; 3350 uint64_t timeout_in_sec; 3351 spdk_bdev_io_timeout_cb cb_fn; 3352 void *cb_arg; 3353 }; 3354 3355 static void 3356 bdev_desc_free(struct spdk_bdev_desc *desc) 3357 { 3358 spdk_spin_destroy(&desc->spinlock); 3359 free(desc->media_events_buffer); 3360 free(desc); 3361 } 3362 3363 static void 3364 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3365 { 3366 struct poll_timeout_ctx *ctx = _ctx; 3367 struct spdk_bdev_desc *desc = ctx->desc; 3368 3369 free(ctx); 3370 3371 spdk_spin_lock(&desc->spinlock); 3372 desc->refs--; 3373 if (desc->closed == true && desc->refs == 0) { 3374 spdk_spin_unlock(&desc->spinlock); 3375 bdev_desc_free(desc); 3376 return; 3377 } 3378 spdk_spin_unlock(&desc->spinlock); 3379 } 3380 3381 static void 3382 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3383 struct spdk_io_channel *io_ch, void *_ctx) 3384 { 3385 struct poll_timeout_ctx *ctx = _ctx; 3386 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3387 struct spdk_bdev_desc *desc = ctx->desc; 3388 struct spdk_bdev_io *bdev_io; 3389 uint64_t now; 3390 3391 spdk_spin_lock(&desc->spinlock); 3392 if (desc->closed == true) { 3393 spdk_spin_unlock(&desc->spinlock); 3394 spdk_bdev_for_each_channel_continue(i, -1); 3395 return; 3396 } 3397 spdk_spin_unlock(&desc->spinlock); 3398 3399 now = spdk_get_ticks(); 3400 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3401 /* Exclude any I/O that are generated via splitting. */ 3402 if (bdev_io->internal.cb == bdev_io_split_done) { 3403 continue; 3404 } 3405 3406 /* Once we find an I/O that has not timed out, we can immediately 3407 * exit the loop. 3408 */ 3409 if (now < (bdev_io->internal.submit_tsc + 3410 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3411 goto end; 3412 } 3413 3414 if (bdev_io->internal.desc == desc) { 3415 ctx->cb_fn(ctx->cb_arg, bdev_io); 3416 } 3417 } 3418 3419 end: 3420 spdk_bdev_for_each_channel_continue(i, 0); 3421 } 3422 3423 static int 3424 bdev_poll_timeout_io(void *arg) 3425 { 3426 struct spdk_bdev_desc *desc = arg; 3427 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3428 struct poll_timeout_ctx *ctx; 3429 3430 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3431 if (!ctx) { 3432 SPDK_ERRLOG("failed to allocate memory\n"); 3433 return SPDK_POLLER_BUSY; 3434 } 3435 ctx->desc = desc; 3436 ctx->cb_arg = desc->cb_arg; 3437 ctx->cb_fn = desc->cb_fn; 3438 ctx->timeout_in_sec = desc->timeout_in_sec; 3439 3440 /* Take a ref on the descriptor in case it gets closed while we are checking 3441 * all of the channels. 3442 */ 3443 spdk_spin_lock(&desc->spinlock); 3444 desc->refs++; 3445 spdk_spin_unlock(&desc->spinlock); 3446 3447 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3448 bdev_channel_poll_timeout_io_done); 3449 3450 return SPDK_POLLER_BUSY; 3451 } 3452 3453 int 3454 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3455 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3456 { 3457 assert(desc->thread == spdk_get_thread()); 3458 3459 spdk_poller_unregister(&desc->io_timeout_poller); 3460 3461 if (timeout_in_sec) { 3462 assert(cb_fn != NULL); 3463 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3464 desc, 3465 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3466 1000); 3467 if (desc->io_timeout_poller == NULL) { 3468 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3469 return -1; 3470 } 3471 } 3472 3473 desc->cb_fn = cb_fn; 3474 desc->cb_arg = cb_arg; 3475 desc->timeout_in_sec = timeout_in_sec; 3476 3477 return 0; 3478 } 3479 3480 static int 3481 bdev_channel_create(void *io_device, void *ctx_buf) 3482 { 3483 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3484 struct spdk_bdev_channel *ch = ctx_buf; 3485 struct spdk_io_channel *mgmt_io_ch; 3486 struct spdk_bdev_mgmt_channel *mgmt_ch; 3487 struct spdk_bdev_shared_resource *shared_resource; 3488 struct lba_range *range; 3489 3490 ch->bdev = bdev; 3491 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3492 if (!ch->channel) { 3493 return -1; 3494 } 3495 3496 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3497 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3498 3499 assert(ch->histogram == NULL); 3500 if (bdev->internal.histogram_enabled) { 3501 ch->histogram = spdk_histogram_data_alloc(); 3502 if (ch->histogram == NULL) { 3503 SPDK_ERRLOG("Could not allocate histogram\n"); 3504 } 3505 } 3506 3507 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3508 if (!mgmt_io_ch) { 3509 spdk_put_io_channel(ch->channel); 3510 return -1; 3511 } 3512 3513 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3514 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3515 if (shared_resource->shared_ch == ch->channel) { 3516 spdk_put_io_channel(mgmt_io_ch); 3517 shared_resource->ref++; 3518 break; 3519 } 3520 } 3521 3522 if (shared_resource == NULL) { 3523 shared_resource = calloc(1, sizeof(*shared_resource)); 3524 if (shared_resource == NULL) { 3525 spdk_put_io_channel(ch->channel); 3526 spdk_put_io_channel(mgmt_io_ch); 3527 return -1; 3528 } 3529 3530 shared_resource->mgmt_ch = mgmt_ch; 3531 shared_resource->io_outstanding = 0; 3532 TAILQ_INIT(&shared_resource->nomem_io); 3533 shared_resource->nomem_threshold = 0; 3534 shared_resource->shared_ch = ch->channel; 3535 shared_resource->ref = 1; 3536 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3537 } 3538 3539 ch->io_outstanding = 0; 3540 TAILQ_INIT(&ch->queued_resets); 3541 TAILQ_INIT(&ch->locked_ranges); 3542 ch->flags = 0; 3543 ch->shared_resource = shared_resource; 3544 3545 TAILQ_INIT(&ch->io_submitted); 3546 TAILQ_INIT(&ch->io_locked); 3547 3548 ch->stat = bdev_alloc_io_stat(false); 3549 if (ch->stat == NULL) { 3550 bdev_channel_destroy_resource(ch); 3551 return -1; 3552 } 3553 3554 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3555 3556 #ifdef SPDK_CONFIG_VTUNE 3557 { 3558 char *name; 3559 __itt_init_ittlib(NULL, 0); 3560 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3561 if (!name) { 3562 bdev_channel_destroy_resource(ch); 3563 return -1; 3564 } 3565 ch->handle = __itt_string_handle_create(name); 3566 free(name); 3567 ch->start_tsc = spdk_get_ticks(); 3568 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3569 ch->prev_stat = bdev_alloc_io_stat(false); 3570 if (ch->prev_stat == NULL) { 3571 bdev_channel_destroy_resource(ch); 3572 return -1; 3573 } 3574 } 3575 #endif 3576 3577 spdk_spin_lock(&bdev->internal.spinlock); 3578 bdev_enable_qos(bdev, ch); 3579 3580 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3581 struct lba_range *new_range; 3582 3583 new_range = calloc(1, sizeof(*new_range)); 3584 if (new_range == NULL) { 3585 spdk_spin_unlock(&bdev->internal.spinlock); 3586 bdev_channel_destroy_resource(ch); 3587 return -1; 3588 } 3589 new_range->length = range->length; 3590 new_range->offset = range->offset; 3591 new_range->locked_ctx = range->locked_ctx; 3592 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3593 } 3594 3595 spdk_spin_unlock(&bdev->internal.spinlock); 3596 3597 return 0; 3598 } 3599 3600 static int 3601 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3602 void *cb_ctx) 3603 { 3604 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3605 struct spdk_bdev_io *bdev_io; 3606 uint64_t buf_len; 3607 3608 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3609 if (bdev_io->internal.ch == bdev_ch) { 3610 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3611 spdk_iobuf_entry_abort(ch, entry, buf_len); 3612 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3613 } 3614 3615 return 0; 3616 } 3617 3618 /* 3619 * Abort I/O that are waiting on a data buffer. 3620 */ 3621 static void 3622 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3623 { 3624 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3625 bdev_abort_all_buf_io_cb, ch); 3626 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3627 bdev_abort_all_buf_io_cb, ch); 3628 } 3629 3630 /* 3631 * Abort I/O that are queued waiting for submission. These types of I/O are 3632 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3633 */ 3634 static void 3635 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3636 { 3637 struct spdk_bdev_io *bdev_io, *tmp; 3638 3639 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3640 if (bdev_io->internal.ch == ch) { 3641 TAILQ_REMOVE(queue, bdev_io, internal.link); 3642 /* 3643 * spdk_bdev_io_complete() assumes that the completed I/O had 3644 * been submitted to the bdev module. Since in this case it 3645 * hadn't, bump io_outstanding to account for the decrement 3646 * that spdk_bdev_io_complete() will do. 3647 */ 3648 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3649 ch->io_outstanding++; 3650 ch->shared_resource->io_outstanding++; 3651 } 3652 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3653 } 3654 } 3655 } 3656 3657 static bool 3658 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3659 { 3660 struct spdk_bdev_io *bdev_io; 3661 3662 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3663 if (bdev_io == bio_to_abort) { 3664 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3665 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3666 return true; 3667 } 3668 } 3669 3670 return false; 3671 } 3672 3673 static int 3674 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3675 { 3676 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3677 uint64_t buf_len; 3678 3679 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3680 if (bdev_io == bio_to_abort) { 3681 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3682 spdk_iobuf_entry_abort(ch, entry, buf_len); 3683 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3684 return 1; 3685 } 3686 3687 return 0; 3688 } 3689 3690 static bool 3691 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3692 { 3693 int rc; 3694 3695 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3696 bdev_abort_buf_io_cb, bio_to_abort); 3697 if (rc == 1) { 3698 return true; 3699 } 3700 3701 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3702 bdev_abort_buf_io_cb, bio_to_abort); 3703 return rc == 1; 3704 } 3705 3706 static void 3707 bdev_qos_channel_destroy(void *cb_arg) 3708 { 3709 struct spdk_bdev_qos *qos = cb_arg; 3710 3711 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3712 spdk_poller_unregister(&qos->poller); 3713 3714 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3715 3716 free(qos); 3717 } 3718 3719 static int 3720 bdev_qos_destroy(struct spdk_bdev *bdev) 3721 { 3722 int i; 3723 3724 /* 3725 * Cleanly shutting down the QoS poller is tricky, because 3726 * during the asynchronous operation the user could open 3727 * a new descriptor and create a new channel, spawning 3728 * a new QoS poller. 3729 * 3730 * The strategy is to create a new QoS structure here and swap it 3731 * in. The shutdown path then continues to refer to the old one 3732 * until it completes and then releases it. 3733 */ 3734 struct spdk_bdev_qos *new_qos, *old_qos; 3735 3736 old_qos = bdev->internal.qos; 3737 3738 new_qos = calloc(1, sizeof(*new_qos)); 3739 if (!new_qos) { 3740 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3741 return -ENOMEM; 3742 } 3743 3744 /* Copy the old QoS data into the newly allocated structure */ 3745 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3746 3747 /* Zero out the key parts of the QoS structure */ 3748 new_qos->ch = NULL; 3749 new_qos->thread = NULL; 3750 new_qos->poller = NULL; 3751 TAILQ_INIT(&new_qos->queued); 3752 /* 3753 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3754 * It will be used later for the new QoS structure. 3755 */ 3756 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3757 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3758 new_qos->rate_limits[i].min_per_timeslice = 0; 3759 new_qos->rate_limits[i].max_per_timeslice = 0; 3760 } 3761 3762 bdev->internal.qos = new_qos; 3763 3764 if (old_qos->thread == NULL) { 3765 free(old_qos); 3766 } else { 3767 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3768 } 3769 3770 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3771 * been destroyed yet. The destruction path will end up waiting for the final 3772 * channel to be put before it releases resources. */ 3773 3774 return 0; 3775 } 3776 3777 void 3778 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3779 { 3780 total->bytes_read += add->bytes_read; 3781 total->num_read_ops += add->num_read_ops; 3782 total->bytes_written += add->bytes_written; 3783 total->num_write_ops += add->num_write_ops; 3784 total->bytes_unmapped += add->bytes_unmapped; 3785 total->num_unmap_ops += add->num_unmap_ops; 3786 total->bytes_copied += add->bytes_copied; 3787 total->num_copy_ops += add->num_copy_ops; 3788 total->read_latency_ticks += add->read_latency_ticks; 3789 total->write_latency_ticks += add->write_latency_ticks; 3790 total->unmap_latency_ticks += add->unmap_latency_ticks; 3791 total->copy_latency_ticks += add->copy_latency_ticks; 3792 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3793 total->max_read_latency_ticks = add->max_read_latency_ticks; 3794 } 3795 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3796 total->min_read_latency_ticks = add->min_read_latency_ticks; 3797 } 3798 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3799 total->max_write_latency_ticks = add->max_write_latency_ticks; 3800 } 3801 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3802 total->min_write_latency_ticks = add->min_write_latency_ticks; 3803 } 3804 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3805 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3806 } 3807 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3808 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3809 } 3810 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3811 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3812 } 3813 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3814 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3815 } 3816 } 3817 3818 static void 3819 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3820 { 3821 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3822 3823 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3824 memcpy(to_stat->io_error, from_stat->io_error, 3825 sizeof(struct spdk_bdev_io_error_stat)); 3826 } 3827 } 3828 3829 void 3830 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 3831 { 3832 stat->max_read_latency_ticks = 0; 3833 stat->min_read_latency_ticks = UINT64_MAX; 3834 stat->max_write_latency_ticks = 0; 3835 stat->min_write_latency_ticks = UINT64_MAX; 3836 stat->max_unmap_latency_ticks = 0; 3837 stat->min_unmap_latency_ticks = UINT64_MAX; 3838 stat->max_copy_latency_ticks = 0; 3839 stat->min_copy_latency_ticks = UINT64_MAX; 3840 3841 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 3842 return; 3843 } 3844 3845 stat->bytes_read = 0; 3846 stat->num_read_ops = 0; 3847 stat->bytes_written = 0; 3848 stat->num_write_ops = 0; 3849 stat->bytes_unmapped = 0; 3850 stat->num_unmap_ops = 0; 3851 stat->bytes_copied = 0; 3852 stat->num_copy_ops = 0; 3853 stat->read_latency_ticks = 0; 3854 stat->write_latency_ticks = 0; 3855 stat->unmap_latency_ticks = 0; 3856 stat->copy_latency_ticks = 0; 3857 3858 if (stat->io_error != NULL) { 3859 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3860 } 3861 } 3862 3863 struct spdk_bdev_io_stat * 3864 bdev_alloc_io_stat(bool io_error_stat) 3865 { 3866 struct spdk_bdev_io_stat *stat; 3867 3868 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3869 if (stat == NULL) { 3870 return NULL; 3871 } 3872 3873 if (io_error_stat) { 3874 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3875 if (stat->io_error == NULL) { 3876 free(stat); 3877 return NULL; 3878 } 3879 } else { 3880 stat->io_error = NULL; 3881 } 3882 3883 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 3884 3885 return stat; 3886 } 3887 3888 void 3889 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3890 { 3891 if (stat != NULL) { 3892 free(stat->io_error); 3893 free(stat); 3894 } 3895 } 3896 3897 void 3898 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3899 { 3900 int i; 3901 3902 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3903 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3904 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3905 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3906 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3907 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3908 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3909 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3910 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3911 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3912 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3913 stat->min_read_latency_ticks != UINT64_MAX ? 3914 stat->min_read_latency_ticks : 0); 3915 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3916 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3917 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3918 stat->min_write_latency_ticks != UINT64_MAX ? 3919 stat->min_write_latency_ticks : 0); 3920 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3921 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3922 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3923 stat->min_unmap_latency_ticks != UINT64_MAX ? 3924 stat->min_unmap_latency_ticks : 0); 3925 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3926 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3927 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3928 stat->min_copy_latency_ticks != UINT64_MAX ? 3929 stat->min_copy_latency_ticks : 0); 3930 3931 if (stat->io_error != NULL) { 3932 spdk_json_write_named_object_begin(w, "io_error"); 3933 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3934 if (stat->io_error->error_status[i] != 0) { 3935 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3936 stat->io_error->error_status[i]); 3937 } 3938 } 3939 spdk_json_write_object_end(w); 3940 } 3941 } 3942 3943 static void 3944 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3945 { 3946 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3947 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3948 3949 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3950 bdev_abort_all_buf_io(mgmt_ch, ch); 3951 bdev_abort_all_buf_io(mgmt_ch, ch); 3952 } 3953 3954 static void 3955 bdev_channel_destroy(void *io_device, void *ctx_buf) 3956 { 3957 struct spdk_bdev_channel *ch = ctx_buf; 3958 3959 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3960 spdk_get_thread()); 3961 3962 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3963 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3964 3965 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3966 spdk_spin_lock(&ch->bdev->internal.spinlock); 3967 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3968 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3969 3970 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3971 3972 bdev_channel_abort_queued_ios(ch); 3973 3974 if (ch->histogram) { 3975 spdk_histogram_data_free(ch->histogram); 3976 } 3977 3978 bdev_channel_destroy_resource(ch); 3979 } 3980 3981 /* 3982 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3983 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3984 */ 3985 static int 3986 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3987 { 3988 struct spdk_bdev_name *tmp; 3989 3990 bdev_name->name = strdup(name); 3991 if (bdev_name->name == NULL) { 3992 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3993 return -ENOMEM; 3994 } 3995 3996 bdev_name->bdev = bdev; 3997 3998 spdk_spin_lock(&g_bdev_mgr.spinlock); 3999 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4000 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4001 4002 if (tmp != NULL) { 4003 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4004 free(bdev_name->name); 4005 return -EEXIST; 4006 } 4007 4008 return 0; 4009 } 4010 4011 static void 4012 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4013 { 4014 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4015 free(bdev_name->name); 4016 } 4017 4018 static void 4019 bdev_name_del(struct spdk_bdev_name *bdev_name) 4020 { 4021 spdk_spin_lock(&g_bdev_mgr.spinlock); 4022 bdev_name_del_unsafe(bdev_name); 4023 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4024 } 4025 4026 int 4027 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4028 { 4029 struct spdk_bdev_alias *tmp; 4030 int ret; 4031 4032 if (alias == NULL) { 4033 SPDK_ERRLOG("Empty alias passed\n"); 4034 return -EINVAL; 4035 } 4036 4037 tmp = calloc(1, sizeof(*tmp)); 4038 if (tmp == NULL) { 4039 SPDK_ERRLOG("Unable to allocate alias\n"); 4040 return -ENOMEM; 4041 } 4042 4043 ret = bdev_name_add(&tmp->alias, bdev, alias); 4044 if (ret != 0) { 4045 free(tmp); 4046 return ret; 4047 } 4048 4049 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4050 4051 return 0; 4052 } 4053 4054 static int 4055 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4056 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4057 { 4058 struct spdk_bdev_alias *tmp; 4059 4060 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4061 if (strcmp(alias, tmp->alias.name) == 0) { 4062 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4063 alias_del_fn(&tmp->alias); 4064 free(tmp); 4065 return 0; 4066 } 4067 } 4068 4069 return -ENOENT; 4070 } 4071 4072 int 4073 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4074 { 4075 int rc; 4076 4077 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4078 if (rc == -ENOENT) { 4079 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4080 } 4081 4082 return rc; 4083 } 4084 4085 void 4086 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4087 { 4088 struct spdk_bdev_alias *p, *tmp; 4089 4090 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4091 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4092 bdev_name_del(&p->alias); 4093 free(p); 4094 } 4095 } 4096 4097 struct spdk_io_channel * 4098 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4099 { 4100 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4101 } 4102 4103 void * 4104 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4105 { 4106 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4107 void *ctx = NULL; 4108 4109 if (bdev->fn_table->get_module_ctx) { 4110 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4111 } 4112 4113 return ctx; 4114 } 4115 4116 const char * 4117 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4118 { 4119 return bdev->module->name; 4120 } 4121 4122 const char * 4123 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4124 { 4125 return bdev->name; 4126 } 4127 4128 const char * 4129 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4130 { 4131 return bdev->product_name; 4132 } 4133 4134 const struct spdk_bdev_aliases_list * 4135 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4136 { 4137 return &bdev->aliases; 4138 } 4139 4140 uint32_t 4141 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4142 { 4143 return bdev->blocklen; 4144 } 4145 4146 uint32_t 4147 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4148 { 4149 return bdev->write_unit_size; 4150 } 4151 4152 uint64_t 4153 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4154 { 4155 return bdev->blockcnt; 4156 } 4157 4158 const char * 4159 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4160 { 4161 return qos_rpc_type[type]; 4162 } 4163 4164 void 4165 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4166 { 4167 int i; 4168 4169 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4170 4171 spdk_spin_lock(&bdev->internal.spinlock); 4172 if (bdev->internal.qos) { 4173 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4174 if (bdev->internal.qos->rate_limits[i].limit != 4175 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4176 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4177 if (bdev_qos_is_iops_rate_limit(i) == false) { 4178 /* Change from Byte to Megabyte which is user visible. */ 4179 limits[i] = limits[i] / 1024 / 1024; 4180 } 4181 } 4182 } 4183 } 4184 spdk_spin_unlock(&bdev->internal.spinlock); 4185 } 4186 4187 size_t 4188 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4189 { 4190 return 1 << bdev->required_alignment; 4191 } 4192 4193 uint32_t 4194 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4195 { 4196 return bdev->optimal_io_boundary; 4197 } 4198 4199 bool 4200 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4201 { 4202 return bdev->write_cache; 4203 } 4204 4205 const struct spdk_uuid * 4206 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4207 { 4208 return &bdev->uuid; 4209 } 4210 4211 uint16_t 4212 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4213 { 4214 return bdev->acwu; 4215 } 4216 4217 uint32_t 4218 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4219 { 4220 return bdev->md_len; 4221 } 4222 4223 bool 4224 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4225 { 4226 return (bdev->md_len != 0) && bdev->md_interleave; 4227 } 4228 4229 bool 4230 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4231 { 4232 return (bdev->md_len != 0) && !bdev->md_interleave; 4233 } 4234 4235 bool 4236 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4237 { 4238 return bdev->zoned; 4239 } 4240 4241 uint32_t 4242 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4243 { 4244 if (spdk_bdev_is_md_interleaved(bdev)) { 4245 return bdev->blocklen - bdev->md_len; 4246 } else { 4247 return bdev->blocklen; 4248 } 4249 } 4250 4251 uint32_t 4252 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4253 { 4254 return bdev->phys_blocklen; 4255 } 4256 4257 static uint32_t 4258 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4259 { 4260 if (!spdk_bdev_is_md_interleaved(bdev)) { 4261 return bdev->blocklen + bdev->md_len; 4262 } else { 4263 return bdev->blocklen; 4264 } 4265 } 4266 4267 /* We have to use the typedef in the function declaration to appease astyle. */ 4268 typedef enum spdk_dif_type spdk_dif_type_t; 4269 4270 spdk_dif_type_t 4271 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4272 { 4273 if (bdev->md_len != 0) { 4274 return bdev->dif_type; 4275 } else { 4276 return SPDK_DIF_DISABLE; 4277 } 4278 } 4279 4280 bool 4281 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4282 { 4283 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4284 return bdev->dif_is_head_of_md; 4285 } else { 4286 return false; 4287 } 4288 } 4289 4290 bool 4291 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4292 enum spdk_dif_check_type check_type) 4293 { 4294 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4295 return false; 4296 } 4297 4298 switch (check_type) { 4299 case SPDK_DIF_CHECK_TYPE_REFTAG: 4300 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4301 case SPDK_DIF_CHECK_TYPE_APPTAG: 4302 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4303 case SPDK_DIF_CHECK_TYPE_GUARD: 4304 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4305 default: 4306 return false; 4307 } 4308 } 4309 4310 uint32_t 4311 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4312 { 4313 uint64_t alighed_length; 4314 uint64_t max_copy_blocks; 4315 uint64_t temp_max_copy_blocks; 4316 struct spdk_iobuf_opts opts; 4317 4318 if (spdk_bdev_io_type_supported((struct spdk_bdev *)bdev, SPDK_BDEV_IO_TYPE_COPY)) { 4319 return bdev->max_copy; 4320 } else { 4321 spdk_iobuf_get_opts(&opts); 4322 alighed_length = opts.large_bufsize - spdk_bdev_get_buf_align(bdev); 4323 temp_max_copy_blocks = spdk_bdev_is_md_separate(bdev) ? 4324 alighed_length / (bdev->blocklen + bdev->md_len) : 4325 alighed_length / bdev->blocklen; 4326 max_copy_blocks = 1 << spdk_u64log2(temp_max_copy_blocks); 4327 return max_copy_blocks; 4328 } 4329 } 4330 4331 uint64_t 4332 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4333 { 4334 return bdev->internal.measured_queue_depth; 4335 } 4336 4337 uint64_t 4338 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4339 { 4340 return bdev->internal.period; 4341 } 4342 4343 uint64_t 4344 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4345 { 4346 return bdev->internal.weighted_io_time; 4347 } 4348 4349 uint64_t 4350 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4351 { 4352 return bdev->internal.io_time; 4353 } 4354 4355 static void bdev_update_qd_sampling_period(void *ctx); 4356 4357 static void 4358 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4359 { 4360 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4361 4362 if (bdev->internal.measured_queue_depth) { 4363 bdev->internal.io_time += bdev->internal.period; 4364 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4365 } 4366 4367 bdev->internal.qd_poll_in_progress = false; 4368 4369 bdev_update_qd_sampling_period(bdev); 4370 } 4371 4372 static void 4373 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4374 struct spdk_io_channel *io_ch, void *_ctx) 4375 { 4376 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4377 4378 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4379 spdk_bdev_for_each_channel_continue(i, 0); 4380 } 4381 4382 static int 4383 bdev_calculate_measured_queue_depth(void *ctx) 4384 { 4385 struct spdk_bdev *bdev = ctx; 4386 4387 bdev->internal.qd_poll_in_progress = true; 4388 bdev->internal.temporary_queue_depth = 0; 4389 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4390 return SPDK_POLLER_BUSY; 4391 } 4392 4393 static void 4394 bdev_update_qd_sampling_period(void *ctx) 4395 { 4396 struct spdk_bdev *bdev = ctx; 4397 4398 if (bdev->internal.period == bdev->internal.new_period) { 4399 return; 4400 } 4401 4402 if (bdev->internal.qd_poll_in_progress) { 4403 return; 4404 } 4405 4406 bdev->internal.period = bdev->internal.new_period; 4407 4408 spdk_poller_unregister(&bdev->internal.qd_poller); 4409 if (bdev->internal.period != 0) { 4410 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4411 bdev, bdev->internal.period); 4412 } else { 4413 spdk_bdev_close(bdev->internal.qd_desc); 4414 bdev->internal.qd_desc = NULL; 4415 } 4416 } 4417 4418 static void 4419 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4420 { 4421 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4422 } 4423 4424 void 4425 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4426 { 4427 int rc; 4428 4429 if (bdev->internal.new_period == period) { 4430 return; 4431 } 4432 4433 bdev->internal.new_period = period; 4434 4435 if (bdev->internal.qd_desc != NULL) { 4436 assert(bdev->internal.period != 0); 4437 4438 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4439 bdev_update_qd_sampling_period, bdev); 4440 return; 4441 } 4442 4443 assert(bdev->internal.period == 0); 4444 4445 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4446 NULL, &bdev->internal.qd_desc); 4447 if (rc != 0) { 4448 return; 4449 } 4450 4451 bdev->internal.period = period; 4452 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4453 bdev, period); 4454 } 4455 4456 struct bdev_get_current_qd_ctx { 4457 uint64_t current_qd; 4458 spdk_bdev_get_current_qd_cb cb_fn; 4459 void *cb_arg; 4460 }; 4461 4462 static void 4463 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4464 { 4465 struct bdev_get_current_qd_ctx *ctx = _ctx; 4466 4467 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4468 4469 free(ctx); 4470 } 4471 4472 static void 4473 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4474 struct spdk_io_channel *io_ch, void *_ctx) 4475 { 4476 struct bdev_get_current_qd_ctx *ctx = _ctx; 4477 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4478 4479 ctx->current_qd += bdev_ch->io_outstanding; 4480 4481 spdk_bdev_for_each_channel_continue(i, 0); 4482 } 4483 4484 void 4485 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4486 void *cb_arg) 4487 { 4488 struct bdev_get_current_qd_ctx *ctx; 4489 4490 assert(cb_fn != NULL); 4491 4492 ctx = calloc(1, sizeof(*ctx)); 4493 if (ctx == NULL) { 4494 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4495 return; 4496 } 4497 4498 ctx->cb_fn = cb_fn; 4499 ctx->cb_arg = cb_arg; 4500 4501 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4502 } 4503 4504 static void 4505 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4506 { 4507 assert(desc->thread == spdk_get_thread()); 4508 4509 spdk_spin_lock(&desc->spinlock); 4510 desc->refs--; 4511 if (!desc->closed) { 4512 spdk_spin_unlock(&desc->spinlock); 4513 desc->callback.event_fn(type, 4514 desc->bdev, 4515 desc->callback.ctx); 4516 return; 4517 } else if (desc->refs == 0) { 4518 /* This descriptor was closed after this event_notify message was sent. 4519 * spdk_bdev_close() could not free the descriptor since this message was 4520 * in flight, so we free it now using bdev_desc_free(). 4521 */ 4522 spdk_spin_unlock(&desc->spinlock); 4523 bdev_desc_free(desc); 4524 return; 4525 } 4526 spdk_spin_unlock(&desc->spinlock); 4527 } 4528 4529 static void 4530 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4531 { 4532 spdk_spin_lock(&desc->spinlock); 4533 desc->refs++; 4534 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4535 spdk_spin_unlock(&desc->spinlock); 4536 } 4537 4538 static void 4539 _resize_notify(void *ctx) 4540 { 4541 struct spdk_bdev_desc *desc = ctx; 4542 4543 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4544 } 4545 4546 int 4547 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4548 { 4549 struct spdk_bdev_desc *desc; 4550 int ret; 4551 4552 if (size == bdev->blockcnt) { 4553 return 0; 4554 } 4555 4556 spdk_spin_lock(&bdev->internal.spinlock); 4557 4558 /* bdev has open descriptors */ 4559 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4560 bdev->blockcnt > size) { 4561 ret = -EBUSY; 4562 } else { 4563 bdev->blockcnt = size; 4564 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4565 event_notify(desc, _resize_notify); 4566 } 4567 ret = 0; 4568 } 4569 4570 spdk_spin_unlock(&bdev->internal.spinlock); 4571 4572 return ret; 4573 } 4574 4575 /* 4576 * Convert I/O offset and length from bytes to blocks. 4577 * 4578 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4579 */ 4580 static uint64_t 4581 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4582 uint64_t num_bytes, uint64_t *num_blocks) 4583 { 4584 uint32_t block_size = bdev->blocklen; 4585 uint8_t shift_cnt; 4586 4587 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4588 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4589 shift_cnt = spdk_u32log2(block_size); 4590 *offset_blocks = offset_bytes >> shift_cnt; 4591 *num_blocks = num_bytes >> shift_cnt; 4592 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4593 (num_bytes - (*num_blocks << shift_cnt)); 4594 } else { 4595 *offset_blocks = offset_bytes / block_size; 4596 *num_blocks = num_bytes / block_size; 4597 return (offset_bytes % block_size) | (num_bytes % block_size); 4598 } 4599 } 4600 4601 static bool 4602 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4603 { 4604 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4605 * has been an overflow and hence the offset has been wrapped around */ 4606 if (offset_blocks + num_blocks < offset_blocks) { 4607 return false; 4608 } 4609 4610 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4611 if (offset_blocks + num_blocks > bdev->blockcnt) { 4612 return false; 4613 } 4614 4615 return true; 4616 } 4617 4618 static void 4619 bdev_seek_complete_cb(void *ctx) 4620 { 4621 struct spdk_bdev_io *bdev_io = ctx; 4622 4623 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4624 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4625 } 4626 4627 static int 4628 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4629 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4630 spdk_bdev_io_completion_cb cb, void *cb_arg) 4631 { 4632 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4633 struct spdk_bdev_io *bdev_io; 4634 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4635 4636 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4637 4638 /* Check if offset_blocks is valid looking at the validity of one block */ 4639 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4640 return -EINVAL; 4641 } 4642 4643 bdev_io = bdev_channel_get_io(channel); 4644 if (!bdev_io) { 4645 return -ENOMEM; 4646 } 4647 4648 bdev_io->internal.ch = channel; 4649 bdev_io->internal.desc = desc; 4650 bdev_io->type = io_type; 4651 bdev_io->u.bdev.offset_blocks = offset_blocks; 4652 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4653 4654 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4655 /* In case bdev doesn't support seek to next data/hole offset, 4656 * it is assumed that only data and no holes are present */ 4657 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4658 bdev_io->u.bdev.seek.offset = offset_blocks; 4659 } else { 4660 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4661 } 4662 4663 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4664 return 0; 4665 } 4666 4667 bdev_io_submit(bdev_io); 4668 return 0; 4669 } 4670 4671 int 4672 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4673 uint64_t offset_blocks, 4674 spdk_bdev_io_completion_cb cb, void *cb_arg) 4675 { 4676 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4677 } 4678 4679 int 4680 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4681 uint64_t offset_blocks, 4682 spdk_bdev_io_completion_cb cb, void *cb_arg) 4683 { 4684 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4685 } 4686 4687 uint64_t 4688 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4689 { 4690 return bdev_io->u.bdev.seek.offset; 4691 } 4692 4693 static int 4694 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4695 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4696 spdk_bdev_io_completion_cb cb, void *cb_arg) 4697 { 4698 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4699 struct spdk_bdev_io *bdev_io; 4700 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4701 4702 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4703 return -EINVAL; 4704 } 4705 4706 bdev_io = bdev_channel_get_io(channel); 4707 if (!bdev_io) { 4708 return -ENOMEM; 4709 } 4710 4711 bdev_io->internal.ch = channel; 4712 bdev_io->internal.desc = desc; 4713 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4714 bdev_io->u.bdev.iovs = &bdev_io->iov; 4715 bdev_io->u.bdev.iovs[0].iov_base = buf; 4716 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4717 bdev_io->u.bdev.iovcnt = 1; 4718 bdev_io->u.bdev.md_buf = md_buf; 4719 bdev_io->u.bdev.num_blocks = num_blocks; 4720 bdev_io->u.bdev.offset_blocks = offset_blocks; 4721 bdev_io->u.bdev.memory_domain = NULL; 4722 bdev_io->u.bdev.memory_domain_ctx = NULL; 4723 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4724 4725 bdev_io_submit(bdev_io); 4726 return 0; 4727 } 4728 4729 int 4730 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4731 void *buf, uint64_t offset, uint64_t nbytes, 4732 spdk_bdev_io_completion_cb cb, void *cb_arg) 4733 { 4734 uint64_t offset_blocks, num_blocks; 4735 4736 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4737 nbytes, &num_blocks) != 0) { 4738 return -EINVAL; 4739 } 4740 4741 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4742 } 4743 4744 int 4745 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4746 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4747 spdk_bdev_io_completion_cb cb, void *cb_arg) 4748 { 4749 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4750 } 4751 4752 int 4753 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4754 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4755 spdk_bdev_io_completion_cb cb, void *cb_arg) 4756 { 4757 struct iovec iov = { 4758 .iov_base = buf, 4759 }; 4760 4761 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4762 return -EINVAL; 4763 } 4764 4765 if (md_buf && !_is_buf_allocated(&iov)) { 4766 return -EINVAL; 4767 } 4768 4769 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4770 cb, cb_arg); 4771 } 4772 4773 int 4774 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4775 struct iovec *iov, int iovcnt, 4776 uint64_t offset, uint64_t nbytes, 4777 spdk_bdev_io_completion_cb cb, void *cb_arg) 4778 { 4779 uint64_t offset_blocks, num_blocks; 4780 4781 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4782 nbytes, &num_blocks) != 0) { 4783 return -EINVAL; 4784 } 4785 4786 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4787 } 4788 4789 static int 4790 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4791 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4792 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 4793 spdk_bdev_io_completion_cb cb, void *cb_arg) 4794 { 4795 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4796 struct spdk_bdev_io *bdev_io; 4797 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4798 4799 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4800 return -EINVAL; 4801 } 4802 4803 bdev_io = bdev_channel_get_io(channel); 4804 if (!bdev_io) { 4805 return -ENOMEM; 4806 } 4807 4808 bdev_io->internal.ch = channel; 4809 bdev_io->internal.desc = desc; 4810 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4811 bdev_io->u.bdev.iovs = iov; 4812 bdev_io->u.bdev.iovcnt = iovcnt; 4813 bdev_io->u.bdev.md_buf = md_buf; 4814 bdev_io->u.bdev.num_blocks = num_blocks; 4815 bdev_io->u.bdev.offset_blocks = offset_blocks; 4816 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4817 bdev_io->internal.memory_domain = domain; 4818 bdev_io->internal.memory_domain_ctx = domain_ctx; 4819 bdev_io->u.bdev.memory_domain = domain; 4820 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 4821 4822 _bdev_io_submit_ext(desc, bdev_io); 4823 4824 return 0; 4825 } 4826 4827 int 4828 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4829 struct iovec *iov, int iovcnt, 4830 uint64_t offset_blocks, uint64_t num_blocks, 4831 spdk_bdev_io_completion_cb cb, void *cb_arg) 4832 { 4833 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4834 num_blocks, NULL, NULL, cb, cb_arg); 4835 } 4836 4837 int 4838 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4839 struct iovec *iov, int iovcnt, void *md_buf, 4840 uint64_t offset_blocks, uint64_t num_blocks, 4841 spdk_bdev_io_completion_cb cb, void *cb_arg) 4842 { 4843 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4844 return -EINVAL; 4845 } 4846 4847 if (md_buf && !_is_buf_allocated(iov)) { 4848 return -EINVAL; 4849 } 4850 4851 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4852 num_blocks, NULL, NULL, cb, cb_arg); 4853 } 4854 4855 static inline bool 4856 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4857 { 4858 /* 4859 * We check if opts size is at least of size when we first introduced 4860 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4861 * are not checked internal. 4862 */ 4863 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4864 sizeof(opts->metadata) && 4865 opts->size <= sizeof(*opts) && 4866 /* When memory domain is used, the user must provide data buffers */ 4867 (!opts->memory_domain || (iov && iov[0].iov_base)); 4868 } 4869 4870 int 4871 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4872 struct iovec *iov, int iovcnt, 4873 uint64_t offset_blocks, uint64_t num_blocks, 4874 spdk_bdev_io_completion_cb cb, void *cb_arg, 4875 struct spdk_bdev_ext_io_opts *opts) 4876 { 4877 void *md = NULL; 4878 4879 if (opts) { 4880 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4881 return -EINVAL; 4882 } 4883 md = opts->metadata; 4884 } 4885 4886 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4887 return -EINVAL; 4888 } 4889 4890 if (md && !_is_buf_allocated(iov)) { 4891 return -EINVAL; 4892 } 4893 4894 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4895 num_blocks, 4896 bdev_get_ext_io_opt(opts, memory_domain, NULL), 4897 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 4898 cb, cb_arg); 4899 } 4900 4901 static int 4902 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4903 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4904 spdk_bdev_io_completion_cb cb, void *cb_arg) 4905 { 4906 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4907 struct spdk_bdev_io *bdev_io; 4908 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4909 4910 if (!desc->write) { 4911 return -EBADF; 4912 } 4913 4914 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4915 return -EINVAL; 4916 } 4917 4918 bdev_io = bdev_channel_get_io(channel); 4919 if (!bdev_io) { 4920 return -ENOMEM; 4921 } 4922 4923 bdev_io->internal.ch = channel; 4924 bdev_io->internal.desc = desc; 4925 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4926 bdev_io->u.bdev.iovs = &bdev_io->iov; 4927 bdev_io->u.bdev.iovs[0].iov_base = buf; 4928 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4929 bdev_io->u.bdev.iovcnt = 1; 4930 bdev_io->u.bdev.md_buf = md_buf; 4931 bdev_io->u.bdev.num_blocks = num_blocks; 4932 bdev_io->u.bdev.offset_blocks = offset_blocks; 4933 bdev_io->u.bdev.memory_domain = NULL; 4934 bdev_io->u.bdev.memory_domain_ctx = NULL; 4935 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4936 4937 bdev_io_submit(bdev_io); 4938 return 0; 4939 } 4940 4941 int 4942 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4943 void *buf, uint64_t offset, uint64_t nbytes, 4944 spdk_bdev_io_completion_cb cb, void *cb_arg) 4945 { 4946 uint64_t offset_blocks, num_blocks; 4947 4948 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4949 nbytes, &num_blocks) != 0) { 4950 return -EINVAL; 4951 } 4952 4953 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4954 } 4955 4956 int 4957 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4958 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4959 spdk_bdev_io_completion_cb cb, void *cb_arg) 4960 { 4961 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4962 cb, cb_arg); 4963 } 4964 4965 int 4966 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4967 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4968 spdk_bdev_io_completion_cb cb, void *cb_arg) 4969 { 4970 struct iovec iov = { 4971 .iov_base = buf, 4972 }; 4973 4974 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4975 return -EINVAL; 4976 } 4977 4978 if (md_buf && !_is_buf_allocated(&iov)) { 4979 return -EINVAL; 4980 } 4981 4982 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4983 cb, cb_arg); 4984 } 4985 4986 static int 4987 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4988 struct iovec *iov, int iovcnt, void *md_buf, 4989 uint64_t offset_blocks, uint64_t num_blocks, 4990 struct spdk_memory_domain *domain, void *domain_ctx, 4991 spdk_bdev_io_completion_cb cb, void *cb_arg) 4992 { 4993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4994 struct spdk_bdev_io *bdev_io; 4995 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4996 4997 if (!desc->write) { 4998 return -EBADF; 4999 } 5000 5001 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5002 return -EINVAL; 5003 } 5004 5005 bdev_io = bdev_channel_get_io(channel); 5006 if (!bdev_io) { 5007 return -ENOMEM; 5008 } 5009 5010 bdev_io->internal.ch = channel; 5011 bdev_io->internal.desc = desc; 5012 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5013 bdev_io->u.bdev.iovs = iov; 5014 bdev_io->u.bdev.iovcnt = iovcnt; 5015 bdev_io->u.bdev.md_buf = md_buf; 5016 bdev_io->u.bdev.num_blocks = num_blocks; 5017 bdev_io->u.bdev.offset_blocks = offset_blocks; 5018 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5019 bdev_io->internal.memory_domain = domain; 5020 bdev_io->internal.memory_domain_ctx = domain_ctx; 5021 bdev_io->u.bdev.memory_domain = domain; 5022 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5023 5024 _bdev_io_submit_ext(desc, bdev_io); 5025 5026 return 0; 5027 } 5028 5029 int 5030 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5031 struct iovec *iov, int iovcnt, 5032 uint64_t offset, uint64_t len, 5033 spdk_bdev_io_completion_cb cb, void *cb_arg) 5034 { 5035 uint64_t offset_blocks, num_blocks; 5036 5037 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5038 len, &num_blocks) != 0) { 5039 return -EINVAL; 5040 } 5041 5042 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5043 } 5044 5045 int 5046 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5047 struct iovec *iov, int iovcnt, 5048 uint64_t offset_blocks, uint64_t num_blocks, 5049 spdk_bdev_io_completion_cb cb, void *cb_arg) 5050 { 5051 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5052 num_blocks, NULL, NULL, cb, cb_arg); 5053 } 5054 5055 int 5056 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5057 struct iovec *iov, int iovcnt, void *md_buf, 5058 uint64_t offset_blocks, uint64_t num_blocks, 5059 spdk_bdev_io_completion_cb cb, void *cb_arg) 5060 { 5061 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5062 return -EINVAL; 5063 } 5064 5065 if (md_buf && !_is_buf_allocated(iov)) { 5066 return -EINVAL; 5067 } 5068 5069 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5070 num_blocks, NULL, NULL, cb, cb_arg); 5071 } 5072 5073 int 5074 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5075 struct iovec *iov, int iovcnt, 5076 uint64_t offset_blocks, uint64_t num_blocks, 5077 spdk_bdev_io_completion_cb cb, void *cb_arg, 5078 struct spdk_bdev_ext_io_opts *opts) 5079 { 5080 void *md = NULL; 5081 5082 if (opts) { 5083 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5084 return -EINVAL; 5085 } 5086 md = opts->metadata; 5087 } 5088 5089 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5090 return -EINVAL; 5091 } 5092 5093 if (md && !_is_buf_allocated(iov)) { 5094 return -EINVAL; 5095 } 5096 5097 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5098 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5099 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5100 cb, cb_arg); 5101 } 5102 5103 static void 5104 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5105 { 5106 struct spdk_bdev_io *parent_io = cb_arg; 5107 struct spdk_bdev *bdev = parent_io->bdev; 5108 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5109 int i, rc = 0; 5110 5111 if (!success) { 5112 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5113 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5114 spdk_bdev_free_io(bdev_io); 5115 return; 5116 } 5117 5118 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5119 rc = memcmp(read_buf, 5120 parent_io->u.bdev.iovs[i].iov_base, 5121 parent_io->u.bdev.iovs[i].iov_len); 5122 if (rc) { 5123 break; 5124 } 5125 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5126 } 5127 5128 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5129 rc = memcmp(bdev_io->u.bdev.md_buf, 5130 parent_io->u.bdev.md_buf, 5131 spdk_bdev_get_md_size(bdev)); 5132 } 5133 5134 spdk_bdev_free_io(bdev_io); 5135 5136 if (rc == 0) { 5137 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5138 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5139 } else { 5140 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5141 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5142 } 5143 } 5144 5145 static void 5146 bdev_compare_do_read(void *_bdev_io) 5147 { 5148 struct spdk_bdev_io *bdev_io = _bdev_io; 5149 int rc; 5150 5151 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5152 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5153 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5154 bdev_compare_do_read_done, bdev_io); 5155 5156 if (rc == -ENOMEM) { 5157 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5158 } else if (rc != 0) { 5159 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5160 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5161 } 5162 } 5163 5164 static int 5165 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5166 struct iovec *iov, int iovcnt, void *md_buf, 5167 uint64_t offset_blocks, uint64_t num_blocks, 5168 spdk_bdev_io_completion_cb cb, void *cb_arg) 5169 { 5170 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5171 struct spdk_bdev_io *bdev_io; 5172 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5173 5174 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5175 return -EINVAL; 5176 } 5177 5178 bdev_io = bdev_channel_get_io(channel); 5179 if (!bdev_io) { 5180 return -ENOMEM; 5181 } 5182 5183 bdev_io->internal.ch = channel; 5184 bdev_io->internal.desc = desc; 5185 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5186 bdev_io->u.bdev.iovs = iov; 5187 bdev_io->u.bdev.iovcnt = iovcnt; 5188 bdev_io->u.bdev.md_buf = md_buf; 5189 bdev_io->u.bdev.num_blocks = num_blocks; 5190 bdev_io->u.bdev.offset_blocks = offset_blocks; 5191 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5192 bdev_io->u.bdev.memory_domain = NULL; 5193 bdev_io->u.bdev.memory_domain_ctx = NULL; 5194 5195 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5196 bdev_io_submit(bdev_io); 5197 return 0; 5198 } 5199 5200 bdev_compare_do_read(bdev_io); 5201 5202 return 0; 5203 } 5204 5205 int 5206 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5207 struct iovec *iov, int iovcnt, 5208 uint64_t offset_blocks, uint64_t num_blocks, 5209 spdk_bdev_io_completion_cb cb, void *cb_arg) 5210 { 5211 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5212 num_blocks, cb, cb_arg); 5213 } 5214 5215 int 5216 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5217 struct iovec *iov, int iovcnt, void *md_buf, 5218 uint64_t offset_blocks, uint64_t num_blocks, 5219 spdk_bdev_io_completion_cb cb, void *cb_arg) 5220 { 5221 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5222 return -EINVAL; 5223 } 5224 5225 if (md_buf && !_is_buf_allocated(iov)) { 5226 return -EINVAL; 5227 } 5228 5229 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5230 num_blocks, cb, cb_arg); 5231 } 5232 5233 static int 5234 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5235 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5236 spdk_bdev_io_completion_cb cb, void *cb_arg) 5237 { 5238 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5239 struct spdk_bdev_io *bdev_io; 5240 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5241 5242 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5243 return -EINVAL; 5244 } 5245 5246 bdev_io = bdev_channel_get_io(channel); 5247 if (!bdev_io) { 5248 return -ENOMEM; 5249 } 5250 5251 bdev_io->internal.ch = channel; 5252 bdev_io->internal.desc = desc; 5253 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5254 bdev_io->u.bdev.iovs = &bdev_io->iov; 5255 bdev_io->u.bdev.iovs[0].iov_base = buf; 5256 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5257 bdev_io->u.bdev.iovcnt = 1; 5258 bdev_io->u.bdev.md_buf = md_buf; 5259 bdev_io->u.bdev.num_blocks = num_blocks; 5260 bdev_io->u.bdev.offset_blocks = offset_blocks; 5261 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5262 bdev_io->u.bdev.memory_domain = NULL; 5263 bdev_io->u.bdev.memory_domain_ctx = NULL; 5264 5265 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5266 bdev_io_submit(bdev_io); 5267 return 0; 5268 } 5269 5270 bdev_compare_do_read(bdev_io); 5271 5272 return 0; 5273 } 5274 5275 int 5276 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5277 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5278 spdk_bdev_io_completion_cb cb, void *cb_arg) 5279 { 5280 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5281 cb, cb_arg); 5282 } 5283 5284 int 5285 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5286 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5287 spdk_bdev_io_completion_cb cb, void *cb_arg) 5288 { 5289 struct iovec iov = { 5290 .iov_base = buf, 5291 }; 5292 5293 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5294 return -EINVAL; 5295 } 5296 5297 if (md_buf && !_is_buf_allocated(&iov)) { 5298 return -EINVAL; 5299 } 5300 5301 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5302 cb, cb_arg); 5303 } 5304 5305 static void 5306 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5307 { 5308 struct spdk_bdev_io *bdev_io = ctx; 5309 5310 if (unlock_status) { 5311 SPDK_ERRLOG("LBA range unlock failed\n"); 5312 } 5313 5314 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5315 false, bdev_io->internal.caller_ctx); 5316 } 5317 5318 static void 5319 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5320 { 5321 bdev_io->internal.status = status; 5322 5323 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5324 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5325 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5326 } 5327 5328 static void 5329 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5330 { 5331 struct spdk_bdev_io *parent_io = cb_arg; 5332 5333 if (!success) { 5334 SPDK_ERRLOG("Compare and write operation failed\n"); 5335 } 5336 5337 spdk_bdev_free_io(bdev_io); 5338 5339 bdev_comparev_and_writev_blocks_unlock(parent_io, 5340 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5341 } 5342 5343 static void 5344 bdev_compare_and_write_do_write(void *_bdev_io) 5345 { 5346 struct spdk_bdev_io *bdev_io = _bdev_io; 5347 int rc; 5348 5349 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5350 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5351 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5352 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5353 bdev_compare_and_write_do_write_done, bdev_io); 5354 5355 5356 if (rc == -ENOMEM) { 5357 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5358 } else if (rc != 0) { 5359 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5360 } 5361 } 5362 5363 static void 5364 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5365 { 5366 struct spdk_bdev_io *parent_io = cb_arg; 5367 5368 spdk_bdev_free_io(bdev_io); 5369 5370 if (!success) { 5371 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5372 return; 5373 } 5374 5375 bdev_compare_and_write_do_write(parent_io); 5376 } 5377 5378 static void 5379 bdev_compare_and_write_do_compare(void *_bdev_io) 5380 { 5381 struct spdk_bdev_io *bdev_io = _bdev_io; 5382 int rc; 5383 5384 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5385 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5386 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5387 bdev_compare_and_write_do_compare_done, bdev_io); 5388 5389 if (rc == -ENOMEM) { 5390 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5391 } else if (rc != 0) { 5392 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5393 } 5394 } 5395 5396 static void 5397 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5398 { 5399 struct spdk_bdev_io *bdev_io = ctx; 5400 5401 if (status) { 5402 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5403 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5404 return; 5405 } 5406 5407 bdev_compare_and_write_do_compare(bdev_io); 5408 } 5409 5410 int 5411 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5412 struct iovec *compare_iov, int compare_iovcnt, 5413 struct iovec *write_iov, int write_iovcnt, 5414 uint64_t offset_blocks, uint64_t num_blocks, 5415 spdk_bdev_io_completion_cb cb, void *cb_arg) 5416 { 5417 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5418 struct spdk_bdev_io *bdev_io; 5419 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5420 5421 if (!desc->write) { 5422 return -EBADF; 5423 } 5424 5425 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5426 return -EINVAL; 5427 } 5428 5429 if (num_blocks > bdev->acwu) { 5430 return -EINVAL; 5431 } 5432 5433 bdev_io = bdev_channel_get_io(channel); 5434 if (!bdev_io) { 5435 return -ENOMEM; 5436 } 5437 5438 bdev_io->internal.ch = channel; 5439 bdev_io->internal.desc = desc; 5440 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5441 bdev_io->u.bdev.iovs = compare_iov; 5442 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5443 bdev_io->u.bdev.fused_iovs = write_iov; 5444 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5445 bdev_io->u.bdev.md_buf = NULL; 5446 bdev_io->u.bdev.num_blocks = num_blocks; 5447 bdev_io->u.bdev.offset_blocks = offset_blocks; 5448 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5449 bdev_io->u.bdev.memory_domain = NULL; 5450 bdev_io->u.bdev.memory_domain_ctx = NULL; 5451 5452 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5453 bdev_io_submit(bdev_io); 5454 return 0; 5455 } 5456 5457 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5458 bdev_comparev_and_writev_blocks_locked, bdev_io); 5459 } 5460 5461 int 5462 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5463 struct iovec *iov, int iovcnt, 5464 uint64_t offset_blocks, uint64_t num_blocks, 5465 bool populate, 5466 spdk_bdev_io_completion_cb cb, void *cb_arg) 5467 { 5468 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5469 struct spdk_bdev_io *bdev_io; 5470 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5471 5472 if (!desc->write) { 5473 return -EBADF; 5474 } 5475 5476 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5477 return -EINVAL; 5478 } 5479 5480 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5481 return -ENOTSUP; 5482 } 5483 5484 bdev_io = bdev_channel_get_io(channel); 5485 if (!bdev_io) { 5486 return -ENOMEM; 5487 } 5488 5489 bdev_io->internal.ch = channel; 5490 bdev_io->internal.desc = desc; 5491 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5492 bdev_io->u.bdev.num_blocks = num_blocks; 5493 bdev_io->u.bdev.offset_blocks = offset_blocks; 5494 bdev_io->u.bdev.iovs = iov; 5495 bdev_io->u.bdev.iovcnt = iovcnt; 5496 bdev_io->u.bdev.md_buf = NULL; 5497 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5498 bdev_io->u.bdev.zcopy.commit = 0; 5499 bdev_io->u.bdev.zcopy.start = 1; 5500 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5501 bdev_io->u.bdev.memory_domain = NULL; 5502 bdev_io->u.bdev.memory_domain_ctx = NULL; 5503 5504 bdev_io_submit(bdev_io); 5505 5506 return 0; 5507 } 5508 5509 int 5510 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5511 spdk_bdev_io_completion_cb cb, void *cb_arg) 5512 { 5513 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5514 return -EINVAL; 5515 } 5516 5517 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5518 bdev_io->u.bdev.zcopy.start = 0; 5519 bdev_io->internal.caller_ctx = cb_arg; 5520 bdev_io->internal.cb = cb; 5521 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5522 5523 bdev_io_submit(bdev_io); 5524 5525 return 0; 5526 } 5527 5528 int 5529 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5530 uint64_t offset, uint64_t len, 5531 spdk_bdev_io_completion_cb cb, void *cb_arg) 5532 { 5533 uint64_t offset_blocks, num_blocks; 5534 5535 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5536 len, &num_blocks) != 0) { 5537 return -EINVAL; 5538 } 5539 5540 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5541 } 5542 5543 int 5544 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5545 uint64_t offset_blocks, uint64_t num_blocks, 5546 spdk_bdev_io_completion_cb cb, void *cb_arg) 5547 { 5548 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5549 struct spdk_bdev_io *bdev_io; 5550 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5551 5552 if (!desc->write) { 5553 return -EBADF; 5554 } 5555 5556 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5557 return -EINVAL; 5558 } 5559 5560 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5561 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5562 return -ENOTSUP; 5563 } 5564 5565 bdev_io = bdev_channel_get_io(channel); 5566 5567 if (!bdev_io) { 5568 return -ENOMEM; 5569 } 5570 5571 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5572 bdev_io->internal.ch = channel; 5573 bdev_io->internal.desc = desc; 5574 bdev_io->u.bdev.offset_blocks = offset_blocks; 5575 bdev_io->u.bdev.num_blocks = num_blocks; 5576 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5577 bdev_io->u.bdev.memory_domain = NULL; 5578 bdev_io->u.bdev.memory_domain_ctx = NULL; 5579 5580 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5581 bdev_io_submit(bdev_io); 5582 return 0; 5583 } 5584 5585 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5586 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5587 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5588 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5589 bdev_write_zero_buffer_next(bdev_io); 5590 5591 return 0; 5592 } 5593 5594 int 5595 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5596 uint64_t offset, uint64_t nbytes, 5597 spdk_bdev_io_completion_cb cb, void *cb_arg) 5598 { 5599 uint64_t offset_blocks, num_blocks; 5600 5601 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5602 nbytes, &num_blocks) != 0) { 5603 return -EINVAL; 5604 } 5605 5606 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5607 } 5608 5609 int 5610 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5611 uint64_t offset_blocks, uint64_t num_blocks, 5612 spdk_bdev_io_completion_cb cb, void *cb_arg) 5613 { 5614 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5615 struct spdk_bdev_io *bdev_io; 5616 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5617 5618 if (!desc->write) { 5619 return -EBADF; 5620 } 5621 5622 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5623 return -EINVAL; 5624 } 5625 5626 if (num_blocks == 0) { 5627 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5628 return -EINVAL; 5629 } 5630 5631 bdev_io = bdev_channel_get_io(channel); 5632 if (!bdev_io) { 5633 return -ENOMEM; 5634 } 5635 5636 bdev_io->internal.ch = channel; 5637 bdev_io->internal.desc = desc; 5638 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5639 5640 bdev_io->u.bdev.iovs = &bdev_io->iov; 5641 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5642 bdev_io->u.bdev.iovs[0].iov_len = 0; 5643 bdev_io->u.bdev.iovcnt = 1; 5644 5645 bdev_io->u.bdev.offset_blocks = offset_blocks; 5646 bdev_io->u.bdev.num_blocks = num_blocks; 5647 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5648 bdev_io->u.bdev.memory_domain = NULL; 5649 bdev_io->u.bdev.memory_domain_ctx = NULL; 5650 5651 bdev_io_submit(bdev_io); 5652 return 0; 5653 } 5654 5655 int 5656 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5657 uint64_t offset, uint64_t length, 5658 spdk_bdev_io_completion_cb cb, void *cb_arg) 5659 { 5660 uint64_t offset_blocks, num_blocks; 5661 5662 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5663 length, &num_blocks) != 0) { 5664 return -EINVAL; 5665 } 5666 5667 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5668 } 5669 5670 int 5671 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5672 uint64_t offset_blocks, uint64_t num_blocks, 5673 spdk_bdev_io_completion_cb cb, void *cb_arg) 5674 { 5675 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5676 struct spdk_bdev_io *bdev_io; 5677 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5678 5679 if (!desc->write) { 5680 return -EBADF; 5681 } 5682 5683 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5684 return -EINVAL; 5685 } 5686 5687 bdev_io = bdev_channel_get_io(channel); 5688 if (!bdev_io) { 5689 return -ENOMEM; 5690 } 5691 5692 bdev_io->internal.ch = channel; 5693 bdev_io->internal.desc = desc; 5694 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5695 bdev_io->u.bdev.iovs = NULL; 5696 bdev_io->u.bdev.iovcnt = 0; 5697 bdev_io->u.bdev.offset_blocks = offset_blocks; 5698 bdev_io->u.bdev.num_blocks = num_blocks; 5699 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5700 5701 bdev_io_submit(bdev_io); 5702 return 0; 5703 } 5704 5705 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5706 5707 static void 5708 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5709 { 5710 struct spdk_bdev_channel *ch = _ctx; 5711 struct spdk_bdev_io *bdev_io; 5712 5713 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5714 5715 if (status == -EBUSY) { 5716 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5717 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5718 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5719 } else { 5720 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5721 * start the reset. */ 5722 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5723 bdev_io_submit_reset(bdev_io); 5724 } 5725 } else { 5726 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5727 SPDK_DEBUGLOG(bdev, 5728 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5729 ch->bdev->name); 5730 /* Mark the completion status as a SUCCESS and complete the reset. */ 5731 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5732 } 5733 } 5734 5735 static void 5736 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5737 struct spdk_io_channel *io_ch, void *_ctx) 5738 { 5739 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5740 int status = 0; 5741 5742 if (cur_ch->io_outstanding > 0) { 5743 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5744 * further iteration over the rest of the channels and pass non-zero status 5745 * to the callback function. */ 5746 status = -EBUSY; 5747 } 5748 spdk_bdev_for_each_channel_continue(i, status); 5749 } 5750 5751 static int 5752 bdev_reset_poll_for_outstanding_io(void *ctx) 5753 { 5754 struct spdk_bdev_channel *ch = ctx; 5755 struct spdk_bdev_io *bdev_io; 5756 5757 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5758 5759 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5760 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5761 bdev_reset_check_outstanding_io_done); 5762 5763 return SPDK_POLLER_BUSY; 5764 } 5765 5766 static void 5767 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5768 { 5769 struct spdk_bdev_channel *ch = _ctx; 5770 struct spdk_bdev_io *bdev_io; 5771 5772 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5773 5774 if (bdev->reset_io_drain_timeout == 0) { 5775 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5776 5777 bdev_io_submit_reset(bdev_io); 5778 return; 5779 } 5780 5781 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5782 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5783 5784 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5785 * submit the reset to the underlying module only if outstanding I/O 5786 * remain after reset_io_drain_timeout seconds have passed. */ 5787 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5788 bdev_reset_check_outstanding_io_done); 5789 } 5790 5791 static void 5792 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5793 struct spdk_io_channel *ch, void *_ctx) 5794 { 5795 struct spdk_bdev_channel *channel; 5796 struct spdk_bdev_mgmt_channel *mgmt_channel; 5797 struct spdk_bdev_shared_resource *shared_resource; 5798 bdev_io_tailq_t tmp_queued; 5799 5800 TAILQ_INIT(&tmp_queued); 5801 5802 channel = __io_ch_to_bdev_ch(ch); 5803 shared_resource = channel->shared_resource; 5804 mgmt_channel = shared_resource->mgmt_ch; 5805 5806 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5807 5808 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5809 /* The QoS object is always valid and readable while 5810 * the channel flag is set, so the lock here should not 5811 * be necessary. We're not in the fast path though, so 5812 * just take it anyway. */ 5813 spdk_spin_lock(&channel->bdev->internal.spinlock); 5814 if (channel->bdev->internal.qos->ch == channel) { 5815 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5816 } 5817 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5818 } 5819 5820 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5821 bdev_abort_all_buf_io(mgmt_channel, channel); 5822 bdev_abort_all_buf_io(mgmt_channel, channel); 5823 bdev_abort_all_queued_io(&tmp_queued, channel); 5824 5825 spdk_bdev_for_each_channel_continue(i, 0); 5826 } 5827 5828 static void 5829 bdev_start_reset(void *ctx) 5830 { 5831 struct spdk_bdev_channel *ch = ctx; 5832 5833 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5834 bdev_reset_freeze_channel_done); 5835 } 5836 5837 static void 5838 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5839 { 5840 struct spdk_bdev *bdev = ch->bdev; 5841 5842 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5843 5844 spdk_spin_lock(&bdev->internal.spinlock); 5845 if (bdev->internal.reset_in_progress == NULL) { 5846 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5847 /* 5848 * Take a channel reference for the target bdev for the life of this 5849 * reset. This guards against the channel getting destroyed while 5850 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5851 * progress. We will release the reference when this reset is 5852 * completed. 5853 */ 5854 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5855 bdev_start_reset(ch); 5856 } 5857 spdk_spin_unlock(&bdev->internal.spinlock); 5858 } 5859 5860 int 5861 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5862 spdk_bdev_io_completion_cb cb, void *cb_arg) 5863 { 5864 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5865 struct spdk_bdev_io *bdev_io; 5866 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5867 5868 bdev_io = bdev_channel_get_io(channel); 5869 if (!bdev_io) { 5870 return -ENOMEM; 5871 } 5872 5873 bdev_io->internal.ch = channel; 5874 bdev_io->internal.desc = desc; 5875 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5876 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5877 bdev_io->u.reset.ch_ref = NULL; 5878 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5879 5880 spdk_spin_lock(&bdev->internal.spinlock); 5881 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5882 spdk_spin_unlock(&bdev->internal.spinlock); 5883 5884 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5885 internal.ch_link); 5886 5887 bdev_channel_start_reset(channel); 5888 5889 return 0; 5890 } 5891 5892 void 5893 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5894 struct spdk_bdev_io_stat *stat) 5895 { 5896 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5897 5898 bdev_get_io_stat(stat, channel->stat); 5899 } 5900 5901 static void 5902 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5903 { 5904 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5905 5906 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5907 bdev_iostat_ctx->cb_arg, 0); 5908 free(bdev_iostat_ctx); 5909 } 5910 5911 static void 5912 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5913 struct spdk_io_channel *ch, void *_ctx) 5914 { 5915 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5916 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5917 5918 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5919 spdk_bdev_for_each_channel_continue(i, 0); 5920 } 5921 5922 void 5923 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5924 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5925 { 5926 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5927 5928 assert(bdev != NULL); 5929 assert(stat != NULL); 5930 assert(cb != NULL); 5931 5932 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5933 if (bdev_iostat_ctx == NULL) { 5934 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5935 cb(bdev, stat, cb_arg, -ENOMEM); 5936 return; 5937 } 5938 5939 bdev_iostat_ctx->stat = stat; 5940 bdev_iostat_ctx->cb = cb; 5941 bdev_iostat_ctx->cb_arg = cb_arg; 5942 5943 /* Start with the statistics from previously deleted channels. */ 5944 spdk_spin_lock(&bdev->internal.spinlock); 5945 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5946 spdk_spin_unlock(&bdev->internal.spinlock); 5947 5948 /* Then iterate and add the statistics from each existing channel. */ 5949 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5950 bdev_get_device_stat_done); 5951 } 5952 5953 struct bdev_iostat_reset_ctx { 5954 enum spdk_bdev_reset_stat_mode mode; 5955 bdev_reset_device_stat_cb cb; 5956 void *cb_arg; 5957 }; 5958 5959 static void 5960 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5961 { 5962 struct bdev_iostat_reset_ctx *ctx = _ctx; 5963 5964 ctx->cb(bdev, ctx->cb_arg, 0); 5965 5966 free(ctx); 5967 } 5968 5969 static void 5970 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5971 struct spdk_io_channel *ch, void *_ctx) 5972 { 5973 struct bdev_iostat_reset_ctx *ctx = _ctx; 5974 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5975 5976 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 5977 5978 spdk_bdev_for_each_channel_continue(i, 0); 5979 } 5980 5981 void 5982 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 5983 bdev_reset_device_stat_cb cb, void *cb_arg) 5984 { 5985 struct bdev_iostat_reset_ctx *ctx; 5986 5987 assert(bdev != NULL); 5988 assert(cb != NULL); 5989 5990 ctx = calloc(1, sizeof(*ctx)); 5991 if (ctx == NULL) { 5992 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5993 cb(bdev, cb_arg, -ENOMEM); 5994 return; 5995 } 5996 5997 ctx->mode = mode; 5998 ctx->cb = cb; 5999 ctx->cb_arg = cb_arg; 6000 6001 spdk_spin_lock(&bdev->internal.spinlock); 6002 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6003 spdk_spin_unlock(&bdev->internal.spinlock); 6004 6005 spdk_bdev_for_each_channel(bdev, 6006 bdev_reset_each_channel_stat, 6007 ctx, 6008 bdev_reset_device_stat_done); 6009 } 6010 6011 int 6012 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6013 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6014 spdk_bdev_io_completion_cb cb, void *cb_arg) 6015 { 6016 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6017 struct spdk_bdev_io *bdev_io; 6018 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6019 6020 if (!desc->write) { 6021 return -EBADF; 6022 } 6023 6024 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6025 return -ENOTSUP; 6026 } 6027 6028 bdev_io = bdev_channel_get_io(channel); 6029 if (!bdev_io) { 6030 return -ENOMEM; 6031 } 6032 6033 bdev_io->internal.ch = channel; 6034 bdev_io->internal.desc = desc; 6035 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6036 bdev_io->u.nvme_passthru.cmd = *cmd; 6037 bdev_io->u.nvme_passthru.buf = buf; 6038 bdev_io->u.nvme_passthru.nbytes = nbytes; 6039 bdev_io->u.nvme_passthru.md_buf = NULL; 6040 bdev_io->u.nvme_passthru.md_len = 0; 6041 6042 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6043 6044 bdev_io_submit(bdev_io); 6045 return 0; 6046 } 6047 6048 int 6049 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6050 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6051 spdk_bdev_io_completion_cb cb, void *cb_arg) 6052 { 6053 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6054 struct spdk_bdev_io *bdev_io; 6055 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6056 6057 if (!desc->write) { 6058 /* 6059 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6060 * to easily determine if the command is a read or write, but for now just 6061 * do not allow io_passthru with a read-only descriptor. 6062 */ 6063 return -EBADF; 6064 } 6065 6066 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6067 return -ENOTSUP; 6068 } 6069 6070 bdev_io = bdev_channel_get_io(channel); 6071 if (!bdev_io) { 6072 return -ENOMEM; 6073 } 6074 6075 bdev_io->internal.ch = channel; 6076 bdev_io->internal.desc = desc; 6077 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6078 bdev_io->u.nvme_passthru.cmd = *cmd; 6079 bdev_io->u.nvme_passthru.buf = buf; 6080 bdev_io->u.nvme_passthru.nbytes = nbytes; 6081 bdev_io->u.nvme_passthru.md_buf = NULL; 6082 bdev_io->u.nvme_passthru.md_len = 0; 6083 6084 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6085 6086 bdev_io_submit(bdev_io); 6087 return 0; 6088 } 6089 6090 int 6091 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6092 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6093 spdk_bdev_io_completion_cb cb, void *cb_arg) 6094 { 6095 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6096 struct spdk_bdev_io *bdev_io; 6097 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6098 6099 if (!desc->write) { 6100 /* 6101 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6102 * to easily determine if the command is a read or write, but for now just 6103 * do not allow io_passthru with a read-only descriptor. 6104 */ 6105 return -EBADF; 6106 } 6107 6108 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6109 return -ENOTSUP; 6110 } 6111 6112 bdev_io = bdev_channel_get_io(channel); 6113 if (!bdev_io) { 6114 return -ENOMEM; 6115 } 6116 6117 bdev_io->internal.ch = channel; 6118 bdev_io->internal.desc = desc; 6119 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6120 bdev_io->u.nvme_passthru.cmd = *cmd; 6121 bdev_io->u.nvme_passthru.buf = buf; 6122 bdev_io->u.nvme_passthru.nbytes = nbytes; 6123 bdev_io->u.nvme_passthru.md_buf = md_buf; 6124 bdev_io->u.nvme_passthru.md_len = md_len; 6125 6126 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6127 6128 bdev_io_submit(bdev_io); 6129 return 0; 6130 } 6131 6132 static void bdev_abort_retry(void *ctx); 6133 static void bdev_abort(struct spdk_bdev_io *parent_io); 6134 6135 static void 6136 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6137 { 6138 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6139 struct spdk_bdev_io *parent_io = cb_arg; 6140 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6141 6142 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6143 6144 spdk_bdev_free_io(bdev_io); 6145 6146 if (!success) { 6147 /* Check if the target I/O completed in the meantime. */ 6148 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6149 if (tmp_io == bio_to_abort) { 6150 break; 6151 } 6152 } 6153 6154 /* If the target I/O still exists, set the parent to failed. */ 6155 if (tmp_io != NULL) { 6156 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6157 } 6158 } 6159 6160 parent_io->u.bdev.split_outstanding--; 6161 if (parent_io->u.bdev.split_outstanding == 0) { 6162 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6163 bdev_abort_retry(parent_io); 6164 } else { 6165 bdev_io_complete(parent_io); 6166 } 6167 } 6168 } 6169 6170 static int 6171 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6172 struct spdk_bdev_io *bio_to_abort, 6173 spdk_bdev_io_completion_cb cb, void *cb_arg) 6174 { 6175 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6176 struct spdk_bdev_io *bdev_io; 6177 6178 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6179 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6180 /* TODO: Abort reset or abort request. */ 6181 return -ENOTSUP; 6182 } 6183 6184 bdev_io = bdev_channel_get_io(channel); 6185 if (bdev_io == NULL) { 6186 return -ENOMEM; 6187 } 6188 6189 bdev_io->internal.ch = channel; 6190 bdev_io->internal.desc = desc; 6191 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6192 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6193 6194 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6195 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6196 6197 /* Parent abort request is not submitted directly, but to manage its 6198 * execution add it to the submitted list here. 6199 */ 6200 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6201 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6202 6203 bdev_abort(bdev_io); 6204 6205 return 0; 6206 } 6207 6208 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6209 6210 /* Submit the abort request to the underlying bdev module. */ 6211 bdev_io_submit(bdev_io); 6212 6213 return 0; 6214 } 6215 6216 static uint32_t 6217 _bdev_abort(struct spdk_bdev_io *parent_io) 6218 { 6219 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6220 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6221 void *bio_cb_arg; 6222 struct spdk_bdev_io *bio_to_abort; 6223 uint32_t matched_ios; 6224 int rc; 6225 6226 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6227 6228 /* matched_ios is returned and will be kept by the caller. 6229 * 6230 * This function will be used for two cases, 1) the same cb_arg is used for 6231 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6232 * Incrementing split_outstanding directly here may confuse readers especially 6233 * for the 1st case. 6234 * 6235 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6236 * works as expected. 6237 */ 6238 matched_ios = 0; 6239 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6240 6241 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6242 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6243 continue; 6244 } 6245 6246 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6247 /* Any I/O which was submitted after this abort command should be excluded. */ 6248 continue; 6249 } 6250 6251 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6252 if (rc != 0) { 6253 if (rc == -ENOMEM) { 6254 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6255 } else { 6256 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6257 } 6258 break; 6259 } 6260 matched_ios++; 6261 } 6262 6263 return matched_ios; 6264 } 6265 6266 static void 6267 bdev_abort_retry(void *ctx) 6268 { 6269 struct spdk_bdev_io *parent_io = ctx; 6270 uint32_t matched_ios; 6271 6272 matched_ios = _bdev_abort(parent_io); 6273 6274 if (matched_ios == 0) { 6275 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6276 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6277 } else { 6278 /* For retry, the case that no target I/O was found is success 6279 * because it means target I/Os completed in the meantime. 6280 */ 6281 bdev_io_complete(parent_io); 6282 } 6283 return; 6284 } 6285 6286 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6287 parent_io->u.bdev.split_outstanding = matched_ios; 6288 } 6289 6290 static void 6291 bdev_abort(struct spdk_bdev_io *parent_io) 6292 { 6293 uint32_t matched_ios; 6294 6295 matched_ios = _bdev_abort(parent_io); 6296 6297 if (matched_ios == 0) { 6298 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6299 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6300 } else { 6301 /* The case the no target I/O was found is failure. */ 6302 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6303 bdev_io_complete(parent_io); 6304 } 6305 return; 6306 } 6307 6308 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6309 parent_io->u.bdev.split_outstanding = matched_ios; 6310 } 6311 6312 int 6313 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6314 void *bio_cb_arg, 6315 spdk_bdev_io_completion_cb cb, void *cb_arg) 6316 { 6317 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6318 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6319 struct spdk_bdev_io *bdev_io; 6320 6321 if (bio_cb_arg == NULL) { 6322 return -EINVAL; 6323 } 6324 6325 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6326 return -ENOTSUP; 6327 } 6328 6329 bdev_io = bdev_channel_get_io(channel); 6330 if (bdev_io == NULL) { 6331 return -ENOMEM; 6332 } 6333 6334 bdev_io->internal.ch = channel; 6335 bdev_io->internal.desc = desc; 6336 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6337 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6338 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6339 6340 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6341 6342 /* Parent abort request is not submitted directly, but to manage its execution, 6343 * add it to the submitted list here. 6344 */ 6345 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6346 6347 bdev_abort(bdev_io); 6348 6349 return 0; 6350 } 6351 6352 int 6353 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6354 struct spdk_bdev_io_wait_entry *entry) 6355 { 6356 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6357 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6358 6359 if (bdev != entry->bdev) { 6360 SPDK_ERRLOG("bdevs do not match\n"); 6361 return -EINVAL; 6362 } 6363 6364 if (mgmt_ch->per_thread_cache_count > 0) { 6365 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6366 return -EINVAL; 6367 } 6368 6369 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6370 return 0; 6371 } 6372 6373 static inline void 6374 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6375 { 6376 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6377 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6378 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6379 uint32_t blocklen = bdev_io->bdev->blocklen; 6380 6381 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6382 switch (bdev_io->type) { 6383 case SPDK_BDEV_IO_TYPE_READ: 6384 io_stat->bytes_read += num_blocks * blocklen; 6385 io_stat->num_read_ops++; 6386 io_stat->read_latency_ticks += tsc_diff; 6387 if (io_stat->max_read_latency_ticks < tsc_diff) { 6388 io_stat->max_read_latency_ticks = tsc_diff; 6389 } 6390 if (io_stat->min_read_latency_ticks > tsc_diff) { 6391 io_stat->min_read_latency_ticks = tsc_diff; 6392 } 6393 break; 6394 case SPDK_BDEV_IO_TYPE_WRITE: 6395 io_stat->bytes_written += num_blocks * blocklen; 6396 io_stat->num_write_ops++; 6397 io_stat->write_latency_ticks += tsc_diff; 6398 if (io_stat->max_write_latency_ticks < tsc_diff) { 6399 io_stat->max_write_latency_ticks = tsc_diff; 6400 } 6401 if (io_stat->min_write_latency_ticks > tsc_diff) { 6402 io_stat->min_write_latency_ticks = tsc_diff; 6403 } 6404 break; 6405 case SPDK_BDEV_IO_TYPE_UNMAP: 6406 io_stat->bytes_unmapped += num_blocks * blocklen; 6407 io_stat->num_unmap_ops++; 6408 io_stat->unmap_latency_ticks += tsc_diff; 6409 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6410 io_stat->max_unmap_latency_ticks = tsc_diff; 6411 } 6412 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6413 io_stat->min_unmap_latency_ticks = tsc_diff; 6414 } 6415 break; 6416 case SPDK_BDEV_IO_TYPE_ZCOPY: 6417 /* Track the data in the start phase only */ 6418 if (bdev_io->u.bdev.zcopy.start) { 6419 if (bdev_io->u.bdev.zcopy.populate) { 6420 io_stat->bytes_read += num_blocks * blocklen; 6421 io_stat->num_read_ops++; 6422 io_stat->read_latency_ticks += tsc_diff; 6423 if (io_stat->max_read_latency_ticks < tsc_diff) { 6424 io_stat->max_read_latency_ticks = tsc_diff; 6425 } 6426 if (io_stat->min_read_latency_ticks > tsc_diff) { 6427 io_stat->min_read_latency_ticks = tsc_diff; 6428 } 6429 } else { 6430 io_stat->bytes_written += num_blocks * blocklen; 6431 io_stat->num_write_ops++; 6432 io_stat->write_latency_ticks += tsc_diff; 6433 if (io_stat->max_write_latency_ticks < tsc_diff) { 6434 io_stat->max_write_latency_ticks = tsc_diff; 6435 } 6436 if (io_stat->min_write_latency_ticks > tsc_diff) { 6437 io_stat->min_write_latency_ticks = tsc_diff; 6438 } 6439 } 6440 } 6441 break; 6442 case SPDK_BDEV_IO_TYPE_COPY: 6443 io_stat->bytes_copied += num_blocks * blocklen; 6444 io_stat->num_copy_ops++; 6445 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6446 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6447 io_stat->max_copy_latency_ticks = tsc_diff; 6448 } 6449 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6450 io_stat->min_copy_latency_ticks = tsc_diff; 6451 } 6452 break; 6453 default: 6454 break; 6455 } 6456 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6457 io_stat = bdev_io->bdev->internal.stat; 6458 assert(io_stat->io_error != NULL); 6459 6460 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6461 io_stat->io_error->error_status[-io_status - 1]++; 6462 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6463 } 6464 6465 #ifdef SPDK_CONFIG_VTUNE 6466 uint64_t now_tsc = spdk_get_ticks(); 6467 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6468 uint64_t data[5]; 6469 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6470 6471 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6472 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6473 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6474 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6475 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6476 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6477 6478 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6479 __itt_metadata_u64, 5, data); 6480 6481 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6482 bdev_io->internal.ch->start_tsc = now_tsc; 6483 } 6484 #endif 6485 } 6486 6487 static inline void 6488 bdev_io_complete(void *ctx) 6489 { 6490 struct spdk_bdev_io *bdev_io = ctx; 6491 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6492 uint64_t tsc, tsc_diff; 6493 6494 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6495 /* 6496 * Defer completion to avoid potential infinite recursion if the 6497 * user's completion callback issues a new I/O. 6498 */ 6499 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6500 bdev_io_complete, bdev_io); 6501 return; 6502 } 6503 6504 tsc = spdk_get_ticks(); 6505 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6506 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6507 bdev_io->internal.caller_ctx); 6508 6509 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6510 6511 if (bdev_io->internal.ch->histogram) { 6512 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6513 } 6514 6515 bdev_io_update_io_stat(bdev_io, tsc_diff); 6516 6517 assert(bdev_io->internal.cb != NULL); 6518 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6519 6520 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6521 bdev_io->internal.caller_ctx); 6522 } 6523 6524 static void bdev_destroy_cb(void *io_device); 6525 6526 static void 6527 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6528 { 6529 struct spdk_bdev_io *bdev_io = _ctx; 6530 6531 if (bdev_io->u.reset.ch_ref != NULL) { 6532 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6533 bdev_io->u.reset.ch_ref = NULL; 6534 } 6535 6536 bdev_io_complete(bdev_io); 6537 6538 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6539 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6540 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6541 } 6542 } 6543 6544 static void 6545 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6546 struct spdk_io_channel *_ch, void *_ctx) 6547 { 6548 struct spdk_bdev_io *bdev_io = _ctx; 6549 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6550 struct spdk_bdev_io *queued_reset; 6551 6552 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6553 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6554 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6555 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6556 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6557 } 6558 6559 spdk_bdev_for_each_channel_continue(i, 0); 6560 } 6561 6562 void 6563 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6564 { 6565 struct spdk_bdev *bdev = bdev_io->bdev; 6566 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6567 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6568 6569 bdev_io->internal.status = status; 6570 6571 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6572 bool unlock_channels = false; 6573 6574 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6575 SPDK_ERRLOG("NOMEM returned for reset\n"); 6576 } 6577 spdk_spin_lock(&bdev->internal.spinlock); 6578 if (bdev_io == bdev->internal.reset_in_progress) { 6579 bdev->internal.reset_in_progress = NULL; 6580 unlock_channels = true; 6581 } 6582 spdk_spin_unlock(&bdev->internal.spinlock); 6583 6584 if (unlock_channels) { 6585 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6586 bdev_reset_complete); 6587 return; 6588 } 6589 } else { 6590 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6591 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6592 /* bdev IO will be completed in the callback */ 6593 return; 6594 } 6595 6596 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6597 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6598 return; 6599 } 6600 } 6601 6602 bdev_io_complete(bdev_io); 6603 } 6604 6605 void 6606 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6607 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6608 { 6609 if (sc == SPDK_SCSI_STATUS_GOOD) { 6610 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6611 } else { 6612 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6613 bdev_io->internal.error.scsi.sc = sc; 6614 bdev_io->internal.error.scsi.sk = sk; 6615 bdev_io->internal.error.scsi.asc = asc; 6616 bdev_io->internal.error.scsi.ascq = ascq; 6617 } 6618 6619 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6620 } 6621 6622 void 6623 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6624 int *sc, int *sk, int *asc, int *ascq) 6625 { 6626 assert(sc != NULL); 6627 assert(sk != NULL); 6628 assert(asc != NULL); 6629 assert(ascq != NULL); 6630 6631 switch (bdev_io->internal.status) { 6632 case SPDK_BDEV_IO_STATUS_SUCCESS: 6633 *sc = SPDK_SCSI_STATUS_GOOD; 6634 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6635 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6636 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6637 break; 6638 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6639 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6640 break; 6641 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6642 *sc = bdev_io->internal.error.scsi.sc; 6643 *sk = bdev_io->internal.error.scsi.sk; 6644 *asc = bdev_io->internal.error.scsi.asc; 6645 *ascq = bdev_io->internal.error.scsi.ascq; 6646 break; 6647 default: 6648 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6649 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6650 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6651 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6652 break; 6653 } 6654 } 6655 6656 void 6657 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6658 { 6659 if (aio_result == 0) { 6660 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6661 } else { 6662 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6663 } 6664 6665 bdev_io->internal.error.aio_result = aio_result; 6666 6667 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6668 } 6669 6670 void 6671 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6672 { 6673 assert(aio_result != NULL); 6674 6675 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6676 *aio_result = bdev_io->internal.error.aio_result; 6677 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6678 *aio_result = 0; 6679 } else { 6680 *aio_result = -EIO; 6681 } 6682 } 6683 6684 void 6685 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6686 { 6687 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6688 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6689 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6690 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6691 } else { 6692 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6693 } 6694 6695 bdev_io->internal.error.nvme.cdw0 = cdw0; 6696 bdev_io->internal.error.nvme.sct = sct; 6697 bdev_io->internal.error.nvme.sc = sc; 6698 6699 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6700 } 6701 6702 void 6703 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6704 { 6705 assert(sct != NULL); 6706 assert(sc != NULL); 6707 assert(cdw0 != NULL); 6708 6709 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6710 *sct = SPDK_NVME_SCT_GENERIC; 6711 *sc = SPDK_NVME_SC_SUCCESS; 6712 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6713 *cdw0 = 0; 6714 } else { 6715 *cdw0 = 1U; 6716 } 6717 return; 6718 } 6719 6720 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6721 *sct = bdev_io->internal.error.nvme.sct; 6722 *sc = bdev_io->internal.error.nvme.sc; 6723 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6724 *sct = SPDK_NVME_SCT_GENERIC; 6725 *sc = SPDK_NVME_SC_SUCCESS; 6726 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6727 *sct = SPDK_NVME_SCT_GENERIC; 6728 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6729 } else { 6730 *sct = SPDK_NVME_SCT_GENERIC; 6731 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6732 } 6733 6734 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6735 } 6736 6737 void 6738 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6739 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6740 { 6741 assert(first_sct != NULL); 6742 assert(first_sc != NULL); 6743 assert(second_sct != NULL); 6744 assert(second_sc != NULL); 6745 assert(cdw0 != NULL); 6746 6747 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6748 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6749 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6750 *first_sct = bdev_io->internal.error.nvme.sct; 6751 *first_sc = bdev_io->internal.error.nvme.sc; 6752 *second_sct = SPDK_NVME_SCT_GENERIC; 6753 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6754 } else { 6755 *first_sct = SPDK_NVME_SCT_GENERIC; 6756 *first_sc = SPDK_NVME_SC_SUCCESS; 6757 *second_sct = bdev_io->internal.error.nvme.sct; 6758 *second_sc = bdev_io->internal.error.nvme.sc; 6759 } 6760 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6761 *first_sct = SPDK_NVME_SCT_GENERIC; 6762 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6763 *second_sct = SPDK_NVME_SCT_GENERIC; 6764 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6765 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6766 *first_sct = SPDK_NVME_SCT_GENERIC; 6767 *first_sc = SPDK_NVME_SC_SUCCESS; 6768 *second_sct = SPDK_NVME_SCT_GENERIC; 6769 *second_sc = SPDK_NVME_SC_SUCCESS; 6770 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6771 *first_sct = SPDK_NVME_SCT_GENERIC; 6772 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6773 *second_sct = SPDK_NVME_SCT_GENERIC; 6774 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6775 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6776 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6777 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6778 *second_sct = SPDK_NVME_SCT_GENERIC; 6779 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6780 } else { 6781 *first_sct = SPDK_NVME_SCT_GENERIC; 6782 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6783 *second_sct = SPDK_NVME_SCT_GENERIC; 6784 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6785 } 6786 6787 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6788 } 6789 6790 struct spdk_thread * 6791 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6792 { 6793 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6794 } 6795 6796 struct spdk_io_channel * 6797 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6798 { 6799 return bdev_io->internal.ch->channel; 6800 } 6801 6802 static int 6803 bdev_register(struct spdk_bdev *bdev) 6804 { 6805 char *bdev_name; 6806 char uuid[SPDK_UUID_STRING_LEN]; 6807 int ret; 6808 6809 assert(bdev->module != NULL); 6810 6811 if (!bdev->name) { 6812 SPDK_ERRLOG("Bdev name is NULL\n"); 6813 return -EINVAL; 6814 } 6815 6816 if (!strlen(bdev->name)) { 6817 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6818 return -EINVAL; 6819 } 6820 6821 /* Users often register their own I/O devices using the bdev name. In 6822 * order to avoid conflicts, prepend bdev_. */ 6823 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6824 if (!bdev_name) { 6825 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6826 return -ENOMEM; 6827 } 6828 6829 bdev->internal.stat = bdev_alloc_io_stat(true); 6830 if (!bdev->internal.stat) { 6831 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6832 free(bdev_name); 6833 return -ENOMEM; 6834 } 6835 6836 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6837 bdev->internal.measured_queue_depth = UINT64_MAX; 6838 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6839 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6840 bdev->internal.qd_poller = NULL; 6841 bdev->internal.qos = NULL; 6842 6843 TAILQ_INIT(&bdev->internal.open_descs); 6844 TAILQ_INIT(&bdev->internal.locked_ranges); 6845 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6846 TAILQ_INIT(&bdev->aliases); 6847 6848 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6849 if (ret != 0) { 6850 bdev_free_io_stat(bdev->internal.stat); 6851 free(bdev_name); 6852 return ret; 6853 } 6854 6855 /* UUID has to be specified by the user or defined by bdev itself. 6856 * Otherwise this field must remain empty, to indicate that this 6857 * value cannot be depended upon. */ 6858 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6859 /* Add the UUID alias only if it's different than the name */ 6860 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6861 if (strcmp(bdev->name, uuid) != 0) { 6862 ret = spdk_bdev_alias_add(bdev, uuid); 6863 if (ret != 0) { 6864 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6865 bdev_name_del(&bdev->internal.bdev_name); 6866 bdev_free_io_stat(bdev->internal.stat); 6867 free(bdev_name); 6868 return ret; 6869 } 6870 } 6871 } 6872 6873 if (spdk_bdev_get_buf_align(bdev) > 1) { 6874 if (bdev->split_on_optimal_io_boundary) { 6875 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6876 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6877 } else { 6878 bdev->split_on_optimal_io_boundary = true; 6879 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6880 } 6881 } 6882 6883 /* If the user didn't specify a write unit size, set it to one. */ 6884 if (bdev->write_unit_size == 0) { 6885 bdev->write_unit_size = 1; 6886 } 6887 6888 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6889 if (bdev->acwu == 0) { 6890 bdev->acwu = bdev->write_unit_size; 6891 } 6892 6893 if (bdev->phys_blocklen == 0) { 6894 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6895 } 6896 6897 bdev->internal.reset_in_progress = NULL; 6898 bdev->internal.qd_poll_in_progress = false; 6899 bdev->internal.period = 0; 6900 bdev->internal.new_period = 0; 6901 6902 spdk_io_device_register(__bdev_to_io_dev(bdev), 6903 bdev_channel_create, bdev_channel_destroy, 6904 sizeof(struct spdk_bdev_channel), 6905 bdev_name); 6906 6907 free(bdev_name); 6908 6909 spdk_spin_init(&bdev->internal.spinlock); 6910 6911 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6912 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6913 6914 return 0; 6915 } 6916 6917 static void 6918 bdev_destroy_cb(void *io_device) 6919 { 6920 int rc; 6921 struct spdk_bdev *bdev; 6922 spdk_bdev_unregister_cb cb_fn; 6923 void *cb_arg; 6924 6925 bdev = __bdev_from_io_dev(io_device); 6926 6927 if (bdev->internal.unregister_td != spdk_get_thread()) { 6928 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 6929 return; 6930 } 6931 6932 cb_fn = bdev->internal.unregister_cb; 6933 cb_arg = bdev->internal.unregister_ctx; 6934 6935 spdk_spin_destroy(&bdev->internal.spinlock); 6936 free(bdev->internal.qos); 6937 bdev_free_io_stat(bdev->internal.stat); 6938 6939 rc = bdev->fn_table->destruct(bdev->ctxt); 6940 if (rc < 0) { 6941 SPDK_ERRLOG("destruct failed\n"); 6942 } 6943 if (rc <= 0 && cb_fn != NULL) { 6944 cb_fn(cb_arg, rc); 6945 } 6946 } 6947 6948 void 6949 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6950 { 6951 if (bdev->internal.unregister_cb != NULL) { 6952 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6953 } 6954 } 6955 6956 static void 6957 _remove_notify(void *arg) 6958 { 6959 struct spdk_bdev_desc *desc = arg; 6960 6961 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 6962 } 6963 6964 /* returns: 0 - bdev removed and ready to be destructed. 6965 * -EBUSY - bdev can't be destructed yet. */ 6966 static int 6967 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6968 { 6969 struct spdk_bdev_desc *desc, *tmp; 6970 int rc = 0; 6971 char uuid[SPDK_UUID_STRING_LEN]; 6972 6973 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6974 assert(spdk_spin_held(&bdev->internal.spinlock)); 6975 6976 /* Notify each descriptor about hotremoval */ 6977 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6978 rc = -EBUSY; 6979 /* 6980 * Defer invocation of the event_cb to a separate message that will 6981 * run later on its thread. This ensures this context unwinds and 6982 * we don't recursively unregister this bdev again if the event_cb 6983 * immediately closes its descriptor. 6984 */ 6985 event_notify(desc, _remove_notify); 6986 } 6987 6988 /* If there are no descriptors, proceed removing the bdev */ 6989 if (rc == 0) { 6990 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6991 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6992 6993 /* Delete the name and the UUID alias */ 6994 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6995 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6996 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6997 6998 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6999 7000 if (bdev->internal.reset_in_progress != NULL) { 7001 /* If reset is in progress, let the completion callback for reset 7002 * unregister the bdev. 7003 */ 7004 rc = -EBUSY; 7005 } 7006 } 7007 7008 return rc; 7009 } 7010 7011 static void 7012 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7013 struct spdk_io_channel *io_ch, void *_ctx) 7014 { 7015 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7016 7017 bdev_channel_abort_queued_ios(bdev_ch); 7018 spdk_bdev_for_each_channel_continue(i, 0); 7019 } 7020 7021 static void 7022 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7023 { 7024 int rc; 7025 7026 spdk_spin_lock(&g_bdev_mgr.spinlock); 7027 spdk_spin_lock(&bdev->internal.spinlock); 7028 /* 7029 * Set the status to REMOVING after completing to abort channels. Otherwise, 7030 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7031 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7032 * may fail. 7033 */ 7034 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7035 rc = bdev_unregister_unsafe(bdev); 7036 spdk_spin_unlock(&bdev->internal.spinlock); 7037 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7038 7039 if (rc == 0) { 7040 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7041 } 7042 } 7043 7044 void 7045 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7046 { 7047 struct spdk_thread *thread; 7048 7049 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7050 7051 thread = spdk_get_thread(); 7052 if (!thread) { 7053 /* The user called this from a non-SPDK thread. */ 7054 if (cb_fn != NULL) { 7055 cb_fn(cb_arg, -ENOTSUP); 7056 } 7057 return; 7058 } 7059 7060 spdk_spin_lock(&g_bdev_mgr.spinlock); 7061 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7062 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7063 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7064 if (cb_fn) { 7065 cb_fn(cb_arg, -EBUSY); 7066 } 7067 return; 7068 } 7069 7070 spdk_spin_lock(&bdev->internal.spinlock); 7071 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7072 bdev->internal.unregister_cb = cb_fn; 7073 bdev->internal.unregister_ctx = cb_arg; 7074 bdev->internal.unregister_td = thread; 7075 spdk_spin_unlock(&bdev->internal.spinlock); 7076 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7077 7078 spdk_bdev_set_qd_sampling_period(bdev, 0); 7079 7080 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7081 bdev_unregister); 7082 } 7083 7084 int 7085 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7086 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7087 { 7088 struct spdk_bdev_desc *desc; 7089 struct spdk_bdev *bdev; 7090 int rc; 7091 7092 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7093 if (rc != 0) { 7094 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7095 return rc; 7096 } 7097 7098 bdev = spdk_bdev_desc_get_bdev(desc); 7099 7100 if (bdev->module != module) { 7101 spdk_bdev_close(desc); 7102 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7103 bdev_name); 7104 return -ENODEV; 7105 } 7106 7107 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7108 7109 spdk_bdev_close(desc); 7110 7111 return 0; 7112 } 7113 7114 static int 7115 bdev_start_qos(struct spdk_bdev *bdev) 7116 { 7117 struct set_qos_limit_ctx *ctx; 7118 7119 /* Enable QoS */ 7120 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7121 ctx = calloc(1, sizeof(*ctx)); 7122 if (ctx == NULL) { 7123 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7124 return -ENOMEM; 7125 } 7126 ctx->bdev = bdev; 7127 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7128 } 7129 7130 return 0; 7131 } 7132 7133 static void 7134 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7135 struct spdk_bdev *bdev) 7136 { 7137 enum spdk_bdev_claim_type type; 7138 const char *typename, *modname; 7139 extern struct spdk_log_flag SPDK_LOG_bdev; 7140 7141 assert(spdk_spin_held(&bdev->internal.spinlock)); 7142 7143 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7144 return; 7145 } 7146 7147 type = bdev->internal.claim_type; 7148 typename = spdk_bdev_claim_get_name(type); 7149 7150 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7151 modname = bdev->internal.claim.v1.module->name; 7152 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7153 bdev->name, detail, typename, modname); 7154 return; 7155 } 7156 7157 if (claim_type_is_v2(type)) { 7158 struct spdk_bdev_module_claim *claim; 7159 7160 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7161 modname = claim->module->name; 7162 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7163 bdev->name, detail, typename, modname); 7164 } 7165 return; 7166 } 7167 7168 assert(false); 7169 } 7170 7171 static int 7172 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7173 { 7174 struct spdk_thread *thread; 7175 int rc = 0; 7176 7177 thread = spdk_get_thread(); 7178 if (!thread) { 7179 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7180 return -ENOTSUP; 7181 } 7182 7183 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7184 spdk_get_thread()); 7185 7186 desc->bdev = bdev; 7187 desc->thread = thread; 7188 desc->write = write; 7189 7190 spdk_spin_lock(&bdev->internal.spinlock); 7191 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7192 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7193 spdk_spin_unlock(&bdev->internal.spinlock); 7194 return -ENODEV; 7195 } 7196 7197 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7198 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7199 spdk_spin_unlock(&bdev->internal.spinlock); 7200 return -EPERM; 7201 } 7202 7203 rc = bdev_start_qos(bdev); 7204 if (rc != 0) { 7205 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7206 spdk_spin_unlock(&bdev->internal.spinlock); 7207 return rc; 7208 } 7209 7210 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7211 7212 spdk_spin_unlock(&bdev->internal.spinlock); 7213 7214 return 0; 7215 } 7216 7217 static int 7218 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7219 struct spdk_bdev_desc **_desc) 7220 { 7221 struct spdk_bdev_desc *desc; 7222 unsigned int event_id; 7223 7224 desc = calloc(1, sizeof(*desc)); 7225 if (desc == NULL) { 7226 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7227 return -ENOMEM; 7228 } 7229 7230 TAILQ_INIT(&desc->pending_media_events); 7231 TAILQ_INIT(&desc->free_media_events); 7232 7233 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7234 desc->callback.event_fn = event_cb; 7235 desc->callback.ctx = event_ctx; 7236 spdk_spin_init(&desc->spinlock); 7237 7238 if (bdev->media_events) { 7239 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7240 sizeof(*desc->media_events_buffer)); 7241 if (desc->media_events_buffer == NULL) { 7242 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7243 bdev_desc_free(desc); 7244 return -ENOMEM; 7245 } 7246 7247 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7248 TAILQ_INSERT_TAIL(&desc->free_media_events, 7249 &desc->media_events_buffer[event_id], tailq); 7250 } 7251 } 7252 7253 *_desc = desc; 7254 7255 return 0; 7256 } 7257 7258 int 7259 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7260 void *event_ctx, struct spdk_bdev_desc **_desc) 7261 { 7262 struct spdk_bdev_desc *desc; 7263 struct spdk_bdev *bdev; 7264 int rc; 7265 7266 if (event_cb == NULL) { 7267 SPDK_ERRLOG("Missing event callback function\n"); 7268 return -EINVAL; 7269 } 7270 7271 spdk_spin_lock(&g_bdev_mgr.spinlock); 7272 7273 bdev = bdev_get_by_name(bdev_name); 7274 7275 if (bdev == NULL) { 7276 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7277 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7278 return -ENODEV; 7279 } 7280 7281 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7282 if (rc != 0) { 7283 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7284 return rc; 7285 } 7286 7287 rc = bdev_open(bdev, write, desc); 7288 if (rc != 0) { 7289 bdev_desc_free(desc); 7290 desc = NULL; 7291 } 7292 7293 *_desc = desc; 7294 7295 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7296 7297 return rc; 7298 } 7299 7300 static void 7301 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7302 { 7303 int rc; 7304 7305 spdk_spin_lock(&bdev->internal.spinlock); 7306 spdk_spin_lock(&desc->spinlock); 7307 7308 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7309 7310 desc->closed = true; 7311 7312 if (desc->claim != NULL) { 7313 bdev_desc_release_claims(desc); 7314 } 7315 7316 if (0 == desc->refs) { 7317 spdk_spin_unlock(&desc->spinlock); 7318 bdev_desc_free(desc); 7319 } else { 7320 spdk_spin_unlock(&desc->spinlock); 7321 } 7322 7323 /* If no more descriptors, kill QoS channel */ 7324 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7325 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7326 bdev->name, spdk_get_thread()); 7327 7328 if (bdev_qos_destroy(bdev)) { 7329 /* There isn't anything we can do to recover here. Just let the 7330 * old QoS poller keep running. The QoS handling won't change 7331 * cores when the user allocates a new channel, but it won't break. */ 7332 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7333 } 7334 } 7335 7336 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7337 rc = bdev_unregister_unsafe(bdev); 7338 spdk_spin_unlock(&bdev->internal.spinlock); 7339 7340 if (rc == 0) { 7341 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7342 } 7343 } else { 7344 spdk_spin_unlock(&bdev->internal.spinlock); 7345 } 7346 } 7347 7348 void 7349 spdk_bdev_close(struct spdk_bdev_desc *desc) 7350 { 7351 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7352 7353 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7354 spdk_get_thread()); 7355 7356 assert(desc->thread == spdk_get_thread()); 7357 7358 spdk_poller_unregister(&desc->io_timeout_poller); 7359 7360 spdk_spin_lock(&g_bdev_mgr.spinlock); 7361 7362 bdev_close(bdev, desc); 7363 7364 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7365 } 7366 7367 static void 7368 bdev_register_finished(void *arg) 7369 { 7370 struct spdk_bdev_desc *desc = arg; 7371 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7372 7373 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7374 7375 spdk_spin_lock(&g_bdev_mgr.spinlock); 7376 7377 bdev_close(bdev, desc); 7378 7379 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7380 } 7381 7382 int 7383 spdk_bdev_register(struct spdk_bdev *bdev) 7384 { 7385 struct spdk_bdev_desc *desc; 7386 struct spdk_thread *thread = spdk_get_thread(); 7387 int rc; 7388 7389 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7390 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7391 thread ? spdk_thread_get_name(thread) : "null"); 7392 return -EINVAL; 7393 } 7394 7395 rc = bdev_register(bdev); 7396 if (rc != 0) { 7397 return rc; 7398 } 7399 7400 /* A descriptor is opened to prevent bdev deletion during examination */ 7401 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7402 if (rc != 0) { 7403 spdk_bdev_unregister(bdev, NULL, NULL); 7404 return rc; 7405 } 7406 7407 rc = bdev_open(bdev, false, desc); 7408 if (rc != 0) { 7409 bdev_desc_free(desc); 7410 spdk_bdev_unregister(bdev, NULL, NULL); 7411 return rc; 7412 } 7413 7414 /* Examine configuration before initializing I/O */ 7415 bdev_examine(bdev); 7416 7417 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7418 if (rc != 0) { 7419 bdev_close(bdev, desc); 7420 spdk_bdev_unregister(bdev, NULL, NULL); 7421 } 7422 7423 return rc; 7424 } 7425 7426 int 7427 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7428 struct spdk_bdev_module *module) 7429 { 7430 spdk_spin_lock(&bdev->internal.spinlock); 7431 7432 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7433 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7434 spdk_spin_unlock(&bdev->internal.spinlock); 7435 return -EPERM; 7436 } 7437 7438 if (desc && !desc->write) { 7439 desc->write = true; 7440 } 7441 7442 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7443 bdev->internal.claim.v1.module = module; 7444 7445 spdk_spin_unlock(&bdev->internal.spinlock); 7446 return 0; 7447 } 7448 7449 void 7450 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7451 { 7452 spdk_spin_lock(&bdev->internal.spinlock); 7453 7454 assert(bdev->internal.claim.v1.module != NULL); 7455 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7456 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7457 bdev->internal.claim.v1.module = NULL; 7458 7459 spdk_spin_unlock(&bdev->internal.spinlock); 7460 } 7461 7462 /* 7463 * Start claims v2 7464 */ 7465 7466 const char * 7467 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7468 { 7469 switch (type) { 7470 case SPDK_BDEV_CLAIM_NONE: 7471 return "not_claimed"; 7472 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7473 return "exclusive_write"; 7474 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7475 return "read_many_write_one"; 7476 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7477 return "read_many_write_none"; 7478 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7479 return "read_many_write_many"; 7480 default: 7481 break; 7482 } 7483 return "invalid_claim"; 7484 } 7485 7486 static bool 7487 claim_type_is_v2(enum spdk_bdev_claim_type type) 7488 { 7489 switch (type) { 7490 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7491 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7492 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7493 return true; 7494 default: 7495 break; 7496 } 7497 return false; 7498 } 7499 7500 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7501 static bool 7502 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7503 { 7504 switch (type) { 7505 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7506 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7507 return true; 7508 default: 7509 break; 7510 } 7511 return false; 7512 } 7513 7514 void 7515 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7516 { 7517 if (opts == NULL) { 7518 SPDK_ERRLOG("opts should not be NULL\n"); 7519 assert(opts != NULL); 7520 return; 7521 } 7522 if (size == 0) { 7523 SPDK_ERRLOG("size should not be zero\n"); 7524 assert(size != 0); 7525 return; 7526 } 7527 7528 memset(opts, 0, size); 7529 opts->opts_size = size; 7530 7531 #define FIELD_OK(field) \ 7532 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7533 7534 #define SET_FIELD(field, value) \ 7535 if (FIELD_OK(field)) { \ 7536 opts->field = value; \ 7537 } \ 7538 7539 SET_FIELD(shared_claim_key, 0); 7540 7541 #undef FIELD_OK 7542 #undef SET_FIELD 7543 } 7544 7545 static int 7546 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7547 { 7548 if (src->opts_size == 0) { 7549 SPDK_ERRLOG("size should not be zero\n"); 7550 return -1; 7551 } 7552 7553 memset(dst, 0, sizeof(*dst)); 7554 dst->opts_size = src->opts_size; 7555 7556 #define FIELD_OK(field) \ 7557 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7558 7559 #define SET_FIELD(field) \ 7560 if (FIELD_OK(field)) { \ 7561 dst->field = src->field; \ 7562 } \ 7563 7564 if (FIELD_OK(name)) { 7565 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7566 } 7567 7568 SET_FIELD(shared_claim_key); 7569 7570 /* You should not remove this statement, but need to update the assert statement 7571 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7572 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7573 7574 #undef FIELD_OK 7575 #undef SET_FIELD 7576 return 0; 7577 } 7578 7579 /* Returns 0 if a read-write-once claim can be taken. */ 7580 static int 7581 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7582 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7583 { 7584 struct spdk_bdev *bdev = desc->bdev; 7585 struct spdk_bdev_desc *open_desc; 7586 7587 assert(spdk_spin_held(&bdev->internal.spinlock)); 7588 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7589 7590 if (opts->shared_claim_key != 0) { 7591 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7592 bdev->name); 7593 return -EINVAL; 7594 } 7595 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7596 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7597 return -EPERM; 7598 } 7599 if (desc->claim != NULL) { 7600 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7601 bdev->name, desc->claim->module->name); 7602 return -EPERM; 7603 } 7604 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7605 if (desc != open_desc && open_desc->write) { 7606 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7607 "another descriptor is open for writing\n", 7608 bdev->name); 7609 return -EPERM; 7610 } 7611 } 7612 7613 return 0; 7614 } 7615 7616 /* Returns 0 if a read-only-many claim can be taken. */ 7617 static int 7618 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7619 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7620 { 7621 struct spdk_bdev *bdev = desc->bdev; 7622 struct spdk_bdev_desc *open_desc; 7623 7624 assert(spdk_spin_held(&bdev->internal.spinlock)); 7625 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 7626 assert(desc->claim == NULL); 7627 7628 if (desc->write) { 7629 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 7630 bdev->name); 7631 return -EINVAL; 7632 } 7633 if (opts->shared_claim_key != 0) { 7634 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 7635 return -EINVAL; 7636 } 7637 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7638 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7639 if (open_desc->write) { 7640 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 7641 "another descriptor is open for writing\n", 7642 bdev->name); 7643 return -EPERM; 7644 } 7645 } 7646 } 7647 7648 return 0; 7649 } 7650 7651 /* Returns 0 if a read-write-many claim can be taken. */ 7652 static int 7653 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7654 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7655 { 7656 struct spdk_bdev *bdev = desc->bdev; 7657 struct spdk_bdev_desc *open_desc; 7658 7659 assert(spdk_spin_held(&bdev->internal.spinlock)); 7660 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 7661 assert(desc->claim == NULL); 7662 7663 if (opts->shared_claim_key == 0) { 7664 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 7665 bdev->name); 7666 return -EINVAL; 7667 } 7668 switch (bdev->internal.claim_type) { 7669 case SPDK_BDEV_CLAIM_NONE: 7670 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7671 if (open_desc == desc) { 7672 continue; 7673 } 7674 if (open_desc->write) { 7675 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 7676 "another descriptor is open for writing without a " 7677 "claim\n", bdev->name); 7678 return -EPERM; 7679 } 7680 } 7681 break; 7682 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7683 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 7684 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 7685 return -EPERM; 7686 } 7687 break; 7688 default: 7689 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7690 return -EBUSY; 7691 } 7692 7693 return 0; 7694 } 7695 7696 /* Updates desc and its bdev with a v2 claim. */ 7697 static int 7698 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7699 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7700 { 7701 struct spdk_bdev *bdev = desc->bdev; 7702 struct spdk_bdev_module_claim *claim; 7703 7704 assert(spdk_spin_held(&bdev->internal.spinlock)); 7705 assert(claim_type_is_v2(type)); 7706 assert(desc->claim == NULL); 7707 7708 claim = calloc(1, sizeof(*desc->claim)); 7709 if (claim == NULL) { 7710 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 7711 return -ENOMEM; 7712 } 7713 claim->module = module; 7714 claim->desc = desc; 7715 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 7716 memcpy(claim->name, opts->name, sizeof(claim->name)); 7717 desc->claim = claim; 7718 7719 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7720 bdev->internal.claim_type = type; 7721 TAILQ_INIT(&bdev->internal.claim.v2.claims); 7722 bdev->internal.claim.v2.key = opts->shared_claim_key; 7723 } 7724 assert(type == bdev->internal.claim_type); 7725 7726 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 7727 7728 if (!desc->write && claim_type_promotes_to_write(type)) { 7729 desc->write = true; 7730 } 7731 7732 return 0; 7733 } 7734 7735 int 7736 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7737 struct spdk_bdev_claim_opts *_opts, 7738 struct spdk_bdev_module *module) 7739 { 7740 struct spdk_bdev *bdev = desc->bdev; 7741 struct spdk_bdev_claim_opts opts; 7742 int rc = 0; 7743 7744 if (desc == NULL) { 7745 SPDK_ERRLOG("descriptor must not be NULL\n"); 7746 return -EINVAL; 7747 } 7748 7749 if (_opts == NULL) { 7750 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 7751 } else if (claim_opts_copy(_opts, &opts) != 0) { 7752 return -EINVAL; 7753 } 7754 7755 spdk_spin_lock(&bdev->internal.spinlock); 7756 7757 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 7758 bdev->internal.claim_type != type) { 7759 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7760 spdk_spin_unlock(&bdev->internal.spinlock); 7761 return -EPERM; 7762 } 7763 7764 if (claim_type_is_v2(type) && desc->claim != NULL) { 7765 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 7766 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 7767 spdk_spin_unlock(&bdev->internal.spinlock); 7768 return -EPERM; 7769 } 7770 7771 switch (type) { 7772 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7773 spdk_spin_unlock(&bdev->internal.spinlock); 7774 return spdk_bdev_module_claim_bdev(bdev, desc, module); 7775 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7776 rc = claim_verify_rwo(desc, type, &opts, module); 7777 break; 7778 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7779 rc = claim_verify_rom(desc, type, &opts, module); 7780 break; 7781 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7782 rc = claim_verify_rwm(desc, type, &opts, module); 7783 break; 7784 default: 7785 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 7786 rc = -ENOTSUP; 7787 } 7788 7789 if (rc == 0) { 7790 rc = claim_bdev(desc, type, &opts, module); 7791 } 7792 7793 spdk_spin_unlock(&bdev->internal.spinlock); 7794 return rc; 7795 } 7796 7797 static void 7798 claim_reset(struct spdk_bdev *bdev) 7799 { 7800 assert(spdk_spin_held(&bdev->internal.spinlock)); 7801 assert(claim_type_is_v2(bdev->internal.claim_type)); 7802 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 7803 7804 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7805 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7806 } 7807 7808 static void 7809 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 7810 { 7811 struct spdk_bdev *bdev = desc->bdev; 7812 7813 assert(spdk_spin_held(&bdev->internal.spinlock)); 7814 assert(claim_type_is_v2(bdev->internal.claim_type)); 7815 7816 if (bdev->internal.examine_in_progress == 0) { 7817 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 7818 free(desc->claim); 7819 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 7820 claim_reset(bdev); 7821 } 7822 } else { 7823 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 7824 desc->claim->module = NULL; 7825 desc->claim->desc = NULL; 7826 } 7827 desc->claim = NULL; 7828 } 7829 7830 /* 7831 * End claims v2 7832 */ 7833 7834 struct spdk_bdev * 7835 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7836 { 7837 assert(desc != NULL); 7838 return desc->bdev; 7839 } 7840 7841 int 7842 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7843 { 7844 struct spdk_bdev *bdev, *tmp; 7845 struct spdk_bdev_desc *desc; 7846 int rc = 0; 7847 7848 assert(fn != NULL); 7849 7850 spdk_spin_lock(&g_bdev_mgr.spinlock); 7851 bdev = spdk_bdev_first(); 7852 while (bdev != NULL) { 7853 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7854 if (rc != 0) { 7855 break; 7856 } 7857 rc = bdev_open(bdev, false, desc); 7858 if (rc != 0) { 7859 bdev_desc_free(desc); 7860 if (rc == -ENODEV) { 7861 /* Ignore the error and move to the next bdev. */ 7862 rc = 0; 7863 bdev = spdk_bdev_next(bdev); 7864 continue; 7865 } 7866 break; 7867 } 7868 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7869 7870 rc = fn(ctx, bdev); 7871 7872 spdk_spin_lock(&g_bdev_mgr.spinlock); 7873 tmp = spdk_bdev_next(bdev); 7874 bdev_close(bdev, desc); 7875 if (rc != 0) { 7876 break; 7877 } 7878 bdev = tmp; 7879 } 7880 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7881 7882 return rc; 7883 } 7884 7885 int 7886 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7887 { 7888 struct spdk_bdev *bdev, *tmp; 7889 struct spdk_bdev_desc *desc; 7890 int rc = 0; 7891 7892 assert(fn != NULL); 7893 7894 spdk_spin_lock(&g_bdev_mgr.spinlock); 7895 bdev = spdk_bdev_first_leaf(); 7896 while (bdev != NULL) { 7897 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7898 if (rc != 0) { 7899 break; 7900 } 7901 rc = bdev_open(bdev, false, desc); 7902 if (rc != 0) { 7903 bdev_desc_free(desc); 7904 if (rc == -ENODEV) { 7905 /* Ignore the error and move to the next bdev. */ 7906 rc = 0; 7907 bdev = spdk_bdev_next_leaf(bdev); 7908 continue; 7909 } 7910 break; 7911 } 7912 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7913 7914 rc = fn(ctx, bdev); 7915 7916 spdk_spin_lock(&g_bdev_mgr.spinlock); 7917 tmp = spdk_bdev_next_leaf(bdev); 7918 bdev_close(bdev, desc); 7919 if (rc != 0) { 7920 break; 7921 } 7922 bdev = tmp; 7923 } 7924 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7925 7926 return rc; 7927 } 7928 7929 void 7930 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7931 { 7932 struct iovec *iovs; 7933 int iovcnt; 7934 7935 if (bdev_io == NULL) { 7936 return; 7937 } 7938 7939 switch (bdev_io->type) { 7940 case SPDK_BDEV_IO_TYPE_READ: 7941 case SPDK_BDEV_IO_TYPE_WRITE: 7942 case SPDK_BDEV_IO_TYPE_ZCOPY: 7943 iovs = bdev_io->u.bdev.iovs; 7944 iovcnt = bdev_io->u.bdev.iovcnt; 7945 break; 7946 default: 7947 iovs = NULL; 7948 iovcnt = 0; 7949 break; 7950 } 7951 7952 if (iovp) { 7953 *iovp = iovs; 7954 } 7955 if (iovcntp) { 7956 *iovcntp = iovcnt; 7957 } 7958 } 7959 7960 void * 7961 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7962 { 7963 if (bdev_io == NULL) { 7964 return NULL; 7965 } 7966 7967 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7968 return NULL; 7969 } 7970 7971 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7972 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7973 return bdev_io->u.bdev.md_buf; 7974 } 7975 7976 return NULL; 7977 } 7978 7979 void * 7980 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7981 { 7982 if (bdev_io == NULL) { 7983 assert(false); 7984 return NULL; 7985 } 7986 7987 return bdev_io->internal.caller_ctx; 7988 } 7989 7990 void 7991 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7992 { 7993 7994 if (spdk_bdev_module_list_find(bdev_module->name)) { 7995 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7996 assert(false); 7997 } 7998 7999 spdk_spin_init(&bdev_module->internal.spinlock); 8000 8001 /* 8002 * Modules with examine callbacks must be initialized first, so they are 8003 * ready to handle examine callbacks from later modules that will 8004 * register physical bdevs. 8005 */ 8006 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8007 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8008 } else { 8009 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8010 } 8011 } 8012 8013 struct spdk_bdev_module * 8014 spdk_bdev_module_list_find(const char *name) 8015 { 8016 struct spdk_bdev_module *bdev_module; 8017 8018 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8019 if (strcmp(name, bdev_module->name) == 0) { 8020 break; 8021 } 8022 } 8023 8024 return bdev_module; 8025 } 8026 8027 static void 8028 bdev_write_zero_buffer_next(void *_bdev_io) 8029 { 8030 struct spdk_bdev_io *bdev_io = _bdev_io; 8031 uint64_t num_bytes, num_blocks; 8032 void *md_buf = NULL; 8033 int rc; 8034 8035 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8036 bdev_io->u.bdev.split_remaining_num_blocks, 8037 ZERO_BUFFER_SIZE); 8038 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8039 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8040 8041 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8042 md_buf = (char *)g_bdev_mgr.zero_buffer + 8043 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8044 } 8045 8046 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8047 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8048 g_bdev_mgr.zero_buffer, md_buf, 8049 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8050 bdev_write_zero_buffer_done, bdev_io); 8051 if (rc == 0) { 8052 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8053 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8054 } else if (rc == -ENOMEM) { 8055 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8056 } else { 8057 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8058 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8059 } 8060 } 8061 8062 static void 8063 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8064 { 8065 struct spdk_bdev_io *parent_io = cb_arg; 8066 8067 spdk_bdev_free_io(bdev_io); 8068 8069 if (!success) { 8070 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8071 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8072 return; 8073 } 8074 8075 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8076 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8077 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8078 return; 8079 } 8080 8081 bdev_write_zero_buffer_next(parent_io); 8082 } 8083 8084 static void 8085 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8086 { 8087 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8088 ctx->bdev->internal.qos_mod_in_progress = false; 8089 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8090 8091 if (ctx->cb_fn) { 8092 ctx->cb_fn(ctx->cb_arg, status); 8093 } 8094 free(ctx); 8095 } 8096 8097 static void 8098 bdev_disable_qos_done(void *cb_arg) 8099 { 8100 struct set_qos_limit_ctx *ctx = cb_arg; 8101 struct spdk_bdev *bdev = ctx->bdev; 8102 struct spdk_bdev_io *bdev_io; 8103 struct spdk_bdev_qos *qos; 8104 8105 spdk_spin_lock(&bdev->internal.spinlock); 8106 qos = bdev->internal.qos; 8107 bdev->internal.qos = NULL; 8108 spdk_spin_unlock(&bdev->internal.spinlock); 8109 8110 while (!TAILQ_EMPTY(&qos->queued)) { 8111 /* Send queued I/O back to their original thread for resubmission. */ 8112 bdev_io = TAILQ_FIRST(&qos->queued); 8113 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8114 8115 if (bdev_io->internal.io_submit_ch) { 8116 /* 8117 * Channel was changed when sending it to the QoS thread - change it back 8118 * before sending it back to the original thread. 8119 */ 8120 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8121 bdev_io->internal.io_submit_ch = NULL; 8122 } 8123 8124 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8125 _bdev_io_submit, bdev_io); 8126 } 8127 8128 if (qos->thread != NULL) { 8129 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8130 spdk_poller_unregister(&qos->poller); 8131 } 8132 8133 free(qos); 8134 8135 bdev_set_qos_limit_done(ctx, 0); 8136 } 8137 8138 static void 8139 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8140 { 8141 struct set_qos_limit_ctx *ctx = _ctx; 8142 struct spdk_thread *thread; 8143 8144 spdk_spin_lock(&bdev->internal.spinlock); 8145 thread = bdev->internal.qos->thread; 8146 spdk_spin_unlock(&bdev->internal.spinlock); 8147 8148 if (thread != NULL) { 8149 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8150 } else { 8151 bdev_disable_qos_done(ctx); 8152 } 8153 } 8154 8155 static void 8156 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8157 struct spdk_io_channel *ch, void *_ctx) 8158 { 8159 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8160 8161 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8162 8163 spdk_bdev_for_each_channel_continue(i, 0); 8164 } 8165 8166 static void 8167 bdev_update_qos_rate_limit_msg(void *cb_arg) 8168 { 8169 struct set_qos_limit_ctx *ctx = cb_arg; 8170 struct spdk_bdev *bdev = ctx->bdev; 8171 8172 spdk_spin_lock(&bdev->internal.spinlock); 8173 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8174 spdk_spin_unlock(&bdev->internal.spinlock); 8175 8176 bdev_set_qos_limit_done(ctx, 0); 8177 } 8178 8179 static void 8180 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8181 struct spdk_io_channel *ch, void *_ctx) 8182 { 8183 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8184 8185 spdk_spin_lock(&bdev->internal.spinlock); 8186 bdev_enable_qos(bdev, bdev_ch); 8187 spdk_spin_unlock(&bdev->internal.spinlock); 8188 spdk_bdev_for_each_channel_continue(i, 0); 8189 } 8190 8191 static void 8192 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8193 { 8194 struct set_qos_limit_ctx *ctx = _ctx; 8195 8196 bdev_set_qos_limit_done(ctx, status); 8197 } 8198 8199 static void 8200 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8201 { 8202 int i; 8203 8204 assert(bdev->internal.qos != NULL); 8205 8206 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8207 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8208 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8209 8210 if (limits[i] == 0) { 8211 bdev->internal.qos->rate_limits[i].limit = 8212 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8213 } 8214 } 8215 } 8216 } 8217 8218 void 8219 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8220 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8221 { 8222 struct set_qos_limit_ctx *ctx; 8223 uint32_t limit_set_complement; 8224 uint64_t min_limit_per_sec; 8225 int i; 8226 bool disable_rate_limit = true; 8227 8228 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8229 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8230 continue; 8231 } 8232 8233 if (limits[i] > 0) { 8234 disable_rate_limit = false; 8235 } 8236 8237 if (bdev_qos_is_iops_rate_limit(i) == true) { 8238 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8239 } else { 8240 /* Change from megabyte to byte rate limit */ 8241 limits[i] = limits[i] * 1024 * 1024; 8242 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8243 } 8244 8245 limit_set_complement = limits[i] % min_limit_per_sec; 8246 if (limit_set_complement) { 8247 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8248 limits[i], min_limit_per_sec); 8249 limits[i] += min_limit_per_sec - limit_set_complement; 8250 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8251 } 8252 } 8253 8254 ctx = calloc(1, sizeof(*ctx)); 8255 if (ctx == NULL) { 8256 cb_fn(cb_arg, -ENOMEM); 8257 return; 8258 } 8259 8260 ctx->cb_fn = cb_fn; 8261 ctx->cb_arg = cb_arg; 8262 ctx->bdev = bdev; 8263 8264 spdk_spin_lock(&bdev->internal.spinlock); 8265 if (bdev->internal.qos_mod_in_progress) { 8266 spdk_spin_unlock(&bdev->internal.spinlock); 8267 free(ctx); 8268 cb_fn(cb_arg, -EAGAIN); 8269 return; 8270 } 8271 bdev->internal.qos_mod_in_progress = true; 8272 8273 if (disable_rate_limit == true && bdev->internal.qos) { 8274 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8275 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8276 (bdev->internal.qos->rate_limits[i].limit > 0 && 8277 bdev->internal.qos->rate_limits[i].limit != 8278 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8279 disable_rate_limit = false; 8280 break; 8281 } 8282 } 8283 } 8284 8285 if (disable_rate_limit == false) { 8286 if (bdev->internal.qos == NULL) { 8287 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8288 if (!bdev->internal.qos) { 8289 spdk_spin_unlock(&bdev->internal.spinlock); 8290 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8291 bdev_set_qos_limit_done(ctx, -ENOMEM); 8292 return; 8293 } 8294 } 8295 8296 if (bdev->internal.qos->thread == NULL) { 8297 /* Enabling */ 8298 bdev_set_qos_rate_limits(bdev, limits); 8299 8300 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8301 bdev_enable_qos_done); 8302 } else { 8303 /* Updating */ 8304 bdev_set_qos_rate_limits(bdev, limits); 8305 8306 spdk_thread_send_msg(bdev->internal.qos->thread, 8307 bdev_update_qos_rate_limit_msg, ctx); 8308 } 8309 } else { 8310 if (bdev->internal.qos != NULL) { 8311 bdev_set_qos_rate_limits(bdev, limits); 8312 8313 /* Disabling */ 8314 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8315 bdev_disable_qos_msg_done); 8316 } else { 8317 spdk_spin_unlock(&bdev->internal.spinlock); 8318 bdev_set_qos_limit_done(ctx, 0); 8319 return; 8320 } 8321 } 8322 8323 spdk_spin_unlock(&bdev->internal.spinlock); 8324 } 8325 8326 struct spdk_bdev_histogram_ctx { 8327 spdk_bdev_histogram_status_cb cb_fn; 8328 void *cb_arg; 8329 struct spdk_bdev *bdev; 8330 int status; 8331 }; 8332 8333 static void 8334 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8335 { 8336 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8337 8338 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8339 ctx->bdev->internal.histogram_in_progress = false; 8340 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8341 ctx->cb_fn(ctx->cb_arg, ctx->status); 8342 free(ctx); 8343 } 8344 8345 static void 8346 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8347 struct spdk_io_channel *_ch, void *_ctx) 8348 { 8349 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8350 8351 if (ch->histogram != NULL) { 8352 spdk_histogram_data_free(ch->histogram); 8353 ch->histogram = NULL; 8354 } 8355 spdk_bdev_for_each_channel_continue(i, 0); 8356 } 8357 8358 static void 8359 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8360 { 8361 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8362 8363 if (status != 0) { 8364 ctx->status = status; 8365 ctx->bdev->internal.histogram_enabled = false; 8366 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8367 bdev_histogram_disable_channel_cb); 8368 } else { 8369 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8370 ctx->bdev->internal.histogram_in_progress = false; 8371 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8372 ctx->cb_fn(ctx->cb_arg, ctx->status); 8373 free(ctx); 8374 } 8375 } 8376 8377 static void 8378 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8379 struct spdk_io_channel *_ch, void *_ctx) 8380 { 8381 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8382 int status = 0; 8383 8384 if (ch->histogram == NULL) { 8385 ch->histogram = spdk_histogram_data_alloc(); 8386 if (ch->histogram == NULL) { 8387 status = -ENOMEM; 8388 } 8389 } 8390 8391 spdk_bdev_for_each_channel_continue(i, status); 8392 } 8393 8394 void 8395 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8396 void *cb_arg, bool enable) 8397 { 8398 struct spdk_bdev_histogram_ctx *ctx; 8399 8400 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8401 if (ctx == NULL) { 8402 cb_fn(cb_arg, -ENOMEM); 8403 return; 8404 } 8405 8406 ctx->bdev = bdev; 8407 ctx->status = 0; 8408 ctx->cb_fn = cb_fn; 8409 ctx->cb_arg = cb_arg; 8410 8411 spdk_spin_lock(&bdev->internal.spinlock); 8412 if (bdev->internal.histogram_in_progress) { 8413 spdk_spin_unlock(&bdev->internal.spinlock); 8414 free(ctx); 8415 cb_fn(cb_arg, -EAGAIN); 8416 return; 8417 } 8418 8419 bdev->internal.histogram_in_progress = true; 8420 spdk_spin_unlock(&bdev->internal.spinlock); 8421 8422 bdev->internal.histogram_enabled = enable; 8423 8424 if (enable) { 8425 /* Allocate histogram for each channel */ 8426 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8427 bdev_histogram_enable_channel_cb); 8428 } else { 8429 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8430 bdev_histogram_disable_channel_cb); 8431 } 8432 } 8433 8434 struct spdk_bdev_histogram_data_ctx { 8435 spdk_bdev_histogram_data_cb cb_fn; 8436 void *cb_arg; 8437 struct spdk_bdev *bdev; 8438 /** merged histogram data from all channels */ 8439 struct spdk_histogram_data *histogram; 8440 }; 8441 8442 static void 8443 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8444 { 8445 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8446 8447 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8448 free(ctx); 8449 } 8450 8451 static void 8452 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8453 struct spdk_io_channel *_ch, void *_ctx) 8454 { 8455 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8456 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8457 int status = 0; 8458 8459 if (ch->histogram == NULL) { 8460 status = -EFAULT; 8461 } else { 8462 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8463 } 8464 8465 spdk_bdev_for_each_channel_continue(i, status); 8466 } 8467 8468 void 8469 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8470 spdk_bdev_histogram_data_cb cb_fn, 8471 void *cb_arg) 8472 { 8473 struct spdk_bdev_histogram_data_ctx *ctx; 8474 8475 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8476 if (ctx == NULL) { 8477 cb_fn(cb_arg, -ENOMEM, NULL); 8478 return; 8479 } 8480 8481 ctx->bdev = bdev; 8482 ctx->cb_fn = cb_fn; 8483 ctx->cb_arg = cb_arg; 8484 8485 ctx->histogram = histogram; 8486 8487 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8488 bdev_histogram_get_channel_cb); 8489 } 8490 8491 void 8492 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8493 void *cb_arg) 8494 { 8495 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8496 int status = 0; 8497 8498 assert(cb_fn != NULL); 8499 8500 if (bdev_ch->histogram == NULL) { 8501 status = -EFAULT; 8502 } 8503 cb_fn(cb_arg, status, bdev_ch->histogram); 8504 } 8505 8506 size_t 8507 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8508 size_t max_events) 8509 { 8510 struct media_event_entry *entry; 8511 size_t num_events = 0; 8512 8513 for (; num_events < max_events; ++num_events) { 8514 entry = TAILQ_FIRST(&desc->pending_media_events); 8515 if (entry == NULL) { 8516 break; 8517 } 8518 8519 events[num_events] = entry->event; 8520 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8521 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8522 } 8523 8524 return num_events; 8525 } 8526 8527 int 8528 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8529 size_t num_events) 8530 { 8531 struct spdk_bdev_desc *desc; 8532 struct media_event_entry *entry; 8533 size_t event_id; 8534 int rc = 0; 8535 8536 assert(bdev->media_events); 8537 8538 spdk_spin_lock(&bdev->internal.spinlock); 8539 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8540 if (desc->write) { 8541 break; 8542 } 8543 } 8544 8545 if (desc == NULL || desc->media_events_buffer == NULL) { 8546 rc = -ENODEV; 8547 goto out; 8548 } 8549 8550 for (event_id = 0; event_id < num_events; ++event_id) { 8551 entry = TAILQ_FIRST(&desc->free_media_events); 8552 if (entry == NULL) { 8553 break; 8554 } 8555 8556 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8557 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8558 entry->event = events[event_id]; 8559 } 8560 8561 rc = event_id; 8562 out: 8563 spdk_spin_unlock(&bdev->internal.spinlock); 8564 return rc; 8565 } 8566 8567 static void 8568 _media_management_notify(void *arg) 8569 { 8570 struct spdk_bdev_desc *desc = arg; 8571 8572 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8573 } 8574 8575 void 8576 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8577 { 8578 struct spdk_bdev_desc *desc; 8579 8580 spdk_spin_lock(&bdev->internal.spinlock); 8581 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8582 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8583 event_notify(desc, _media_management_notify); 8584 } 8585 } 8586 spdk_spin_unlock(&bdev->internal.spinlock); 8587 } 8588 8589 struct locked_lba_range_ctx { 8590 struct lba_range range; 8591 struct spdk_bdev *bdev; 8592 struct lba_range *current_range; 8593 struct lba_range *owner_range; 8594 struct spdk_poller *poller; 8595 lock_range_cb cb_fn; 8596 void *cb_arg; 8597 }; 8598 8599 static void 8600 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8601 { 8602 struct locked_lba_range_ctx *ctx = _ctx; 8603 8604 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8605 free(ctx); 8606 } 8607 8608 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8609 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8610 8611 static void 8612 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8613 { 8614 struct locked_lba_range_ctx *ctx = _ctx; 8615 8616 if (status == -ENOMEM) { 8617 /* One of the channels could not allocate a range object. 8618 * So we have to go back and clean up any ranges that were 8619 * allocated successfully before we return error status to 8620 * the caller. We can reuse the unlock function to do that 8621 * clean up. 8622 */ 8623 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8624 bdev_lock_error_cleanup_cb); 8625 return; 8626 } 8627 8628 /* All channels have locked this range and no I/O overlapping the range 8629 * are outstanding! Set the owner_ch for the range object for the 8630 * locking channel, so that this channel will know that it is allowed 8631 * to write to this range. 8632 */ 8633 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8634 ctx->cb_fn(ctx->cb_arg, status); 8635 8636 /* Don't free the ctx here. Its range is in the bdev's global list of 8637 * locked ranges still, and will be removed and freed when this range 8638 * is later unlocked. 8639 */ 8640 } 8641 8642 static int 8643 bdev_lock_lba_range_check_io(void *_i) 8644 { 8645 struct spdk_bdev_channel_iter *i = _i; 8646 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8647 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8648 struct locked_lba_range_ctx *ctx = i->ctx; 8649 struct lba_range *range = ctx->current_range; 8650 struct spdk_bdev_io *bdev_io; 8651 8652 spdk_poller_unregister(&ctx->poller); 8653 8654 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8655 * range. But we need to wait until any outstanding IO overlapping with this range 8656 * are completed. 8657 */ 8658 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8659 if (bdev_io_range_is_locked(bdev_io, range)) { 8660 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8661 return SPDK_POLLER_BUSY; 8662 } 8663 } 8664 8665 spdk_bdev_for_each_channel_continue(i, 0); 8666 return SPDK_POLLER_BUSY; 8667 } 8668 8669 static void 8670 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8671 struct spdk_io_channel *_ch, void *_ctx) 8672 { 8673 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8674 struct locked_lba_range_ctx *ctx = _ctx; 8675 struct lba_range *range; 8676 8677 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8678 if (range->length == ctx->range.length && 8679 range->offset == ctx->range.offset && 8680 range->locked_ctx == ctx->range.locked_ctx) { 8681 /* This range already exists on this channel, so don't add 8682 * it again. This can happen when a new channel is created 8683 * while the for_each_channel operation is in progress. 8684 * Do not check for outstanding I/O in that case, since the 8685 * range was locked before any I/O could be submitted to the 8686 * new channel. 8687 */ 8688 spdk_bdev_for_each_channel_continue(i, 0); 8689 return; 8690 } 8691 } 8692 8693 range = calloc(1, sizeof(*range)); 8694 if (range == NULL) { 8695 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8696 return; 8697 } 8698 8699 range->length = ctx->range.length; 8700 range->offset = ctx->range.offset; 8701 range->locked_ctx = ctx->range.locked_ctx; 8702 ctx->current_range = range; 8703 if (ctx->range.owner_ch == ch) { 8704 /* This is the range object for the channel that will hold 8705 * the lock. Store it in the ctx object so that we can easily 8706 * set its owner_ch after the lock is finally acquired. 8707 */ 8708 ctx->owner_range = range; 8709 } 8710 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8711 bdev_lock_lba_range_check_io(i); 8712 } 8713 8714 static void 8715 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8716 { 8717 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8718 8719 /* We will add a copy of this range to each channel now. */ 8720 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8721 bdev_lock_lba_range_cb); 8722 } 8723 8724 static bool 8725 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8726 { 8727 struct lba_range *r; 8728 8729 TAILQ_FOREACH(r, tailq, tailq) { 8730 if (bdev_lba_range_overlapped(range, r)) { 8731 return true; 8732 } 8733 } 8734 return false; 8735 } 8736 8737 static int 8738 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8739 uint64_t offset, uint64_t length, 8740 lock_range_cb cb_fn, void *cb_arg) 8741 { 8742 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8743 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8744 struct locked_lba_range_ctx *ctx; 8745 8746 if (cb_arg == NULL) { 8747 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8748 return -EINVAL; 8749 } 8750 8751 ctx = calloc(1, sizeof(*ctx)); 8752 if (ctx == NULL) { 8753 return -ENOMEM; 8754 } 8755 8756 ctx->range.offset = offset; 8757 ctx->range.length = length; 8758 ctx->range.owner_ch = ch; 8759 ctx->range.locked_ctx = cb_arg; 8760 ctx->bdev = bdev; 8761 ctx->cb_fn = cb_fn; 8762 ctx->cb_arg = cb_arg; 8763 8764 spdk_spin_lock(&bdev->internal.spinlock); 8765 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8766 /* There is an active lock overlapping with this range. 8767 * Put it on the pending list until this range no 8768 * longer overlaps with another. 8769 */ 8770 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8771 } else { 8772 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8773 bdev_lock_lba_range_ctx(bdev, ctx); 8774 } 8775 spdk_spin_unlock(&bdev->internal.spinlock); 8776 return 0; 8777 } 8778 8779 static void 8780 bdev_lock_lba_range_ctx_msg(void *_ctx) 8781 { 8782 struct locked_lba_range_ctx *ctx = _ctx; 8783 8784 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8785 } 8786 8787 static void 8788 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8789 { 8790 struct locked_lba_range_ctx *ctx = _ctx; 8791 struct locked_lba_range_ctx *pending_ctx; 8792 struct lba_range *range, *tmp; 8793 8794 spdk_spin_lock(&bdev->internal.spinlock); 8795 /* Check if there are any pending locked ranges that overlap with this range 8796 * that was just unlocked. If there are, check that it doesn't overlap with any 8797 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8798 * the lock process. 8799 */ 8800 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8801 if (bdev_lba_range_overlapped(range, &ctx->range) && 8802 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8803 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8804 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8805 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8806 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8807 bdev_lock_lba_range_ctx_msg, pending_ctx); 8808 } 8809 } 8810 spdk_spin_unlock(&bdev->internal.spinlock); 8811 8812 ctx->cb_fn(ctx->cb_arg, status); 8813 free(ctx); 8814 } 8815 8816 static void 8817 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8818 struct spdk_io_channel *_ch, void *_ctx) 8819 { 8820 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8821 struct locked_lba_range_ctx *ctx = _ctx; 8822 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8823 struct spdk_bdev_io *bdev_io; 8824 struct lba_range *range; 8825 8826 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8827 if (ctx->range.offset == range->offset && 8828 ctx->range.length == range->length && 8829 ctx->range.locked_ctx == range->locked_ctx) { 8830 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8831 free(range); 8832 break; 8833 } 8834 } 8835 8836 /* Note: we should almost always be able to assert that the range specified 8837 * was found. But there are some very rare corner cases where a new channel 8838 * gets created simultaneously with a range unlock, where this function 8839 * would execute on that new channel and wouldn't have the range. 8840 * We also use this to clean up range allocations when a later allocation 8841 * fails in the locking path. 8842 * So we can't actually assert() here. 8843 */ 8844 8845 /* Swap the locked IO into a temporary list, and then try to submit them again. 8846 * We could hyper-optimize this to only resubmit locked I/O that overlap 8847 * with the range that was just unlocked, but this isn't a performance path so 8848 * we go for simplicity here. 8849 */ 8850 TAILQ_INIT(&io_locked); 8851 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8852 while (!TAILQ_EMPTY(&io_locked)) { 8853 bdev_io = TAILQ_FIRST(&io_locked); 8854 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8855 bdev_io_submit(bdev_io); 8856 } 8857 8858 spdk_bdev_for_each_channel_continue(i, 0); 8859 } 8860 8861 static int 8862 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8863 uint64_t offset, uint64_t length, 8864 lock_range_cb cb_fn, void *cb_arg) 8865 { 8866 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8867 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8868 struct locked_lba_range_ctx *ctx; 8869 struct lba_range *range; 8870 bool range_found = false; 8871 8872 /* Let's make sure the specified channel actually has a lock on 8873 * the specified range. Note that the range must match exactly. 8874 */ 8875 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8876 if (range->offset == offset && range->length == length && 8877 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8878 range_found = true; 8879 break; 8880 } 8881 } 8882 8883 if (!range_found) { 8884 return -EINVAL; 8885 } 8886 8887 spdk_spin_lock(&bdev->internal.spinlock); 8888 /* We confirmed that this channel has locked the specified range. To 8889 * start the unlock the process, we find the range in the bdev's locked_ranges 8890 * and remove it. This ensures new channels don't inherit the locked range. 8891 * Then we will send a message to each channel (including the one specified 8892 * here) to remove the range from its per-channel list. 8893 */ 8894 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8895 if (range->offset == offset && range->length == length && 8896 range->locked_ctx == cb_arg) { 8897 break; 8898 } 8899 } 8900 if (range == NULL) { 8901 assert(false); 8902 spdk_spin_unlock(&bdev->internal.spinlock); 8903 return -EINVAL; 8904 } 8905 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8906 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8907 spdk_spin_unlock(&bdev->internal.spinlock); 8908 8909 ctx->cb_fn = cb_fn; 8910 ctx->cb_arg = cb_arg; 8911 8912 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8913 bdev_unlock_lba_range_cb); 8914 return 0; 8915 } 8916 8917 int 8918 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8919 int array_size) 8920 { 8921 if (!bdev) { 8922 return -EINVAL; 8923 } 8924 8925 if (bdev->fn_table->get_memory_domains) { 8926 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8927 } 8928 8929 return 0; 8930 } 8931 8932 struct spdk_bdev_for_each_io_ctx { 8933 void *ctx; 8934 spdk_bdev_io_fn fn; 8935 spdk_bdev_for_each_io_cb cb; 8936 }; 8937 8938 static void 8939 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8940 struct spdk_io_channel *io_ch, void *_ctx) 8941 { 8942 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8943 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8944 struct spdk_bdev_io *bdev_io; 8945 int rc = 0; 8946 8947 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8948 rc = ctx->fn(ctx->ctx, bdev_io); 8949 if (rc != 0) { 8950 break; 8951 } 8952 } 8953 8954 spdk_bdev_for_each_channel_continue(i, rc); 8955 } 8956 8957 static void 8958 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8959 { 8960 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8961 8962 ctx->cb(ctx->ctx, status); 8963 8964 free(ctx); 8965 } 8966 8967 void 8968 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8969 spdk_bdev_for_each_io_cb cb) 8970 { 8971 struct spdk_bdev_for_each_io_ctx *ctx; 8972 8973 assert(fn != NULL && cb != NULL); 8974 8975 ctx = calloc(1, sizeof(*ctx)); 8976 if (ctx == NULL) { 8977 SPDK_ERRLOG("Failed to allocate context.\n"); 8978 cb(_ctx, -ENOMEM); 8979 return; 8980 } 8981 8982 ctx->ctx = _ctx; 8983 ctx->fn = fn; 8984 ctx->cb = cb; 8985 8986 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8987 bdev_for_each_io_done); 8988 } 8989 8990 void 8991 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8992 { 8993 spdk_for_each_channel_continue(iter->i, status); 8994 } 8995 8996 static struct spdk_bdev * 8997 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8998 { 8999 void *io_device = spdk_io_channel_iter_get_io_device(i); 9000 9001 return __bdev_from_io_dev(io_device); 9002 } 9003 9004 static void 9005 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9006 { 9007 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9008 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9009 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9010 9011 iter->i = i; 9012 iter->fn(iter, bdev, ch, iter->ctx); 9013 } 9014 9015 static void 9016 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9017 { 9018 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9019 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9020 9021 iter->i = i; 9022 iter->cpl(bdev, iter->ctx, status); 9023 9024 free(iter); 9025 } 9026 9027 void 9028 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9029 void *ctx, spdk_bdev_for_each_channel_done cpl) 9030 { 9031 struct spdk_bdev_channel_iter *iter; 9032 9033 assert(bdev != NULL && fn != NULL && ctx != NULL); 9034 9035 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9036 if (iter == NULL) { 9037 SPDK_ERRLOG("Unable to allocate iterator\n"); 9038 assert(false); 9039 return; 9040 } 9041 9042 iter->fn = fn; 9043 iter->cpl = cpl; 9044 iter->ctx = ctx; 9045 9046 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9047 iter, bdev_each_channel_cpl); 9048 } 9049 9050 static void 9051 bdev_copy_do_write_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9052 { 9053 struct spdk_bdev_io *parent_io = cb_arg; 9054 9055 /* Check return status of write */ 9056 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9057 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9058 spdk_bdev_free_io(bdev_io); 9059 } 9060 9061 static void 9062 bdev_copy_do_write(void *_bdev_io) 9063 { 9064 struct spdk_bdev_io *bdev_io = _bdev_io; 9065 int rc; 9066 9067 /* Write blocks */ 9068 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9069 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9070 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9071 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_complete, bdev_io); 9072 9073 if (rc == -ENOMEM) { 9074 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9075 } else if (rc != 0) { 9076 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9077 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9078 } 9079 } 9080 9081 static void 9082 bdev_copy_do_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9083 { 9084 struct spdk_bdev_io *parent_io = cb_arg; 9085 9086 /* Check return status of read */ 9087 if (!success) { 9088 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9089 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9090 spdk_bdev_free_io(bdev_io); 9091 return; 9092 } 9093 9094 spdk_bdev_free_io(bdev_io); 9095 9096 /* Do write */ 9097 bdev_copy_do_write(parent_io); 9098 } 9099 9100 static void 9101 bdev_copy_do_read(void *_bdev_io) 9102 { 9103 struct spdk_bdev_io *bdev_io = _bdev_io; 9104 int rc; 9105 9106 /* Read blocks */ 9107 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9108 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9109 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9110 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_complete, bdev_io); 9111 9112 if (rc == -ENOMEM) { 9113 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9114 } else if (rc != 0) { 9115 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9116 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9117 } 9118 } 9119 9120 static void 9121 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9122 { 9123 if (!success) { 9124 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9125 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9126 return; 9127 } 9128 9129 bdev_copy_do_read(bdev_io); 9130 } 9131 9132 int 9133 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9134 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9135 spdk_bdev_io_completion_cb cb, void *cb_arg) 9136 { 9137 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9138 struct spdk_bdev_io *bdev_io; 9139 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9140 9141 if (!desc->write) { 9142 return -EBADF; 9143 } 9144 9145 if (num_blocks == 0) { 9146 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9147 return -EINVAL; 9148 } 9149 9150 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9151 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9152 SPDK_DEBUGLOG(bdev, 9153 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9154 dst_offset_blocks, src_offset_blocks, num_blocks); 9155 return -EINVAL; 9156 } 9157 9158 bdev_io = bdev_channel_get_io(channel); 9159 if (!bdev_io) { 9160 return -ENOMEM; 9161 } 9162 9163 bdev_io->internal.ch = channel; 9164 bdev_io->internal.desc = desc; 9165 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9166 9167 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9168 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9169 bdev_io->u.bdev.num_blocks = num_blocks; 9170 bdev_io->u.bdev.memory_domain = NULL; 9171 bdev_io->u.bdev.memory_domain_ctx = NULL; 9172 bdev_io->u.bdev.iovs = NULL; 9173 bdev_io->u.bdev.iovcnt = 0; 9174 bdev_io->u.bdev.md_buf = NULL; 9175 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9176 9177 if (dst_offset_blocks == src_offset_blocks) { 9178 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9179 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9180 9181 return 0; 9182 } 9183 9184 /* If the bdev backing device support copy directly, pass to it to process. 9185 * Else do general processing from bdev layer. 9186 */ 9187 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 9188 bdev_io_submit(bdev_io); 9189 return 0; 9190 } 9191 9192 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9193 9194 return 0; 9195 } 9196 9197 SPDK_LOG_REGISTER_COMPONENT(bdev) 9198 9199 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9200 { 9201 struct spdk_trace_tpoint_opts opts[] = { 9202 { 9203 "BDEV_IO_START", TRACE_BDEV_IO_START, 9204 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9205 { 9206 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9207 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9208 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9209 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9210 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9211 } 9212 }, 9213 { 9214 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9215 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9216 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9217 }, 9218 { 9219 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9220 OWNER_BDEV, OBJECT_NONE, 1, 9221 { 9222 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9223 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9224 } 9225 }, 9226 { 9227 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9228 OWNER_BDEV, OBJECT_NONE, 0, 9229 { 9230 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9231 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9232 } 9233 }, 9234 }; 9235 9236 9237 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9238 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9239 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9240 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9241 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9242 } 9243