1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 65 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 66 #ifdef DEBUG 67 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 68 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 69 #else 70 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 71 #endif 72 73 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 74 const char *detail, struct spdk_bdev *bdev); 75 76 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 #ifdef SPDK_CONFIG_VTUNE 110 __itt_domain *domain; 111 #endif 112 }; 113 114 static struct spdk_bdev_mgr g_bdev_mgr = { 115 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 116 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 117 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 118 .init_complete = false, 119 .module_init_complete = false, 120 }; 121 122 static void 123 __attribute__((constructor)) 124 _bdev_init(void) 125 { 126 spdk_spin_init(&g_bdev_mgr.spinlock); 127 } 128 129 typedef void (*lock_range_cb)(void *ctx, int status); 130 131 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 132 133 struct lba_range { 134 uint64_t offset; 135 uint64_t length; 136 void *locked_ctx; 137 struct spdk_bdev_channel *owner_ch; 138 TAILQ_ENTRY(lba_range) tailq; 139 }; 140 141 static struct spdk_bdev_opts g_bdev_opts = { 142 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 143 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 144 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 145 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 146 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 147 }; 148 149 static spdk_bdev_init_cb g_init_cb_fn = NULL; 150 static void *g_init_cb_arg = NULL; 151 152 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 153 static void *g_fini_cb_arg = NULL; 154 static struct spdk_thread *g_fini_thread = NULL; 155 156 struct spdk_bdev_qos_limit { 157 /** IOs or bytes allowed per second (i.e., 1s). */ 158 uint64_t limit; 159 160 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 161 * For remaining bytes, allowed to run negative if an I/O is submitted when 162 * some bytes are remaining, but the I/O is bigger than that amount. The 163 * excess will be deducted from the next timeslice. 164 */ 165 int64_t remaining_this_timeslice; 166 167 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 168 uint32_t min_per_timeslice; 169 170 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t max_per_timeslice; 172 173 /** Function to check whether to queue the IO. */ 174 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 175 176 /** Function to update for the submitted IO. */ 177 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 }; 179 180 struct spdk_bdev_qos { 181 /** Types of structure of rate limits. */ 182 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 183 184 /** The channel that all I/O are funneled through. */ 185 struct spdk_bdev_channel *ch; 186 187 /** The thread on which the poller is running. */ 188 struct spdk_thread *thread; 189 190 /** Queue of I/O waiting to be issued. */ 191 bdev_io_tailq_t queued; 192 193 /** Size of a timeslice in tsc ticks. */ 194 uint64_t timeslice_size; 195 196 /** Timestamp of start of last timeslice. */ 197 uint64_t last_timeslice; 198 199 /** Poller that processes queued I/O commands each time slice. */ 200 struct spdk_poller *poller; 201 }; 202 203 struct spdk_bdev_mgmt_channel { 204 /* 205 * Each thread keeps a cache of bdev_io - this allows 206 * bdev threads which are *not* DPDK threads to still 207 * benefit from a per-thread bdev_io cache. Without 208 * this, non-DPDK threads fetching from the mempool 209 * incur a cmpxchg on get and put. 210 */ 211 bdev_io_stailq_t per_thread_cache; 212 uint32_t per_thread_cache_count; 213 uint32_t bdev_io_cache_size; 214 215 struct spdk_iobuf_channel iobuf; 216 217 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 218 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 219 }; 220 221 /* 222 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 223 * will queue here their IO that awaits retry. It makes it possible to retry sending 224 * IO to one bdev after IO from other bdev completes. 225 */ 226 struct spdk_bdev_shared_resource { 227 /* The bdev management channel */ 228 struct spdk_bdev_mgmt_channel *mgmt_ch; 229 230 /* 231 * Count of I/O submitted to bdev module and waiting for completion. 232 * Incremented before submit_request() is called on an spdk_bdev_io. 233 */ 234 uint64_t io_outstanding; 235 236 /* 237 * Queue of IO awaiting retry because of a previous NOMEM status returned 238 * on this channel. 239 */ 240 bdev_io_tailq_t nomem_io; 241 242 /* 243 * Threshold which io_outstanding must drop to before retrying nomem_io. 244 */ 245 uint64_t nomem_threshold; 246 247 /* I/O channel allocated by a bdev module */ 248 struct spdk_io_channel *shared_ch; 249 250 /* Refcount of bdev channels using this resource */ 251 uint32_t ref; 252 253 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 254 }; 255 256 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 257 #define BDEV_CH_QOS_ENABLED (1 << 1) 258 259 struct spdk_bdev_channel { 260 struct spdk_bdev *bdev; 261 262 /* The channel for the underlying device */ 263 struct spdk_io_channel *channel; 264 265 /* Per io_device per thread data */ 266 struct spdk_bdev_shared_resource *shared_resource; 267 268 struct spdk_bdev_io_stat *stat; 269 270 /* 271 * Count of I/O submitted to the underlying dev module through this channel 272 * and waiting for completion. 273 */ 274 uint64_t io_outstanding; 275 276 /* 277 * List of all submitted I/Os including I/O that are generated via splitting. 278 */ 279 bdev_io_tailq_t io_submitted; 280 281 /* 282 * List of spdk_bdev_io that are currently queued because they write to a locked 283 * LBA range. 284 */ 285 bdev_io_tailq_t io_locked; 286 287 uint32_t flags; 288 289 struct spdk_histogram_data *histogram; 290 291 #ifdef SPDK_CONFIG_VTUNE 292 uint64_t start_tsc; 293 uint64_t interval_tsc; 294 __itt_string_handle *handle; 295 struct spdk_bdev_io_stat *prev_stat; 296 #endif 297 298 bdev_io_tailq_t queued_resets; 299 300 lba_range_tailq_t locked_ranges; 301 }; 302 303 struct media_event_entry { 304 struct spdk_bdev_media_event event; 305 TAILQ_ENTRY(media_event_entry) tailq; 306 }; 307 308 #define MEDIA_EVENT_POOL_SIZE 64 309 310 struct spdk_bdev_desc { 311 struct spdk_bdev *bdev; 312 struct spdk_thread *thread; 313 struct { 314 spdk_bdev_event_cb_t event_fn; 315 void *ctx; 316 } callback; 317 bool closed; 318 bool write; 319 bool memory_domains_supported; 320 struct spdk_spinlock spinlock; 321 uint32_t refs; 322 TAILQ_HEAD(, media_event_entry) pending_media_events; 323 TAILQ_HEAD(, media_event_entry) free_media_events; 324 struct media_event_entry *media_events_buffer; 325 TAILQ_ENTRY(spdk_bdev_desc) link; 326 327 uint64_t timeout_in_sec; 328 spdk_bdev_io_timeout_cb cb_fn; 329 void *cb_arg; 330 struct spdk_poller *io_timeout_poller; 331 struct spdk_bdev_module_claim *claim; 332 }; 333 334 struct spdk_bdev_iostat_ctx { 335 struct spdk_bdev_io_stat *stat; 336 spdk_bdev_get_device_stat_cb cb; 337 void *cb_arg; 338 }; 339 340 struct set_qos_limit_ctx { 341 void (*cb_fn)(void *cb_arg, int status); 342 void *cb_arg; 343 struct spdk_bdev *bdev; 344 }; 345 346 struct spdk_bdev_channel_iter { 347 spdk_bdev_for_each_channel_msg fn; 348 spdk_bdev_for_each_channel_done cpl; 349 struct spdk_io_channel_iter *i; 350 void *ctx; 351 }; 352 353 struct spdk_bdev_io_error_stat { 354 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 355 }; 356 357 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 358 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 359 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 360 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 361 362 static inline void bdev_io_complete(void *ctx); 363 364 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 365 static void bdev_write_zero_buffer_next(void *_bdev_io); 366 367 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 368 struct spdk_io_channel *ch, void *_ctx); 369 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 370 371 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 372 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 373 uint64_t num_blocks, 374 struct spdk_memory_domain *domain, void *domain_ctx, 375 spdk_bdev_io_completion_cb cb, void *cb_arg); 376 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 377 struct iovec *iov, int iovcnt, void *md_buf, 378 uint64_t offset_blocks, uint64_t num_blocks, 379 struct spdk_memory_domain *domain, void *domain_ctx, 380 spdk_bdev_io_completion_cb cb, void *cb_arg); 381 382 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 383 uint64_t offset, uint64_t length, 384 lock_range_cb cb_fn, void *cb_arg); 385 386 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 387 uint64_t offset, uint64_t length, 388 lock_range_cb cb_fn, void *cb_arg); 389 390 static inline void bdev_io_complete(void *ctx); 391 392 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 393 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 394 395 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 396 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 397 static void claim_reset(struct spdk_bdev *bdev); 398 399 #define bdev_get_ext_io_opt(opts, field, defval) \ 400 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 401 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 402 403 void 404 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 405 { 406 if (!opts) { 407 SPDK_ERRLOG("opts should not be NULL\n"); 408 return; 409 } 410 411 if (!opts_size) { 412 SPDK_ERRLOG("opts_size should not be zero value\n"); 413 return; 414 } 415 416 opts->opts_size = opts_size; 417 418 #define SET_FIELD(field) \ 419 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 420 opts->field = g_bdev_opts.field; \ 421 } \ 422 423 SET_FIELD(bdev_io_pool_size); 424 SET_FIELD(bdev_io_cache_size); 425 SET_FIELD(bdev_auto_examine); 426 SET_FIELD(small_buf_pool_size); 427 SET_FIELD(large_buf_pool_size); 428 429 /* Do not remove this statement, you should always update this statement when you adding a new field, 430 * and do not forget to add the SET_FIELD statement for your added field. */ 431 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 432 433 #undef SET_FIELD 434 } 435 436 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 437 "v23.05", 0); 438 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 439 "v23.05", 0); 440 int 441 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 442 { 443 struct spdk_iobuf_opts iobuf_opts; 444 uint32_t min_pool_size; 445 int rc; 446 447 if (!opts) { 448 SPDK_ERRLOG("opts cannot be NULL\n"); 449 return -1; 450 } 451 452 if (!opts->opts_size) { 453 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 454 return -1; 455 } 456 457 /* 458 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 459 * initialization. A second mgmt_ch will be created on the same thread when the application starts 460 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 461 */ 462 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 463 if (opts->bdev_io_pool_size < min_pool_size) { 464 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 465 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 466 spdk_thread_get_count()); 467 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 468 return -1; 469 } 470 471 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 472 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 473 } 474 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 475 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 476 } 477 478 #define SET_FIELD(field) \ 479 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 480 g_bdev_opts.field = opts->field; \ 481 } \ 482 483 SET_FIELD(bdev_io_pool_size); 484 SET_FIELD(bdev_io_cache_size); 485 SET_FIELD(bdev_auto_examine); 486 SET_FIELD(small_buf_pool_size); 487 SET_FIELD(large_buf_pool_size); 488 489 spdk_iobuf_get_opts(&iobuf_opts); 490 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 491 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 492 493 rc = spdk_iobuf_set_opts(&iobuf_opts); 494 if (rc != 0) { 495 SPDK_ERRLOG("Failed to set iobuf opts\n"); 496 return -1; 497 } 498 499 g_bdev_opts.opts_size = opts->opts_size; 500 501 #undef SET_FIELD 502 503 return 0; 504 } 505 506 static struct spdk_bdev * 507 bdev_get_by_name(const char *bdev_name) 508 { 509 struct spdk_bdev_name find; 510 struct spdk_bdev_name *res; 511 512 find.name = (char *)bdev_name; 513 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 514 if (res != NULL) { 515 return res->bdev; 516 } 517 518 return NULL; 519 } 520 521 struct spdk_bdev * 522 spdk_bdev_get_by_name(const char *bdev_name) 523 { 524 struct spdk_bdev *bdev; 525 526 spdk_spin_lock(&g_bdev_mgr.spinlock); 527 bdev = bdev_get_by_name(bdev_name); 528 spdk_spin_unlock(&g_bdev_mgr.spinlock); 529 530 return bdev; 531 } 532 533 struct bdev_io_status_string { 534 enum spdk_bdev_io_status status; 535 const char *str; 536 }; 537 538 static const struct bdev_io_status_string bdev_io_status_strings[] = { 539 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 540 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 541 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 542 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 543 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 544 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 545 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 546 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 547 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 548 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 549 }; 550 551 static const char * 552 bdev_io_status_get_string(enum spdk_bdev_io_status status) 553 { 554 uint32_t i; 555 556 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 557 if (bdev_io_status_strings[i].status == status) { 558 return bdev_io_status_strings[i].str; 559 } 560 } 561 562 return "reserved"; 563 } 564 565 struct spdk_bdev_wait_for_examine_ctx { 566 struct spdk_poller *poller; 567 spdk_bdev_wait_for_examine_cb cb_fn; 568 void *cb_arg; 569 }; 570 571 static bool bdev_module_all_actions_completed(void); 572 573 static int 574 bdev_wait_for_examine_cb(void *arg) 575 { 576 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 577 578 if (!bdev_module_all_actions_completed()) { 579 return SPDK_POLLER_IDLE; 580 } 581 582 spdk_poller_unregister(&ctx->poller); 583 ctx->cb_fn(ctx->cb_arg); 584 free(ctx); 585 586 return SPDK_POLLER_BUSY; 587 } 588 589 int 590 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 591 { 592 struct spdk_bdev_wait_for_examine_ctx *ctx; 593 594 ctx = calloc(1, sizeof(*ctx)); 595 if (ctx == NULL) { 596 return -ENOMEM; 597 } 598 ctx->cb_fn = cb_fn; 599 ctx->cb_arg = cb_arg; 600 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 601 602 return 0; 603 } 604 605 struct spdk_bdev_examine_item { 606 char *name; 607 TAILQ_ENTRY(spdk_bdev_examine_item) link; 608 }; 609 610 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 611 612 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 613 g_bdev_examine_allowlist); 614 615 static inline bool 616 bdev_examine_allowlist_check(const char *name) 617 { 618 struct spdk_bdev_examine_item *item; 619 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 620 if (strcmp(name, item->name) == 0) { 621 return true; 622 } 623 } 624 return false; 625 } 626 627 static inline void 628 bdev_examine_allowlist_free(void) 629 { 630 struct spdk_bdev_examine_item *item; 631 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 632 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 633 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 634 free(item->name); 635 free(item); 636 } 637 } 638 639 static inline bool 640 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 641 { 642 struct spdk_bdev_alias *tmp; 643 if (bdev_examine_allowlist_check(bdev->name)) { 644 return true; 645 } 646 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 647 if (bdev_examine_allowlist_check(tmp->alias.name)) { 648 return true; 649 } 650 } 651 return false; 652 } 653 654 static inline bool 655 bdev_ok_to_examine(struct spdk_bdev *bdev) 656 { 657 if (g_bdev_opts.bdev_auto_examine) { 658 return true; 659 } else { 660 return bdev_in_examine_allowlist(bdev); 661 } 662 } 663 664 static void 665 bdev_examine(struct spdk_bdev *bdev) 666 { 667 struct spdk_bdev_module *module; 668 struct spdk_bdev_module_claim *claim, *tmpclaim; 669 uint32_t action; 670 671 if (!bdev_ok_to_examine(bdev)) { 672 return; 673 } 674 675 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 676 if (module->examine_config) { 677 spdk_spin_lock(&module->internal.spinlock); 678 action = module->internal.action_in_progress; 679 module->internal.action_in_progress++; 680 spdk_spin_unlock(&module->internal.spinlock); 681 module->examine_config(bdev); 682 if (action != module->internal.action_in_progress) { 683 SPDK_ERRLOG("examine_config for module %s did not call " 684 "spdk_bdev_module_examine_done()\n", module->name); 685 } 686 } 687 } 688 689 spdk_spin_lock(&bdev->internal.spinlock); 690 691 switch (bdev->internal.claim_type) { 692 case SPDK_BDEV_CLAIM_NONE: 693 /* Examine by all bdev modules */ 694 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 695 if (module->examine_disk) { 696 spdk_spin_lock(&module->internal.spinlock); 697 module->internal.action_in_progress++; 698 spdk_spin_unlock(&module->internal.spinlock); 699 spdk_spin_unlock(&bdev->internal.spinlock); 700 module->examine_disk(bdev); 701 spdk_spin_lock(&bdev->internal.spinlock); 702 } 703 } 704 break; 705 case SPDK_BDEV_CLAIM_EXCL_WRITE: 706 /* Examine by the one bdev module with a v1 claim */ 707 module = bdev->internal.claim.v1.module; 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 return; 715 } 716 break; 717 default: 718 /* Examine by all bdev modules with a v2 claim */ 719 assert(claim_type_is_v2(bdev->internal.claim_type)); 720 /* 721 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 722 * list, perhaps accessing freed memory. Without protection, this could happen 723 * while the lock is dropped during the examine callback. 724 */ 725 bdev->internal.examine_in_progress++; 726 727 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 728 module = claim->module; 729 730 if (module == NULL) { 731 /* This is a vestigial claim, held by examine_count */ 732 continue; 733 } 734 735 if (module->examine_disk == NULL) { 736 continue; 737 } 738 739 spdk_spin_lock(&module->internal.spinlock); 740 module->internal.action_in_progress++; 741 spdk_spin_unlock(&module->internal.spinlock); 742 743 /* Call examine_disk without holding internal.spinlock. */ 744 spdk_spin_unlock(&bdev->internal.spinlock); 745 module->examine_disk(bdev); 746 spdk_spin_lock(&bdev->internal.spinlock); 747 } 748 749 assert(bdev->internal.examine_in_progress > 0); 750 bdev->internal.examine_in_progress--; 751 if (bdev->internal.examine_in_progress == 0) { 752 /* Remove any claims that were released during examine_disk */ 753 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 754 if (claim->desc != NULL) { 755 continue; 756 } 757 758 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 759 free(claim); 760 } 761 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 762 claim_reset(bdev); 763 } 764 } 765 } 766 767 spdk_spin_unlock(&bdev->internal.spinlock); 768 } 769 770 int 771 spdk_bdev_examine(const char *name) 772 { 773 struct spdk_bdev *bdev; 774 struct spdk_bdev_examine_item *item; 775 struct spdk_thread *thread = spdk_get_thread(); 776 777 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 778 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 779 thread ? spdk_thread_get_name(thread) : "null"); 780 return -EINVAL; 781 } 782 783 if (g_bdev_opts.bdev_auto_examine) { 784 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 785 return -EINVAL; 786 } 787 788 if (bdev_examine_allowlist_check(name)) { 789 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 790 return -EEXIST; 791 } 792 793 item = calloc(1, sizeof(*item)); 794 if (!item) { 795 return -ENOMEM; 796 } 797 item->name = strdup(name); 798 if (!item->name) { 799 free(item); 800 return -ENOMEM; 801 } 802 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 803 804 bdev = spdk_bdev_get_by_name(name); 805 if (bdev) { 806 bdev_examine(bdev); 807 } 808 return 0; 809 } 810 811 static inline void 812 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 813 { 814 struct spdk_bdev_examine_item *item; 815 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 816 spdk_json_write_object_begin(w); 817 spdk_json_write_named_string(w, "method", "bdev_examine"); 818 spdk_json_write_named_object_begin(w, "params"); 819 spdk_json_write_named_string(w, "name", item->name); 820 spdk_json_write_object_end(w); 821 spdk_json_write_object_end(w); 822 } 823 } 824 825 struct spdk_bdev * 826 spdk_bdev_first(void) 827 { 828 struct spdk_bdev *bdev; 829 830 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 831 if (bdev) { 832 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 833 } 834 835 return bdev; 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_next(struct spdk_bdev *prev) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_NEXT(prev, internal.link); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 static struct spdk_bdev * 852 _bdev_next_leaf(struct spdk_bdev *bdev) 853 { 854 while (bdev != NULL) { 855 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 856 return bdev; 857 } else { 858 bdev = TAILQ_NEXT(bdev, internal.link); 859 } 860 } 861 862 return bdev; 863 } 864 865 struct spdk_bdev * 866 spdk_bdev_first_leaf(void) 867 { 868 struct spdk_bdev *bdev; 869 870 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 871 872 if (bdev) { 873 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 874 } 875 876 return bdev; 877 } 878 879 struct spdk_bdev * 880 spdk_bdev_next_leaf(struct spdk_bdev *prev) 881 { 882 struct spdk_bdev *bdev; 883 884 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 885 886 if (bdev) { 887 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 888 } 889 890 return bdev; 891 } 892 893 static inline bool 894 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 895 { 896 return bdev_io->internal.memory_domain; 897 } 898 899 void 900 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 901 { 902 struct iovec *iovs; 903 904 if (bdev_io->u.bdev.iovs == NULL) { 905 bdev_io->u.bdev.iovs = &bdev_io->iov; 906 bdev_io->u.bdev.iovcnt = 1; 907 } 908 909 iovs = bdev_io->u.bdev.iovs; 910 911 assert(iovs != NULL); 912 assert(bdev_io->u.bdev.iovcnt >= 1); 913 914 iovs[0].iov_base = buf; 915 iovs[0].iov_len = len; 916 } 917 918 void 919 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 920 { 921 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 922 bdev_io->u.bdev.md_buf = md_buf; 923 } 924 925 static bool 926 _is_buf_allocated(const struct iovec *iovs) 927 { 928 if (iovs == NULL) { 929 return false; 930 } 931 932 return iovs[0].iov_base != NULL; 933 } 934 935 static bool 936 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 937 { 938 int i; 939 uintptr_t iov_base; 940 941 if (spdk_likely(alignment == 1)) { 942 return true; 943 } 944 945 for (i = 0; i < iovcnt; i++) { 946 iov_base = (uintptr_t)iovs[i].iov_base; 947 if ((iov_base & (alignment - 1)) != 0) { 948 return false; 949 } 950 } 951 952 return true; 953 } 954 955 static void 956 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 957 { 958 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 959 void *buf; 960 961 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 962 buf = bdev_io->internal.buf; 963 bdev_io->internal.buf = NULL; 964 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 965 bdev_io->internal.get_aux_buf_cb = NULL; 966 } else { 967 assert(bdev_io->internal.get_buf_cb != NULL); 968 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 969 bdev_io->internal.get_buf_cb = NULL; 970 } 971 } 972 973 static void 974 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 975 { 976 struct spdk_bdev_io *bdev_io = ctx; 977 978 if (rc) { 979 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 980 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 981 } 982 bdev_io_get_buf_complete(bdev_io, !rc); 983 } 984 985 static void 986 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 987 { 988 int rc = 0; 989 990 /* save original md_buf */ 991 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 992 bdev_io->internal.orig_md_iov.iov_len = len; 993 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 994 bdev_io->internal.bounce_md_iov.iov_len = len; 995 /* set bounce md_buf */ 996 bdev_io->u.bdev.md_buf = md_buf; 997 998 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 999 if (bdev_io_use_memory_domain(bdev_io)) { 1000 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1001 bdev_io->internal.memory_domain_ctx, 1002 &bdev_io->internal.orig_md_iov, 1, 1003 &bdev_io->internal.bounce_md_iov, 1, 1004 bdev_io->internal.data_transfer_cpl, 1005 bdev_io); 1006 if (rc == 0) { 1007 /* Continue to submit IO in completion callback */ 1008 return; 1009 } 1010 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1011 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1012 } else { 1013 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1014 } 1015 } 1016 1017 assert(bdev_io->internal.data_transfer_cpl); 1018 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1019 } 1020 1021 static void 1022 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1023 { 1024 struct spdk_bdev *bdev = bdev_io->bdev; 1025 uint64_t md_len; 1026 void *buf; 1027 1028 if (spdk_bdev_is_md_separate(bdev)) { 1029 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1030 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1031 1032 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1033 1034 if (bdev_io->u.bdev.md_buf != NULL) { 1035 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1036 return; 1037 } else { 1038 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1039 } 1040 } 1041 1042 bdev_io_get_buf_complete(bdev_io, true); 1043 } 1044 1045 static void 1046 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1047 { 1048 struct spdk_bdev_io *bdev_io = ctx; 1049 1050 if (rc) { 1051 SPDK_ERRLOG("Failed to get data buffer\n"); 1052 assert(bdev_io->internal.data_transfer_cpl); 1053 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1054 return; 1055 } 1056 1057 _bdev_io_set_md_buf(bdev_io); 1058 } 1059 1060 static void 1061 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1062 bdev_copy_bounce_buffer_cpl cpl_cb) 1063 { 1064 int rc = 0; 1065 1066 bdev_io->internal.data_transfer_cpl = cpl_cb; 1067 /* save original iovec */ 1068 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1069 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1070 /* set bounce iov */ 1071 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1072 bdev_io->u.bdev.iovcnt = 1; 1073 /* set bounce buffer for this operation */ 1074 bdev_io->u.bdev.iovs[0].iov_base = buf; 1075 bdev_io->u.bdev.iovs[0].iov_len = len; 1076 /* if this is write path, copy data from original buffer to bounce buffer */ 1077 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1078 if (bdev_io_use_memory_domain(bdev_io)) { 1079 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1080 bdev_io->internal.memory_domain_ctx, 1081 bdev_io->internal.orig_iovs, 1082 (uint32_t) bdev_io->internal.orig_iovcnt, 1083 bdev_io->u.bdev.iovs, 1, 1084 _bdev_io_pull_bounce_data_buf_done, 1085 bdev_io); 1086 if (rc == 0) { 1087 /* Continue to submit IO in completion callback */ 1088 return; 1089 } 1090 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1091 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1092 } else { 1093 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1094 } 1095 } 1096 1097 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1098 } 1099 1100 static void 1101 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1102 { 1103 struct spdk_bdev *bdev = bdev_io->bdev; 1104 bool buf_allocated; 1105 uint64_t alignment; 1106 void *aligned_buf; 1107 1108 bdev_io->internal.buf = buf; 1109 1110 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1111 bdev_io_get_buf_complete(bdev_io, true); 1112 return; 1113 } 1114 1115 alignment = spdk_bdev_get_buf_align(bdev); 1116 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1117 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1118 1119 if (buf_allocated) { 1120 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1121 /* Continue in completion callback */ 1122 return; 1123 } else { 1124 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1125 } 1126 1127 _bdev_io_set_md_buf(bdev_io); 1128 } 1129 1130 static inline uint64_t 1131 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1132 { 1133 struct spdk_bdev *bdev = bdev_io->bdev; 1134 uint64_t md_len, alignment; 1135 1136 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1137 alignment = spdk_bdev_get_buf_align(bdev); 1138 1139 return len + alignment + md_len; 1140 } 1141 1142 static void 1143 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1144 { 1145 struct spdk_bdev_mgmt_channel *ch; 1146 1147 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1148 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1149 } 1150 1151 static void 1152 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1153 { 1154 assert(bdev_io->internal.buf != NULL); 1155 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1156 bdev_io->internal.buf = NULL; 1157 } 1158 1159 void 1160 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1161 { 1162 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1163 1164 assert(buf != NULL); 1165 _bdev_io_put_buf(bdev_io, buf, len); 1166 } 1167 1168 static void 1169 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1170 { 1171 struct spdk_bdev *bdev = bdev_ch->bdev; 1172 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1173 struct spdk_bdev_io *bdev_io; 1174 1175 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1176 /* 1177 * Allow some more I/O to complete before retrying the nomem_io queue. 1178 * Some drivers (such as nvme) cannot immediately take a new I/O in 1179 * the context of a completion, because the resources for the I/O are 1180 * not released until control returns to the bdev poller. Also, we 1181 * may require several small I/O to complete before a larger I/O 1182 * (that requires splitting) can be submitted. 1183 */ 1184 return; 1185 } 1186 1187 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1188 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1189 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1190 bdev_io->internal.ch->io_outstanding++; 1191 shared_resource->io_outstanding++; 1192 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1193 bdev_io->internal.error.nvme.cdw0 = 0; 1194 bdev_io->num_retries++; 1195 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1196 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1197 break; 1198 } 1199 } 1200 } 1201 1202 static inline void 1203 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1204 struct spdk_bdev_shared_resource *shared_resource) 1205 { 1206 assert(bdev_ch->io_outstanding > 0); 1207 assert(shared_resource->io_outstanding > 0); 1208 bdev_ch->io_outstanding--; 1209 shared_resource->io_outstanding--; 1210 } 1211 1212 static inline bool 1213 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1214 { 1215 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1216 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1217 1218 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1219 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1220 /* 1221 * Wait for some of the outstanding I/O to complete before we 1222 * retry any of the nomem_io. Normally we will wait for 1223 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1224 * depth channels we will instead wait for half to complete. 1225 */ 1226 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1227 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1228 return true; 1229 } 1230 1231 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1232 bdev_ch_retry_io(bdev_ch); 1233 } 1234 1235 return false; 1236 } 1237 1238 static void 1239 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1240 { 1241 struct spdk_bdev_io *bdev_io = ctx; 1242 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1243 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1244 1245 if (rc) { 1246 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1247 } 1248 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1249 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1250 */ 1251 bdev_io_put_buf(bdev_io); 1252 1253 /* Continue with IO completion flow */ 1254 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1255 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1256 return; 1257 } 1258 1259 bdev_io_complete(bdev_io); 1260 } 1261 1262 static inline void 1263 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1264 { 1265 int rc = 0; 1266 1267 /* do the same for metadata buffer */ 1268 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1269 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1270 1271 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1272 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1273 if (bdev_io_use_memory_domain(bdev_io)) { 1274 /* If memory domain is used then we need to call async push function */ 1275 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1276 bdev_io->internal.memory_domain_ctx, 1277 &bdev_io->internal.orig_md_iov, 1278 (uint32_t)bdev_io->internal.orig_iovcnt, 1279 &bdev_io->internal.bounce_md_iov, 1, 1280 bdev_io->internal.data_transfer_cpl, 1281 bdev_io); 1282 if (rc == 0) { 1283 /* Continue IO completion in async callback */ 1284 return; 1285 } 1286 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1287 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1288 } else { 1289 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1290 bdev_io->internal.orig_md_iov.iov_len); 1291 } 1292 } 1293 } 1294 1295 assert(bdev_io->internal.data_transfer_cpl); 1296 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1297 } 1298 1299 static void 1300 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1301 { 1302 struct spdk_bdev_io *bdev_io = ctx; 1303 1304 assert(bdev_io->internal.data_transfer_cpl); 1305 1306 if (rc) { 1307 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1308 return; 1309 } 1310 1311 /* set original buffer for this io */ 1312 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1313 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1314 /* disable bouncing buffer for this io */ 1315 bdev_io->internal.orig_iovcnt = 0; 1316 bdev_io->internal.orig_iovs = NULL; 1317 1318 _bdev_io_push_bounce_md_buffer(bdev_io); 1319 } 1320 1321 static inline void 1322 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1323 { 1324 int rc = 0; 1325 1326 bdev_io->internal.data_transfer_cpl = cpl_cb; 1327 1328 /* if this is read path, copy data from bounce buffer to original buffer */ 1329 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1330 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1331 if (bdev_io_use_memory_domain(bdev_io)) { 1332 /* If memory domain is used then we need to call async push function */ 1333 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1334 bdev_io->internal.memory_domain_ctx, 1335 bdev_io->internal.orig_iovs, 1336 (uint32_t)bdev_io->internal.orig_iovcnt, 1337 &bdev_io->internal.bounce_iov, 1, 1338 _bdev_io_push_bounce_data_buffer_done, 1339 bdev_io); 1340 if (rc == 0) { 1341 /* Continue IO completion in async callback */ 1342 return; 1343 } 1344 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1345 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1346 } else { 1347 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1348 bdev_io->internal.orig_iovcnt, 1349 bdev_io->internal.bounce_iov.iov_base, 1350 bdev_io->internal.bounce_iov.iov_len); 1351 } 1352 } 1353 1354 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1355 } 1356 1357 static void 1358 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1359 { 1360 struct spdk_bdev_io *bdev_io; 1361 1362 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1363 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1364 } 1365 1366 static void 1367 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1368 { 1369 struct spdk_bdev_mgmt_channel *mgmt_ch; 1370 uint64_t max_len; 1371 void *buf; 1372 1373 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1374 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1375 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1376 1377 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1378 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1379 bdev_io_get_buf_complete(bdev_io, false); 1380 return; 1381 } 1382 1383 bdev_io->internal.buf_len = len; 1384 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1385 bdev_io_get_iobuf_cb); 1386 if (buf != NULL) { 1387 _bdev_io_set_buf(bdev_io, buf, len); 1388 } 1389 } 1390 1391 void 1392 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1393 { 1394 struct spdk_bdev *bdev = bdev_io->bdev; 1395 uint64_t alignment; 1396 1397 assert(cb != NULL); 1398 bdev_io->internal.get_buf_cb = cb; 1399 1400 alignment = spdk_bdev_get_buf_align(bdev); 1401 1402 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1403 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1404 /* Buffer already present and aligned */ 1405 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1406 return; 1407 } 1408 1409 bdev_io_get_buf(bdev_io, len); 1410 } 1411 1412 static void 1413 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1414 bool success) 1415 { 1416 if (!success) { 1417 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1418 bdev_io_complete(bdev_io); 1419 } else { 1420 bdev_io_submit(bdev_io); 1421 } 1422 } 1423 1424 static void 1425 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1426 uint64_t len) 1427 { 1428 assert(cb != NULL); 1429 bdev_io->internal.get_buf_cb = cb; 1430 1431 bdev_io_get_buf(bdev_io, len); 1432 } 1433 1434 void 1435 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1436 { 1437 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1438 1439 assert(cb != NULL); 1440 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1441 bdev_io->internal.get_aux_buf_cb = cb; 1442 bdev_io_get_buf(bdev_io, len); 1443 } 1444 1445 static int 1446 bdev_module_get_max_ctx_size(void) 1447 { 1448 struct spdk_bdev_module *bdev_module; 1449 int max_bdev_module_size = 0; 1450 1451 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1452 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1453 max_bdev_module_size = bdev_module->get_ctx_size(); 1454 } 1455 } 1456 1457 return max_bdev_module_size; 1458 } 1459 1460 static void 1461 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1462 { 1463 int i; 1464 struct spdk_bdev_qos *qos = bdev->internal.qos; 1465 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1466 1467 if (!qos) { 1468 return; 1469 } 1470 1471 spdk_bdev_get_qos_rate_limits(bdev, limits); 1472 1473 spdk_json_write_object_begin(w); 1474 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1475 1476 spdk_json_write_named_object_begin(w, "params"); 1477 spdk_json_write_named_string(w, "name", bdev->name); 1478 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1479 if (limits[i] > 0) { 1480 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1481 } 1482 } 1483 spdk_json_write_object_end(w); 1484 1485 spdk_json_write_object_end(w); 1486 } 1487 1488 void 1489 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1490 { 1491 struct spdk_bdev_module *bdev_module; 1492 struct spdk_bdev *bdev; 1493 1494 assert(w != NULL); 1495 1496 spdk_json_write_array_begin(w); 1497 1498 spdk_json_write_object_begin(w); 1499 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1500 spdk_json_write_named_object_begin(w, "params"); 1501 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1502 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1503 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1504 spdk_json_write_object_end(w); 1505 spdk_json_write_object_end(w); 1506 1507 bdev_examine_allowlist_config_json(w); 1508 1509 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1510 if (bdev_module->config_json) { 1511 bdev_module->config_json(w); 1512 } 1513 } 1514 1515 spdk_spin_lock(&g_bdev_mgr.spinlock); 1516 1517 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1518 if (bdev->fn_table->write_config_json) { 1519 bdev->fn_table->write_config_json(bdev, w); 1520 } 1521 1522 bdev_qos_config_json(bdev, w); 1523 } 1524 1525 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1526 1527 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1528 spdk_json_write_object_begin(w); 1529 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1530 spdk_json_write_object_end(w); 1531 1532 spdk_json_write_array_end(w); 1533 } 1534 1535 static void 1536 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1537 { 1538 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1539 struct spdk_bdev_io *bdev_io; 1540 1541 spdk_iobuf_channel_fini(&ch->iobuf); 1542 1543 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1544 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1545 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1546 ch->per_thread_cache_count--; 1547 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1548 } 1549 1550 assert(ch->per_thread_cache_count == 0); 1551 } 1552 1553 static int 1554 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1555 { 1556 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1557 struct spdk_bdev_io *bdev_io; 1558 uint32_t i; 1559 int rc; 1560 1561 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1562 if (rc != 0) { 1563 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1564 return -1; 1565 } 1566 1567 STAILQ_INIT(&ch->per_thread_cache); 1568 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1569 1570 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1571 ch->per_thread_cache_count = 0; 1572 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1573 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1574 if (bdev_io == NULL) { 1575 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1576 assert(false); 1577 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1578 return -1; 1579 } 1580 ch->per_thread_cache_count++; 1581 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1582 } 1583 1584 TAILQ_INIT(&ch->shared_resources); 1585 TAILQ_INIT(&ch->io_wait_queue); 1586 1587 return 0; 1588 } 1589 1590 static void 1591 bdev_init_complete(int rc) 1592 { 1593 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1594 void *cb_arg = g_init_cb_arg; 1595 struct spdk_bdev_module *m; 1596 1597 g_bdev_mgr.init_complete = true; 1598 g_init_cb_fn = NULL; 1599 g_init_cb_arg = NULL; 1600 1601 /* 1602 * For modules that need to know when subsystem init is complete, 1603 * inform them now. 1604 */ 1605 if (rc == 0) { 1606 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1607 if (m->init_complete) { 1608 m->init_complete(); 1609 } 1610 } 1611 } 1612 1613 cb_fn(cb_arg, rc); 1614 } 1615 1616 static bool 1617 bdev_module_all_actions_completed(void) 1618 { 1619 struct spdk_bdev_module *m; 1620 1621 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1622 if (m->internal.action_in_progress > 0) { 1623 return false; 1624 } 1625 } 1626 return true; 1627 } 1628 1629 static void 1630 bdev_module_action_complete(void) 1631 { 1632 /* 1633 * Don't finish bdev subsystem initialization if 1634 * module pre-initialization is still in progress, or 1635 * the subsystem been already initialized. 1636 */ 1637 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1638 return; 1639 } 1640 1641 /* 1642 * Check all bdev modules for inits/examinations in progress. If any 1643 * exist, return immediately since we cannot finish bdev subsystem 1644 * initialization until all are completed. 1645 */ 1646 if (!bdev_module_all_actions_completed()) { 1647 return; 1648 } 1649 1650 /* 1651 * Modules already finished initialization - now that all 1652 * the bdev modules have finished their asynchronous I/O 1653 * processing, the entire bdev layer can be marked as complete. 1654 */ 1655 bdev_init_complete(0); 1656 } 1657 1658 static void 1659 bdev_module_action_done(struct spdk_bdev_module *module) 1660 { 1661 spdk_spin_lock(&module->internal.spinlock); 1662 assert(module->internal.action_in_progress > 0); 1663 module->internal.action_in_progress--; 1664 spdk_spin_unlock(&module->internal.spinlock); 1665 bdev_module_action_complete(); 1666 } 1667 1668 void 1669 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1670 { 1671 assert(module->async_init); 1672 bdev_module_action_done(module); 1673 } 1674 1675 void 1676 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1677 { 1678 bdev_module_action_done(module); 1679 } 1680 1681 /** The last initialized bdev module */ 1682 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1683 1684 static void 1685 bdev_init_failed(void *cb_arg) 1686 { 1687 struct spdk_bdev_module *module = cb_arg; 1688 1689 spdk_spin_lock(&module->internal.spinlock); 1690 assert(module->internal.action_in_progress > 0); 1691 module->internal.action_in_progress--; 1692 spdk_spin_unlock(&module->internal.spinlock); 1693 bdev_init_complete(-1); 1694 } 1695 1696 static int 1697 bdev_modules_init(void) 1698 { 1699 struct spdk_bdev_module *module; 1700 int rc = 0; 1701 1702 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1703 g_resume_bdev_module = module; 1704 if (module->async_init) { 1705 spdk_spin_lock(&module->internal.spinlock); 1706 module->internal.action_in_progress = 1; 1707 spdk_spin_unlock(&module->internal.spinlock); 1708 } 1709 rc = module->module_init(); 1710 if (rc != 0) { 1711 /* Bump action_in_progress to prevent other modules from completion of modules_init 1712 * Send message to defer application shutdown until resources are cleaned up */ 1713 spdk_spin_lock(&module->internal.spinlock); 1714 module->internal.action_in_progress = 1; 1715 spdk_spin_unlock(&module->internal.spinlock); 1716 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1717 return rc; 1718 } 1719 } 1720 1721 g_resume_bdev_module = NULL; 1722 return 0; 1723 } 1724 1725 void 1726 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1727 { 1728 int rc = 0; 1729 char mempool_name[32]; 1730 1731 assert(cb_fn != NULL); 1732 1733 g_init_cb_fn = cb_fn; 1734 g_init_cb_arg = cb_arg; 1735 1736 spdk_notify_type_register("bdev_register"); 1737 spdk_notify_type_register("bdev_unregister"); 1738 1739 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1740 1741 rc = spdk_iobuf_register_module("bdev"); 1742 if (rc != 0) { 1743 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1744 bdev_init_complete(-1); 1745 return; 1746 } 1747 1748 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1749 g_bdev_opts.bdev_io_pool_size, 1750 sizeof(struct spdk_bdev_io) + 1751 bdev_module_get_max_ctx_size(), 1752 0, 1753 SPDK_ENV_SOCKET_ID_ANY); 1754 1755 if (g_bdev_mgr.bdev_io_pool == NULL) { 1756 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1757 bdev_init_complete(-1); 1758 return; 1759 } 1760 1761 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1762 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1763 if (!g_bdev_mgr.zero_buffer) { 1764 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1765 bdev_init_complete(-1); 1766 return; 1767 } 1768 1769 #ifdef SPDK_CONFIG_VTUNE 1770 SPDK_LOG_DEPRECATED(vtune_support); 1771 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1772 #endif 1773 1774 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1775 bdev_mgmt_channel_destroy, 1776 sizeof(struct spdk_bdev_mgmt_channel), 1777 "bdev_mgr"); 1778 1779 rc = bdev_modules_init(); 1780 g_bdev_mgr.module_init_complete = true; 1781 if (rc != 0) { 1782 SPDK_ERRLOG("bdev modules init failed\n"); 1783 return; 1784 } 1785 1786 bdev_module_action_complete(); 1787 } 1788 1789 static void 1790 bdev_mgr_unregister_cb(void *io_device) 1791 { 1792 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1793 1794 if (g_bdev_mgr.bdev_io_pool) { 1795 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1796 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1797 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1798 g_bdev_opts.bdev_io_pool_size); 1799 } 1800 1801 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1802 } 1803 1804 spdk_free(g_bdev_mgr.zero_buffer); 1805 1806 bdev_examine_allowlist_free(); 1807 1808 cb_fn(g_fini_cb_arg); 1809 g_fini_cb_fn = NULL; 1810 g_fini_cb_arg = NULL; 1811 g_bdev_mgr.init_complete = false; 1812 g_bdev_mgr.module_init_complete = false; 1813 } 1814 1815 static void 1816 bdev_module_fini_iter(void *arg) 1817 { 1818 struct spdk_bdev_module *bdev_module; 1819 1820 /* FIXME: Handling initialization failures is broken now, 1821 * so we won't even try cleaning up after successfully 1822 * initialized modules. if module_init_complete is false, 1823 * just call spdk_bdev_mgr_unregister_cb 1824 */ 1825 if (!g_bdev_mgr.module_init_complete) { 1826 bdev_mgr_unregister_cb(NULL); 1827 return; 1828 } 1829 1830 /* Start iterating from the last touched module */ 1831 if (!g_resume_bdev_module) { 1832 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1833 } else { 1834 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1835 internal.tailq); 1836 } 1837 1838 while (bdev_module) { 1839 if (bdev_module->async_fini) { 1840 /* Save our place so we can resume later. We must 1841 * save the variable here, before calling module_fini() 1842 * below, because in some cases the module may immediately 1843 * call spdk_bdev_module_fini_done() and re-enter 1844 * this function to continue iterating. */ 1845 g_resume_bdev_module = bdev_module; 1846 } 1847 1848 if (bdev_module->module_fini) { 1849 bdev_module->module_fini(); 1850 } 1851 1852 if (bdev_module->async_fini) { 1853 return; 1854 } 1855 1856 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1857 internal.tailq); 1858 } 1859 1860 g_resume_bdev_module = NULL; 1861 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1862 } 1863 1864 void 1865 spdk_bdev_module_fini_done(void) 1866 { 1867 if (spdk_get_thread() != g_fini_thread) { 1868 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1869 } else { 1870 bdev_module_fini_iter(NULL); 1871 } 1872 } 1873 1874 static void 1875 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1876 { 1877 struct spdk_bdev *bdev = cb_arg; 1878 1879 if (bdeverrno && bdev) { 1880 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1881 bdev->name); 1882 1883 /* 1884 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1885 * bdev; try to continue by manually removing this bdev from the list and continue 1886 * with the next bdev in the list. 1887 */ 1888 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1889 } 1890 1891 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1892 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1893 /* 1894 * Bdev module finish need to be deferred as we might be in the middle of some context 1895 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1896 * after returning. 1897 */ 1898 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1899 return; 1900 } 1901 1902 /* 1903 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1904 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1905 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1906 * base bdevs. 1907 * 1908 * Also, walk the list in the reverse order. 1909 */ 1910 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1911 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1912 spdk_spin_lock(&bdev->internal.spinlock); 1913 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1914 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 1915 spdk_spin_unlock(&bdev->internal.spinlock); 1916 continue; 1917 } 1918 spdk_spin_unlock(&bdev->internal.spinlock); 1919 1920 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1921 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1922 return; 1923 } 1924 1925 /* 1926 * If any bdev fails to unclaim underlying bdev properly, we may face the 1927 * case of bdev list consisting of claimed bdevs only (if claims are managed 1928 * correctly, this would mean there's a loop in the claims graph which is 1929 * clearly impossible). Warn and unregister last bdev on the list then. 1930 */ 1931 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1932 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1933 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1934 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1935 return; 1936 } 1937 } 1938 1939 static void 1940 bdev_module_fini_start_iter(void *arg) 1941 { 1942 struct spdk_bdev_module *bdev_module; 1943 1944 if (!g_resume_bdev_module) { 1945 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1946 } else { 1947 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1948 } 1949 1950 while (bdev_module) { 1951 if (bdev_module->async_fini_start) { 1952 /* Save our place so we can resume later. We must 1953 * save the variable here, before calling fini_start() 1954 * below, because in some cases the module may immediately 1955 * call spdk_bdev_module_fini_start_done() and re-enter 1956 * this function to continue iterating. */ 1957 g_resume_bdev_module = bdev_module; 1958 } 1959 1960 if (bdev_module->fini_start) { 1961 bdev_module->fini_start(); 1962 } 1963 1964 if (bdev_module->async_fini_start) { 1965 return; 1966 } 1967 1968 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1969 } 1970 1971 g_resume_bdev_module = NULL; 1972 1973 bdev_finish_unregister_bdevs_iter(NULL, 0); 1974 } 1975 1976 void 1977 spdk_bdev_module_fini_start_done(void) 1978 { 1979 if (spdk_get_thread() != g_fini_thread) { 1980 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1981 } else { 1982 bdev_module_fini_start_iter(NULL); 1983 } 1984 } 1985 1986 static void 1987 bdev_finish_wait_for_examine_done(void *cb_arg) 1988 { 1989 bdev_module_fini_start_iter(NULL); 1990 } 1991 1992 void 1993 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1994 { 1995 int rc; 1996 1997 assert(cb_fn != NULL); 1998 1999 g_fini_thread = spdk_get_thread(); 2000 2001 g_fini_cb_fn = cb_fn; 2002 g_fini_cb_arg = cb_arg; 2003 2004 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2005 if (rc != 0) { 2006 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2007 bdev_finish_wait_for_examine_done(NULL); 2008 } 2009 } 2010 2011 struct spdk_bdev_io * 2012 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2013 { 2014 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2015 struct spdk_bdev_io *bdev_io; 2016 2017 if (ch->per_thread_cache_count > 0) { 2018 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2019 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2020 ch->per_thread_cache_count--; 2021 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2022 /* 2023 * Don't try to look for bdev_ios in the global pool if there are 2024 * waiters on bdev_ios - we don't want this caller to jump the line. 2025 */ 2026 bdev_io = NULL; 2027 } else { 2028 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2029 } 2030 2031 return bdev_io; 2032 } 2033 2034 void 2035 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2036 { 2037 struct spdk_bdev_mgmt_channel *ch; 2038 2039 assert(bdev_io != NULL); 2040 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2041 2042 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2043 2044 if (bdev_io->internal.buf != NULL) { 2045 bdev_io_put_buf(bdev_io); 2046 } 2047 2048 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2049 ch->per_thread_cache_count++; 2050 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2051 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2052 struct spdk_bdev_io_wait_entry *entry; 2053 2054 entry = TAILQ_FIRST(&ch->io_wait_queue); 2055 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2056 entry->cb_fn(entry->cb_arg); 2057 } 2058 } else { 2059 /* We should never have a full cache with entries on the io wait queue. */ 2060 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2061 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2062 } 2063 } 2064 2065 static bool 2066 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2067 { 2068 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2069 2070 switch (limit) { 2071 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2072 return true; 2073 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2074 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2075 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2076 return false; 2077 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2078 default: 2079 return false; 2080 } 2081 } 2082 2083 static bool 2084 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2085 { 2086 switch (bdev_io->type) { 2087 case SPDK_BDEV_IO_TYPE_NVME_IO: 2088 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2089 case SPDK_BDEV_IO_TYPE_READ: 2090 case SPDK_BDEV_IO_TYPE_WRITE: 2091 return true; 2092 case SPDK_BDEV_IO_TYPE_ZCOPY: 2093 if (bdev_io->u.bdev.zcopy.start) { 2094 return true; 2095 } else { 2096 return false; 2097 } 2098 default: 2099 return false; 2100 } 2101 } 2102 2103 static bool 2104 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2105 { 2106 switch (bdev_io->type) { 2107 case SPDK_BDEV_IO_TYPE_NVME_IO: 2108 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2109 /* Bit 1 (0x2) set for read operation */ 2110 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2111 return true; 2112 } else { 2113 return false; 2114 } 2115 case SPDK_BDEV_IO_TYPE_READ: 2116 return true; 2117 case SPDK_BDEV_IO_TYPE_ZCOPY: 2118 /* Populate to read from disk */ 2119 if (bdev_io->u.bdev.zcopy.populate) { 2120 return true; 2121 } else { 2122 return false; 2123 } 2124 default: 2125 return false; 2126 } 2127 } 2128 2129 static uint64_t 2130 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2131 { 2132 struct spdk_bdev *bdev = bdev_io->bdev; 2133 2134 switch (bdev_io->type) { 2135 case SPDK_BDEV_IO_TYPE_NVME_IO: 2136 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2137 return bdev_io->u.nvme_passthru.nbytes; 2138 case SPDK_BDEV_IO_TYPE_READ: 2139 case SPDK_BDEV_IO_TYPE_WRITE: 2140 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2141 case SPDK_BDEV_IO_TYPE_ZCOPY: 2142 /* Track the data in the start phase only */ 2143 if (bdev_io->u.bdev.zcopy.start) { 2144 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2145 } else { 2146 return 0; 2147 } 2148 default: 2149 return 0; 2150 } 2151 } 2152 2153 static bool 2154 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2155 { 2156 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2157 return true; 2158 } else { 2159 return false; 2160 } 2161 } 2162 2163 static bool 2164 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2165 { 2166 if (bdev_is_read_io(io) == false) { 2167 return false; 2168 } 2169 2170 return bdev_qos_rw_queue_io(limit, io); 2171 } 2172 2173 static bool 2174 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2175 { 2176 if (bdev_is_read_io(io) == true) { 2177 return false; 2178 } 2179 2180 return bdev_qos_rw_queue_io(limit, io); 2181 } 2182 2183 static void 2184 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2185 { 2186 limit->remaining_this_timeslice--; 2187 } 2188 2189 static void 2190 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2191 { 2192 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2193 } 2194 2195 static void 2196 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2197 { 2198 if (bdev_is_read_io(io) == false) { 2199 return; 2200 } 2201 2202 return bdev_qos_rw_bps_update_quota(limit, io); 2203 } 2204 2205 static void 2206 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2207 { 2208 if (bdev_is_read_io(io) == true) { 2209 return; 2210 } 2211 2212 return bdev_qos_rw_bps_update_quota(limit, io); 2213 } 2214 2215 static void 2216 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2217 { 2218 int i; 2219 2220 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2221 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2222 qos->rate_limits[i].queue_io = NULL; 2223 qos->rate_limits[i].update_quota = NULL; 2224 continue; 2225 } 2226 2227 switch (i) { 2228 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2229 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2230 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2231 break; 2232 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2233 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2234 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2235 break; 2236 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2237 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2238 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2239 break; 2240 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2241 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2242 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2243 break; 2244 default: 2245 break; 2246 } 2247 } 2248 } 2249 2250 static void 2251 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2252 struct spdk_bdev_io *bdev_io, 2253 enum spdk_bdev_io_status status) 2254 { 2255 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2256 2257 bdev_io->internal.in_submit_request = true; 2258 bdev_ch->io_outstanding++; 2259 shared_resource->io_outstanding++; 2260 spdk_bdev_io_complete(bdev_io, status); 2261 bdev_io->internal.in_submit_request = false; 2262 } 2263 2264 static inline void 2265 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2266 { 2267 struct spdk_bdev *bdev = bdev_io->bdev; 2268 struct spdk_io_channel *ch = bdev_ch->channel; 2269 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2270 2271 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2272 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2273 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2274 2275 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2276 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2277 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2278 SPDK_BDEV_IO_STATUS_SUCCESS); 2279 return; 2280 } 2281 } 2282 2283 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2284 bdev_io->bdev->split_on_write_unit && 2285 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2286 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2287 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2288 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2289 return; 2290 } 2291 2292 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2293 bdev_ch->io_outstanding++; 2294 shared_resource->io_outstanding++; 2295 bdev_io->internal.in_submit_request = true; 2296 bdev->fn_table->submit_request(ch, bdev_io); 2297 bdev_io->internal.in_submit_request = false; 2298 } else { 2299 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2300 } 2301 } 2302 2303 static bool 2304 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2305 { 2306 int i; 2307 2308 if (bdev_qos_io_to_limit(bdev_io) == true) { 2309 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2310 if (!qos->rate_limits[i].queue_io) { 2311 continue; 2312 } 2313 2314 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2315 bdev_io) == true) { 2316 return true; 2317 } 2318 } 2319 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2320 if (!qos->rate_limits[i].update_quota) { 2321 continue; 2322 } 2323 2324 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2325 } 2326 } 2327 2328 return false; 2329 } 2330 2331 static inline void 2332 _bdev_io_do_submit(void *ctx) 2333 { 2334 struct spdk_bdev_io *bdev_io = ctx; 2335 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2336 2337 bdev_io_do_submit(ch, bdev_io); 2338 } 2339 2340 static int 2341 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2342 { 2343 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2344 int submitted_ios = 0; 2345 2346 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2347 if (!bdev_qos_queue_io(qos, bdev_io)) { 2348 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2349 2350 if (bdev_io->internal.io_submit_ch) { 2351 /* Send back the IO to the original thread for the actual processing. */ 2352 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2353 bdev_io->internal.io_submit_ch = NULL; 2354 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2355 _bdev_io_do_submit, bdev_io); 2356 } else { 2357 bdev_io_do_submit(ch, bdev_io); 2358 } 2359 2360 submitted_ios++; 2361 } 2362 } 2363 2364 return submitted_ios; 2365 } 2366 2367 static void 2368 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2369 { 2370 int rc; 2371 2372 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2373 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2374 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2375 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2376 &bdev_io->internal.waitq_entry); 2377 if (rc != 0) { 2378 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2379 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2380 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2381 } 2382 } 2383 2384 static bool 2385 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2386 { 2387 uint32_t io_boundary; 2388 struct spdk_bdev *bdev = bdev_io->bdev; 2389 uint32_t max_size = bdev->max_segment_size; 2390 int max_segs = bdev->max_num_segments; 2391 2392 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2393 io_boundary = bdev->write_unit_size; 2394 } else if (bdev->split_on_optimal_io_boundary) { 2395 io_boundary = bdev->optimal_io_boundary; 2396 } else { 2397 io_boundary = 0; 2398 } 2399 2400 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2401 return false; 2402 } 2403 2404 if (io_boundary) { 2405 uint64_t start_stripe, end_stripe; 2406 2407 start_stripe = bdev_io->u.bdev.offset_blocks; 2408 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2409 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2410 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2411 start_stripe >>= spdk_u32log2(io_boundary); 2412 end_stripe >>= spdk_u32log2(io_boundary); 2413 } else { 2414 start_stripe /= io_boundary; 2415 end_stripe /= io_boundary; 2416 } 2417 2418 if (start_stripe != end_stripe) { 2419 return true; 2420 } 2421 } 2422 2423 if (max_segs) { 2424 if (bdev_io->u.bdev.iovcnt > max_segs) { 2425 return true; 2426 } 2427 } 2428 2429 if (max_size) { 2430 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2431 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2432 return true; 2433 } 2434 } 2435 } 2436 2437 return false; 2438 } 2439 2440 static bool 2441 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2442 { 2443 uint32_t num_unmap_segments; 2444 2445 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2446 return false; 2447 } 2448 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2449 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2450 return true; 2451 } 2452 2453 return false; 2454 } 2455 2456 static bool 2457 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2458 { 2459 if (!bdev_io->bdev->max_write_zeroes) { 2460 return false; 2461 } 2462 2463 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2464 return true; 2465 } 2466 2467 return false; 2468 } 2469 2470 static bool 2471 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2472 { 2473 if (bdev_io->bdev->max_copy != 0 && 2474 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2475 return true; 2476 } 2477 2478 return false; 2479 } 2480 2481 static bool 2482 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2483 { 2484 switch (bdev_io->type) { 2485 case SPDK_BDEV_IO_TYPE_READ: 2486 case SPDK_BDEV_IO_TYPE_WRITE: 2487 return bdev_rw_should_split(bdev_io); 2488 case SPDK_BDEV_IO_TYPE_UNMAP: 2489 return bdev_unmap_should_split(bdev_io); 2490 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2491 return bdev_write_zeroes_should_split(bdev_io); 2492 case SPDK_BDEV_IO_TYPE_COPY: 2493 return bdev_copy_should_split(bdev_io); 2494 default: 2495 return false; 2496 } 2497 } 2498 2499 static uint32_t 2500 _to_next_boundary(uint64_t offset, uint32_t boundary) 2501 { 2502 return (boundary - (offset % boundary)); 2503 } 2504 2505 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2506 2507 static void _bdev_rw_split(void *_bdev_io); 2508 2509 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2510 2511 static void 2512 _bdev_unmap_split(void *_bdev_io) 2513 { 2514 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2515 } 2516 2517 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2518 2519 static void 2520 _bdev_write_zeroes_split(void *_bdev_io) 2521 { 2522 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2523 } 2524 2525 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2526 2527 static void 2528 _bdev_copy_split(void *_bdev_io) 2529 { 2530 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2531 } 2532 2533 static int 2534 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2535 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2536 { 2537 int rc; 2538 uint64_t current_offset, current_remaining, current_src_offset; 2539 spdk_bdev_io_wait_cb io_wait_fn; 2540 2541 current_offset = *offset; 2542 current_remaining = *remaining; 2543 2544 bdev_io->u.bdev.split_outstanding++; 2545 2546 io_wait_fn = _bdev_rw_split; 2547 switch (bdev_io->type) { 2548 case SPDK_BDEV_IO_TYPE_READ: 2549 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2550 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2551 iov, iovcnt, md_buf, current_offset, 2552 num_blocks, bdev_io->internal.memory_domain, 2553 bdev_io->internal.memory_domain_ctx, 2554 bdev_io_split_done, bdev_io); 2555 break; 2556 case SPDK_BDEV_IO_TYPE_WRITE: 2557 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2558 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2559 iov, iovcnt, md_buf, current_offset, 2560 num_blocks, bdev_io->internal.memory_domain, 2561 bdev_io->internal.memory_domain_ctx, 2562 bdev_io_split_done, bdev_io); 2563 break; 2564 case SPDK_BDEV_IO_TYPE_UNMAP: 2565 io_wait_fn = _bdev_unmap_split; 2566 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2567 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2568 current_offset, num_blocks, 2569 bdev_io_split_done, bdev_io); 2570 break; 2571 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2572 io_wait_fn = _bdev_write_zeroes_split; 2573 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2574 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2575 current_offset, num_blocks, 2576 bdev_io_split_done, bdev_io); 2577 break; 2578 case SPDK_BDEV_IO_TYPE_COPY: 2579 io_wait_fn = _bdev_copy_split; 2580 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2581 (current_offset - bdev_io->u.bdev.offset_blocks); 2582 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2583 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2584 current_offset, current_src_offset, num_blocks, 2585 bdev_io_split_done, bdev_io); 2586 break; 2587 default: 2588 assert(false); 2589 rc = -EINVAL; 2590 break; 2591 } 2592 2593 if (rc == 0) { 2594 current_offset += num_blocks; 2595 current_remaining -= num_blocks; 2596 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2597 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2598 *offset = current_offset; 2599 *remaining = current_remaining; 2600 } else { 2601 bdev_io->u.bdev.split_outstanding--; 2602 if (rc == -ENOMEM) { 2603 if (bdev_io->u.bdev.split_outstanding == 0) { 2604 /* No I/O is outstanding. Hence we should wait here. */ 2605 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2606 } 2607 } else { 2608 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2609 if (bdev_io->u.bdev.split_outstanding == 0) { 2610 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2611 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2612 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2613 } 2614 } 2615 } 2616 2617 return rc; 2618 } 2619 2620 static void 2621 _bdev_rw_split(void *_bdev_io) 2622 { 2623 struct iovec *parent_iov, *iov; 2624 struct spdk_bdev_io *bdev_io = _bdev_io; 2625 struct spdk_bdev *bdev = bdev_io->bdev; 2626 uint64_t parent_offset, current_offset, remaining; 2627 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2628 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2629 uint32_t iovcnt, iov_len, child_iovsize; 2630 uint32_t blocklen = bdev->blocklen; 2631 uint32_t io_boundary; 2632 uint32_t max_segment_size = bdev->max_segment_size; 2633 uint32_t max_child_iovcnt = bdev->max_num_segments; 2634 void *md_buf = NULL; 2635 int rc; 2636 2637 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2638 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2639 SPDK_BDEV_IO_NUM_CHILD_IOV; 2640 2641 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2642 io_boundary = bdev->write_unit_size; 2643 } else if (bdev->split_on_optimal_io_boundary) { 2644 io_boundary = bdev->optimal_io_boundary; 2645 } else { 2646 io_boundary = UINT32_MAX; 2647 } 2648 2649 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2650 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2651 parent_offset = bdev_io->u.bdev.offset_blocks; 2652 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2653 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2654 2655 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2656 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2657 if (parent_iov_offset < parent_iov->iov_len) { 2658 break; 2659 } 2660 parent_iov_offset -= parent_iov->iov_len; 2661 } 2662 2663 child_iovcnt = 0; 2664 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2665 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2666 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2667 to_next_boundary = spdk_min(remaining, to_next_boundary); 2668 to_next_boundary_bytes = to_next_boundary * blocklen; 2669 2670 iov = &bdev_io->child_iov[child_iovcnt]; 2671 iovcnt = 0; 2672 2673 if (bdev_io->u.bdev.md_buf) { 2674 md_buf = (char *)bdev_io->u.bdev.md_buf + 2675 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2676 } 2677 2678 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2679 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2680 iovcnt < child_iovsize) { 2681 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2682 iov_len = parent_iov->iov_len - parent_iov_offset; 2683 2684 iov_len = spdk_min(iov_len, max_segment_size); 2685 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2686 to_next_boundary_bytes -= iov_len; 2687 2688 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2689 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2690 2691 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2692 parent_iov_offset += iov_len; 2693 } else { 2694 parent_iovpos++; 2695 parent_iov_offset = 0; 2696 } 2697 child_iovcnt++; 2698 iovcnt++; 2699 } 2700 2701 if (to_next_boundary_bytes > 0) { 2702 /* We had to stop this child I/O early because we ran out of 2703 * child_iov space or were limited by max_num_segments. 2704 * Ensure the iovs to be aligned with block size and 2705 * then adjust to_next_boundary before starting the 2706 * child I/O. 2707 */ 2708 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2709 iovcnt == child_iovsize); 2710 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2711 if (to_last_block_bytes != 0) { 2712 uint32_t child_iovpos = child_iovcnt - 1; 2713 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2714 * so the loop will naturally end 2715 */ 2716 2717 to_last_block_bytes = blocklen - to_last_block_bytes; 2718 to_next_boundary_bytes += to_last_block_bytes; 2719 while (to_last_block_bytes > 0 && iovcnt > 0) { 2720 iov_len = spdk_min(to_last_block_bytes, 2721 bdev_io->child_iov[child_iovpos].iov_len); 2722 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2723 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2724 child_iovpos--; 2725 if (--iovcnt == 0) { 2726 /* If the child IO is less than a block size just return. 2727 * If the first child IO of any split round is less than 2728 * a block size, an error exit. 2729 */ 2730 if (bdev_io->u.bdev.split_outstanding == 0) { 2731 SPDK_ERRLOG("The first child io was less than a block size\n"); 2732 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2733 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2734 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2735 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2736 } 2737 2738 return; 2739 } 2740 } 2741 2742 to_last_block_bytes -= iov_len; 2743 2744 if (parent_iov_offset == 0) { 2745 parent_iovpos--; 2746 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2747 } 2748 parent_iov_offset -= iov_len; 2749 } 2750 2751 assert(to_last_block_bytes == 0); 2752 } 2753 to_next_boundary -= to_next_boundary_bytes / blocklen; 2754 } 2755 2756 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2757 ¤t_offset, &remaining); 2758 if (spdk_unlikely(rc)) { 2759 return; 2760 } 2761 } 2762 } 2763 2764 static void 2765 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2766 { 2767 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2768 uint32_t num_children_reqs = 0; 2769 int rc; 2770 2771 offset = bdev_io->u.bdev.split_current_offset_blocks; 2772 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2773 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2774 2775 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2776 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2777 2778 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2779 &offset, &remaining); 2780 if (spdk_likely(rc == 0)) { 2781 num_children_reqs++; 2782 } else { 2783 return; 2784 } 2785 } 2786 } 2787 2788 static void 2789 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2790 { 2791 uint64_t offset, write_zeroes_blocks, remaining; 2792 uint32_t num_children_reqs = 0; 2793 int rc; 2794 2795 offset = bdev_io->u.bdev.split_current_offset_blocks; 2796 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2797 2798 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2799 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2800 2801 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2802 &offset, &remaining); 2803 if (spdk_likely(rc == 0)) { 2804 num_children_reqs++; 2805 } else { 2806 return; 2807 } 2808 } 2809 } 2810 2811 static void 2812 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2813 { 2814 uint64_t offset, copy_blocks, remaining; 2815 uint32_t num_children_reqs = 0; 2816 int rc; 2817 2818 offset = bdev_io->u.bdev.split_current_offset_blocks; 2819 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2820 2821 assert(bdev_io->bdev->max_copy != 0); 2822 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2823 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2824 2825 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2826 &offset, &remaining); 2827 if (spdk_likely(rc == 0)) { 2828 num_children_reqs++; 2829 } else { 2830 return; 2831 } 2832 } 2833 } 2834 2835 static void 2836 parent_bdev_io_complete(void *ctx, int rc) 2837 { 2838 struct spdk_bdev_io *parent_io = ctx; 2839 2840 if (rc) { 2841 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2842 } 2843 2844 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2845 parent_io->internal.caller_ctx); 2846 } 2847 2848 static void 2849 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2850 { 2851 struct spdk_bdev_io *parent_io = cb_arg; 2852 2853 spdk_bdev_free_io(bdev_io); 2854 2855 if (!success) { 2856 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2857 /* If any child I/O failed, stop further splitting process. */ 2858 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2859 parent_io->u.bdev.split_remaining_num_blocks = 0; 2860 } 2861 parent_io->u.bdev.split_outstanding--; 2862 if (parent_io->u.bdev.split_outstanding != 0) { 2863 return; 2864 } 2865 2866 /* 2867 * Parent I/O finishes when all blocks are consumed. 2868 */ 2869 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2870 assert(parent_io->internal.cb != bdev_io_split_done); 2871 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2872 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2873 2874 if (parent_io->internal.orig_iovcnt != 0) { 2875 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2876 /* bdev IO will be completed in the callback */ 2877 } else { 2878 parent_bdev_io_complete(parent_io, 0); 2879 } 2880 return; 2881 } 2882 2883 /* 2884 * Continue with the splitting process. This function will complete the parent I/O if the 2885 * splitting is done. 2886 */ 2887 switch (parent_io->type) { 2888 case SPDK_BDEV_IO_TYPE_READ: 2889 case SPDK_BDEV_IO_TYPE_WRITE: 2890 _bdev_rw_split(parent_io); 2891 break; 2892 case SPDK_BDEV_IO_TYPE_UNMAP: 2893 bdev_unmap_split(parent_io); 2894 break; 2895 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2896 bdev_write_zeroes_split(parent_io); 2897 break; 2898 case SPDK_BDEV_IO_TYPE_COPY: 2899 bdev_copy_split(parent_io); 2900 break; 2901 default: 2902 assert(false); 2903 break; 2904 } 2905 } 2906 2907 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2908 bool success); 2909 2910 static void 2911 bdev_io_split(struct spdk_bdev_io *bdev_io) 2912 { 2913 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2914 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2915 bdev_io->u.bdev.split_outstanding = 0; 2916 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2917 2918 switch (bdev_io->type) { 2919 case SPDK_BDEV_IO_TYPE_READ: 2920 case SPDK_BDEV_IO_TYPE_WRITE: 2921 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2922 _bdev_rw_split(bdev_io); 2923 } else { 2924 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2925 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2926 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2927 } 2928 break; 2929 case SPDK_BDEV_IO_TYPE_UNMAP: 2930 bdev_unmap_split(bdev_io); 2931 break; 2932 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2933 bdev_write_zeroes_split(bdev_io); 2934 break; 2935 case SPDK_BDEV_IO_TYPE_COPY: 2936 bdev_copy_split(bdev_io); 2937 break; 2938 default: 2939 assert(false); 2940 break; 2941 } 2942 } 2943 2944 static void 2945 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2946 { 2947 if (!success) { 2948 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2949 return; 2950 } 2951 2952 _bdev_rw_split(bdev_io); 2953 } 2954 2955 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2956 * be inlined, at least on some compilers. 2957 */ 2958 static inline void 2959 _bdev_io_submit(void *ctx) 2960 { 2961 struct spdk_bdev_io *bdev_io = ctx; 2962 struct spdk_bdev *bdev = bdev_io->bdev; 2963 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2964 2965 if (spdk_likely(bdev_ch->flags == 0)) { 2966 bdev_io_do_submit(bdev_ch, bdev_io); 2967 return; 2968 } 2969 2970 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2971 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2972 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2973 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2974 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2975 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2976 } else { 2977 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2978 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2979 } 2980 } else { 2981 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2982 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2983 } 2984 } 2985 2986 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2987 2988 bool 2989 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2990 { 2991 if (range1->length == 0 || range2->length == 0) { 2992 return false; 2993 } 2994 2995 if (range1->offset + range1->length <= range2->offset) { 2996 return false; 2997 } 2998 2999 if (range2->offset + range2->length <= range1->offset) { 3000 return false; 3001 } 3002 3003 return true; 3004 } 3005 3006 static bool 3007 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3008 { 3009 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3010 struct lba_range r; 3011 3012 switch (bdev_io->type) { 3013 case SPDK_BDEV_IO_TYPE_NVME_IO: 3014 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3015 /* Don't try to decode the NVMe command - just assume worst-case and that 3016 * it overlaps a locked range. 3017 */ 3018 return true; 3019 case SPDK_BDEV_IO_TYPE_WRITE: 3020 case SPDK_BDEV_IO_TYPE_UNMAP: 3021 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3022 case SPDK_BDEV_IO_TYPE_ZCOPY: 3023 case SPDK_BDEV_IO_TYPE_COPY: 3024 r.offset = bdev_io->u.bdev.offset_blocks; 3025 r.length = bdev_io->u.bdev.num_blocks; 3026 if (!bdev_lba_range_overlapped(range, &r)) { 3027 /* This I/O doesn't overlap the specified LBA range. */ 3028 return false; 3029 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3030 /* This I/O overlaps, but the I/O is on the same channel that locked this 3031 * range, and the caller_ctx is the same as the locked_ctx. This means 3032 * that this I/O is associated with the lock, and is allowed to execute. 3033 */ 3034 return false; 3035 } else { 3036 return true; 3037 } 3038 default: 3039 return false; 3040 } 3041 } 3042 3043 void 3044 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3045 { 3046 struct spdk_bdev *bdev = bdev_io->bdev; 3047 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3048 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3049 3050 assert(thread != NULL); 3051 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3052 3053 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3054 struct lba_range *range; 3055 3056 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3057 if (bdev_io_range_is_locked(bdev_io, range)) { 3058 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3059 return; 3060 } 3061 } 3062 } 3063 3064 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3065 3066 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3067 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3068 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3069 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3070 spdk_bdev_get_name(bdev)); 3071 3072 if (bdev_io_should_split(bdev_io)) { 3073 bdev_io_split(bdev_io); 3074 return; 3075 } 3076 3077 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3078 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3079 _bdev_io_submit(bdev_io); 3080 } else { 3081 bdev_io->internal.io_submit_ch = ch; 3082 bdev_io->internal.ch = bdev->internal.qos->ch; 3083 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3084 } 3085 } else { 3086 _bdev_io_submit(bdev_io); 3087 } 3088 } 3089 3090 static inline void 3091 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3092 { 3093 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3094 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3095 * For write operation we need to pull buffers from memory domain before submitting IO. 3096 * Once read operation completes, we need to use memory_domain push functionality to 3097 * update data in original memory domain IO buffer 3098 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3099 bdev_io->u.bdev.memory_domain = NULL; 3100 bdev_io->u.bdev.memory_domain_ctx = NULL; 3101 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3102 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3103 } 3104 3105 static inline void 3106 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3107 { 3108 if (bdev_io->internal.memory_domain && !desc->memory_domains_supported) { 3109 _bdev_io_ext_use_bounce_buffer(bdev_io); 3110 return; 3111 } 3112 3113 bdev_io_submit(bdev_io); 3114 } 3115 3116 static void 3117 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3118 { 3119 struct spdk_bdev *bdev = bdev_io->bdev; 3120 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3121 struct spdk_io_channel *ch = bdev_ch->channel; 3122 3123 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3124 3125 bdev_io->internal.in_submit_request = true; 3126 bdev->fn_table->submit_request(ch, bdev_io); 3127 bdev_io->internal.in_submit_request = false; 3128 } 3129 3130 void 3131 bdev_io_init(struct spdk_bdev_io *bdev_io, 3132 struct spdk_bdev *bdev, void *cb_arg, 3133 spdk_bdev_io_completion_cb cb) 3134 { 3135 bdev_io->bdev = bdev; 3136 bdev_io->internal.caller_ctx = cb_arg; 3137 bdev_io->internal.cb = cb; 3138 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3139 bdev_io->internal.in_submit_request = false; 3140 bdev_io->internal.buf = NULL; 3141 bdev_io->internal.io_submit_ch = NULL; 3142 bdev_io->internal.orig_iovs = NULL; 3143 bdev_io->internal.orig_iovcnt = 0; 3144 bdev_io->internal.orig_md_iov.iov_base = NULL; 3145 bdev_io->internal.error.nvme.cdw0 = 0; 3146 bdev_io->num_retries = 0; 3147 bdev_io->internal.get_buf_cb = NULL; 3148 bdev_io->internal.get_aux_buf_cb = NULL; 3149 bdev_io->internal.memory_domain = NULL; 3150 bdev_io->internal.memory_domain_ctx = NULL; 3151 bdev_io->internal.data_transfer_cpl = NULL; 3152 } 3153 3154 static bool 3155 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3156 { 3157 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3158 } 3159 3160 bool 3161 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3162 { 3163 bool supported; 3164 3165 supported = bdev_io_type_supported(bdev, io_type); 3166 3167 if (!supported) { 3168 switch (io_type) { 3169 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3170 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3171 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3172 break; 3173 default: 3174 break; 3175 } 3176 } 3177 3178 return supported; 3179 } 3180 3181 uint64_t 3182 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3183 { 3184 return bdev_io->internal.submit_tsc; 3185 } 3186 3187 int 3188 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3189 { 3190 if (bdev->fn_table->dump_info_json) { 3191 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3192 } 3193 3194 return 0; 3195 } 3196 3197 static void 3198 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3199 { 3200 uint32_t max_per_timeslice = 0; 3201 int i; 3202 3203 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3204 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3205 qos->rate_limits[i].max_per_timeslice = 0; 3206 continue; 3207 } 3208 3209 max_per_timeslice = qos->rate_limits[i].limit * 3210 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3211 3212 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3213 qos->rate_limits[i].min_per_timeslice); 3214 3215 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3216 } 3217 3218 bdev_qos_set_ops(qos); 3219 } 3220 3221 static int 3222 bdev_channel_poll_qos(void *arg) 3223 { 3224 struct spdk_bdev_qos *qos = arg; 3225 uint64_t now = spdk_get_ticks(); 3226 int i; 3227 3228 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3229 /* We received our callback earlier than expected - return 3230 * immediately and wait to do accounting until at least one 3231 * timeslice has actually expired. This should never happen 3232 * with a well-behaved timer implementation. 3233 */ 3234 return SPDK_POLLER_IDLE; 3235 } 3236 3237 /* Reset for next round of rate limiting */ 3238 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3239 /* We may have allowed the IOs or bytes to slightly overrun in the last 3240 * timeslice. remaining_this_timeslice is signed, so if it's negative 3241 * here, we'll account for the overrun so that the next timeslice will 3242 * be appropriately reduced. 3243 */ 3244 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3245 qos->rate_limits[i].remaining_this_timeslice = 0; 3246 } 3247 } 3248 3249 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3250 qos->last_timeslice += qos->timeslice_size; 3251 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3252 qos->rate_limits[i].remaining_this_timeslice += 3253 qos->rate_limits[i].max_per_timeslice; 3254 } 3255 } 3256 3257 return bdev_qos_io_submit(qos->ch, qos); 3258 } 3259 3260 static void 3261 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3262 { 3263 struct spdk_bdev_shared_resource *shared_resource; 3264 struct lba_range *range; 3265 3266 bdev_free_io_stat(ch->stat); 3267 #ifdef SPDK_CONFIG_VTUNE 3268 bdev_free_io_stat(ch->prev_stat); 3269 #endif 3270 3271 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3272 range = TAILQ_FIRST(&ch->locked_ranges); 3273 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3274 free(range); 3275 } 3276 3277 spdk_put_io_channel(ch->channel); 3278 3279 shared_resource = ch->shared_resource; 3280 3281 assert(TAILQ_EMPTY(&ch->io_locked)); 3282 assert(TAILQ_EMPTY(&ch->io_submitted)); 3283 assert(ch->io_outstanding == 0); 3284 assert(shared_resource->ref > 0); 3285 shared_resource->ref--; 3286 if (shared_resource->ref == 0) { 3287 assert(shared_resource->io_outstanding == 0); 3288 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3289 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3290 free(shared_resource); 3291 } 3292 } 3293 3294 static void 3295 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3296 { 3297 struct spdk_bdev_qos *qos = bdev->internal.qos; 3298 int i; 3299 3300 assert(spdk_spin_held(&bdev->internal.spinlock)); 3301 3302 /* Rate limiting on this bdev enabled */ 3303 if (qos) { 3304 if (qos->ch == NULL) { 3305 struct spdk_io_channel *io_ch; 3306 3307 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3308 bdev->name, spdk_get_thread()); 3309 3310 /* No qos channel has been selected, so set one up */ 3311 3312 /* Take another reference to ch */ 3313 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3314 assert(io_ch != NULL); 3315 qos->ch = ch; 3316 3317 qos->thread = spdk_io_channel_get_thread(io_ch); 3318 3319 TAILQ_INIT(&qos->queued); 3320 3321 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3322 if (bdev_qos_is_iops_rate_limit(i) == true) { 3323 qos->rate_limits[i].min_per_timeslice = 3324 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3325 } else { 3326 qos->rate_limits[i].min_per_timeslice = 3327 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3328 } 3329 3330 if (qos->rate_limits[i].limit == 0) { 3331 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3332 } 3333 } 3334 bdev_qos_update_max_quota_per_timeslice(qos); 3335 qos->timeslice_size = 3336 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3337 qos->last_timeslice = spdk_get_ticks(); 3338 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3339 qos, 3340 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3341 } 3342 3343 ch->flags |= BDEV_CH_QOS_ENABLED; 3344 } 3345 } 3346 3347 struct poll_timeout_ctx { 3348 struct spdk_bdev_desc *desc; 3349 uint64_t timeout_in_sec; 3350 spdk_bdev_io_timeout_cb cb_fn; 3351 void *cb_arg; 3352 }; 3353 3354 static void 3355 bdev_desc_free(struct spdk_bdev_desc *desc) 3356 { 3357 spdk_spin_destroy(&desc->spinlock); 3358 free(desc->media_events_buffer); 3359 free(desc); 3360 } 3361 3362 static void 3363 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3364 { 3365 struct poll_timeout_ctx *ctx = _ctx; 3366 struct spdk_bdev_desc *desc = ctx->desc; 3367 3368 free(ctx); 3369 3370 spdk_spin_lock(&desc->spinlock); 3371 desc->refs--; 3372 if (desc->closed == true && desc->refs == 0) { 3373 spdk_spin_unlock(&desc->spinlock); 3374 bdev_desc_free(desc); 3375 return; 3376 } 3377 spdk_spin_unlock(&desc->spinlock); 3378 } 3379 3380 static void 3381 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3382 struct spdk_io_channel *io_ch, void *_ctx) 3383 { 3384 struct poll_timeout_ctx *ctx = _ctx; 3385 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3386 struct spdk_bdev_desc *desc = ctx->desc; 3387 struct spdk_bdev_io *bdev_io; 3388 uint64_t now; 3389 3390 spdk_spin_lock(&desc->spinlock); 3391 if (desc->closed == true) { 3392 spdk_spin_unlock(&desc->spinlock); 3393 spdk_bdev_for_each_channel_continue(i, -1); 3394 return; 3395 } 3396 spdk_spin_unlock(&desc->spinlock); 3397 3398 now = spdk_get_ticks(); 3399 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3400 /* Exclude any I/O that are generated via splitting. */ 3401 if (bdev_io->internal.cb == bdev_io_split_done) { 3402 continue; 3403 } 3404 3405 /* Once we find an I/O that has not timed out, we can immediately 3406 * exit the loop. 3407 */ 3408 if (now < (bdev_io->internal.submit_tsc + 3409 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3410 goto end; 3411 } 3412 3413 if (bdev_io->internal.desc == desc) { 3414 ctx->cb_fn(ctx->cb_arg, bdev_io); 3415 } 3416 } 3417 3418 end: 3419 spdk_bdev_for_each_channel_continue(i, 0); 3420 } 3421 3422 static int 3423 bdev_poll_timeout_io(void *arg) 3424 { 3425 struct spdk_bdev_desc *desc = arg; 3426 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3427 struct poll_timeout_ctx *ctx; 3428 3429 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3430 if (!ctx) { 3431 SPDK_ERRLOG("failed to allocate memory\n"); 3432 return SPDK_POLLER_BUSY; 3433 } 3434 ctx->desc = desc; 3435 ctx->cb_arg = desc->cb_arg; 3436 ctx->cb_fn = desc->cb_fn; 3437 ctx->timeout_in_sec = desc->timeout_in_sec; 3438 3439 /* Take a ref on the descriptor in case it gets closed while we are checking 3440 * all of the channels. 3441 */ 3442 spdk_spin_lock(&desc->spinlock); 3443 desc->refs++; 3444 spdk_spin_unlock(&desc->spinlock); 3445 3446 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3447 bdev_channel_poll_timeout_io_done); 3448 3449 return SPDK_POLLER_BUSY; 3450 } 3451 3452 int 3453 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3454 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3455 { 3456 assert(desc->thread == spdk_get_thread()); 3457 3458 spdk_poller_unregister(&desc->io_timeout_poller); 3459 3460 if (timeout_in_sec) { 3461 assert(cb_fn != NULL); 3462 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3463 desc, 3464 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3465 1000); 3466 if (desc->io_timeout_poller == NULL) { 3467 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3468 return -1; 3469 } 3470 } 3471 3472 desc->cb_fn = cb_fn; 3473 desc->cb_arg = cb_arg; 3474 desc->timeout_in_sec = timeout_in_sec; 3475 3476 return 0; 3477 } 3478 3479 static int 3480 bdev_channel_create(void *io_device, void *ctx_buf) 3481 { 3482 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3483 struct spdk_bdev_channel *ch = ctx_buf; 3484 struct spdk_io_channel *mgmt_io_ch; 3485 struct spdk_bdev_mgmt_channel *mgmt_ch; 3486 struct spdk_bdev_shared_resource *shared_resource; 3487 struct lba_range *range; 3488 3489 ch->bdev = bdev; 3490 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3491 if (!ch->channel) { 3492 return -1; 3493 } 3494 3495 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3496 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3497 3498 assert(ch->histogram == NULL); 3499 if (bdev->internal.histogram_enabled) { 3500 ch->histogram = spdk_histogram_data_alloc(); 3501 if (ch->histogram == NULL) { 3502 SPDK_ERRLOG("Could not allocate histogram\n"); 3503 } 3504 } 3505 3506 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3507 if (!mgmt_io_ch) { 3508 spdk_put_io_channel(ch->channel); 3509 return -1; 3510 } 3511 3512 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3513 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3514 if (shared_resource->shared_ch == ch->channel) { 3515 spdk_put_io_channel(mgmt_io_ch); 3516 shared_resource->ref++; 3517 break; 3518 } 3519 } 3520 3521 if (shared_resource == NULL) { 3522 shared_resource = calloc(1, sizeof(*shared_resource)); 3523 if (shared_resource == NULL) { 3524 spdk_put_io_channel(ch->channel); 3525 spdk_put_io_channel(mgmt_io_ch); 3526 return -1; 3527 } 3528 3529 shared_resource->mgmt_ch = mgmt_ch; 3530 shared_resource->io_outstanding = 0; 3531 TAILQ_INIT(&shared_resource->nomem_io); 3532 shared_resource->nomem_threshold = 0; 3533 shared_resource->shared_ch = ch->channel; 3534 shared_resource->ref = 1; 3535 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3536 } 3537 3538 ch->io_outstanding = 0; 3539 TAILQ_INIT(&ch->queued_resets); 3540 TAILQ_INIT(&ch->locked_ranges); 3541 ch->flags = 0; 3542 ch->shared_resource = shared_resource; 3543 3544 TAILQ_INIT(&ch->io_submitted); 3545 TAILQ_INIT(&ch->io_locked); 3546 3547 ch->stat = bdev_alloc_io_stat(false); 3548 if (ch->stat == NULL) { 3549 bdev_channel_destroy_resource(ch); 3550 return -1; 3551 } 3552 3553 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3554 3555 #ifdef SPDK_CONFIG_VTUNE 3556 { 3557 char *name; 3558 __itt_init_ittlib(NULL, 0); 3559 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3560 if (!name) { 3561 bdev_channel_destroy_resource(ch); 3562 return -1; 3563 } 3564 ch->handle = __itt_string_handle_create(name); 3565 free(name); 3566 ch->start_tsc = spdk_get_ticks(); 3567 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3568 ch->prev_stat = bdev_alloc_io_stat(false); 3569 if (ch->prev_stat == NULL) { 3570 bdev_channel_destroy_resource(ch); 3571 return -1; 3572 } 3573 } 3574 #endif 3575 3576 spdk_spin_lock(&bdev->internal.spinlock); 3577 bdev_enable_qos(bdev, ch); 3578 3579 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3580 struct lba_range *new_range; 3581 3582 new_range = calloc(1, sizeof(*new_range)); 3583 if (new_range == NULL) { 3584 spdk_spin_unlock(&bdev->internal.spinlock); 3585 bdev_channel_destroy_resource(ch); 3586 return -1; 3587 } 3588 new_range->length = range->length; 3589 new_range->offset = range->offset; 3590 new_range->locked_ctx = range->locked_ctx; 3591 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3592 } 3593 3594 spdk_spin_unlock(&bdev->internal.spinlock); 3595 3596 return 0; 3597 } 3598 3599 static int 3600 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3601 void *cb_ctx) 3602 { 3603 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3604 struct spdk_bdev_io *bdev_io; 3605 uint64_t buf_len; 3606 3607 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3608 if (bdev_io->internal.ch == bdev_ch) { 3609 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3610 spdk_iobuf_entry_abort(ch, entry, buf_len); 3611 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3612 } 3613 3614 return 0; 3615 } 3616 3617 /* 3618 * Abort I/O that are waiting on a data buffer. 3619 */ 3620 static void 3621 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3622 { 3623 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3624 bdev_abort_all_buf_io_cb, ch); 3625 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3626 bdev_abort_all_buf_io_cb, ch); 3627 } 3628 3629 /* 3630 * Abort I/O that are queued waiting for submission. These types of I/O are 3631 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3632 */ 3633 static void 3634 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3635 { 3636 struct spdk_bdev_io *bdev_io, *tmp; 3637 3638 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3639 if (bdev_io->internal.ch == ch) { 3640 TAILQ_REMOVE(queue, bdev_io, internal.link); 3641 /* 3642 * spdk_bdev_io_complete() assumes that the completed I/O had 3643 * been submitted to the bdev module. Since in this case it 3644 * hadn't, bump io_outstanding to account for the decrement 3645 * that spdk_bdev_io_complete() will do. 3646 */ 3647 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3648 ch->io_outstanding++; 3649 ch->shared_resource->io_outstanding++; 3650 } 3651 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3652 } 3653 } 3654 } 3655 3656 static bool 3657 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3658 { 3659 struct spdk_bdev_io *bdev_io; 3660 3661 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3662 if (bdev_io == bio_to_abort) { 3663 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3664 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3665 return true; 3666 } 3667 } 3668 3669 return false; 3670 } 3671 3672 static int 3673 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3674 { 3675 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3676 uint64_t buf_len; 3677 3678 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3679 if (bdev_io == bio_to_abort) { 3680 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3681 spdk_iobuf_entry_abort(ch, entry, buf_len); 3682 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3683 return 1; 3684 } 3685 3686 return 0; 3687 } 3688 3689 static bool 3690 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3691 { 3692 int rc; 3693 3694 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3695 bdev_abort_buf_io_cb, bio_to_abort); 3696 if (rc == 1) { 3697 return true; 3698 } 3699 3700 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3701 bdev_abort_buf_io_cb, bio_to_abort); 3702 return rc == 1; 3703 } 3704 3705 static void 3706 bdev_qos_channel_destroy(void *cb_arg) 3707 { 3708 struct spdk_bdev_qos *qos = cb_arg; 3709 3710 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3711 spdk_poller_unregister(&qos->poller); 3712 3713 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3714 3715 free(qos); 3716 } 3717 3718 static int 3719 bdev_qos_destroy(struct spdk_bdev *bdev) 3720 { 3721 int i; 3722 3723 /* 3724 * Cleanly shutting down the QoS poller is tricky, because 3725 * during the asynchronous operation the user could open 3726 * a new descriptor and create a new channel, spawning 3727 * a new QoS poller. 3728 * 3729 * The strategy is to create a new QoS structure here and swap it 3730 * in. The shutdown path then continues to refer to the old one 3731 * until it completes and then releases it. 3732 */ 3733 struct spdk_bdev_qos *new_qos, *old_qos; 3734 3735 old_qos = bdev->internal.qos; 3736 3737 new_qos = calloc(1, sizeof(*new_qos)); 3738 if (!new_qos) { 3739 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3740 return -ENOMEM; 3741 } 3742 3743 /* Copy the old QoS data into the newly allocated structure */ 3744 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3745 3746 /* Zero out the key parts of the QoS structure */ 3747 new_qos->ch = NULL; 3748 new_qos->thread = NULL; 3749 new_qos->poller = NULL; 3750 TAILQ_INIT(&new_qos->queued); 3751 /* 3752 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3753 * It will be used later for the new QoS structure. 3754 */ 3755 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3756 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3757 new_qos->rate_limits[i].min_per_timeslice = 0; 3758 new_qos->rate_limits[i].max_per_timeslice = 0; 3759 } 3760 3761 bdev->internal.qos = new_qos; 3762 3763 if (old_qos->thread == NULL) { 3764 free(old_qos); 3765 } else { 3766 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3767 } 3768 3769 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3770 * been destroyed yet. The destruction path will end up waiting for the final 3771 * channel to be put before it releases resources. */ 3772 3773 return 0; 3774 } 3775 3776 void 3777 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3778 { 3779 total->bytes_read += add->bytes_read; 3780 total->num_read_ops += add->num_read_ops; 3781 total->bytes_written += add->bytes_written; 3782 total->num_write_ops += add->num_write_ops; 3783 total->bytes_unmapped += add->bytes_unmapped; 3784 total->num_unmap_ops += add->num_unmap_ops; 3785 total->bytes_copied += add->bytes_copied; 3786 total->num_copy_ops += add->num_copy_ops; 3787 total->read_latency_ticks += add->read_latency_ticks; 3788 total->write_latency_ticks += add->write_latency_ticks; 3789 total->unmap_latency_ticks += add->unmap_latency_ticks; 3790 total->copy_latency_ticks += add->copy_latency_ticks; 3791 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3792 total->max_read_latency_ticks = add->max_read_latency_ticks; 3793 } 3794 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3795 total->min_read_latency_ticks = add->min_read_latency_ticks; 3796 } 3797 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3798 total->max_write_latency_ticks = add->max_write_latency_ticks; 3799 } 3800 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3801 total->min_write_latency_ticks = add->min_write_latency_ticks; 3802 } 3803 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3804 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3805 } 3806 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3807 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3808 } 3809 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3810 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3811 } 3812 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3813 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3814 } 3815 } 3816 3817 static void 3818 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3819 { 3820 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3821 3822 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3823 memcpy(to_stat->io_error, from_stat->io_error, 3824 sizeof(struct spdk_bdev_io_error_stat)); 3825 } 3826 } 3827 3828 void 3829 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 3830 { 3831 stat->max_read_latency_ticks = 0; 3832 stat->min_read_latency_ticks = UINT64_MAX; 3833 stat->max_write_latency_ticks = 0; 3834 stat->min_write_latency_ticks = UINT64_MAX; 3835 stat->max_unmap_latency_ticks = 0; 3836 stat->min_unmap_latency_ticks = UINT64_MAX; 3837 stat->max_copy_latency_ticks = 0; 3838 stat->min_copy_latency_ticks = UINT64_MAX; 3839 3840 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 3841 return; 3842 } 3843 3844 stat->bytes_read = 0; 3845 stat->num_read_ops = 0; 3846 stat->bytes_written = 0; 3847 stat->num_write_ops = 0; 3848 stat->bytes_unmapped = 0; 3849 stat->num_unmap_ops = 0; 3850 stat->bytes_copied = 0; 3851 stat->num_copy_ops = 0; 3852 stat->read_latency_ticks = 0; 3853 stat->write_latency_ticks = 0; 3854 stat->unmap_latency_ticks = 0; 3855 stat->copy_latency_ticks = 0; 3856 3857 if (stat->io_error != NULL) { 3858 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3859 } 3860 } 3861 3862 struct spdk_bdev_io_stat * 3863 bdev_alloc_io_stat(bool io_error_stat) 3864 { 3865 struct spdk_bdev_io_stat *stat; 3866 3867 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3868 if (stat == NULL) { 3869 return NULL; 3870 } 3871 3872 if (io_error_stat) { 3873 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3874 if (stat->io_error == NULL) { 3875 free(stat); 3876 return NULL; 3877 } 3878 } else { 3879 stat->io_error = NULL; 3880 } 3881 3882 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 3883 3884 return stat; 3885 } 3886 3887 void 3888 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3889 { 3890 if (stat != NULL) { 3891 free(stat->io_error); 3892 free(stat); 3893 } 3894 } 3895 3896 void 3897 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3898 { 3899 int i; 3900 3901 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3902 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3903 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3904 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3905 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3906 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3907 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3908 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3909 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3910 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3911 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3912 stat->min_read_latency_ticks != UINT64_MAX ? 3913 stat->min_read_latency_ticks : 0); 3914 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3915 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3916 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3917 stat->min_write_latency_ticks != UINT64_MAX ? 3918 stat->min_write_latency_ticks : 0); 3919 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3920 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3921 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3922 stat->min_unmap_latency_ticks != UINT64_MAX ? 3923 stat->min_unmap_latency_ticks : 0); 3924 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3925 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3926 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3927 stat->min_copy_latency_ticks != UINT64_MAX ? 3928 stat->min_copy_latency_ticks : 0); 3929 3930 if (stat->io_error != NULL) { 3931 spdk_json_write_named_object_begin(w, "io_error"); 3932 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3933 if (stat->io_error->error_status[i] != 0) { 3934 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3935 stat->io_error->error_status[i]); 3936 } 3937 } 3938 spdk_json_write_object_end(w); 3939 } 3940 } 3941 3942 static void 3943 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3944 { 3945 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3946 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3947 3948 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3949 bdev_abort_all_buf_io(mgmt_ch, ch); 3950 bdev_abort_all_buf_io(mgmt_ch, ch); 3951 } 3952 3953 static void 3954 bdev_channel_destroy(void *io_device, void *ctx_buf) 3955 { 3956 struct spdk_bdev_channel *ch = ctx_buf; 3957 3958 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3959 spdk_get_thread()); 3960 3961 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3962 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3963 3964 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3965 spdk_spin_lock(&ch->bdev->internal.spinlock); 3966 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3967 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3968 3969 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3970 3971 bdev_channel_abort_queued_ios(ch); 3972 3973 if (ch->histogram) { 3974 spdk_histogram_data_free(ch->histogram); 3975 } 3976 3977 bdev_channel_destroy_resource(ch); 3978 } 3979 3980 /* 3981 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3982 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3983 */ 3984 static int 3985 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3986 { 3987 struct spdk_bdev_name *tmp; 3988 3989 bdev_name->name = strdup(name); 3990 if (bdev_name->name == NULL) { 3991 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3992 return -ENOMEM; 3993 } 3994 3995 bdev_name->bdev = bdev; 3996 3997 spdk_spin_lock(&g_bdev_mgr.spinlock); 3998 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3999 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4000 4001 if (tmp != NULL) { 4002 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4003 free(bdev_name->name); 4004 return -EEXIST; 4005 } 4006 4007 return 0; 4008 } 4009 4010 static void 4011 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4012 { 4013 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4014 free(bdev_name->name); 4015 } 4016 4017 static void 4018 bdev_name_del(struct spdk_bdev_name *bdev_name) 4019 { 4020 spdk_spin_lock(&g_bdev_mgr.spinlock); 4021 bdev_name_del_unsafe(bdev_name); 4022 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4023 } 4024 4025 int 4026 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4027 { 4028 struct spdk_bdev_alias *tmp; 4029 int ret; 4030 4031 if (alias == NULL) { 4032 SPDK_ERRLOG("Empty alias passed\n"); 4033 return -EINVAL; 4034 } 4035 4036 tmp = calloc(1, sizeof(*tmp)); 4037 if (tmp == NULL) { 4038 SPDK_ERRLOG("Unable to allocate alias\n"); 4039 return -ENOMEM; 4040 } 4041 4042 ret = bdev_name_add(&tmp->alias, bdev, alias); 4043 if (ret != 0) { 4044 free(tmp); 4045 return ret; 4046 } 4047 4048 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4049 4050 return 0; 4051 } 4052 4053 static int 4054 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4055 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4056 { 4057 struct spdk_bdev_alias *tmp; 4058 4059 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4060 if (strcmp(alias, tmp->alias.name) == 0) { 4061 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4062 alias_del_fn(&tmp->alias); 4063 free(tmp); 4064 return 0; 4065 } 4066 } 4067 4068 return -ENOENT; 4069 } 4070 4071 int 4072 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4073 { 4074 int rc; 4075 4076 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4077 if (rc == -ENOENT) { 4078 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4079 } 4080 4081 return rc; 4082 } 4083 4084 void 4085 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4086 { 4087 struct spdk_bdev_alias *p, *tmp; 4088 4089 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4090 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4091 bdev_name_del(&p->alias); 4092 free(p); 4093 } 4094 } 4095 4096 struct spdk_io_channel * 4097 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4098 { 4099 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4100 } 4101 4102 void * 4103 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4104 { 4105 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4106 void *ctx = NULL; 4107 4108 if (bdev->fn_table->get_module_ctx) { 4109 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4110 } 4111 4112 return ctx; 4113 } 4114 4115 const char * 4116 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4117 { 4118 return bdev->module->name; 4119 } 4120 4121 const char * 4122 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4123 { 4124 return bdev->name; 4125 } 4126 4127 const char * 4128 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4129 { 4130 return bdev->product_name; 4131 } 4132 4133 const struct spdk_bdev_aliases_list * 4134 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4135 { 4136 return &bdev->aliases; 4137 } 4138 4139 uint32_t 4140 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4141 { 4142 return bdev->blocklen; 4143 } 4144 4145 uint32_t 4146 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4147 { 4148 return bdev->write_unit_size; 4149 } 4150 4151 uint64_t 4152 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4153 { 4154 return bdev->blockcnt; 4155 } 4156 4157 const char * 4158 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4159 { 4160 return qos_rpc_type[type]; 4161 } 4162 4163 void 4164 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4165 { 4166 int i; 4167 4168 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4169 4170 spdk_spin_lock(&bdev->internal.spinlock); 4171 if (bdev->internal.qos) { 4172 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4173 if (bdev->internal.qos->rate_limits[i].limit != 4174 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4175 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4176 if (bdev_qos_is_iops_rate_limit(i) == false) { 4177 /* Change from Byte to Megabyte which is user visible. */ 4178 limits[i] = limits[i] / 1024 / 1024; 4179 } 4180 } 4181 } 4182 } 4183 spdk_spin_unlock(&bdev->internal.spinlock); 4184 } 4185 4186 size_t 4187 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4188 { 4189 return 1 << bdev->required_alignment; 4190 } 4191 4192 uint32_t 4193 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4194 { 4195 return bdev->optimal_io_boundary; 4196 } 4197 4198 bool 4199 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4200 { 4201 return bdev->write_cache; 4202 } 4203 4204 const struct spdk_uuid * 4205 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4206 { 4207 return &bdev->uuid; 4208 } 4209 4210 uint16_t 4211 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4212 { 4213 return bdev->acwu; 4214 } 4215 4216 uint32_t 4217 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4218 { 4219 return bdev->md_len; 4220 } 4221 4222 bool 4223 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4224 { 4225 return (bdev->md_len != 0) && bdev->md_interleave; 4226 } 4227 4228 bool 4229 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4230 { 4231 return (bdev->md_len != 0) && !bdev->md_interleave; 4232 } 4233 4234 bool 4235 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4236 { 4237 return bdev->zoned; 4238 } 4239 4240 uint32_t 4241 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4242 { 4243 if (spdk_bdev_is_md_interleaved(bdev)) { 4244 return bdev->blocklen - bdev->md_len; 4245 } else { 4246 return bdev->blocklen; 4247 } 4248 } 4249 4250 uint32_t 4251 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4252 { 4253 return bdev->phys_blocklen; 4254 } 4255 4256 static uint32_t 4257 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4258 { 4259 if (!spdk_bdev_is_md_interleaved(bdev)) { 4260 return bdev->blocklen + bdev->md_len; 4261 } else { 4262 return bdev->blocklen; 4263 } 4264 } 4265 4266 /* We have to use the typedef in the function declaration to appease astyle. */ 4267 typedef enum spdk_dif_type spdk_dif_type_t; 4268 4269 spdk_dif_type_t 4270 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4271 { 4272 if (bdev->md_len != 0) { 4273 return bdev->dif_type; 4274 } else { 4275 return SPDK_DIF_DISABLE; 4276 } 4277 } 4278 4279 bool 4280 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4281 { 4282 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4283 return bdev->dif_is_head_of_md; 4284 } else { 4285 return false; 4286 } 4287 } 4288 4289 bool 4290 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4291 enum spdk_dif_check_type check_type) 4292 { 4293 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4294 return false; 4295 } 4296 4297 switch (check_type) { 4298 case SPDK_DIF_CHECK_TYPE_REFTAG: 4299 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4300 case SPDK_DIF_CHECK_TYPE_APPTAG: 4301 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4302 case SPDK_DIF_CHECK_TYPE_GUARD: 4303 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4304 default: 4305 return false; 4306 } 4307 } 4308 4309 uint32_t 4310 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4311 { 4312 return bdev->max_copy; 4313 } 4314 4315 uint64_t 4316 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4317 { 4318 return bdev->internal.measured_queue_depth; 4319 } 4320 4321 uint64_t 4322 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4323 { 4324 return bdev->internal.period; 4325 } 4326 4327 uint64_t 4328 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4329 { 4330 return bdev->internal.weighted_io_time; 4331 } 4332 4333 uint64_t 4334 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4335 { 4336 return bdev->internal.io_time; 4337 } 4338 4339 static void bdev_update_qd_sampling_period(void *ctx); 4340 4341 static void 4342 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4343 { 4344 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4345 4346 if (bdev->internal.measured_queue_depth) { 4347 bdev->internal.io_time += bdev->internal.period; 4348 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4349 } 4350 4351 bdev->internal.qd_poll_in_progress = false; 4352 4353 bdev_update_qd_sampling_period(bdev); 4354 } 4355 4356 static void 4357 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4358 struct spdk_io_channel *io_ch, void *_ctx) 4359 { 4360 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4361 4362 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4363 spdk_bdev_for_each_channel_continue(i, 0); 4364 } 4365 4366 static int 4367 bdev_calculate_measured_queue_depth(void *ctx) 4368 { 4369 struct spdk_bdev *bdev = ctx; 4370 4371 bdev->internal.qd_poll_in_progress = true; 4372 bdev->internal.temporary_queue_depth = 0; 4373 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4374 return SPDK_POLLER_BUSY; 4375 } 4376 4377 static void 4378 bdev_update_qd_sampling_period(void *ctx) 4379 { 4380 struct spdk_bdev *bdev = ctx; 4381 4382 if (bdev->internal.period == bdev->internal.new_period) { 4383 return; 4384 } 4385 4386 if (bdev->internal.qd_poll_in_progress) { 4387 return; 4388 } 4389 4390 bdev->internal.period = bdev->internal.new_period; 4391 4392 spdk_poller_unregister(&bdev->internal.qd_poller); 4393 if (bdev->internal.period != 0) { 4394 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4395 bdev, bdev->internal.period); 4396 } else { 4397 spdk_bdev_close(bdev->internal.qd_desc); 4398 bdev->internal.qd_desc = NULL; 4399 } 4400 } 4401 4402 static void 4403 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4404 { 4405 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4406 } 4407 4408 void 4409 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4410 { 4411 int rc; 4412 4413 if (bdev->internal.new_period == period) { 4414 return; 4415 } 4416 4417 bdev->internal.new_period = period; 4418 4419 if (bdev->internal.qd_desc != NULL) { 4420 assert(bdev->internal.period != 0); 4421 4422 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4423 bdev_update_qd_sampling_period, bdev); 4424 return; 4425 } 4426 4427 assert(bdev->internal.period == 0); 4428 4429 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4430 NULL, &bdev->internal.qd_desc); 4431 if (rc != 0) { 4432 return; 4433 } 4434 4435 bdev->internal.period = period; 4436 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4437 bdev, period); 4438 } 4439 4440 struct bdev_get_current_qd_ctx { 4441 uint64_t current_qd; 4442 spdk_bdev_get_current_qd_cb cb_fn; 4443 void *cb_arg; 4444 }; 4445 4446 static void 4447 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4448 { 4449 struct bdev_get_current_qd_ctx *ctx = _ctx; 4450 4451 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4452 4453 free(ctx); 4454 } 4455 4456 static void 4457 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4458 struct spdk_io_channel *io_ch, void *_ctx) 4459 { 4460 struct bdev_get_current_qd_ctx *ctx = _ctx; 4461 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4462 4463 ctx->current_qd += bdev_ch->io_outstanding; 4464 4465 spdk_bdev_for_each_channel_continue(i, 0); 4466 } 4467 4468 void 4469 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4470 void *cb_arg) 4471 { 4472 struct bdev_get_current_qd_ctx *ctx; 4473 4474 assert(cb_fn != NULL); 4475 4476 ctx = calloc(1, sizeof(*ctx)); 4477 if (ctx == NULL) { 4478 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4479 return; 4480 } 4481 4482 ctx->cb_fn = cb_fn; 4483 ctx->cb_arg = cb_arg; 4484 4485 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4486 } 4487 4488 static void 4489 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4490 { 4491 assert(desc->thread == spdk_get_thread()); 4492 4493 spdk_spin_lock(&desc->spinlock); 4494 desc->refs--; 4495 if (!desc->closed) { 4496 spdk_spin_unlock(&desc->spinlock); 4497 desc->callback.event_fn(type, 4498 desc->bdev, 4499 desc->callback.ctx); 4500 return; 4501 } else if (desc->refs == 0) { 4502 /* This descriptor was closed after this event_notify message was sent. 4503 * spdk_bdev_close() could not free the descriptor since this message was 4504 * in flight, so we free it now using bdev_desc_free(). 4505 */ 4506 spdk_spin_unlock(&desc->spinlock); 4507 bdev_desc_free(desc); 4508 return; 4509 } 4510 spdk_spin_unlock(&desc->spinlock); 4511 } 4512 4513 static void 4514 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4515 { 4516 spdk_spin_lock(&desc->spinlock); 4517 desc->refs++; 4518 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4519 spdk_spin_unlock(&desc->spinlock); 4520 } 4521 4522 static void 4523 _resize_notify(void *ctx) 4524 { 4525 struct spdk_bdev_desc *desc = ctx; 4526 4527 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4528 } 4529 4530 int 4531 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4532 { 4533 struct spdk_bdev_desc *desc; 4534 int ret; 4535 4536 if (size == bdev->blockcnt) { 4537 return 0; 4538 } 4539 4540 spdk_spin_lock(&bdev->internal.spinlock); 4541 4542 /* bdev has open descriptors */ 4543 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4544 bdev->blockcnt > size) { 4545 ret = -EBUSY; 4546 } else { 4547 bdev->blockcnt = size; 4548 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4549 event_notify(desc, _resize_notify); 4550 } 4551 ret = 0; 4552 } 4553 4554 spdk_spin_unlock(&bdev->internal.spinlock); 4555 4556 return ret; 4557 } 4558 4559 /* 4560 * Convert I/O offset and length from bytes to blocks. 4561 * 4562 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4563 */ 4564 static uint64_t 4565 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4566 uint64_t num_bytes, uint64_t *num_blocks) 4567 { 4568 uint32_t block_size = bdev->blocklen; 4569 uint8_t shift_cnt; 4570 4571 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4572 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4573 shift_cnt = spdk_u32log2(block_size); 4574 *offset_blocks = offset_bytes >> shift_cnt; 4575 *num_blocks = num_bytes >> shift_cnt; 4576 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4577 (num_bytes - (*num_blocks << shift_cnt)); 4578 } else { 4579 *offset_blocks = offset_bytes / block_size; 4580 *num_blocks = num_bytes / block_size; 4581 return (offset_bytes % block_size) | (num_bytes % block_size); 4582 } 4583 } 4584 4585 static bool 4586 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4587 { 4588 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4589 * has been an overflow and hence the offset has been wrapped around */ 4590 if (offset_blocks + num_blocks < offset_blocks) { 4591 return false; 4592 } 4593 4594 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4595 if (offset_blocks + num_blocks > bdev->blockcnt) { 4596 return false; 4597 } 4598 4599 return true; 4600 } 4601 4602 static void 4603 bdev_seek_complete_cb(void *ctx) 4604 { 4605 struct spdk_bdev_io *bdev_io = ctx; 4606 4607 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4608 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4609 } 4610 4611 static int 4612 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4613 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4614 spdk_bdev_io_completion_cb cb, void *cb_arg) 4615 { 4616 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4617 struct spdk_bdev_io *bdev_io; 4618 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4619 4620 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4621 4622 /* Check if offset_blocks is valid looking at the validity of one block */ 4623 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4624 return -EINVAL; 4625 } 4626 4627 bdev_io = bdev_channel_get_io(channel); 4628 if (!bdev_io) { 4629 return -ENOMEM; 4630 } 4631 4632 bdev_io->internal.ch = channel; 4633 bdev_io->internal.desc = desc; 4634 bdev_io->type = io_type; 4635 bdev_io->u.bdev.offset_blocks = offset_blocks; 4636 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4637 4638 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4639 /* In case bdev doesn't support seek to next data/hole offset, 4640 * it is assumed that only data and no holes are present */ 4641 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4642 bdev_io->u.bdev.seek.offset = offset_blocks; 4643 } else { 4644 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4645 } 4646 4647 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4648 return 0; 4649 } 4650 4651 bdev_io_submit(bdev_io); 4652 return 0; 4653 } 4654 4655 int 4656 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4657 uint64_t offset_blocks, 4658 spdk_bdev_io_completion_cb cb, void *cb_arg) 4659 { 4660 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4661 } 4662 4663 int 4664 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4665 uint64_t offset_blocks, 4666 spdk_bdev_io_completion_cb cb, void *cb_arg) 4667 { 4668 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4669 } 4670 4671 uint64_t 4672 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4673 { 4674 return bdev_io->u.bdev.seek.offset; 4675 } 4676 4677 static int 4678 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4679 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4680 spdk_bdev_io_completion_cb cb, void *cb_arg) 4681 { 4682 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4683 struct spdk_bdev_io *bdev_io; 4684 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4685 4686 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4687 return -EINVAL; 4688 } 4689 4690 bdev_io = bdev_channel_get_io(channel); 4691 if (!bdev_io) { 4692 return -ENOMEM; 4693 } 4694 4695 bdev_io->internal.ch = channel; 4696 bdev_io->internal.desc = desc; 4697 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4698 bdev_io->u.bdev.iovs = &bdev_io->iov; 4699 bdev_io->u.bdev.iovs[0].iov_base = buf; 4700 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4701 bdev_io->u.bdev.iovcnt = 1; 4702 bdev_io->u.bdev.md_buf = md_buf; 4703 bdev_io->u.bdev.num_blocks = num_blocks; 4704 bdev_io->u.bdev.offset_blocks = offset_blocks; 4705 bdev_io->u.bdev.memory_domain = NULL; 4706 bdev_io->u.bdev.memory_domain_ctx = NULL; 4707 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4708 4709 bdev_io_submit(bdev_io); 4710 return 0; 4711 } 4712 4713 int 4714 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4715 void *buf, uint64_t offset, uint64_t nbytes, 4716 spdk_bdev_io_completion_cb cb, void *cb_arg) 4717 { 4718 uint64_t offset_blocks, num_blocks; 4719 4720 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4721 nbytes, &num_blocks) != 0) { 4722 return -EINVAL; 4723 } 4724 4725 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4726 } 4727 4728 int 4729 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4730 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4731 spdk_bdev_io_completion_cb cb, void *cb_arg) 4732 { 4733 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4734 } 4735 4736 int 4737 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4738 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4739 spdk_bdev_io_completion_cb cb, void *cb_arg) 4740 { 4741 struct iovec iov = { 4742 .iov_base = buf, 4743 }; 4744 4745 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4746 return -EINVAL; 4747 } 4748 4749 if (md_buf && !_is_buf_allocated(&iov)) { 4750 return -EINVAL; 4751 } 4752 4753 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4754 cb, cb_arg); 4755 } 4756 4757 int 4758 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4759 struct iovec *iov, int iovcnt, 4760 uint64_t offset, uint64_t nbytes, 4761 spdk_bdev_io_completion_cb cb, void *cb_arg) 4762 { 4763 uint64_t offset_blocks, num_blocks; 4764 4765 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4766 nbytes, &num_blocks) != 0) { 4767 return -EINVAL; 4768 } 4769 4770 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4771 } 4772 4773 static int 4774 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4775 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4776 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 4777 spdk_bdev_io_completion_cb cb, void *cb_arg) 4778 { 4779 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4780 struct spdk_bdev_io *bdev_io; 4781 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4782 4783 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4784 return -EINVAL; 4785 } 4786 4787 bdev_io = bdev_channel_get_io(channel); 4788 if (!bdev_io) { 4789 return -ENOMEM; 4790 } 4791 4792 bdev_io->internal.ch = channel; 4793 bdev_io->internal.desc = desc; 4794 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4795 bdev_io->u.bdev.iovs = iov; 4796 bdev_io->u.bdev.iovcnt = iovcnt; 4797 bdev_io->u.bdev.md_buf = md_buf; 4798 bdev_io->u.bdev.num_blocks = num_blocks; 4799 bdev_io->u.bdev.offset_blocks = offset_blocks; 4800 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4801 bdev_io->internal.memory_domain = domain; 4802 bdev_io->internal.memory_domain_ctx = domain_ctx; 4803 bdev_io->u.bdev.memory_domain = domain; 4804 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 4805 4806 _bdev_io_submit_ext(desc, bdev_io); 4807 4808 return 0; 4809 } 4810 4811 int 4812 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4813 struct iovec *iov, int iovcnt, 4814 uint64_t offset_blocks, uint64_t num_blocks, 4815 spdk_bdev_io_completion_cb cb, void *cb_arg) 4816 { 4817 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4818 num_blocks, NULL, NULL, cb, cb_arg); 4819 } 4820 4821 int 4822 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4823 struct iovec *iov, int iovcnt, void *md_buf, 4824 uint64_t offset_blocks, uint64_t num_blocks, 4825 spdk_bdev_io_completion_cb cb, void *cb_arg) 4826 { 4827 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4828 return -EINVAL; 4829 } 4830 4831 if (md_buf && !_is_buf_allocated(iov)) { 4832 return -EINVAL; 4833 } 4834 4835 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4836 num_blocks, NULL, NULL, cb, cb_arg); 4837 } 4838 4839 static inline bool 4840 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4841 { 4842 /* 4843 * We check if opts size is at least of size when we first introduced 4844 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4845 * are not checked internal. 4846 */ 4847 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4848 sizeof(opts->metadata) && 4849 opts->size <= sizeof(*opts) && 4850 /* When memory domain is used, the user must provide data buffers */ 4851 (!opts->memory_domain || (iov && iov[0].iov_base)); 4852 } 4853 4854 int 4855 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4856 struct iovec *iov, int iovcnt, 4857 uint64_t offset_blocks, uint64_t num_blocks, 4858 spdk_bdev_io_completion_cb cb, void *cb_arg, 4859 struct spdk_bdev_ext_io_opts *opts) 4860 { 4861 void *md = NULL; 4862 4863 if (opts) { 4864 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4865 return -EINVAL; 4866 } 4867 md = opts->metadata; 4868 } 4869 4870 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4871 return -EINVAL; 4872 } 4873 4874 if (md && !_is_buf_allocated(iov)) { 4875 return -EINVAL; 4876 } 4877 4878 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4879 num_blocks, 4880 bdev_get_ext_io_opt(opts, memory_domain, NULL), 4881 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 4882 cb, cb_arg); 4883 } 4884 4885 static int 4886 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4887 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4888 spdk_bdev_io_completion_cb cb, void *cb_arg) 4889 { 4890 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4891 struct spdk_bdev_io *bdev_io; 4892 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4893 4894 if (!desc->write) { 4895 return -EBADF; 4896 } 4897 4898 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4899 return -EINVAL; 4900 } 4901 4902 bdev_io = bdev_channel_get_io(channel); 4903 if (!bdev_io) { 4904 return -ENOMEM; 4905 } 4906 4907 bdev_io->internal.ch = channel; 4908 bdev_io->internal.desc = desc; 4909 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4910 bdev_io->u.bdev.iovs = &bdev_io->iov; 4911 bdev_io->u.bdev.iovs[0].iov_base = buf; 4912 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4913 bdev_io->u.bdev.iovcnt = 1; 4914 bdev_io->u.bdev.md_buf = md_buf; 4915 bdev_io->u.bdev.num_blocks = num_blocks; 4916 bdev_io->u.bdev.offset_blocks = offset_blocks; 4917 bdev_io->u.bdev.memory_domain = NULL; 4918 bdev_io->u.bdev.memory_domain_ctx = NULL; 4919 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4920 4921 bdev_io_submit(bdev_io); 4922 return 0; 4923 } 4924 4925 int 4926 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4927 void *buf, uint64_t offset, uint64_t nbytes, 4928 spdk_bdev_io_completion_cb cb, void *cb_arg) 4929 { 4930 uint64_t offset_blocks, num_blocks; 4931 4932 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4933 nbytes, &num_blocks) != 0) { 4934 return -EINVAL; 4935 } 4936 4937 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4938 } 4939 4940 int 4941 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4942 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4943 spdk_bdev_io_completion_cb cb, void *cb_arg) 4944 { 4945 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4946 cb, cb_arg); 4947 } 4948 4949 int 4950 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4951 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4952 spdk_bdev_io_completion_cb cb, void *cb_arg) 4953 { 4954 struct iovec iov = { 4955 .iov_base = buf, 4956 }; 4957 4958 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4959 return -EINVAL; 4960 } 4961 4962 if (md_buf && !_is_buf_allocated(&iov)) { 4963 return -EINVAL; 4964 } 4965 4966 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4967 cb, cb_arg); 4968 } 4969 4970 static int 4971 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4972 struct iovec *iov, int iovcnt, void *md_buf, 4973 uint64_t offset_blocks, uint64_t num_blocks, 4974 struct spdk_memory_domain *domain, void *domain_ctx, 4975 spdk_bdev_io_completion_cb cb, void *cb_arg) 4976 { 4977 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4978 struct spdk_bdev_io *bdev_io; 4979 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4980 4981 if (!desc->write) { 4982 return -EBADF; 4983 } 4984 4985 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4986 return -EINVAL; 4987 } 4988 4989 bdev_io = bdev_channel_get_io(channel); 4990 if (!bdev_io) { 4991 return -ENOMEM; 4992 } 4993 4994 bdev_io->internal.ch = channel; 4995 bdev_io->internal.desc = desc; 4996 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4997 bdev_io->u.bdev.iovs = iov; 4998 bdev_io->u.bdev.iovcnt = iovcnt; 4999 bdev_io->u.bdev.md_buf = md_buf; 5000 bdev_io->u.bdev.num_blocks = num_blocks; 5001 bdev_io->u.bdev.offset_blocks = offset_blocks; 5002 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5003 bdev_io->internal.memory_domain = domain; 5004 bdev_io->internal.memory_domain_ctx = domain_ctx; 5005 bdev_io->u.bdev.memory_domain = domain; 5006 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5007 5008 _bdev_io_submit_ext(desc, bdev_io); 5009 5010 return 0; 5011 } 5012 5013 int 5014 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5015 struct iovec *iov, int iovcnt, 5016 uint64_t offset, uint64_t len, 5017 spdk_bdev_io_completion_cb cb, void *cb_arg) 5018 { 5019 uint64_t offset_blocks, num_blocks; 5020 5021 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5022 len, &num_blocks) != 0) { 5023 return -EINVAL; 5024 } 5025 5026 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5027 } 5028 5029 int 5030 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5031 struct iovec *iov, int iovcnt, 5032 uint64_t offset_blocks, uint64_t num_blocks, 5033 spdk_bdev_io_completion_cb cb, void *cb_arg) 5034 { 5035 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5036 num_blocks, NULL, NULL, cb, cb_arg); 5037 } 5038 5039 int 5040 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5041 struct iovec *iov, int iovcnt, void *md_buf, 5042 uint64_t offset_blocks, uint64_t num_blocks, 5043 spdk_bdev_io_completion_cb cb, void *cb_arg) 5044 { 5045 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5046 return -EINVAL; 5047 } 5048 5049 if (md_buf && !_is_buf_allocated(iov)) { 5050 return -EINVAL; 5051 } 5052 5053 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5054 num_blocks, NULL, NULL, cb, cb_arg); 5055 } 5056 5057 int 5058 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5059 struct iovec *iov, int iovcnt, 5060 uint64_t offset_blocks, uint64_t num_blocks, 5061 spdk_bdev_io_completion_cb cb, void *cb_arg, 5062 struct spdk_bdev_ext_io_opts *opts) 5063 { 5064 void *md = NULL; 5065 5066 if (opts) { 5067 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5068 return -EINVAL; 5069 } 5070 md = opts->metadata; 5071 } 5072 5073 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5074 return -EINVAL; 5075 } 5076 5077 if (md && !_is_buf_allocated(iov)) { 5078 return -EINVAL; 5079 } 5080 5081 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5082 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5083 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5084 cb, cb_arg); 5085 } 5086 5087 static void 5088 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5089 { 5090 struct spdk_bdev_io *parent_io = cb_arg; 5091 struct spdk_bdev *bdev = parent_io->bdev; 5092 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5093 int i, rc = 0; 5094 5095 if (!success) { 5096 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5097 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5098 spdk_bdev_free_io(bdev_io); 5099 return; 5100 } 5101 5102 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5103 rc = memcmp(read_buf, 5104 parent_io->u.bdev.iovs[i].iov_base, 5105 parent_io->u.bdev.iovs[i].iov_len); 5106 if (rc) { 5107 break; 5108 } 5109 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5110 } 5111 5112 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5113 rc = memcmp(bdev_io->u.bdev.md_buf, 5114 parent_io->u.bdev.md_buf, 5115 spdk_bdev_get_md_size(bdev)); 5116 } 5117 5118 spdk_bdev_free_io(bdev_io); 5119 5120 if (rc == 0) { 5121 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5122 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5123 } else { 5124 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5125 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5126 } 5127 } 5128 5129 static void 5130 bdev_compare_do_read(void *_bdev_io) 5131 { 5132 struct spdk_bdev_io *bdev_io = _bdev_io; 5133 int rc; 5134 5135 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5136 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5137 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5138 bdev_compare_do_read_done, bdev_io); 5139 5140 if (rc == -ENOMEM) { 5141 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5142 } else if (rc != 0) { 5143 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5144 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5145 } 5146 } 5147 5148 static int 5149 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5150 struct iovec *iov, int iovcnt, void *md_buf, 5151 uint64_t offset_blocks, uint64_t num_blocks, 5152 spdk_bdev_io_completion_cb cb, void *cb_arg) 5153 { 5154 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5155 struct spdk_bdev_io *bdev_io; 5156 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5157 5158 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5159 return -EINVAL; 5160 } 5161 5162 bdev_io = bdev_channel_get_io(channel); 5163 if (!bdev_io) { 5164 return -ENOMEM; 5165 } 5166 5167 bdev_io->internal.ch = channel; 5168 bdev_io->internal.desc = desc; 5169 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5170 bdev_io->u.bdev.iovs = iov; 5171 bdev_io->u.bdev.iovcnt = iovcnt; 5172 bdev_io->u.bdev.md_buf = md_buf; 5173 bdev_io->u.bdev.num_blocks = num_blocks; 5174 bdev_io->u.bdev.offset_blocks = offset_blocks; 5175 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5176 bdev_io->u.bdev.memory_domain = NULL; 5177 bdev_io->u.bdev.memory_domain_ctx = NULL; 5178 5179 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5180 bdev_io_submit(bdev_io); 5181 return 0; 5182 } 5183 5184 bdev_compare_do_read(bdev_io); 5185 5186 return 0; 5187 } 5188 5189 int 5190 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5191 struct iovec *iov, int iovcnt, 5192 uint64_t offset_blocks, uint64_t num_blocks, 5193 spdk_bdev_io_completion_cb cb, void *cb_arg) 5194 { 5195 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5196 num_blocks, cb, cb_arg); 5197 } 5198 5199 int 5200 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5201 struct iovec *iov, int iovcnt, void *md_buf, 5202 uint64_t offset_blocks, uint64_t num_blocks, 5203 spdk_bdev_io_completion_cb cb, void *cb_arg) 5204 { 5205 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5206 return -EINVAL; 5207 } 5208 5209 if (md_buf && !_is_buf_allocated(iov)) { 5210 return -EINVAL; 5211 } 5212 5213 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5214 num_blocks, cb, cb_arg); 5215 } 5216 5217 static int 5218 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5219 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5220 spdk_bdev_io_completion_cb cb, void *cb_arg) 5221 { 5222 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5223 struct spdk_bdev_io *bdev_io; 5224 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5225 5226 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5227 return -EINVAL; 5228 } 5229 5230 bdev_io = bdev_channel_get_io(channel); 5231 if (!bdev_io) { 5232 return -ENOMEM; 5233 } 5234 5235 bdev_io->internal.ch = channel; 5236 bdev_io->internal.desc = desc; 5237 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5238 bdev_io->u.bdev.iovs = &bdev_io->iov; 5239 bdev_io->u.bdev.iovs[0].iov_base = buf; 5240 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5241 bdev_io->u.bdev.iovcnt = 1; 5242 bdev_io->u.bdev.md_buf = md_buf; 5243 bdev_io->u.bdev.num_blocks = num_blocks; 5244 bdev_io->u.bdev.offset_blocks = offset_blocks; 5245 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5246 bdev_io->u.bdev.memory_domain = NULL; 5247 bdev_io->u.bdev.memory_domain_ctx = NULL; 5248 5249 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5250 bdev_io_submit(bdev_io); 5251 return 0; 5252 } 5253 5254 bdev_compare_do_read(bdev_io); 5255 5256 return 0; 5257 } 5258 5259 int 5260 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5261 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5262 spdk_bdev_io_completion_cb cb, void *cb_arg) 5263 { 5264 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5265 cb, cb_arg); 5266 } 5267 5268 int 5269 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5270 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5271 spdk_bdev_io_completion_cb cb, void *cb_arg) 5272 { 5273 struct iovec iov = { 5274 .iov_base = buf, 5275 }; 5276 5277 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5278 return -EINVAL; 5279 } 5280 5281 if (md_buf && !_is_buf_allocated(&iov)) { 5282 return -EINVAL; 5283 } 5284 5285 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5286 cb, cb_arg); 5287 } 5288 5289 static void 5290 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5291 { 5292 struct spdk_bdev_io *bdev_io = ctx; 5293 5294 if (unlock_status) { 5295 SPDK_ERRLOG("LBA range unlock failed\n"); 5296 } 5297 5298 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5299 false, bdev_io->internal.caller_ctx); 5300 } 5301 5302 static void 5303 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5304 { 5305 bdev_io->internal.status = status; 5306 5307 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5308 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5309 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5310 } 5311 5312 static void 5313 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5314 { 5315 struct spdk_bdev_io *parent_io = cb_arg; 5316 5317 if (!success) { 5318 SPDK_ERRLOG("Compare and write operation failed\n"); 5319 } 5320 5321 spdk_bdev_free_io(bdev_io); 5322 5323 bdev_comparev_and_writev_blocks_unlock(parent_io, 5324 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5325 } 5326 5327 static void 5328 bdev_compare_and_write_do_write(void *_bdev_io) 5329 { 5330 struct spdk_bdev_io *bdev_io = _bdev_io; 5331 int rc; 5332 5333 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5334 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5335 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5336 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5337 bdev_compare_and_write_do_write_done, bdev_io); 5338 5339 5340 if (rc == -ENOMEM) { 5341 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5342 } else if (rc != 0) { 5343 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5344 } 5345 } 5346 5347 static void 5348 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5349 { 5350 struct spdk_bdev_io *parent_io = cb_arg; 5351 5352 spdk_bdev_free_io(bdev_io); 5353 5354 if (!success) { 5355 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5356 return; 5357 } 5358 5359 bdev_compare_and_write_do_write(parent_io); 5360 } 5361 5362 static void 5363 bdev_compare_and_write_do_compare(void *_bdev_io) 5364 { 5365 struct spdk_bdev_io *bdev_io = _bdev_io; 5366 int rc; 5367 5368 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5369 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5370 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5371 bdev_compare_and_write_do_compare_done, bdev_io); 5372 5373 if (rc == -ENOMEM) { 5374 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5375 } else if (rc != 0) { 5376 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5377 } 5378 } 5379 5380 static void 5381 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5382 { 5383 struct spdk_bdev_io *bdev_io = ctx; 5384 5385 if (status) { 5386 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5387 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5388 return; 5389 } 5390 5391 bdev_compare_and_write_do_compare(bdev_io); 5392 } 5393 5394 int 5395 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5396 struct iovec *compare_iov, int compare_iovcnt, 5397 struct iovec *write_iov, int write_iovcnt, 5398 uint64_t offset_blocks, uint64_t num_blocks, 5399 spdk_bdev_io_completion_cb cb, void *cb_arg) 5400 { 5401 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5402 struct spdk_bdev_io *bdev_io; 5403 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5404 5405 if (!desc->write) { 5406 return -EBADF; 5407 } 5408 5409 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5410 return -EINVAL; 5411 } 5412 5413 if (num_blocks > bdev->acwu) { 5414 return -EINVAL; 5415 } 5416 5417 bdev_io = bdev_channel_get_io(channel); 5418 if (!bdev_io) { 5419 return -ENOMEM; 5420 } 5421 5422 bdev_io->internal.ch = channel; 5423 bdev_io->internal.desc = desc; 5424 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5425 bdev_io->u.bdev.iovs = compare_iov; 5426 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5427 bdev_io->u.bdev.fused_iovs = write_iov; 5428 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5429 bdev_io->u.bdev.md_buf = NULL; 5430 bdev_io->u.bdev.num_blocks = num_blocks; 5431 bdev_io->u.bdev.offset_blocks = offset_blocks; 5432 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5433 bdev_io->u.bdev.memory_domain = NULL; 5434 bdev_io->u.bdev.memory_domain_ctx = NULL; 5435 5436 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5437 bdev_io_submit(bdev_io); 5438 return 0; 5439 } 5440 5441 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5442 bdev_comparev_and_writev_blocks_locked, bdev_io); 5443 } 5444 5445 int 5446 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5447 struct iovec *iov, int iovcnt, 5448 uint64_t offset_blocks, uint64_t num_blocks, 5449 bool populate, 5450 spdk_bdev_io_completion_cb cb, void *cb_arg) 5451 { 5452 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5453 struct spdk_bdev_io *bdev_io; 5454 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5455 5456 if (!desc->write) { 5457 return -EBADF; 5458 } 5459 5460 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5461 return -EINVAL; 5462 } 5463 5464 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5465 return -ENOTSUP; 5466 } 5467 5468 bdev_io = bdev_channel_get_io(channel); 5469 if (!bdev_io) { 5470 return -ENOMEM; 5471 } 5472 5473 bdev_io->internal.ch = channel; 5474 bdev_io->internal.desc = desc; 5475 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5476 bdev_io->u.bdev.num_blocks = num_blocks; 5477 bdev_io->u.bdev.offset_blocks = offset_blocks; 5478 bdev_io->u.bdev.iovs = iov; 5479 bdev_io->u.bdev.iovcnt = iovcnt; 5480 bdev_io->u.bdev.md_buf = NULL; 5481 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5482 bdev_io->u.bdev.zcopy.commit = 0; 5483 bdev_io->u.bdev.zcopy.start = 1; 5484 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5485 bdev_io->u.bdev.memory_domain = NULL; 5486 bdev_io->u.bdev.memory_domain_ctx = NULL; 5487 5488 bdev_io_submit(bdev_io); 5489 5490 return 0; 5491 } 5492 5493 int 5494 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5495 spdk_bdev_io_completion_cb cb, void *cb_arg) 5496 { 5497 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5498 return -EINVAL; 5499 } 5500 5501 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5502 bdev_io->u.bdev.zcopy.start = 0; 5503 bdev_io->internal.caller_ctx = cb_arg; 5504 bdev_io->internal.cb = cb; 5505 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5506 5507 bdev_io_submit(bdev_io); 5508 5509 return 0; 5510 } 5511 5512 int 5513 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5514 uint64_t offset, uint64_t len, 5515 spdk_bdev_io_completion_cb cb, void *cb_arg) 5516 { 5517 uint64_t offset_blocks, num_blocks; 5518 5519 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5520 len, &num_blocks) != 0) { 5521 return -EINVAL; 5522 } 5523 5524 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5525 } 5526 5527 int 5528 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5529 uint64_t offset_blocks, uint64_t num_blocks, 5530 spdk_bdev_io_completion_cb cb, void *cb_arg) 5531 { 5532 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5533 struct spdk_bdev_io *bdev_io; 5534 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5535 5536 if (!desc->write) { 5537 return -EBADF; 5538 } 5539 5540 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5541 return -EINVAL; 5542 } 5543 5544 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5545 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5546 return -ENOTSUP; 5547 } 5548 5549 bdev_io = bdev_channel_get_io(channel); 5550 5551 if (!bdev_io) { 5552 return -ENOMEM; 5553 } 5554 5555 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5556 bdev_io->internal.ch = channel; 5557 bdev_io->internal.desc = desc; 5558 bdev_io->u.bdev.offset_blocks = offset_blocks; 5559 bdev_io->u.bdev.num_blocks = num_blocks; 5560 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5561 bdev_io->u.bdev.memory_domain = NULL; 5562 bdev_io->u.bdev.memory_domain_ctx = NULL; 5563 5564 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5565 bdev_io_submit(bdev_io); 5566 return 0; 5567 } 5568 5569 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5570 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5571 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5572 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5573 bdev_write_zero_buffer_next(bdev_io); 5574 5575 return 0; 5576 } 5577 5578 int 5579 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5580 uint64_t offset, uint64_t nbytes, 5581 spdk_bdev_io_completion_cb cb, void *cb_arg) 5582 { 5583 uint64_t offset_blocks, num_blocks; 5584 5585 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5586 nbytes, &num_blocks) != 0) { 5587 return -EINVAL; 5588 } 5589 5590 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5591 } 5592 5593 int 5594 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5595 uint64_t offset_blocks, uint64_t num_blocks, 5596 spdk_bdev_io_completion_cb cb, void *cb_arg) 5597 { 5598 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5599 struct spdk_bdev_io *bdev_io; 5600 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5601 5602 if (!desc->write) { 5603 return -EBADF; 5604 } 5605 5606 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5607 return -EINVAL; 5608 } 5609 5610 if (num_blocks == 0) { 5611 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5612 return -EINVAL; 5613 } 5614 5615 bdev_io = bdev_channel_get_io(channel); 5616 if (!bdev_io) { 5617 return -ENOMEM; 5618 } 5619 5620 bdev_io->internal.ch = channel; 5621 bdev_io->internal.desc = desc; 5622 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5623 5624 bdev_io->u.bdev.iovs = &bdev_io->iov; 5625 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5626 bdev_io->u.bdev.iovs[0].iov_len = 0; 5627 bdev_io->u.bdev.iovcnt = 1; 5628 5629 bdev_io->u.bdev.offset_blocks = offset_blocks; 5630 bdev_io->u.bdev.num_blocks = num_blocks; 5631 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5632 bdev_io->u.bdev.memory_domain = NULL; 5633 bdev_io->u.bdev.memory_domain_ctx = NULL; 5634 5635 bdev_io_submit(bdev_io); 5636 return 0; 5637 } 5638 5639 int 5640 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5641 uint64_t offset, uint64_t length, 5642 spdk_bdev_io_completion_cb cb, void *cb_arg) 5643 { 5644 uint64_t offset_blocks, num_blocks; 5645 5646 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5647 length, &num_blocks) != 0) { 5648 return -EINVAL; 5649 } 5650 5651 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5652 } 5653 5654 int 5655 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5656 uint64_t offset_blocks, uint64_t num_blocks, 5657 spdk_bdev_io_completion_cb cb, void *cb_arg) 5658 { 5659 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5660 struct spdk_bdev_io *bdev_io; 5661 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5662 5663 if (!desc->write) { 5664 return -EBADF; 5665 } 5666 5667 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5668 return -EINVAL; 5669 } 5670 5671 bdev_io = bdev_channel_get_io(channel); 5672 if (!bdev_io) { 5673 return -ENOMEM; 5674 } 5675 5676 bdev_io->internal.ch = channel; 5677 bdev_io->internal.desc = desc; 5678 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5679 bdev_io->u.bdev.iovs = NULL; 5680 bdev_io->u.bdev.iovcnt = 0; 5681 bdev_io->u.bdev.offset_blocks = offset_blocks; 5682 bdev_io->u.bdev.num_blocks = num_blocks; 5683 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5684 5685 bdev_io_submit(bdev_io); 5686 return 0; 5687 } 5688 5689 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5690 5691 static void 5692 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5693 { 5694 struct spdk_bdev_channel *ch = _ctx; 5695 struct spdk_bdev_io *bdev_io; 5696 5697 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5698 5699 if (status == -EBUSY) { 5700 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5701 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5702 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5703 } else { 5704 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5705 * start the reset. */ 5706 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5707 bdev_io_submit_reset(bdev_io); 5708 } 5709 } else { 5710 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5711 SPDK_DEBUGLOG(bdev, 5712 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5713 ch->bdev->name); 5714 /* Mark the completion status as a SUCCESS and complete the reset. */ 5715 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5716 } 5717 } 5718 5719 static void 5720 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5721 struct spdk_io_channel *io_ch, void *_ctx) 5722 { 5723 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5724 int status = 0; 5725 5726 if (cur_ch->io_outstanding > 0) { 5727 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5728 * further iteration over the rest of the channels and pass non-zero status 5729 * to the callback function. */ 5730 status = -EBUSY; 5731 } 5732 spdk_bdev_for_each_channel_continue(i, status); 5733 } 5734 5735 static int 5736 bdev_reset_poll_for_outstanding_io(void *ctx) 5737 { 5738 struct spdk_bdev_channel *ch = ctx; 5739 struct spdk_bdev_io *bdev_io; 5740 5741 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5742 5743 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5744 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5745 bdev_reset_check_outstanding_io_done); 5746 5747 return SPDK_POLLER_BUSY; 5748 } 5749 5750 static void 5751 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5752 { 5753 struct spdk_bdev_channel *ch = _ctx; 5754 struct spdk_bdev_io *bdev_io; 5755 5756 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5757 5758 if (bdev->reset_io_drain_timeout == 0) { 5759 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5760 5761 bdev_io_submit_reset(bdev_io); 5762 return; 5763 } 5764 5765 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5766 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5767 5768 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5769 * submit the reset to the underlying module only if outstanding I/O 5770 * remain after reset_io_drain_timeout seconds have passed. */ 5771 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5772 bdev_reset_check_outstanding_io_done); 5773 } 5774 5775 static void 5776 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5777 struct spdk_io_channel *ch, void *_ctx) 5778 { 5779 struct spdk_bdev_channel *channel; 5780 struct spdk_bdev_mgmt_channel *mgmt_channel; 5781 struct spdk_bdev_shared_resource *shared_resource; 5782 bdev_io_tailq_t tmp_queued; 5783 5784 TAILQ_INIT(&tmp_queued); 5785 5786 channel = __io_ch_to_bdev_ch(ch); 5787 shared_resource = channel->shared_resource; 5788 mgmt_channel = shared_resource->mgmt_ch; 5789 5790 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5791 5792 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5793 /* The QoS object is always valid and readable while 5794 * the channel flag is set, so the lock here should not 5795 * be necessary. We're not in the fast path though, so 5796 * just take it anyway. */ 5797 spdk_spin_lock(&channel->bdev->internal.spinlock); 5798 if (channel->bdev->internal.qos->ch == channel) { 5799 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5800 } 5801 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5802 } 5803 5804 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5805 bdev_abort_all_buf_io(mgmt_channel, channel); 5806 bdev_abort_all_buf_io(mgmt_channel, channel); 5807 bdev_abort_all_queued_io(&tmp_queued, channel); 5808 5809 spdk_bdev_for_each_channel_continue(i, 0); 5810 } 5811 5812 static void 5813 bdev_start_reset(void *ctx) 5814 { 5815 struct spdk_bdev_channel *ch = ctx; 5816 5817 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5818 bdev_reset_freeze_channel_done); 5819 } 5820 5821 static void 5822 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5823 { 5824 struct spdk_bdev *bdev = ch->bdev; 5825 5826 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5827 5828 spdk_spin_lock(&bdev->internal.spinlock); 5829 if (bdev->internal.reset_in_progress == NULL) { 5830 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5831 /* 5832 * Take a channel reference for the target bdev for the life of this 5833 * reset. This guards against the channel getting destroyed while 5834 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5835 * progress. We will release the reference when this reset is 5836 * completed. 5837 */ 5838 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5839 bdev_start_reset(ch); 5840 } 5841 spdk_spin_unlock(&bdev->internal.spinlock); 5842 } 5843 5844 int 5845 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5846 spdk_bdev_io_completion_cb cb, void *cb_arg) 5847 { 5848 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5849 struct spdk_bdev_io *bdev_io; 5850 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5851 5852 bdev_io = bdev_channel_get_io(channel); 5853 if (!bdev_io) { 5854 return -ENOMEM; 5855 } 5856 5857 bdev_io->internal.ch = channel; 5858 bdev_io->internal.desc = desc; 5859 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5860 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5861 bdev_io->u.reset.ch_ref = NULL; 5862 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5863 5864 spdk_spin_lock(&bdev->internal.spinlock); 5865 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5866 spdk_spin_unlock(&bdev->internal.spinlock); 5867 5868 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5869 internal.ch_link); 5870 5871 bdev_channel_start_reset(channel); 5872 5873 return 0; 5874 } 5875 5876 void 5877 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5878 struct spdk_bdev_io_stat *stat) 5879 { 5880 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5881 5882 bdev_get_io_stat(stat, channel->stat); 5883 } 5884 5885 static void 5886 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5887 { 5888 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5889 5890 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5891 bdev_iostat_ctx->cb_arg, 0); 5892 free(bdev_iostat_ctx); 5893 } 5894 5895 static void 5896 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5897 struct spdk_io_channel *ch, void *_ctx) 5898 { 5899 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5900 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5901 5902 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5903 spdk_bdev_for_each_channel_continue(i, 0); 5904 } 5905 5906 void 5907 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5908 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5909 { 5910 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5911 5912 assert(bdev != NULL); 5913 assert(stat != NULL); 5914 assert(cb != NULL); 5915 5916 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5917 if (bdev_iostat_ctx == NULL) { 5918 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5919 cb(bdev, stat, cb_arg, -ENOMEM); 5920 return; 5921 } 5922 5923 bdev_iostat_ctx->stat = stat; 5924 bdev_iostat_ctx->cb = cb; 5925 bdev_iostat_ctx->cb_arg = cb_arg; 5926 5927 /* Start with the statistics from previously deleted channels. */ 5928 spdk_spin_lock(&bdev->internal.spinlock); 5929 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5930 spdk_spin_unlock(&bdev->internal.spinlock); 5931 5932 /* Then iterate and add the statistics from each existing channel. */ 5933 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5934 bdev_get_device_stat_done); 5935 } 5936 5937 struct bdev_iostat_reset_ctx { 5938 enum spdk_bdev_reset_stat_mode mode; 5939 bdev_reset_device_stat_cb cb; 5940 void *cb_arg; 5941 }; 5942 5943 static void 5944 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5945 { 5946 struct bdev_iostat_reset_ctx *ctx = _ctx; 5947 5948 ctx->cb(bdev, ctx->cb_arg, 0); 5949 5950 free(ctx); 5951 } 5952 5953 static void 5954 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5955 struct spdk_io_channel *ch, void *_ctx) 5956 { 5957 struct bdev_iostat_reset_ctx *ctx = _ctx; 5958 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5959 5960 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 5961 5962 spdk_bdev_for_each_channel_continue(i, 0); 5963 } 5964 5965 void 5966 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 5967 bdev_reset_device_stat_cb cb, void *cb_arg) 5968 { 5969 struct bdev_iostat_reset_ctx *ctx; 5970 5971 assert(bdev != NULL); 5972 assert(cb != NULL); 5973 5974 ctx = calloc(1, sizeof(*ctx)); 5975 if (ctx == NULL) { 5976 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5977 cb(bdev, cb_arg, -ENOMEM); 5978 return; 5979 } 5980 5981 ctx->mode = mode; 5982 ctx->cb = cb; 5983 ctx->cb_arg = cb_arg; 5984 5985 spdk_spin_lock(&bdev->internal.spinlock); 5986 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 5987 spdk_spin_unlock(&bdev->internal.spinlock); 5988 5989 spdk_bdev_for_each_channel(bdev, 5990 bdev_reset_each_channel_stat, 5991 ctx, 5992 bdev_reset_device_stat_done); 5993 } 5994 5995 int 5996 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5997 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5998 spdk_bdev_io_completion_cb cb, void *cb_arg) 5999 { 6000 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6001 struct spdk_bdev_io *bdev_io; 6002 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6003 6004 if (!desc->write) { 6005 return -EBADF; 6006 } 6007 6008 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6009 return -ENOTSUP; 6010 } 6011 6012 bdev_io = bdev_channel_get_io(channel); 6013 if (!bdev_io) { 6014 return -ENOMEM; 6015 } 6016 6017 bdev_io->internal.ch = channel; 6018 bdev_io->internal.desc = desc; 6019 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6020 bdev_io->u.nvme_passthru.cmd = *cmd; 6021 bdev_io->u.nvme_passthru.buf = buf; 6022 bdev_io->u.nvme_passthru.nbytes = nbytes; 6023 bdev_io->u.nvme_passthru.md_buf = NULL; 6024 bdev_io->u.nvme_passthru.md_len = 0; 6025 6026 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6027 6028 bdev_io_submit(bdev_io); 6029 return 0; 6030 } 6031 6032 int 6033 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6034 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6035 spdk_bdev_io_completion_cb cb, void *cb_arg) 6036 { 6037 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6038 struct spdk_bdev_io *bdev_io; 6039 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6040 6041 if (!desc->write) { 6042 /* 6043 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6044 * to easily determine if the command is a read or write, but for now just 6045 * do not allow io_passthru with a read-only descriptor. 6046 */ 6047 return -EBADF; 6048 } 6049 6050 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6051 return -ENOTSUP; 6052 } 6053 6054 bdev_io = bdev_channel_get_io(channel); 6055 if (!bdev_io) { 6056 return -ENOMEM; 6057 } 6058 6059 bdev_io->internal.ch = channel; 6060 bdev_io->internal.desc = desc; 6061 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6062 bdev_io->u.nvme_passthru.cmd = *cmd; 6063 bdev_io->u.nvme_passthru.buf = buf; 6064 bdev_io->u.nvme_passthru.nbytes = nbytes; 6065 bdev_io->u.nvme_passthru.md_buf = NULL; 6066 bdev_io->u.nvme_passthru.md_len = 0; 6067 6068 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6069 6070 bdev_io_submit(bdev_io); 6071 return 0; 6072 } 6073 6074 int 6075 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6076 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6077 spdk_bdev_io_completion_cb cb, void *cb_arg) 6078 { 6079 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6080 struct spdk_bdev_io *bdev_io; 6081 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6082 6083 if (!desc->write) { 6084 /* 6085 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6086 * to easily determine if the command is a read or write, but for now just 6087 * do not allow io_passthru with a read-only descriptor. 6088 */ 6089 return -EBADF; 6090 } 6091 6092 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6093 return -ENOTSUP; 6094 } 6095 6096 bdev_io = bdev_channel_get_io(channel); 6097 if (!bdev_io) { 6098 return -ENOMEM; 6099 } 6100 6101 bdev_io->internal.ch = channel; 6102 bdev_io->internal.desc = desc; 6103 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6104 bdev_io->u.nvme_passthru.cmd = *cmd; 6105 bdev_io->u.nvme_passthru.buf = buf; 6106 bdev_io->u.nvme_passthru.nbytes = nbytes; 6107 bdev_io->u.nvme_passthru.md_buf = md_buf; 6108 bdev_io->u.nvme_passthru.md_len = md_len; 6109 6110 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6111 6112 bdev_io_submit(bdev_io); 6113 return 0; 6114 } 6115 6116 static void bdev_abort_retry(void *ctx); 6117 static void bdev_abort(struct spdk_bdev_io *parent_io); 6118 6119 static void 6120 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6121 { 6122 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6123 struct spdk_bdev_io *parent_io = cb_arg; 6124 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6125 6126 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6127 6128 spdk_bdev_free_io(bdev_io); 6129 6130 if (!success) { 6131 /* Check if the target I/O completed in the meantime. */ 6132 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6133 if (tmp_io == bio_to_abort) { 6134 break; 6135 } 6136 } 6137 6138 /* If the target I/O still exists, set the parent to failed. */ 6139 if (tmp_io != NULL) { 6140 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6141 } 6142 } 6143 6144 parent_io->u.bdev.split_outstanding--; 6145 if (parent_io->u.bdev.split_outstanding == 0) { 6146 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6147 bdev_abort_retry(parent_io); 6148 } else { 6149 bdev_io_complete(parent_io); 6150 } 6151 } 6152 } 6153 6154 static int 6155 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6156 struct spdk_bdev_io *bio_to_abort, 6157 spdk_bdev_io_completion_cb cb, void *cb_arg) 6158 { 6159 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6160 struct spdk_bdev_io *bdev_io; 6161 6162 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6163 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6164 /* TODO: Abort reset or abort request. */ 6165 return -ENOTSUP; 6166 } 6167 6168 bdev_io = bdev_channel_get_io(channel); 6169 if (bdev_io == NULL) { 6170 return -ENOMEM; 6171 } 6172 6173 bdev_io->internal.ch = channel; 6174 bdev_io->internal.desc = desc; 6175 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6176 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6177 6178 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6179 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6180 6181 /* Parent abort request is not submitted directly, but to manage its 6182 * execution add it to the submitted list here. 6183 */ 6184 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6185 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6186 6187 bdev_abort(bdev_io); 6188 6189 return 0; 6190 } 6191 6192 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6193 6194 /* Submit the abort request to the underlying bdev module. */ 6195 bdev_io_submit(bdev_io); 6196 6197 return 0; 6198 } 6199 6200 static uint32_t 6201 _bdev_abort(struct spdk_bdev_io *parent_io) 6202 { 6203 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6204 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6205 void *bio_cb_arg; 6206 struct spdk_bdev_io *bio_to_abort; 6207 uint32_t matched_ios; 6208 int rc; 6209 6210 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6211 6212 /* matched_ios is returned and will be kept by the caller. 6213 * 6214 * This function will be used for two cases, 1) the same cb_arg is used for 6215 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6216 * Incrementing split_outstanding directly here may confuse readers especially 6217 * for the 1st case. 6218 * 6219 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6220 * works as expected. 6221 */ 6222 matched_ios = 0; 6223 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6224 6225 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6226 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6227 continue; 6228 } 6229 6230 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6231 /* Any I/O which was submitted after this abort command should be excluded. */ 6232 continue; 6233 } 6234 6235 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6236 if (rc != 0) { 6237 if (rc == -ENOMEM) { 6238 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6239 } else { 6240 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6241 } 6242 break; 6243 } 6244 matched_ios++; 6245 } 6246 6247 return matched_ios; 6248 } 6249 6250 static void 6251 bdev_abort_retry(void *ctx) 6252 { 6253 struct spdk_bdev_io *parent_io = ctx; 6254 uint32_t matched_ios; 6255 6256 matched_ios = _bdev_abort(parent_io); 6257 6258 if (matched_ios == 0) { 6259 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6260 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6261 } else { 6262 /* For retry, the case that no target I/O was found is success 6263 * because it means target I/Os completed in the meantime. 6264 */ 6265 bdev_io_complete(parent_io); 6266 } 6267 return; 6268 } 6269 6270 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6271 parent_io->u.bdev.split_outstanding = matched_ios; 6272 } 6273 6274 static void 6275 bdev_abort(struct spdk_bdev_io *parent_io) 6276 { 6277 uint32_t matched_ios; 6278 6279 matched_ios = _bdev_abort(parent_io); 6280 6281 if (matched_ios == 0) { 6282 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6283 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6284 } else { 6285 /* The case the no target I/O was found is failure. */ 6286 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6287 bdev_io_complete(parent_io); 6288 } 6289 return; 6290 } 6291 6292 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6293 parent_io->u.bdev.split_outstanding = matched_ios; 6294 } 6295 6296 int 6297 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6298 void *bio_cb_arg, 6299 spdk_bdev_io_completion_cb cb, void *cb_arg) 6300 { 6301 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6302 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6303 struct spdk_bdev_io *bdev_io; 6304 6305 if (bio_cb_arg == NULL) { 6306 return -EINVAL; 6307 } 6308 6309 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6310 return -ENOTSUP; 6311 } 6312 6313 bdev_io = bdev_channel_get_io(channel); 6314 if (bdev_io == NULL) { 6315 return -ENOMEM; 6316 } 6317 6318 bdev_io->internal.ch = channel; 6319 bdev_io->internal.desc = desc; 6320 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6321 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6322 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6323 6324 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6325 6326 /* Parent abort request is not submitted directly, but to manage its execution, 6327 * add it to the submitted list here. 6328 */ 6329 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6330 6331 bdev_abort(bdev_io); 6332 6333 return 0; 6334 } 6335 6336 int 6337 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6338 struct spdk_bdev_io_wait_entry *entry) 6339 { 6340 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6341 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6342 6343 if (bdev != entry->bdev) { 6344 SPDK_ERRLOG("bdevs do not match\n"); 6345 return -EINVAL; 6346 } 6347 6348 if (mgmt_ch->per_thread_cache_count > 0) { 6349 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6350 return -EINVAL; 6351 } 6352 6353 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6354 return 0; 6355 } 6356 6357 static inline void 6358 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6359 { 6360 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6361 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6362 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6363 uint32_t blocklen = bdev_io->bdev->blocklen; 6364 6365 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6366 switch (bdev_io->type) { 6367 case SPDK_BDEV_IO_TYPE_READ: 6368 io_stat->bytes_read += num_blocks * blocklen; 6369 io_stat->num_read_ops++; 6370 io_stat->read_latency_ticks += tsc_diff; 6371 if (io_stat->max_read_latency_ticks < tsc_diff) { 6372 io_stat->max_read_latency_ticks = tsc_diff; 6373 } 6374 if (io_stat->min_read_latency_ticks > tsc_diff) { 6375 io_stat->min_read_latency_ticks = tsc_diff; 6376 } 6377 break; 6378 case SPDK_BDEV_IO_TYPE_WRITE: 6379 io_stat->bytes_written += num_blocks * blocklen; 6380 io_stat->num_write_ops++; 6381 io_stat->write_latency_ticks += tsc_diff; 6382 if (io_stat->max_write_latency_ticks < tsc_diff) { 6383 io_stat->max_write_latency_ticks = tsc_diff; 6384 } 6385 if (io_stat->min_write_latency_ticks > tsc_diff) { 6386 io_stat->min_write_latency_ticks = tsc_diff; 6387 } 6388 break; 6389 case SPDK_BDEV_IO_TYPE_UNMAP: 6390 io_stat->bytes_unmapped += num_blocks * blocklen; 6391 io_stat->num_unmap_ops++; 6392 io_stat->unmap_latency_ticks += tsc_diff; 6393 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6394 io_stat->max_unmap_latency_ticks = tsc_diff; 6395 } 6396 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6397 io_stat->min_unmap_latency_ticks = tsc_diff; 6398 } 6399 break; 6400 case SPDK_BDEV_IO_TYPE_ZCOPY: 6401 /* Track the data in the start phase only */ 6402 if (bdev_io->u.bdev.zcopy.start) { 6403 if (bdev_io->u.bdev.zcopy.populate) { 6404 io_stat->bytes_read += num_blocks * blocklen; 6405 io_stat->num_read_ops++; 6406 io_stat->read_latency_ticks += tsc_diff; 6407 if (io_stat->max_read_latency_ticks < tsc_diff) { 6408 io_stat->max_read_latency_ticks = tsc_diff; 6409 } 6410 if (io_stat->min_read_latency_ticks > tsc_diff) { 6411 io_stat->min_read_latency_ticks = tsc_diff; 6412 } 6413 } else { 6414 io_stat->bytes_written += num_blocks * blocklen; 6415 io_stat->num_write_ops++; 6416 io_stat->write_latency_ticks += tsc_diff; 6417 if (io_stat->max_write_latency_ticks < tsc_diff) { 6418 io_stat->max_write_latency_ticks = tsc_diff; 6419 } 6420 if (io_stat->min_write_latency_ticks > tsc_diff) { 6421 io_stat->min_write_latency_ticks = tsc_diff; 6422 } 6423 } 6424 } 6425 break; 6426 case SPDK_BDEV_IO_TYPE_COPY: 6427 io_stat->bytes_copied += num_blocks * blocklen; 6428 io_stat->num_copy_ops++; 6429 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6430 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6431 io_stat->max_copy_latency_ticks = tsc_diff; 6432 } 6433 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6434 io_stat->min_copy_latency_ticks = tsc_diff; 6435 } 6436 break; 6437 default: 6438 break; 6439 } 6440 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6441 io_stat = bdev_io->bdev->internal.stat; 6442 assert(io_stat->io_error != NULL); 6443 6444 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6445 io_stat->io_error->error_status[-io_status - 1]++; 6446 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6447 } 6448 6449 #ifdef SPDK_CONFIG_VTUNE 6450 uint64_t now_tsc = spdk_get_ticks(); 6451 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6452 uint64_t data[5]; 6453 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6454 6455 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6456 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6457 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6458 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6459 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6460 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6461 6462 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6463 __itt_metadata_u64, 5, data); 6464 6465 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6466 bdev_io->internal.ch->start_tsc = now_tsc; 6467 } 6468 #endif 6469 } 6470 6471 static inline void 6472 bdev_io_complete(void *ctx) 6473 { 6474 struct spdk_bdev_io *bdev_io = ctx; 6475 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6476 uint64_t tsc, tsc_diff; 6477 6478 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6479 /* 6480 * Defer completion to avoid potential infinite recursion if the 6481 * user's completion callback issues a new I/O. 6482 */ 6483 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6484 bdev_io_complete, bdev_io); 6485 return; 6486 } 6487 6488 tsc = spdk_get_ticks(); 6489 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6490 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6491 bdev_io->internal.caller_ctx); 6492 6493 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6494 6495 if (bdev_io->internal.ch->histogram) { 6496 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6497 } 6498 6499 bdev_io_update_io_stat(bdev_io, tsc_diff); 6500 6501 assert(bdev_io->internal.cb != NULL); 6502 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6503 6504 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6505 bdev_io->internal.caller_ctx); 6506 } 6507 6508 static void bdev_destroy_cb(void *io_device); 6509 6510 static void 6511 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6512 { 6513 struct spdk_bdev_io *bdev_io = _ctx; 6514 6515 if (bdev_io->u.reset.ch_ref != NULL) { 6516 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6517 bdev_io->u.reset.ch_ref = NULL; 6518 } 6519 6520 bdev_io_complete(bdev_io); 6521 6522 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6523 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6524 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6525 } 6526 } 6527 6528 static void 6529 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6530 struct spdk_io_channel *_ch, void *_ctx) 6531 { 6532 struct spdk_bdev_io *bdev_io = _ctx; 6533 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6534 struct spdk_bdev_io *queued_reset; 6535 6536 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6537 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6538 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6539 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6540 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6541 } 6542 6543 spdk_bdev_for_each_channel_continue(i, 0); 6544 } 6545 6546 void 6547 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6548 { 6549 struct spdk_bdev *bdev = bdev_io->bdev; 6550 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6551 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6552 6553 bdev_io->internal.status = status; 6554 6555 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6556 bool unlock_channels = false; 6557 6558 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6559 SPDK_ERRLOG("NOMEM returned for reset\n"); 6560 } 6561 spdk_spin_lock(&bdev->internal.spinlock); 6562 if (bdev_io == bdev->internal.reset_in_progress) { 6563 bdev->internal.reset_in_progress = NULL; 6564 unlock_channels = true; 6565 } 6566 spdk_spin_unlock(&bdev->internal.spinlock); 6567 6568 if (unlock_channels) { 6569 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6570 bdev_reset_complete); 6571 return; 6572 } 6573 } else { 6574 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6575 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6576 /* bdev IO will be completed in the callback */ 6577 return; 6578 } 6579 6580 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6581 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6582 return; 6583 } 6584 } 6585 6586 bdev_io_complete(bdev_io); 6587 } 6588 6589 void 6590 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6591 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6592 { 6593 if (sc == SPDK_SCSI_STATUS_GOOD) { 6594 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6595 } else { 6596 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6597 bdev_io->internal.error.scsi.sc = sc; 6598 bdev_io->internal.error.scsi.sk = sk; 6599 bdev_io->internal.error.scsi.asc = asc; 6600 bdev_io->internal.error.scsi.ascq = ascq; 6601 } 6602 6603 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6604 } 6605 6606 void 6607 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6608 int *sc, int *sk, int *asc, int *ascq) 6609 { 6610 assert(sc != NULL); 6611 assert(sk != NULL); 6612 assert(asc != NULL); 6613 assert(ascq != NULL); 6614 6615 switch (bdev_io->internal.status) { 6616 case SPDK_BDEV_IO_STATUS_SUCCESS: 6617 *sc = SPDK_SCSI_STATUS_GOOD; 6618 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6619 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6620 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6621 break; 6622 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6623 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6624 break; 6625 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6626 *sc = bdev_io->internal.error.scsi.sc; 6627 *sk = bdev_io->internal.error.scsi.sk; 6628 *asc = bdev_io->internal.error.scsi.asc; 6629 *ascq = bdev_io->internal.error.scsi.ascq; 6630 break; 6631 default: 6632 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6633 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6634 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6635 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6636 break; 6637 } 6638 } 6639 6640 void 6641 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6642 { 6643 if (aio_result == 0) { 6644 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6645 } else { 6646 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6647 } 6648 6649 bdev_io->internal.error.aio_result = aio_result; 6650 6651 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6652 } 6653 6654 void 6655 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6656 { 6657 assert(aio_result != NULL); 6658 6659 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6660 *aio_result = bdev_io->internal.error.aio_result; 6661 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6662 *aio_result = 0; 6663 } else { 6664 *aio_result = -EIO; 6665 } 6666 } 6667 6668 void 6669 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6670 { 6671 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6672 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6673 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6674 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6675 } else { 6676 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6677 } 6678 6679 bdev_io->internal.error.nvme.cdw0 = cdw0; 6680 bdev_io->internal.error.nvme.sct = sct; 6681 bdev_io->internal.error.nvme.sc = sc; 6682 6683 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6684 } 6685 6686 void 6687 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6688 { 6689 assert(sct != NULL); 6690 assert(sc != NULL); 6691 assert(cdw0 != NULL); 6692 6693 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6694 *sct = SPDK_NVME_SCT_GENERIC; 6695 *sc = SPDK_NVME_SC_SUCCESS; 6696 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6697 *cdw0 = 0; 6698 } else { 6699 *cdw0 = 1U; 6700 } 6701 return; 6702 } 6703 6704 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6705 *sct = bdev_io->internal.error.nvme.sct; 6706 *sc = bdev_io->internal.error.nvme.sc; 6707 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6708 *sct = SPDK_NVME_SCT_GENERIC; 6709 *sc = SPDK_NVME_SC_SUCCESS; 6710 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6711 *sct = SPDK_NVME_SCT_GENERIC; 6712 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6713 } else { 6714 *sct = SPDK_NVME_SCT_GENERIC; 6715 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6716 } 6717 6718 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6719 } 6720 6721 void 6722 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6723 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6724 { 6725 assert(first_sct != NULL); 6726 assert(first_sc != NULL); 6727 assert(second_sct != NULL); 6728 assert(second_sc != NULL); 6729 assert(cdw0 != NULL); 6730 6731 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6732 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6733 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6734 *first_sct = bdev_io->internal.error.nvme.sct; 6735 *first_sc = bdev_io->internal.error.nvme.sc; 6736 *second_sct = SPDK_NVME_SCT_GENERIC; 6737 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6738 } else { 6739 *first_sct = SPDK_NVME_SCT_GENERIC; 6740 *first_sc = SPDK_NVME_SC_SUCCESS; 6741 *second_sct = bdev_io->internal.error.nvme.sct; 6742 *second_sc = bdev_io->internal.error.nvme.sc; 6743 } 6744 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6745 *first_sct = SPDK_NVME_SCT_GENERIC; 6746 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6747 *second_sct = SPDK_NVME_SCT_GENERIC; 6748 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6749 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6750 *first_sct = SPDK_NVME_SCT_GENERIC; 6751 *first_sc = SPDK_NVME_SC_SUCCESS; 6752 *second_sct = SPDK_NVME_SCT_GENERIC; 6753 *second_sc = SPDK_NVME_SC_SUCCESS; 6754 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6755 *first_sct = SPDK_NVME_SCT_GENERIC; 6756 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6757 *second_sct = SPDK_NVME_SCT_GENERIC; 6758 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6759 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6760 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6761 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6762 *second_sct = SPDK_NVME_SCT_GENERIC; 6763 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6764 } else { 6765 *first_sct = SPDK_NVME_SCT_GENERIC; 6766 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6767 *second_sct = SPDK_NVME_SCT_GENERIC; 6768 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6769 } 6770 6771 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6772 } 6773 6774 struct spdk_thread * 6775 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6776 { 6777 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6778 } 6779 6780 struct spdk_io_channel * 6781 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6782 { 6783 return bdev_io->internal.ch->channel; 6784 } 6785 6786 static int 6787 bdev_register(struct spdk_bdev *bdev) 6788 { 6789 char *bdev_name; 6790 char uuid[SPDK_UUID_STRING_LEN]; 6791 int ret; 6792 6793 assert(bdev->module != NULL); 6794 6795 if (!bdev->name) { 6796 SPDK_ERRLOG("Bdev name is NULL\n"); 6797 return -EINVAL; 6798 } 6799 6800 if (!strlen(bdev->name)) { 6801 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6802 return -EINVAL; 6803 } 6804 6805 /* Users often register their own I/O devices using the bdev name. In 6806 * order to avoid conflicts, prepend bdev_. */ 6807 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6808 if (!bdev_name) { 6809 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6810 return -ENOMEM; 6811 } 6812 6813 bdev->internal.stat = bdev_alloc_io_stat(true); 6814 if (!bdev->internal.stat) { 6815 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6816 free(bdev_name); 6817 return -ENOMEM; 6818 } 6819 6820 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6821 bdev->internal.measured_queue_depth = UINT64_MAX; 6822 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6823 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6824 bdev->internal.qd_poller = NULL; 6825 bdev->internal.qos = NULL; 6826 6827 TAILQ_INIT(&bdev->internal.open_descs); 6828 TAILQ_INIT(&bdev->internal.locked_ranges); 6829 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6830 TAILQ_INIT(&bdev->aliases); 6831 6832 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6833 if (ret != 0) { 6834 bdev_free_io_stat(bdev->internal.stat); 6835 free(bdev_name); 6836 return ret; 6837 } 6838 6839 /* UUID has to be specified by the user or defined by bdev itself. 6840 * Otherwise this field must remain empty, to indicate that this 6841 * value cannot be depended upon. */ 6842 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6843 /* Add the UUID alias only if it's different than the name */ 6844 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6845 if (strcmp(bdev->name, uuid) != 0) { 6846 ret = spdk_bdev_alias_add(bdev, uuid); 6847 if (ret != 0) { 6848 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6849 bdev_name_del(&bdev->internal.bdev_name); 6850 bdev_free_io_stat(bdev->internal.stat); 6851 free(bdev_name); 6852 return ret; 6853 } 6854 } 6855 } 6856 6857 if (spdk_bdev_get_buf_align(bdev) > 1) { 6858 if (bdev->split_on_optimal_io_boundary) { 6859 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6860 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6861 } else { 6862 bdev->split_on_optimal_io_boundary = true; 6863 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6864 } 6865 } 6866 6867 /* If the user didn't specify a write unit size, set it to one. */ 6868 if (bdev->write_unit_size == 0) { 6869 bdev->write_unit_size = 1; 6870 } 6871 6872 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6873 if (bdev->acwu == 0) { 6874 bdev->acwu = bdev->write_unit_size; 6875 } 6876 6877 if (bdev->phys_blocklen == 0) { 6878 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6879 } 6880 6881 bdev->internal.reset_in_progress = NULL; 6882 bdev->internal.qd_poll_in_progress = false; 6883 bdev->internal.period = 0; 6884 bdev->internal.new_period = 0; 6885 6886 spdk_io_device_register(__bdev_to_io_dev(bdev), 6887 bdev_channel_create, bdev_channel_destroy, 6888 sizeof(struct spdk_bdev_channel), 6889 bdev_name); 6890 6891 free(bdev_name); 6892 6893 spdk_spin_init(&bdev->internal.spinlock); 6894 6895 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6896 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6897 6898 return 0; 6899 } 6900 6901 static void 6902 bdev_destroy_cb(void *io_device) 6903 { 6904 int rc; 6905 struct spdk_bdev *bdev; 6906 spdk_bdev_unregister_cb cb_fn; 6907 void *cb_arg; 6908 6909 bdev = __bdev_from_io_dev(io_device); 6910 6911 if (bdev->internal.unregister_td != spdk_get_thread()) { 6912 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 6913 return; 6914 } 6915 6916 cb_fn = bdev->internal.unregister_cb; 6917 cb_arg = bdev->internal.unregister_ctx; 6918 6919 spdk_spin_destroy(&bdev->internal.spinlock); 6920 free(bdev->internal.qos); 6921 bdev_free_io_stat(bdev->internal.stat); 6922 6923 rc = bdev->fn_table->destruct(bdev->ctxt); 6924 if (rc < 0) { 6925 SPDK_ERRLOG("destruct failed\n"); 6926 } 6927 if (rc <= 0 && cb_fn != NULL) { 6928 cb_fn(cb_arg, rc); 6929 } 6930 } 6931 6932 void 6933 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6934 { 6935 if (bdev->internal.unregister_cb != NULL) { 6936 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6937 } 6938 } 6939 6940 static void 6941 _remove_notify(void *arg) 6942 { 6943 struct spdk_bdev_desc *desc = arg; 6944 6945 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 6946 } 6947 6948 /* returns: 0 - bdev removed and ready to be destructed. 6949 * -EBUSY - bdev can't be destructed yet. */ 6950 static int 6951 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6952 { 6953 struct spdk_bdev_desc *desc, *tmp; 6954 int rc = 0; 6955 char uuid[SPDK_UUID_STRING_LEN]; 6956 6957 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6958 assert(spdk_spin_held(&bdev->internal.spinlock)); 6959 6960 /* Notify each descriptor about hotremoval */ 6961 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6962 rc = -EBUSY; 6963 /* 6964 * Defer invocation of the event_cb to a separate message that will 6965 * run later on its thread. This ensures this context unwinds and 6966 * we don't recursively unregister this bdev again if the event_cb 6967 * immediately closes its descriptor. 6968 */ 6969 event_notify(desc, _remove_notify); 6970 } 6971 6972 /* If there are no descriptors, proceed removing the bdev */ 6973 if (rc == 0) { 6974 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6975 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6976 6977 /* Delete the name and the UUID alias */ 6978 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6979 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6980 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6981 6982 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6983 6984 if (bdev->internal.reset_in_progress != NULL) { 6985 /* If reset is in progress, let the completion callback for reset 6986 * unregister the bdev. 6987 */ 6988 rc = -EBUSY; 6989 } 6990 } 6991 6992 return rc; 6993 } 6994 6995 static void 6996 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6997 struct spdk_io_channel *io_ch, void *_ctx) 6998 { 6999 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7000 7001 bdev_channel_abort_queued_ios(bdev_ch); 7002 spdk_bdev_for_each_channel_continue(i, 0); 7003 } 7004 7005 static void 7006 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7007 { 7008 int rc; 7009 7010 spdk_spin_lock(&g_bdev_mgr.spinlock); 7011 spdk_spin_lock(&bdev->internal.spinlock); 7012 /* 7013 * Set the status to REMOVING after completing to abort channels. Otherwise, 7014 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7015 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7016 * may fail. 7017 */ 7018 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7019 rc = bdev_unregister_unsafe(bdev); 7020 spdk_spin_unlock(&bdev->internal.spinlock); 7021 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7022 7023 if (rc == 0) { 7024 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7025 } 7026 } 7027 7028 void 7029 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7030 { 7031 struct spdk_thread *thread; 7032 7033 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7034 7035 thread = spdk_get_thread(); 7036 if (!thread) { 7037 /* The user called this from a non-SPDK thread. */ 7038 if (cb_fn != NULL) { 7039 cb_fn(cb_arg, -ENOTSUP); 7040 } 7041 return; 7042 } 7043 7044 spdk_spin_lock(&g_bdev_mgr.spinlock); 7045 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7046 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7047 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7048 if (cb_fn) { 7049 cb_fn(cb_arg, -EBUSY); 7050 } 7051 return; 7052 } 7053 7054 spdk_spin_lock(&bdev->internal.spinlock); 7055 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7056 bdev->internal.unregister_cb = cb_fn; 7057 bdev->internal.unregister_ctx = cb_arg; 7058 bdev->internal.unregister_td = thread; 7059 spdk_spin_unlock(&bdev->internal.spinlock); 7060 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7061 7062 spdk_bdev_set_qd_sampling_period(bdev, 0); 7063 7064 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7065 bdev_unregister); 7066 } 7067 7068 int 7069 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7070 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7071 { 7072 struct spdk_bdev_desc *desc; 7073 struct spdk_bdev *bdev; 7074 int rc; 7075 7076 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7077 if (rc != 0) { 7078 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7079 return rc; 7080 } 7081 7082 bdev = spdk_bdev_desc_get_bdev(desc); 7083 7084 if (bdev->module != module) { 7085 spdk_bdev_close(desc); 7086 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7087 bdev_name); 7088 return -ENODEV; 7089 } 7090 7091 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7092 7093 spdk_bdev_close(desc); 7094 7095 return 0; 7096 } 7097 7098 static int 7099 bdev_start_qos(struct spdk_bdev *bdev) 7100 { 7101 struct set_qos_limit_ctx *ctx; 7102 7103 /* Enable QoS */ 7104 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7105 ctx = calloc(1, sizeof(*ctx)); 7106 if (ctx == NULL) { 7107 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7108 return -ENOMEM; 7109 } 7110 ctx->bdev = bdev; 7111 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7112 } 7113 7114 return 0; 7115 } 7116 7117 static void 7118 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7119 struct spdk_bdev *bdev) 7120 { 7121 enum spdk_bdev_claim_type type; 7122 const char *typename, *modname; 7123 extern struct spdk_log_flag SPDK_LOG_bdev; 7124 7125 assert(spdk_spin_held(&bdev->internal.spinlock)); 7126 7127 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7128 return; 7129 } 7130 7131 type = bdev->internal.claim_type; 7132 typename = spdk_bdev_claim_get_name(type); 7133 7134 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7135 modname = bdev->internal.claim.v1.module->name; 7136 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7137 bdev->name, detail, typename, modname); 7138 return; 7139 } 7140 7141 if (claim_type_is_v2(type)) { 7142 struct spdk_bdev_module_claim *claim; 7143 7144 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7145 modname = claim->module->name; 7146 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7147 bdev->name, detail, typename, modname); 7148 } 7149 return; 7150 } 7151 7152 assert(false); 7153 } 7154 7155 static int 7156 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7157 { 7158 struct spdk_thread *thread; 7159 int rc = 0; 7160 7161 thread = spdk_get_thread(); 7162 if (!thread) { 7163 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7164 return -ENOTSUP; 7165 } 7166 7167 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7168 spdk_get_thread()); 7169 7170 desc->bdev = bdev; 7171 desc->thread = thread; 7172 desc->write = write; 7173 7174 spdk_spin_lock(&bdev->internal.spinlock); 7175 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7176 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7177 spdk_spin_unlock(&bdev->internal.spinlock); 7178 return -ENODEV; 7179 } 7180 7181 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7182 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7183 spdk_spin_unlock(&bdev->internal.spinlock); 7184 return -EPERM; 7185 } 7186 7187 rc = bdev_start_qos(bdev); 7188 if (rc != 0) { 7189 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7190 spdk_spin_unlock(&bdev->internal.spinlock); 7191 return rc; 7192 } 7193 7194 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7195 7196 spdk_spin_unlock(&bdev->internal.spinlock); 7197 7198 return 0; 7199 } 7200 7201 static int 7202 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7203 struct spdk_bdev_desc **_desc) 7204 { 7205 struct spdk_bdev_desc *desc; 7206 unsigned int event_id; 7207 7208 desc = calloc(1, sizeof(*desc)); 7209 if (desc == NULL) { 7210 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7211 return -ENOMEM; 7212 } 7213 7214 TAILQ_INIT(&desc->pending_media_events); 7215 TAILQ_INIT(&desc->free_media_events); 7216 7217 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7218 desc->callback.event_fn = event_cb; 7219 desc->callback.ctx = event_ctx; 7220 spdk_spin_init(&desc->spinlock); 7221 7222 if (bdev->media_events) { 7223 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7224 sizeof(*desc->media_events_buffer)); 7225 if (desc->media_events_buffer == NULL) { 7226 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7227 bdev_desc_free(desc); 7228 return -ENOMEM; 7229 } 7230 7231 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7232 TAILQ_INSERT_TAIL(&desc->free_media_events, 7233 &desc->media_events_buffer[event_id], tailq); 7234 } 7235 } 7236 7237 *_desc = desc; 7238 7239 return 0; 7240 } 7241 7242 int 7243 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7244 void *event_ctx, struct spdk_bdev_desc **_desc) 7245 { 7246 struct spdk_bdev_desc *desc; 7247 struct spdk_bdev *bdev; 7248 int rc; 7249 7250 if (event_cb == NULL) { 7251 SPDK_ERRLOG("Missing event callback function\n"); 7252 return -EINVAL; 7253 } 7254 7255 spdk_spin_lock(&g_bdev_mgr.spinlock); 7256 7257 bdev = bdev_get_by_name(bdev_name); 7258 7259 if (bdev == NULL) { 7260 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7261 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7262 return -ENODEV; 7263 } 7264 7265 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7266 if (rc != 0) { 7267 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7268 return rc; 7269 } 7270 7271 rc = bdev_open(bdev, write, desc); 7272 if (rc != 0) { 7273 bdev_desc_free(desc); 7274 desc = NULL; 7275 } 7276 7277 *_desc = desc; 7278 7279 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7280 7281 return rc; 7282 } 7283 7284 static void 7285 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7286 { 7287 int rc; 7288 7289 spdk_spin_lock(&bdev->internal.spinlock); 7290 spdk_spin_lock(&desc->spinlock); 7291 7292 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7293 7294 desc->closed = true; 7295 7296 if (desc->claim != NULL) { 7297 bdev_desc_release_claims(desc); 7298 } 7299 7300 if (0 == desc->refs) { 7301 spdk_spin_unlock(&desc->spinlock); 7302 bdev_desc_free(desc); 7303 } else { 7304 spdk_spin_unlock(&desc->spinlock); 7305 } 7306 7307 /* If no more descriptors, kill QoS channel */ 7308 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7309 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7310 bdev->name, spdk_get_thread()); 7311 7312 if (bdev_qos_destroy(bdev)) { 7313 /* There isn't anything we can do to recover here. Just let the 7314 * old QoS poller keep running. The QoS handling won't change 7315 * cores when the user allocates a new channel, but it won't break. */ 7316 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7317 } 7318 } 7319 7320 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7321 rc = bdev_unregister_unsafe(bdev); 7322 spdk_spin_unlock(&bdev->internal.spinlock); 7323 7324 if (rc == 0) { 7325 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7326 } 7327 } else { 7328 spdk_spin_unlock(&bdev->internal.spinlock); 7329 } 7330 } 7331 7332 void 7333 spdk_bdev_close(struct spdk_bdev_desc *desc) 7334 { 7335 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7336 7337 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7338 spdk_get_thread()); 7339 7340 assert(desc->thread == spdk_get_thread()); 7341 7342 spdk_poller_unregister(&desc->io_timeout_poller); 7343 7344 spdk_spin_lock(&g_bdev_mgr.spinlock); 7345 7346 bdev_close(bdev, desc); 7347 7348 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7349 } 7350 7351 static void 7352 bdev_register_finished(void *arg) 7353 { 7354 struct spdk_bdev_desc *desc = arg; 7355 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7356 7357 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7358 7359 spdk_spin_lock(&g_bdev_mgr.spinlock); 7360 7361 bdev_close(bdev, desc); 7362 7363 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7364 } 7365 7366 int 7367 spdk_bdev_register(struct spdk_bdev *bdev) 7368 { 7369 struct spdk_bdev_desc *desc; 7370 struct spdk_thread *thread = spdk_get_thread(); 7371 int rc; 7372 7373 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7374 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7375 thread ? spdk_thread_get_name(thread) : "null"); 7376 return -EINVAL; 7377 } 7378 7379 rc = bdev_register(bdev); 7380 if (rc != 0) { 7381 return rc; 7382 } 7383 7384 /* A descriptor is opened to prevent bdev deletion during examination */ 7385 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7386 if (rc != 0) { 7387 spdk_bdev_unregister(bdev, NULL, NULL); 7388 return rc; 7389 } 7390 7391 rc = bdev_open(bdev, false, desc); 7392 if (rc != 0) { 7393 bdev_desc_free(desc); 7394 spdk_bdev_unregister(bdev, NULL, NULL); 7395 return rc; 7396 } 7397 7398 /* Examine configuration before initializing I/O */ 7399 bdev_examine(bdev); 7400 7401 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7402 if (rc != 0) { 7403 bdev_close(bdev, desc); 7404 spdk_bdev_unregister(bdev, NULL, NULL); 7405 } 7406 7407 return rc; 7408 } 7409 7410 int 7411 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7412 struct spdk_bdev_module *module) 7413 { 7414 spdk_spin_lock(&bdev->internal.spinlock); 7415 7416 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7417 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7418 spdk_spin_unlock(&bdev->internal.spinlock); 7419 return -EPERM; 7420 } 7421 7422 if (desc && !desc->write) { 7423 desc->write = true; 7424 } 7425 7426 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7427 bdev->internal.claim.v1.module = module; 7428 7429 spdk_spin_unlock(&bdev->internal.spinlock); 7430 return 0; 7431 } 7432 7433 void 7434 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7435 { 7436 spdk_spin_lock(&bdev->internal.spinlock); 7437 7438 assert(bdev->internal.claim.v1.module != NULL); 7439 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7440 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7441 bdev->internal.claim.v1.module = NULL; 7442 7443 spdk_spin_unlock(&bdev->internal.spinlock); 7444 } 7445 7446 /* 7447 * Start claims v2 7448 */ 7449 7450 const char * 7451 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7452 { 7453 switch (type) { 7454 case SPDK_BDEV_CLAIM_NONE: 7455 return "not_claimed"; 7456 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7457 return "exclusive_write"; 7458 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7459 return "read_many_write_one"; 7460 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7461 return "read_many_write_none"; 7462 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7463 return "read_many_write_many"; 7464 default: 7465 break; 7466 } 7467 return "invalid_claim"; 7468 } 7469 7470 static bool 7471 claim_type_is_v2(enum spdk_bdev_claim_type type) 7472 { 7473 switch (type) { 7474 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7475 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7476 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7477 return true; 7478 default: 7479 break; 7480 } 7481 return false; 7482 } 7483 7484 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7485 static bool 7486 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7487 { 7488 switch (type) { 7489 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7490 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7491 return true; 7492 default: 7493 break; 7494 } 7495 return false; 7496 } 7497 7498 void 7499 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7500 { 7501 if (opts == NULL) { 7502 SPDK_ERRLOG("opts should not be NULL\n"); 7503 assert(opts != NULL); 7504 return; 7505 } 7506 if (size == 0) { 7507 SPDK_ERRLOG("size should not be zero\n"); 7508 assert(size != 0); 7509 return; 7510 } 7511 7512 memset(opts, 0, size); 7513 opts->opts_size = size; 7514 7515 #define FIELD_OK(field) \ 7516 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7517 7518 #define SET_FIELD(field, value) \ 7519 if (FIELD_OK(field)) { \ 7520 opts->field = value; \ 7521 } \ 7522 7523 SET_FIELD(shared_claim_key, 0); 7524 7525 #undef FIELD_OK 7526 #undef SET_FIELD 7527 } 7528 7529 static int 7530 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7531 { 7532 if (src->opts_size == 0) { 7533 SPDK_ERRLOG("size should not be zero\n"); 7534 return -1; 7535 } 7536 7537 memset(dst, 0, sizeof(*dst)); 7538 dst->opts_size = src->opts_size; 7539 7540 #define FIELD_OK(field) \ 7541 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7542 7543 #define SET_FIELD(field) \ 7544 if (FIELD_OK(field)) { \ 7545 dst->field = src->field; \ 7546 } \ 7547 7548 if (FIELD_OK(name)) { 7549 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7550 } 7551 7552 SET_FIELD(shared_claim_key); 7553 7554 /* You should not remove this statement, but need to update the assert statement 7555 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7556 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7557 7558 #undef FIELD_OK 7559 #undef SET_FIELD 7560 return 0; 7561 } 7562 7563 /* Returns 0 if a read-write-once claim can be taken. */ 7564 static int 7565 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7566 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7567 { 7568 struct spdk_bdev *bdev = desc->bdev; 7569 struct spdk_bdev_desc *open_desc; 7570 7571 assert(spdk_spin_held(&bdev->internal.spinlock)); 7572 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7573 7574 if (opts->shared_claim_key != 0) { 7575 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7576 bdev->name); 7577 return -EINVAL; 7578 } 7579 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7580 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7581 return -EPERM; 7582 } 7583 if (desc->claim != NULL) { 7584 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7585 bdev->name, desc->claim->module->name); 7586 return -EPERM; 7587 } 7588 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7589 if (desc != open_desc && open_desc->write) { 7590 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7591 "another descriptor is open for writing\n", 7592 bdev->name); 7593 return -EPERM; 7594 } 7595 } 7596 7597 return 0; 7598 } 7599 7600 /* Returns 0 if a read-only-many claim can be taken. */ 7601 static int 7602 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7603 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7604 { 7605 struct spdk_bdev *bdev = desc->bdev; 7606 struct spdk_bdev_desc *open_desc; 7607 7608 assert(spdk_spin_held(&bdev->internal.spinlock)); 7609 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 7610 assert(desc->claim == NULL); 7611 7612 if (desc->write) { 7613 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 7614 bdev->name); 7615 return -EINVAL; 7616 } 7617 if (opts->shared_claim_key != 0) { 7618 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 7619 return -EINVAL; 7620 } 7621 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7622 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7623 if (open_desc->write) { 7624 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 7625 "another descriptor is open for writing\n", 7626 bdev->name); 7627 return -EPERM; 7628 } 7629 } 7630 } 7631 7632 return 0; 7633 } 7634 7635 /* Returns 0 if a read-write-many claim can be taken. */ 7636 static int 7637 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7638 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7639 { 7640 struct spdk_bdev *bdev = desc->bdev; 7641 struct spdk_bdev_desc *open_desc; 7642 7643 assert(spdk_spin_held(&bdev->internal.spinlock)); 7644 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 7645 assert(desc->claim == NULL); 7646 7647 if (opts->shared_claim_key == 0) { 7648 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 7649 bdev->name); 7650 return -EINVAL; 7651 } 7652 switch (bdev->internal.claim_type) { 7653 case SPDK_BDEV_CLAIM_NONE: 7654 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7655 if (open_desc == desc) { 7656 continue; 7657 } 7658 if (open_desc->write) { 7659 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 7660 "another descriptor is open for writing without a " 7661 "claim\n", bdev->name); 7662 return -EPERM; 7663 } 7664 } 7665 break; 7666 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7667 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 7668 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 7669 return -EPERM; 7670 } 7671 break; 7672 default: 7673 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7674 return -EBUSY; 7675 } 7676 7677 return 0; 7678 } 7679 7680 /* Updates desc and its bdev with a v2 claim. */ 7681 static int 7682 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7683 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7684 { 7685 struct spdk_bdev *bdev = desc->bdev; 7686 struct spdk_bdev_module_claim *claim; 7687 7688 assert(spdk_spin_held(&bdev->internal.spinlock)); 7689 assert(claim_type_is_v2(type)); 7690 assert(desc->claim == NULL); 7691 7692 claim = calloc(1, sizeof(*desc->claim)); 7693 if (claim == NULL) { 7694 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 7695 return -ENOMEM; 7696 } 7697 claim->module = module; 7698 claim->desc = desc; 7699 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 7700 memcpy(claim->name, opts->name, sizeof(claim->name)); 7701 desc->claim = claim; 7702 7703 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7704 bdev->internal.claim_type = type; 7705 TAILQ_INIT(&bdev->internal.claim.v2.claims); 7706 bdev->internal.claim.v2.key = opts->shared_claim_key; 7707 } 7708 assert(type == bdev->internal.claim_type); 7709 7710 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 7711 7712 if (!desc->write && claim_type_promotes_to_write(type)) { 7713 desc->write = true; 7714 } 7715 7716 return 0; 7717 } 7718 7719 int 7720 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7721 struct spdk_bdev_claim_opts *_opts, 7722 struct spdk_bdev_module *module) 7723 { 7724 struct spdk_bdev *bdev = desc->bdev; 7725 struct spdk_bdev_claim_opts opts; 7726 int rc = 0; 7727 7728 if (desc == NULL) { 7729 SPDK_ERRLOG("descriptor must not be NULL\n"); 7730 return -EINVAL; 7731 } 7732 7733 if (_opts == NULL) { 7734 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 7735 } else if (claim_opts_copy(_opts, &opts) != 0) { 7736 return -EINVAL; 7737 } 7738 7739 spdk_spin_lock(&bdev->internal.spinlock); 7740 7741 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 7742 bdev->internal.claim_type != type) { 7743 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7744 spdk_spin_unlock(&bdev->internal.spinlock); 7745 return -EPERM; 7746 } 7747 7748 if (claim_type_is_v2(type) && desc->claim != NULL) { 7749 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 7750 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 7751 spdk_spin_unlock(&bdev->internal.spinlock); 7752 return -EPERM; 7753 } 7754 7755 switch (type) { 7756 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7757 spdk_spin_unlock(&bdev->internal.spinlock); 7758 return spdk_bdev_module_claim_bdev(bdev, desc, module); 7759 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7760 rc = claim_verify_rwo(desc, type, &opts, module); 7761 break; 7762 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7763 rc = claim_verify_rom(desc, type, &opts, module); 7764 break; 7765 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7766 rc = claim_verify_rwm(desc, type, &opts, module); 7767 break; 7768 default: 7769 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 7770 rc = -ENOTSUP; 7771 } 7772 7773 if (rc == 0) { 7774 rc = claim_bdev(desc, type, &opts, module); 7775 } 7776 7777 spdk_spin_unlock(&bdev->internal.spinlock); 7778 return rc; 7779 } 7780 7781 static void 7782 claim_reset(struct spdk_bdev *bdev) 7783 { 7784 assert(spdk_spin_held(&bdev->internal.spinlock)); 7785 assert(claim_type_is_v2(bdev->internal.claim_type)); 7786 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 7787 7788 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7789 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7790 } 7791 7792 static void 7793 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 7794 { 7795 struct spdk_bdev *bdev = desc->bdev; 7796 7797 assert(spdk_spin_held(&bdev->internal.spinlock)); 7798 assert(claim_type_is_v2(bdev->internal.claim_type)); 7799 7800 if (bdev->internal.examine_in_progress == 0) { 7801 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 7802 free(desc->claim); 7803 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 7804 claim_reset(bdev); 7805 } 7806 } else { 7807 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 7808 desc->claim->module = NULL; 7809 desc->claim->desc = NULL; 7810 } 7811 desc->claim = NULL; 7812 } 7813 7814 /* 7815 * End claims v2 7816 */ 7817 7818 struct spdk_bdev * 7819 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7820 { 7821 assert(desc != NULL); 7822 return desc->bdev; 7823 } 7824 7825 int 7826 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7827 { 7828 struct spdk_bdev *bdev, *tmp; 7829 struct spdk_bdev_desc *desc; 7830 int rc = 0; 7831 7832 assert(fn != NULL); 7833 7834 spdk_spin_lock(&g_bdev_mgr.spinlock); 7835 bdev = spdk_bdev_first(); 7836 while (bdev != NULL) { 7837 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7838 if (rc != 0) { 7839 break; 7840 } 7841 rc = bdev_open(bdev, false, desc); 7842 if (rc != 0) { 7843 bdev_desc_free(desc); 7844 if (rc == -ENODEV) { 7845 /* Ignore the error and move to the next bdev. */ 7846 rc = 0; 7847 bdev = spdk_bdev_next(bdev); 7848 continue; 7849 } 7850 break; 7851 } 7852 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7853 7854 rc = fn(ctx, bdev); 7855 7856 spdk_spin_lock(&g_bdev_mgr.spinlock); 7857 tmp = spdk_bdev_next(bdev); 7858 bdev_close(bdev, desc); 7859 if (rc != 0) { 7860 break; 7861 } 7862 bdev = tmp; 7863 } 7864 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7865 7866 return rc; 7867 } 7868 7869 int 7870 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7871 { 7872 struct spdk_bdev *bdev, *tmp; 7873 struct spdk_bdev_desc *desc; 7874 int rc = 0; 7875 7876 assert(fn != NULL); 7877 7878 spdk_spin_lock(&g_bdev_mgr.spinlock); 7879 bdev = spdk_bdev_first_leaf(); 7880 while (bdev != NULL) { 7881 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7882 if (rc != 0) { 7883 break; 7884 } 7885 rc = bdev_open(bdev, false, desc); 7886 if (rc != 0) { 7887 bdev_desc_free(desc); 7888 if (rc == -ENODEV) { 7889 /* Ignore the error and move to the next bdev. */ 7890 rc = 0; 7891 bdev = spdk_bdev_next_leaf(bdev); 7892 continue; 7893 } 7894 break; 7895 } 7896 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7897 7898 rc = fn(ctx, bdev); 7899 7900 spdk_spin_lock(&g_bdev_mgr.spinlock); 7901 tmp = spdk_bdev_next_leaf(bdev); 7902 bdev_close(bdev, desc); 7903 if (rc != 0) { 7904 break; 7905 } 7906 bdev = tmp; 7907 } 7908 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7909 7910 return rc; 7911 } 7912 7913 void 7914 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7915 { 7916 struct iovec *iovs; 7917 int iovcnt; 7918 7919 if (bdev_io == NULL) { 7920 return; 7921 } 7922 7923 switch (bdev_io->type) { 7924 case SPDK_BDEV_IO_TYPE_READ: 7925 case SPDK_BDEV_IO_TYPE_WRITE: 7926 case SPDK_BDEV_IO_TYPE_ZCOPY: 7927 iovs = bdev_io->u.bdev.iovs; 7928 iovcnt = bdev_io->u.bdev.iovcnt; 7929 break; 7930 default: 7931 iovs = NULL; 7932 iovcnt = 0; 7933 break; 7934 } 7935 7936 if (iovp) { 7937 *iovp = iovs; 7938 } 7939 if (iovcntp) { 7940 *iovcntp = iovcnt; 7941 } 7942 } 7943 7944 void * 7945 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7946 { 7947 if (bdev_io == NULL) { 7948 return NULL; 7949 } 7950 7951 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7952 return NULL; 7953 } 7954 7955 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7956 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7957 return bdev_io->u.bdev.md_buf; 7958 } 7959 7960 return NULL; 7961 } 7962 7963 void * 7964 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7965 { 7966 if (bdev_io == NULL) { 7967 assert(false); 7968 return NULL; 7969 } 7970 7971 return bdev_io->internal.caller_ctx; 7972 } 7973 7974 void 7975 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7976 { 7977 7978 if (spdk_bdev_module_list_find(bdev_module->name)) { 7979 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7980 assert(false); 7981 } 7982 7983 spdk_spin_init(&bdev_module->internal.spinlock); 7984 7985 /* 7986 * Modules with examine callbacks must be initialized first, so they are 7987 * ready to handle examine callbacks from later modules that will 7988 * register physical bdevs. 7989 */ 7990 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7991 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7992 } else { 7993 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7994 } 7995 } 7996 7997 struct spdk_bdev_module * 7998 spdk_bdev_module_list_find(const char *name) 7999 { 8000 struct spdk_bdev_module *bdev_module; 8001 8002 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8003 if (strcmp(name, bdev_module->name) == 0) { 8004 break; 8005 } 8006 } 8007 8008 return bdev_module; 8009 } 8010 8011 static void 8012 bdev_write_zero_buffer_next(void *_bdev_io) 8013 { 8014 struct spdk_bdev_io *bdev_io = _bdev_io; 8015 uint64_t num_bytes, num_blocks; 8016 void *md_buf = NULL; 8017 int rc; 8018 8019 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8020 bdev_io->u.bdev.split_remaining_num_blocks, 8021 ZERO_BUFFER_SIZE); 8022 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8023 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8024 8025 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8026 md_buf = (char *)g_bdev_mgr.zero_buffer + 8027 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8028 } 8029 8030 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8031 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8032 g_bdev_mgr.zero_buffer, md_buf, 8033 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8034 bdev_write_zero_buffer_done, bdev_io); 8035 if (rc == 0) { 8036 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8037 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8038 } else if (rc == -ENOMEM) { 8039 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8040 } else { 8041 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8042 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8043 } 8044 } 8045 8046 static void 8047 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8048 { 8049 struct spdk_bdev_io *parent_io = cb_arg; 8050 8051 spdk_bdev_free_io(bdev_io); 8052 8053 if (!success) { 8054 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8055 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8056 return; 8057 } 8058 8059 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8060 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8061 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8062 return; 8063 } 8064 8065 bdev_write_zero_buffer_next(parent_io); 8066 } 8067 8068 static void 8069 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8070 { 8071 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8072 ctx->bdev->internal.qos_mod_in_progress = false; 8073 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8074 8075 if (ctx->cb_fn) { 8076 ctx->cb_fn(ctx->cb_arg, status); 8077 } 8078 free(ctx); 8079 } 8080 8081 static void 8082 bdev_disable_qos_done(void *cb_arg) 8083 { 8084 struct set_qos_limit_ctx *ctx = cb_arg; 8085 struct spdk_bdev *bdev = ctx->bdev; 8086 struct spdk_bdev_io *bdev_io; 8087 struct spdk_bdev_qos *qos; 8088 8089 spdk_spin_lock(&bdev->internal.spinlock); 8090 qos = bdev->internal.qos; 8091 bdev->internal.qos = NULL; 8092 spdk_spin_unlock(&bdev->internal.spinlock); 8093 8094 while (!TAILQ_EMPTY(&qos->queued)) { 8095 /* Send queued I/O back to their original thread for resubmission. */ 8096 bdev_io = TAILQ_FIRST(&qos->queued); 8097 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8098 8099 if (bdev_io->internal.io_submit_ch) { 8100 /* 8101 * Channel was changed when sending it to the QoS thread - change it back 8102 * before sending it back to the original thread. 8103 */ 8104 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8105 bdev_io->internal.io_submit_ch = NULL; 8106 } 8107 8108 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8109 _bdev_io_submit, bdev_io); 8110 } 8111 8112 if (qos->thread != NULL) { 8113 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8114 spdk_poller_unregister(&qos->poller); 8115 } 8116 8117 free(qos); 8118 8119 bdev_set_qos_limit_done(ctx, 0); 8120 } 8121 8122 static void 8123 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8124 { 8125 struct set_qos_limit_ctx *ctx = _ctx; 8126 struct spdk_thread *thread; 8127 8128 spdk_spin_lock(&bdev->internal.spinlock); 8129 thread = bdev->internal.qos->thread; 8130 spdk_spin_unlock(&bdev->internal.spinlock); 8131 8132 if (thread != NULL) { 8133 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8134 } else { 8135 bdev_disable_qos_done(ctx); 8136 } 8137 } 8138 8139 static void 8140 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8141 struct spdk_io_channel *ch, void *_ctx) 8142 { 8143 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8144 8145 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8146 8147 spdk_bdev_for_each_channel_continue(i, 0); 8148 } 8149 8150 static void 8151 bdev_update_qos_rate_limit_msg(void *cb_arg) 8152 { 8153 struct set_qos_limit_ctx *ctx = cb_arg; 8154 struct spdk_bdev *bdev = ctx->bdev; 8155 8156 spdk_spin_lock(&bdev->internal.spinlock); 8157 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8158 spdk_spin_unlock(&bdev->internal.spinlock); 8159 8160 bdev_set_qos_limit_done(ctx, 0); 8161 } 8162 8163 static void 8164 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8165 struct spdk_io_channel *ch, void *_ctx) 8166 { 8167 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8168 8169 spdk_spin_lock(&bdev->internal.spinlock); 8170 bdev_enable_qos(bdev, bdev_ch); 8171 spdk_spin_unlock(&bdev->internal.spinlock); 8172 spdk_bdev_for_each_channel_continue(i, 0); 8173 } 8174 8175 static void 8176 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8177 { 8178 struct set_qos_limit_ctx *ctx = _ctx; 8179 8180 bdev_set_qos_limit_done(ctx, status); 8181 } 8182 8183 static void 8184 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8185 { 8186 int i; 8187 8188 assert(bdev->internal.qos != NULL); 8189 8190 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8191 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8192 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8193 8194 if (limits[i] == 0) { 8195 bdev->internal.qos->rate_limits[i].limit = 8196 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8197 } 8198 } 8199 } 8200 } 8201 8202 void 8203 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8204 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8205 { 8206 struct set_qos_limit_ctx *ctx; 8207 uint32_t limit_set_complement; 8208 uint64_t min_limit_per_sec; 8209 int i; 8210 bool disable_rate_limit = true; 8211 8212 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8213 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8214 continue; 8215 } 8216 8217 if (limits[i] > 0) { 8218 disable_rate_limit = false; 8219 } 8220 8221 if (bdev_qos_is_iops_rate_limit(i) == true) { 8222 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8223 } else { 8224 /* Change from megabyte to byte rate limit */ 8225 limits[i] = limits[i] * 1024 * 1024; 8226 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8227 } 8228 8229 limit_set_complement = limits[i] % min_limit_per_sec; 8230 if (limit_set_complement) { 8231 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8232 limits[i], min_limit_per_sec); 8233 limits[i] += min_limit_per_sec - limit_set_complement; 8234 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8235 } 8236 } 8237 8238 ctx = calloc(1, sizeof(*ctx)); 8239 if (ctx == NULL) { 8240 cb_fn(cb_arg, -ENOMEM); 8241 return; 8242 } 8243 8244 ctx->cb_fn = cb_fn; 8245 ctx->cb_arg = cb_arg; 8246 ctx->bdev = bdev; 8247 8248 spdk_spin_lock(&bdev->internal.spinlock); 8249 if (bdev->internal.qos_mod_in_progress) { 8250 spdk_spin_unlock(&bdev->internal.spinlock); 8251 free(ctx); 8252 cb_fn(cb_arg, -EAGAIN); 8253 return; 8254 } 8255 bdev->internal.qos_mod_in_progress = true; 8256 8257 if (disable_rate_limit == true && bdev->internal.qos) { 8258 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8259 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8260 (bdev->internal.qos->rate_limits[i].limit > 0 && 8261 bdev->internal.qos->rate_limits[i].limit != 8262 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8263 disable_rate_limit = false; 8264 break; 8265 } 8266 } 8267 } 8268 8269 if (disable_rate_limit == false) { 8270 if (bdev->internal.qos == NULL) { 8271 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8272 if (!bdev->internal.qos) { 8273 spdk_spin_unlock(&bdev->internal.spinlock); 8274 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8275 bdev_set_qos_limit_done(ctx, -ENOMEM); 8276 return; 8277 } 8278 } 8279 8280 if (bdev->internal.qos->thread == NULL) { 8281 /* Enabling */ 8282 bdev_set_qos_rate_limits(bdev, limits); 8283 8284 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8285 bdev_enable_qos_done); 8286 } else { 8287 /* Updating */ 8288 bdev_set_qos_rate_limits(bdev, limits); 8289 8290 spdk_thread_send_msg(bdev->internal.qos->thread, 8291 bdev_update_qos_rate_limit_msg, ctx); 8292 } 8293 } else { 8294 if (bdev->internal.qos != NULL) { 8295 bdev_set_qos_rate_limits(bdev, limits); 8296 8297 /* Disabling */ 8298 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8299 bdev_disable_qos_msg_done); 8300 } else { 8301 spdk_spin_unlock(&bdev->internal.spinlock); 8302 bdev_set_qos_limit_done(ctx, 0); 8303 return; 8304 } 8305 } 8306 8307 spdk_spin_unlock(&bdev->internal.spinlock); 8308 } 8309 8310 struct spdk_bdev_histogram_ctx { 8311 spdk_bdev_histogram_status_cb cb_fn; 8312 void *cb_arg; 8313 struct spdk_bdev *bdev; 8314 int status; 8315 }; 8316 8317 static void 8318 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8319 { 8320 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8321 8322 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8323 ctx->bdev->internal.histogram_in_progress = false; 8324 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8325 ctx->cb_fn(ctx->cb_arg, ctx->status); 8326 free(ctx); 8327 } 8328 8329 static void 8330 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8331 struct spdk_io_channel *_ch, void *_ctx) 8332 { 8333 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8334 8335 if (ch->histogram != NULL) { 8336 spdk_histogram_data_free(ch->histogram); 8337 ch->histogram = NULL; 8338 } 8339 spdk_bdev_for_each_channel_continue(i, 0); 8340 } 8341 8342 static void 8343 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8344 { 8345 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8346 8347 if (status != 0) { 8348 ctx->status = status; 8349 ctx->bdev->internal.histogram_enabled = false; 8350 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8351 bdev_histogram_disable_channel_cb); 8352 } else { 8353 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8354 ctx->bdev->internal.histogram_in_progress = false; 8355 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8356 ctx->cb_fn(ctx->cb_arg, ctx->status); 8357 free(ctx); 8358 } 8359 } 8360 8361 static void 8362 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8363 struct spdk_io_channel *_ch, void *_ctx) 8364 { 8365 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8366 int status = 0; 8367 8368 if (ch->histogram == NULL) { 8369 ch->histogram = spdk_histogram_data_alloc(); 8370 if (ch->histogram == NULL) { 8371 status = -ENOMEM; 8372 } 8373 } 8374 8375 spdk_bdev_for_each_channel_continue(i, status); 8376 } 8377 8378 void 8379 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8380 void *cb_arg, bool enable) 8381 { 8382 struct spdk_bdev_histogram_ctx *ctx; 8383 8384 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8385 if (ctx == NULL) { 8386 cb_fn(cb_arg, -ENOMEM); 8387 return; 8388 } 8389 8390 ctx->bdev = bdev; 8391 ctx->status = 0; 8392 ctx->cb_fn = cb_fn; 8393 ctx->cb_arg = cb_arg; 8394 8395 spdk_spin_lock(&bdev->internal.spinlock); 8396 if (bdev->internal.histogram_in_progress) { 8397 spdk_spin_unlock(&bdev->internal.spinlock); 8398 free(ctx); 8399 cb_fn(cb_arg, -EAGAIN); 8400 return; 8401 } 8402 8403 bdev->internal.histogram_in_progress = true; 8404 spdk_spin_unlock(&bdev->internal.spinlock); 8405 8406 bdev->internal.histogram_enabled = enable; 8407 8408 if (enable) { 8409 /* Allocate histogram for each channel */ 8410 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8411 bdev_histogram_enable_channel_cb); 8412 } else { 8413 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8414 bdev_histogram_disable_channel_cb); 8415 } 8416 } 8417 8418 struct spdk_bdev_histogram_data_ctx { 8419 spdk_bdev_histogram_data_cb cb_fn; 8420 void *cb_arg; 8421 struct spdk_bdev *bdev; 8422 /** merged histogram data from all channels */ 8423 struct spdk_histogram_data *histogram; 8424 }; 8425 8426 static void 8427 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8428 { 8429 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8430 8431 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8432 free(ctx); 8433 } 8434 8435 static void 8436 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8437 struct spdk_io_channel *_ch, void *_ctx) 8438 { 8439 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8440 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8441 int status = 0; 8442 8443 if (ch->histogram == NULL) { 8444 status = -EFAULT; 8445 } else { 8446 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8447 } 8448 8449 spdk_bdev_for_each_channel_continue(i, status); 8450 } 8451 8452 void 8453 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8454 spdk_bdev_histogram_data_cb cb_fn, 8455 void *cb_arg) 8456 { 8457 struct spdk_bdev_histogram_data_ctx *ctx; 8458 8459 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8460 if (ctx == NULL) { 8461 cb_fn(cb_arg, -ENOMEM, NULL); 8462 return; 8463 } 8464 8465 ctx->bdev = bdev; 8466 ctx->cb_fn = cb_fn; 8467 ctx->cb_arg = cb_arg; 8468 8469 ctx->histogram = histogram; 8470 8471 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8472 bdev_histogram_get_channel_cb); 8473 } 8474 8475 void 8476 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8477 void *cb_arg) 8478 { 8479 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8480 int status = 0; 8481 8482 assert(cb_fn != NULL); 8483 8484 if (bdev_ch->histogram == NULL) { 8485 status = -EFAULT; 8486 } 8487 cb_fn(cb_arg, status, bdev_ch->histogram); 8488 } 8489 8490 size_t 8491 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8492 size_t max_events) 8493 { 8494 struct media_event_entry *entry; 8495 size_t num_events = 0; 8496 8497 for (; num_events < max_events; ++num_events) { 8498 entry = TAILQ_FIRST(&desc->pending_media_events); 8499 if (entry == NULL) { 8500 break; 8501 } 8502 8503 events[num_events] = entry->event; 8504 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8505 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8506 } 8507 8508 return num_events; 8509 } 8510 8511 int 8512 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8513 size_t num_events) 8514 { 8515 struct spdk_bdev_desc *desc; 8516 struct media_event_entry *entry; 8517 size_t event_id; 8518 int rc = 0; 8519 8520 assert(bdev->media_events); 8521 8522 spdk_spin_lock(&bdev->internal.spinlock); 8523 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8524 if (desc->write) { 8525 break; 8526 } 8527 } 8528 8529 if (desc == NULL || desc->media_events_buffer == NULL) { 8530 rc = -ENODEV; 8531 goto out; 8532 } 8533 8534 for (event_id = 0; event_id < num_events; ++event_id) { 8535 entry = TAILQ_FIRST(&desc->free_media_events); 8536 if (entry == NULL) { 8537 break; 8538 } 8539 8540 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8541 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8542 entry->event = events[event_id]; 8543 } 8544 8545 rc = event_id; 8546 out: 8547 spdk_spin_unlock(&bdev->internal.spinlock); 8548 return rc; 8549 } 8550 8551 static void 8552 _media_management_notify(void *arg) 8553 { 8554 struct spdk_bdev_desc *desc = arg; 8555 8556 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8557 } 8558 8559 void 8560 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8561 { 8562 struct spdk_bdev_desc *desc; 8563 8564 spdk_spin_lock(&bdev->internal.spinlock); 8565 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8566 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8567 event_notify(desc, _media_management_notify); 8568 } 8569 } 8570 spdk_spin_unlock(&bdev->internal.spinlock); 8571 } 8572 8573 struct locked_lba_range_ctx { 8574 struct lba_range range; 8575 struct spdk_bdev *bdev; 8576 struct lba_range *current_range; 8577 struct lba_range *owner_range; 8578 struct spdk_poller *poller; 8579 lock_range_cb cb_fn; 8580 void *cb_arg; 8581 }; 8582 8583 static void 8584 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8585 { 8586 struct locked_lba_range_ctx *ctx = _ctx; 8587 8588 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8589 free(ctx); 8590 } 8591 8592 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8593 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8594 8595 static void 8596 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8597 { 8598 struct locked_lba_range_ctx *ctx = _ctx; 8599 8600 if (status == -ENOMEM) { 8601 /* One of the channels could not allocate a range object. 8602 * So we have to go back and clean up any ranges that were 8603 * allocated successfully before we return error status to 8604 * the caller. We can reuse the unlock function to do that 8605 * clean up. 8606 */ 8607 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8608 bdev_lock_error_cleanup_cb); 8609 return; 8610 } 8611 8612 /* All channels have locked this range and no I/O overlapping the range 8613 * are outstanding! Set the owner_ch for the range object for the 8614 * locking channel, so that this channel will know that it is allowed 8615 * to write to this range. 8616 */ 8617 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8618 ctx->cb_fn(ctx->cb_arg, status); 8619 8620 /* Don't free the ctx here. Its range is in the bdev's global list of 8621 * locked ranges still, and will be removed and freed when this range 8622 * is later unlocked. 8623 */ 8624 } 8625 8626 static int 8627 bdev_lock_lba_range_check_io(void *_i) 8628 { 8629 struct spdk_bdev_channel_iter *i = _i; 8630 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8631 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8632 struct locked_lba_range_ctx *ctx = i->ctx; 8633 struct lba_range *range = ctx->current_range; 8634 struct spdk_bdev_io *bdev_io; 8635 8636 spdk_poller_unregister(&ctx->poller); 8637 8638 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8639 * range. But we need to wait until any outstanding IO overlapping with this range 8640 * are completed. 8641 */ 8642 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8643 if (bdev_io_range_is_locked(bdev_io, range)) { 8644 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8645 return SPDK_POLLER_BUSY; 8646 } 8647 } 8648 8649 spdk_bdev_for_each_channel_continue(i, 0); 8650 return SPDK_POLLER_BUSY; 8651 } 8652 8653 static void 8654 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8655 struct spdk_io_channel *_ch, void *_ctx) 8656 { 8657 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8658 struct locked_lba_range_ctx *ctx = _ctx; 8659 struct lba_range *range; 8660 8661 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8662 if (range->length == ctx->range.length && 8663 range->offset == ctx->range.offset && 8664 range->locked_ctx == ctx->range.locked_ctx) { 8665 /* This range already exists on this channel, so don't add 8666 * it again. This can happen when a new channel is created 8667 * while the for_each_channel operation is in progress. 8668 * Do not check for outstanding I/O in that case, since the 8669 * range was locked before any I/O could be submitted to the 8670 * new channel. 8671 */ 8672 spdk_bdev_for_each_channel_continue(i, 0); 8673 return; 8674 } 8675 } 8676 8677 range = calloc(1, sizeof(*range)); 8678 if (range == NULL) { 8679 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8680 return; 8681 } 8682 8683 range->length = ctx->range.length; 8684 range->offset = ctx->range.offset; 8685 range->locked_ctx = ctx->range.locked_ctx; 8686 ctx->current_range = range; 8687 if (ctx->range.owner_ch == ch) { 8688 /* This is the range object for the channel that will hold 8689 * the lock. Store it in the ctx object so that we can easily 8690 * set its owner_ch after the lock is finally acquired. 8691 */ 8692 ctx->owner_range = range; 8693 } 8694 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8695 bdev_lock_lba_range_check_io(i); 8696 } 8697 8698 static void 8699 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8700 { 8701 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8702 8703 /* We will add a copy of this range to each channel now. */ 8704 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8705 bdev_lock_lba_range_cb); 8706 } 8707 8708 static bool 8709 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8710 { 8711 struct lba_range *r; 8712 8713 TAILQ_FOREACH(r, tailq, tailq) { 8714 if (bdev_lba_range_overlapped(range, r)) { 8715 return true; 8716 } 8717 } 8718 return false; 8719 } 8720 8721 static int 8722 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8723 uint64_t offset, uint64_t length, 8724 lock_range_cb cb_fn, void *cb_arg) 8725 { 8726 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8727 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8728 struct locked_lba_range_ctx *ctx; 8729 8730 if (cb_arg == NULL) { 8731 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8732 return -EINVAL; 8733 } 8734 8735 ctx = calloc(1, sizeof(*ctx)); 8736 if (ctx == NULL) { 8737 return -ENOMEM; 8738 } 8739 8740 ctx->range.offset = offset; 8741 ctx->range.length = length; 8742 ctx->range.owner_ch = ch; 8743 ctx->range.locked_ctx = cb_arg; 8744 ctx->bdev = bdev; 8745 ctx->cb_fn = cb_fn; 8746 ctx->cb_arg = cb_arg; 8747 8748 spdk_spin_lock(&bdev->internal.spinlock); 8749 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8750 /* There is an active lock overlapping with this range. 8751 * Put it on the pending list until this range no 8752 * longer overlaps with another. 8753 */ 8754 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8755 } else { 8756 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8757 bdev_lock_lba_range_ctx(bdev, ctx); 8758 } 8759 spdk_spin_unlock(&bdev->internal.spinlock); 8760 return 0; 8761 } 8762 8763 static void 8764 bdev_lock_lba_range_ctx_msg(void *_ctx) 8765 { 8766 struct locked_lba_range_ctx *ctx = _ctx; 8767 8768 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8769 } 8770 8771 static void 8772 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8773 { 8774 struct locked_lba_range_ctx *ctx = _ctx; 8775 struct locked_lba_range_ctx *pending_ctx; 8776 struct lba_range *range, *tmp; 8777 8778 spdk_spin_lock(&bdev->internal.spinlock); 8779 /* Check if there are any pending locked ranges that overlap with this range 8780 * that was just unlocked. If there are, check that it doesn't overlap with any 8781 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8782 * the lock process. 8783 */ 8784 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8785 if (bdev_lba_range_overlapped(range, &ctx->range) && 8786 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8787 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8788 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8789 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8790 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8791 bdev_lock_lba_range_ctx_msg, pending_ctx); 8792 } 8793 } 8794 spdk_spin_unlock(&bdev->internal.spinlock); 8795 8796 ctx->cb_fn(ctx->cb_arg, status); 8797 free(ctx); 8798 } 8799 8800 static void 8801 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8802 struct spdk_io_channel *_ch, void *_ctx) 8803 { 8804 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8805 struct locked_lba_range_ctx *ctx = _ctx; 8806 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8807 struct spdk_bdev_io *bdev_io; 8808 struct lba_range *range; 8809 8810 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8811 if (ctx->range.offset == range->offset && 8812 ctx->range.length == range->length && 8813 ctx->range.locked_ctx == range->locked_ctx) { 8814 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8815 free(range); 8816 break; 8817 } 8818 } 8819 8820 /* Note: we should almost always be able to assert that the range specified 8821 * was found. But there are some very rare corner cases where a new channel 8822 * gets created simultaneously with a range unlock, where this function 8823 * would execute on that new channel and wouldn't have the range. 8824 * We also use this to clean up range allocations when a later allocation 8825 * fails in the locking path. 8826 * So we can't actually assert() here. 8827 */ 8828 8829 /* Swap the locked IO into a temporary list, and then try to submit them again. 8830 * We could hyper-optimize this to only resubmit locked I/O that overlap 8831 * with the range that was just unlocked, but this isn't a performance path so 8832 * we go for simplicity here. 8833 */ 8834 TAILQ_INIT(&io_locked); 8835 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8836 while (!TAILQ_EMPTY(&io_locked)) { 8837 bdev_io = TAILQ_FIRST(&io_locked); 8838 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8839 bdev_io_submit(bdev_io); 8840 } 8841 8842 spdk_bdev_for_each_channel_continue(i, 0); 8843 } 8844 8845 static int 8846 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8847 uint64_t offset, uint64_t length, 8848 lock_range_cb cb_fn, void *cb_arg) 8849 { 8850 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8851 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8852 struct locked_lba_range_ctx *ctx; 8853 struct lba_range *range; 8854 bool range_found = false; 8855 8856 /* Let's make sure the specified channel actually has a lock on 8857 * the specified range. Note that the range must match exactly. 8858 */ 8859 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8860 if (range->offset == offset && range->length == length && 8861 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8862 range_found = true; 8863 break; 8864 } 8865 } 8866 8867 if (!range_found) { 8868 return -EINVAL; 8869 } 8870 8871 spdk_spin_lock(&bdev->internal.spinlock); 8872 /* We confirmed that this channel has locked the specified range. To 8873 * start the unlock the process, we find the range in the bdev's locked_ranges 8874 * and remove it. This ensures new channels don't inherit the locked range. 8875 * Then we will send a message to each channel (including the one specified 8876 * here) to remove the range from its per-channel list. 8877 */ 8878 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8879 if (range->offset == offset && range->length == length && 8880 range->locked_ctx == cb_arg) { 8881 break; 8882 } 8883 } 8884 if (range == NULL) { 8885 assert(false); 8886 spdk_spin_unlock(&bdev->internal.spinlock); 8887 return -EINVAL; 8888 } 8889 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8890 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8891 spdk_spin_unlock(&bdev->internal.spinlock); 8892 8893 ctx->cb_fn = cb_fn; 8894 ctx->cb_arg = cb_arg; 8895 8896 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8897 bdev_unlock_lba_range_cb); 8898 return 0; 8899 } 8900 8901 int 8902 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8903 int array_size) 8904 { 8905 if (!bdev) { 8906 return -EINVAL; 8907 } 8908 8909 if (bdev->fn_table->get_memory_domains) { 8910 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8911 } 8912 8913 return 0; 8914 } 8915 8916 struct spdk_bdev_for_each_io_ctx { 8917 void *ctx; 8918 spdk_bdev_io_fn fn; 8919 spdk_bdev_for_each_io_cb cb; 8920 }; 8921 8922 static void 8923 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8924 struct spdk_io_channel *io_ch, void *_ctx) 8925 { 8926 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8927 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8928 struct spdk_bdev_io *bdev_io; 8929 int rc = 0; 8930 8931 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8932 rc = ctx->fn(ctx->ctx, bdev_io); 8933 if (rc != 0) { 8934 break; 8935 } 8936 } 8937 8938 spdk_bdev_for_each_channel_continue(i, rc); 8939 } 8940 8941 static void 8942 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8943 { 8944 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8945 8946 ctx->cb(ctx->ctx, status); 8947 8948 free(ctx); 8949 } 8950 8951 void 8952 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8953 spdk_bdev_for_each_io_cb cb) 8954 { 8955 struct spdk_bdev_for_each_io_ctx *ctx; 8956 8957 assert(fn != NULL && cb != NULL); 8958 8959 ctx = calloc(1, sizeof(*ctx)); 8960 if (ctx == NULL) { 8961 SPDK_ERRLOG("Failed to allocate context.\n"); 8962 cb(_ctx, -ENOMEM); 8963 return; 8964 } 8965 8966 ctx->ctx = _ctx; 8967 ctx->fn = fn; 8968 ctx->cb = cb; 8969 8970 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8971 bdev_for_each_io_done); 8972 } 8973 8974 void 8975 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8976 { 8977 spdk_for_each_channel_continue(iter->i, status); 8978 } 8979 8980 static struct spdk_bdev * 8981 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8982 { 8983 void *io_device = spdk_io_channel_iter_get_io_device(i); 8984 8985 return __bdev_from_io_dev(io_device); 8986 } 8987 8988 static void 8989 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8990 { 8991 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8992 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8993 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8994 8995 iter->i = i; 8996 iter->fn(iter, bdev, ch, iter->ctx); 8997 } 8998 8999 static void 9000 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9001 { 9002 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9003 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9004 9005 iter->i = i; 9006 iter->cpl(bdev, iter->ctx, status); 9007 9008 free(iter); 9009 } 9010 9011 void 9012 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9013 void *ctx, spdk_bdev_for_each_channel_done cpl) 9014 { 9015 struct spdk_bdev_channel_iter *iter; 9016 9017 assert(bdev != NULL && fn != NULL && ctx != NULL); 9018 9019 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9020 if (iter == NULL) { 9021 SPDK_ERRLOG("Unable to allocate iterator\n"); 9022 assert(false); 9023 return; 9024 } 9025 9026 iter->fn = fn; 9027 iter->cpl = cpl; 9028 iter->ctx = ctx; 9029 9030 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9031 iter, bdev_each_channel_cpl); 9032 } 9033 9034 int 9035 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9036 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9037 spdk_bdev_io_completion_cb cb, void *cb_arg) 9038 { 9039 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9040 struct spdk_bdev_io *bdev_io; 9041 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9042 9043 if (!desc->write) { 9044 return -EBADF; 9045 } 9046 9047 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 9048 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 9049 return -ENOTSUP; 9050 } 9051 9052 if (num_blocks == 0) { 9053 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9054 return -EINVAL; 9055 } 9056 9057 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9058 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9059 SPDK_DEBUGLOG(bdev, 9060 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9061 dst_offset_blocks, src_offset_blocks, num_blocks); 9062 return -EINVAL; 9063 } 9064 9065 bdev_io = bdev_channel_get_io(channel); 9066 if (!bdev_io) { 9067 return -ENOMEM; 9068 } 9069 9070 bdev_io->internal.ch = channel; 9071 bdev_io->internal.desc = desc; 9072 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9073 9074 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9075 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9076 bdev_io->u.bdev.num_blocks = num_blocks; 9077 bdev_io->u.bdev.memory_domain = NULL; 9078 bdev_io->u.bdev.memory_domain_ctx = NULL; 9079 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9080 9081 bdev_io_submit(bdev_io); 9082 return 0; 9083 } 9084 9085 SPDK_LOG_REGISTER_COMPONENT(bdev) 9086 9087 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9088 { 9089 struct spdk_trace_tpoint_opts opts[] = { 9090 { 9091 "BDEV_IO_START", TRACE_BDEV_IO_START, 9092 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9093 { 9094 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9095 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9096 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9097 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9098 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9099 } 9100 }, 9101 { 9102 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9103 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9104 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9105 }, 9106 { 9107 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9108 OWNER_BDEV, OBJECT_NONE, 1, 9109 { 9110 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9111 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9112 } 9113 }, 9114 { 9115 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9116 OWNER_BDEV, OBJECT_NONE, 0, 9117 { 9118 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9119 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9120 } 9121 }, 9122 }; 9123 9124 9125 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9126 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9127 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9128 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9129 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9130 } 9131