1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 uint32_t flags; 293 294 struct spdk_histogram_data *histogram; 295 296 #ifdef SPDK_CONFIG_VTUNE 297 uint64_t start_tsc; 298 uint64_t interval_tsc; 299 __itt_string_handle *handle; 300 struct spdk_bdev_io_stat *prev_stat; 301 #endif 302 303 bdev_io_tailq_t queued_resets; 304 305 lba_range_tailq_t locked_ranges; 306 }; 307 308 struct media_event_entry { 309 struct spdk_bdev_media_event event; 310 TAILQ_ENTRY(media_event_entry) tailq; 311 }; 312 313 #define MEDIA_EVENT_POOL_SIZE 64 314 315 struct spdk_bdev_desc { 316 struct spdk_bdev *bdev; 317 struct spdk_thread *thread; 318 struct { 319 spdk_bdev_event_cb_t event_fn; 320 void *ctx; 321 } callback; 322 bool closed; 323 bool write; 324 bool memory_domains_supported; 325 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 326 struct spdk_spinlock spinlock; 327 uint32_t refs; 328 TAILQ_HEAD(, media_event_entry) pending_media_events; 329 TAILQ_HEAD(, media_event_entry) free_media_events; 330 struct media_event_entry *media_events_buffer; 331 TAILQ_ENTRY(spdk_bdev_desc) link; 332 333 uint64_t timeout_in_sec; 334 spdk_bdev_io_timeout_cb cb_fn; 335 void *cb_arg; 336 struct spdk_poller *io_timeout_poller; 337 struct spdk_bdev_module_claim *claim; 338 }; 339 340 struct spdk_bdev_iostat_ctx { 341 struct spdk_bdev_io_stat *stat; 342 spdk_bdev_get_device_stat_cb cb; 343 void *cb_arg; 344 }; 345 346 struct set_qos_limit_ctx { 347 void (*cb_fn)(void *cb_arg, int status); 348 void *cb_arg; 349 struct spdk_bdev *bdev; 350 }; 351 352 struct spdk_bdev_channel_iter { 353 spdk_bdev_for_each_channel_msg fn; 354 spdk_bdev_for_each_channel_done cpl; 355 struct spdk_io_channel_iter *i; 356 void *ctx; 357 }; 358 359 struct spdk_bdev_io_error_stat { 360 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 361 }; 362 363 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 364 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 365 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 366 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 367 368 static inline void bdev_io_complete(void *ctx); 369 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 370 371 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 372 static void bdev_write_zero_buffer_next(void *_bdev_io); 373 374 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 375 struct spdk_io_channel *ch, void *_ctx); 376 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 377 378 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 379 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 380 uint64_t num_blocks, 381 struct spdk_memory_domain *domain, void *domain_ctx, 382 spdk_bdev_io_completion_cb cb, void *cb_arg); 383 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 384 struct iovec *iov, int iovcnt, void *md_buf, 385 uint64_t offset_blocks, uint64_t num_blocks, 386 struct spdk_memory_domain *domain, void *domain_ctx, 387 spdk_bdev_io_completion_cb cb, void *cb_arg); 388 389 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 390 uint64_t offset, uint64_t length, 391 lock_range_cb cb_fn, void *cb_arg); 392 393 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 394 uint64_t offset, uint64_t length, 395 lock_range_cb cb_fn, void *cb_arg); 396 397 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 398 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 399 400 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 401 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 402 static void claim_reset(struct spdk_bdev *bdev); 403 404 #define bdev_get_ext_io_opt(opts, field, defval) \ 405 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 406 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 407 408 void 409 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 410 { 411 if (!opts) { 412 SPDK_ERRLOG("opts should not be NULL\n"); 413 return; 414 } 415 416 if (!opts_size) { 417 SPDK_ERRLOG("opts_size should not be zero value\n"); 418 return; 419 } 420 421 opts->opts_size = opts_size; 422 423 #define SET_FIELD(field) \ 424 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 425 opts->field = g_bdev_opts.field; \ 426 } \ 427 428 SET_FIELD(bdev_io_pool_size); 429 SET_FIELD(bdev_io_cache_size); 430 SET_FIELD(bdev_auto_examine); 431 SET_FIELD(small_buf_pool_size); 432 SET_FIELD(large_buf_pool_size); 433 434 /* Do not remove this statement, you should always update this statement when you adding a new field, 435 * and do not forget to add the SET_FIELD statement for your added field. */ 436 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 437 438 #undef SET_FIELD 439 } 440 441 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 442 "v23.05", 0); 443 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 444 "v23.05", 0); 445 int 446 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 447 { 448 struct spdk_iobuf_opts iobuf_opts; 449 uint32_t min_pool_size; 450 int rc; 451 452 if (!opts) { 453 SPDK_ERRLOG("opts cannot be NULL\n"); 454 return -1; 455 } 456 457 if (!opts->opts_size) { 458 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 459 return -1; 460 } 461 462 /* 463 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 464 * initialization. A second mgmt_ch will be created on the same thread when the application starts 465 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 466 */ 467 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 468 if (opts->bdev_io_pool_size < min_pool_size) { 469 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 470 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 471 spdk_thread_get_count()); 472 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 473 return -1; 474 } 475 476 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 477 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 478 } 479 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 480 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 481 } 482 483 #define SET_FIELD(field) \ 484 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 485 g_bdev_opts.field = opts->field; \ 486 } \ 487 488 SET_FIELD(bdev_io_pool_size); 489 SET_FIELD(bdev_io_cache_size); 490 SET_FIELD(bdev_auto_examine); 491 SET_FIELD(small_buf_pool_size); 492 SET_FIELD(large_buf_pool_size); 493 494 spdk_iobuf_get_opts(&iobuf_opts); 495 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 496 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 497 498 rc = spdk_iobuf_set_opts(&iobuf_opts); 499 if (rc != 0) { 500 SPDK_ERRLOG("Failed to set iobuf opts\n"); 501 return -1; 502 } 503 504 g_bdev_opts.opts_size = opts->opts_size; 505 506 #undef SET_FIELD 507 508 return 0; 509 } 510 511 static struct spdk_bdev * 512 bdev_get_by_name(const char *bdev_name) 513 { 514 struct spdk_bdev_name find; 515 struct spdk_bdev_name *res; 516 517 find.name = (char *)bdev_name; 518 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 519 if (res != NULL) { 520 return res->bdev; 521 } 522 523 return NULL; 524 } 525 526 struct spdk_bdev * 527 spdk_bdev_get_by_name(const char *bdev_name) 528 { 529 struct spdk_bdev *bdev; 530 531 spdk_spin_lock(&g_bdev_mgr.spinlock); 532 bdev = bdev_get_by_name(bdev_name); 533 spdk_spin_unlock(&g_bdev_mgr.spinlock); 534 535 return bdev; 536 } 537 538 struct bdev_io_status_string { 539 enum spdk_bdev_io_status status; 540 const char *str; 541 }; 542 543 static const struct bdev_io_status_string bdev_io_status_strings[] = { 544 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 545 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 546 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 547 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 548 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 549 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 550 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 551 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 552 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 553 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 554 }; 555 556 static const char * 557 bdev_io_status_get_string(enum spdk_bdev_io_status status) 558 { 559 uint32_t i; 560 561 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 562 if (bdev_io_status_strings[i].status == status) { 563 return bdev_io_status_strings[i].str; 564 } 565 } 566 567 return "reserved"; 568 } 569 570 struct spdk_bdev_wait_for_examine_ctx { 571 struct spdk_poller *poller; 572 spdk_bdev_wait_for_examine_cb cb_fn; 573 void *cb_arg; 574 }; 575 576 static bool bdev_module_all_actions_completed(void); 577 578 static int 579 bdev_wait_for_examine_cb(void *arg) 580 { 581 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 582 583 if (!bdev_module_all_actions_completed()) { 584 return SPDK_POLLER_IDLE; 585 } 586 587 spdk_poller_unregister(&ctx->poller); 588 ctx->cb_fn(ctx->cb_arg); 589 free(ctx); 590 591 return SPDK_POLLER_BUSY; 592 } 593 594 int 595 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 596 { 597 struct spdk_bdev_wait_for_examine_ctx *ctx; 598 599 ctx = calloc(1, sizeof(*ctx)); 600 if (ctx == NULL) { 601 return -ENOMEM; 602 } 603 ctx->cb_fn = cb_fn; 604 ctx->cb_arg = cb_arg; 605 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 606 607 return 0; 608 } 609 610 struct spdk_bdev_examine_item { 611 char *name; 612 TAILQ_ENTRY(spdk_bdev_examine_item) link; 613 }; 614 615 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 616 617 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 618 g_bdev_examine_allowlist); 619 620 static inline bool 621 bdev_examine_allowlist_check(const char *name) 622 { 623 struct spdk_bdev_examine_item *item; 624 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 625 if (strcmp(name, item->name) == 0) { 626 return true; 627 } 628 } 629 return false; 630 } 631 632 static inline void 633 bdev_examine_allowlist_free(void) 634 { 635 struct spdk_bdev_examine_item *item; 636 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 637 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 638 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 639 free(item->name); 640 free(item); 641 } 642 } 643 644 static inline bool 645 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 646 { 647 struct spdk_bdev_alias *tmp; 648 if (bdev_examine_allowlist_check(bdev->name)) { 649 return true; 650 } 651 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 652 if (bdev_examine_allowlist_check(tmp->alias.name)) { 653 return true; 654 } 655 } 656 return false; 657 } 658 659 static inline bool 660 bdev_ok_to_examine(struct spdk_bdev *bdev) 661 { 662 if (g_bdev_opts.bdev_auto_examine) { 663 return true; 664 } else { 665 return bdev_in_examine_allowlist(bdev); 666 } 667 } 668 669 static void 670 bdev_examine(struct spdk_bdev *bdev) 671 { 672 struct spdk_bdev_module *module; 673 struct spdk_bdev_module_claim *claim, *tmpclaim; 674 uint32_t action; 675 676 if (!bdev_ok_to_examine(bdev)) { 677 return; 678 } 679 680 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 681 if (module->examine_config) { 682 spdk_spin_lock(&module->internal.spinlock); 683 action = module->internal.action_in_progress; 684 module->internal.action_in_progress++; 685 spdk_spin_unlock(&module->internal.spinlock); 686 module->examine_config(bdev); 687 if (action != module->internal.action_in_progress) { 688 SPDK_ERRLOG("examine_config for module %s did not call " 689 "spdk_bdev_module_examine_done()\n", module->name); 690 } 691 } 692 } 693 694 spdk_spin_lock(&bdev->internal.spinlock); 695 696 switch (bdev->internal.claim_type) { 697 case SPDK_BDEV_CLAIM_NONE: 698 /* Examine by all bdev modules */ 699 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 700 if (module->examine_disk) { 701 spdk_spin_lock(&module->internal.spinlock); 702 module->internal.action_in_progress++; 703 spdk_spin_unlock(&module->internal.spinlock); 704 spdk_spin_unlock(&bdev->internal.spinlock); 705 module->examine_disk(bdev); 706 spdk_spin_lock(&bdev->internal.spinlock); 707 } 708 } 709 break; 710 case SPDK_BDEV_CLAIM_EXCL_WRITE: 711 /* Examine by the one bdev module with a v1 claim */ 712 module = bdev->internal.claim.v1.module; 713 if (module->examine_disk) { 714 spdk_spin_lock(&module->internal.spinlock); 715 module->internal.action_in_progress++; 716 spdk_spin_unlock(&module->internal.spinlock); 717 spdk_spin_unlock(&bdev->internal.spinlock); 718 module->examine_disk(bdev); 719 return; 720 } 721 break; 722 default: 723 /* Examine by all bdev modules with a v2 claim */ 724 assert(claim_type_is_v2(bdev->internal.claim_type)); 725 /* 726 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 727 * list, perhaps accessing freed memory. Without protection, this could happen 728 * while the lock is dropped during the examine callback. 729 */ 730 bdev->internal.examine_in_progress++; 731 732 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 733 module = claim->module; 734 735 if (module == NULL) { 736 /* This is a vestigial claim, held by examine_count */ 737 continue; 738 } 739 740 if (module->examine_disk == NULL) { 741 continue; 742 } 743 744 spdk_spin_lock(&module->internal.spinlock); 745 module->internal.action_in_progress++; 746 spdk_spin_unlock(&module->internal.spinlock); 747 748 /* Call examine_disk without holding internal.spinlock. */ 749 spdk_spin_unlock(&bdev->internal.spinlock); 750 module->examine_disk(bdev); 751 spdk_spin_lock(&bdev->internal.spinlock); 752 } 753 754 assert(bdev->internal.examine_in_progress > 0); 755 bdev->internal.examine_in_progress--; 756 if (bdev->internal.examine_in_progress == 0) { 757 /* Remove any claims that were released during examine_disk */ 758 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 759 if (claim->desc != NULL) { 760 continue; 761 } 762 763 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 764 free(claim); 765 } 766 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 767 claim_reset(bdev); 768 } 769 } 770 } 771 772 spdk_spin_unlock(&bdev->internal.spinlock); 773 } 774 775 int 776 spdk_bdev_examine(const char *name) 777 { 778 struct spdk_bdev *bdev; 779 struct spdk_bdev_examine_item *item; 780 struct spdk_thread *thread = spdk_get_thread(); 781 782 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 783 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 784 thread ? spdk_thread_get_name(thread) : "null"); 785 return -EINVAL; 786 } 787 788 if (g_bdev_opts.bdev_auto_examine) { 789 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 790 return -EINVAL; 791 } 792 793 if (bdev_examine_allowlist_check(name)) { 794 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 795 return -EEXIST; 796 } 797 798 item = calloc(1, sizeof(*item)); 799 if (!item) { 800 return -ENOMEM; 801 } 802 item->name = strdup(name); 803 if (!item->name) { 804 free(item); 805 return -ENOMEM; 806 } 807 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 808 809 bdev = spdk_bdev_get_by_name(name); 810 if (bdev) { 811 bdev_examine(bdev); 812 } 813 return 0; 814 } 815 816 static inline void 817 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 818 { 819 struct spdk_bdev_examine_item *item; 820 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 821 spdk_json_write_object_begin(w); 822 spdk_json_write_named_string(w, "method", "bdev_examine"); 823 spdk_json_write_named_object_begin(w, "params"); 824 spdk_json_write_named_string(w, "name", item->name); 825 spdk_json_write_object_end(w); 826 spdk_json_write_object_end(w); 827 } 828 } 829 830 struct spdk_bdev * 831 spdk_bdev_first(void) 832 { 833 struct spdk_bdev *bdev; 834 835 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 836 if (bdev) { 837 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 838 } 839 840 return bdev; 841 } 842 843 struct spdk_bdev * 844 spdk_bdev_next(struct spdk_bdev *prev) 845 { 846 struct spdk_bdev *bdev; 847 848 bdev = TAILQ_NEXT(prev, internal.link); 849 if (bdev) { 850 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 851 } 852 853 return bdev; 854 } 855 856 static struct spdk_bdev * 857 _bdev_next_leaf(struct spdk_bdev *bdev) 858 { 859 while (bdev != NULL) { 860 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 861 return bdev; 862 } else { 863 bdev = TAILQ_NEXT(bdev, internal.link); 864 } 865 } 866 867 return bdev; 868 } 869 870 struct spdk_bdev * 871 spdk_bdev_first_leaf(void) 872 { 873 struct spdk_bdev *bdev; 874 875 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 876 877 if (bdev) { 878 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 879 } 880 881 return bdev; 882 } 883 884 struct spdk_bdev * 885 spdk_bdev_next_leaf(struct spdk_bdev *prev) 886 { 887 struct spdk_bdev *bdev; 888 889 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 890 891 if (bdev) { 892 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 893 } 894 895 return bdev; 896 } 897 898 static inline bool 899 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 900 { 901 return bdev_io->internal.memory_domain; 902 } 903 904 void 905 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 906 { 907 struct iovec *iovs; 908 909 if (bdev_io->u.bdev.iovs == NULL) { 910 bdev_io->u.bdev.iovs = &bdev_io->iov; 911 bdev_io->u.bdev.iovcnt = 1; 912 } 913 914 iovs = bdev_io->u.bdev.iovs; 915 916 assert(iovs != NULL); 917 assert(bdev_io->u.bdev.iovcnt >= 1); 918 919 iovs[0].iov_base = buf; 920 iovs[0].iov_len = len; 921 } 922 923 void 924 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 925 { 926 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 927 bdev_io->u.bdev.md_buf = md_buf; 928 } 929 930 static bool 931 _is_buf_allocated(const struct iovec *iovs) 932 { 933 if (iovs == NULL) { 934 return false; 935 } 936 937 return iovs[0].iov_base != NULL; 938 } 939 940 static bool 941 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 942 { 943 int i; 944 uintptr_t iov_base; 945 946 if (spdk_likely(alignment == 1)) { 947 return true; 948 } 949 950 for (i = 0; i < iovcnt; i++) { 951 iov_base = (uintptr_t)iovs[i].iov_base; 952 if ((iov_base & (alignment - 1)) != 0) { 953 return false; 954 } 955 } 956 957 return true; 958 } 959 960 static void 961 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 962 { 963 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 964 void *buf; 965 966 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 967 buf = bdev_io->internal.buf; 968 bdev_io->internal.buf = NULL; 969 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 970 bdev_io->internal.get_aux_buf_cb = NULL; 971 } else { 972 assert(bdev_io->internal.get_buf_cb != NULL); 973 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 974 bdev_io->internal.get_buf_cb = NULL; 975 } 976 } 977 978 static void 979 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 980 { 981 struct spdk_bdev_io *bdev_io = ctx; 982 983 if (rc) { 984 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 985 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 986 } 987 bdev_io_get_buf_complete(bdev_io, !rc); 988 } 989 990 static void 991 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 992 { 993 int rc = 0; 994 995 /* save original md_buf */ 996 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 997 bdev_io->internal.orig_md_iov.iov_len = len; 998 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 999 bdev_io->internal.bounce_md_iov.iov_len = len; 1000 /* set bounce md_buf */ 1001 bdev_io->u.bdev.md_buf = md_buf; 1002 1003 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1004 if (bdev_io_use_memory_domain(bdev_io)) { 1005 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1006 bdev_io->internal.memory_domain_ctx, 1007 &bdev_io->internal.orig_md_iov, 1, 1008 &bdev_io->internal.bounce_md_iov, 1, 1009 bdev_io->internal.data_transfer_cpl, 1010 bdev_io); 1011 if (rc == 0) { 1012 /* Continue to submit IO in completion callback */ 1013 return; 1014 } 1015 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1016 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1017 } else { 1018 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1019 } 1020 } 1021 1022 assert(bdev_io->internal.data_transfer_cpl); 1023 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1024 } 1025 1026 static void 1027 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1028 { 1029 struct spdk_bdev *bdev = bdev_io->bdev; 1030 uint64_t md_len; 1031 void *buf; 1032 1033 if (spdk_bdev_is_md_separate(bdev)) { 1034 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1035 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1036 1037 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1038 1039 if (bdev_io->u.bdev.md_buf != NULL) { 1040 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1041 return; 1042 } else { 1043 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1044 } 1045 } 1046 1047 bdev_io_get_buf_complete(bdev_io, true); 1048 } 1049 1050 static void 1051 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1052 { 1053 struct spdk_bdev_io *bdev_io = ctx; 1054 1055 if (rc) { 1056 SPDK_ERRLOG("Failed to get data buffer\n"); 1057 assert(bdev_io->internal.data_transfer_cpl); 1058 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1059 return; 1060 } 1061 1062 _bdev_io_set_md_buf(bdev_io); 1063 } 1064 1065 static void 1066 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1067 bdev_copy_bounce_buffer_cpl cpl_cb) 1068 { 1069 int rc = 0; 1070 1071 bdev_io->internal.data_transfer_cpl = cpl_cb; 1072 /* save original iovec */ 1073 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1074 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1075 /* set bounce iov */ 1076 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1077 bdev_io->u.bdev.iovcnt = 1; 1078 /* set bounce buffer for this operation */ 1079 bdev_io->u.bdev.iovs[0].iov_base = buf; 1080 bdev_io->u.bdev.iovs[0].iov_len = len; 1081 /* if this is write path, copy data from original buffer to bounce buffer */ 1082 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1083 if (bdev_io_use_memory_domain(bdev_io)) { 1084 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1085 bdev_io->internal.memory_domain_ctx, 1086 bdev_io->internal.orig_iovs, 1087 (uint32_t) bdev_io->internal.orig_iovcnt, 1088 bdev_io->u.bdev.iovs, 1, 1089 _bdev_io_pull_bounce_data_buf_done, 1090 bdev_io); 1091 if (rc == 0) { 1092 /* Continue to submit IO in completion callback */ 1093 return; 1094 } 1095 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1096 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1097 } else { 1098 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1099 } 1100 } 1101 1102 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1103 } 1104 1105 static void 1106 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1107 { 1108 struct spdk_bdev *bdev = bdev_io->bdev; 1109 bool buf_allocated; 1110 uint64_t alignment; 1111 void *aligned_buf; 1112 1113 bdev_io->internal.buf = buf; 1114 1115 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1116 bdev_io_get_buf_complete(bdev_io, true); 1117 return; 1118 } 1119 1120 alignment = spdk_bdev_get_buf_align(bdev); 1121 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1122 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1123 1124 if (buf_allocated) { 1125 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1126 /* Continue in completion callback */ 1127 return; 1128 } else { 1129 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1130 } 1131 1132 _bdev_io_set_md_buf(bdev_io); 1133 } 1134 1135 static inline uint64_t 1136 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1137 { 1138 struct spdk_bdev *bdev = bdev_io->bdev; 1139 uint64_t md_len, alignment; 1140 1141 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1142 alignment = spdk_bdev_get_buf_align(bdev); 1143 1144 return len + alignment + md_len; 1145 } 1146 1147 static void 1148 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1149 { 1150 struct spdk_bdev_mgmt_channel *ch; 1151 1152 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1153 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1154 } 1155 1156 static void 1157 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1158 { 1159 assert(bdev_io->internal.buf != NULL); 1160 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1161 bdev_io->internal.buf = NULL; 1162 } 1163 1164 void 1165 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1166 { 1167 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1168 1169 assert(buf != NULL); 1170 _bdev_io_put_buf(bdev_io, buf, len); 1171 } 1172 1173 static void 1174 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1175 { 1176 struct spdk_bdev *bdev = bdev_ch->bdev; 1177 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1178 struct spdk_bdev_io *bdev_io; 1179 1180 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1181 /* 1182 * Allow some more I/O to complete before retrying the nomem_io queue. 1183 * Some drivers (such as nvme) cannot immediately take a new I/O in 1184 * the context of a completion, because the resources for the I/O are 1185 * not released until control returns to the bdev poller. Also, we 1186 * may require several small I/O to complete before a larger I/O 1187 * (that requires splitting) can be submitted. 1188 */ 1189 return; 1190 } 1191 1192 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1193 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1194 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1195 bdev_io->internal.ch->io_outstanding++; 1196 shared_resource->io_outstanding++; 1197 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1198 bdev_io->internal.error.nvme.cdw0 = 0; 1199 bdev_io->num_retries++; 1200 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1201 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1202 break; 1203 } 1204 } 1205 } 1206 1207 static inline void 1208 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1209 struct spdk_bdev_shared_resource *shared_resource) 1210 { 1211 assert(bdev_ch->io_outstanding > 0); 1212 assert(shared_resource->io_outstanding > 0); 1213 bdev_ch->io_outstanding--; 1214 shared_resource->io_outstanding--; 1215 } 1216 1217 static inline bool 1218 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1219 { 1220 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1221 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1222 1223 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1224 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1225 /* 1226 * Wait for some of the outstanding I/O to complete before we 1227 * retry any of the nomem_io. Normally we will wait for 1228 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1229 * depth channels we will instead wait for half to complete. 1230 */ 1231 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1232 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1233 return true; 1234 } 1235 1236 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1237 bdev_ch_retry_io(bdev_ch); 1238 } 1239 1240 return false; 1241 } 1242 1243 static void 1244 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1245 { 1246 struct spdk_bdev_io *bdev_io = ctx; 1247 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1248 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1249 1250 if (rc) { 1251 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1252 } 1253 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1254 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1255 */ 1256 bdev_io_put_buf(bdev_io); 1257 1258 /* Continue with IO completion flow */ 1259 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1260 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1261 return; 1262 } 1263 1264 bdev_io_complete(bdev_io); 1265 } 1266 1267 static inline void 1268 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1269 { 1270 int rc = 0; 1271 1272 /* do the same for metadata buffer */ 1273 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1274 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1275 1276 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1277 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1278 if (bdev_io_use_memory_domain(bdev_io)) { 1279 /* If memory domain is used then we need to call async push function */ 1280 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1281 bdev_io->internal.memory_domain_ctx, 1282 &bdev_io->internal.orig_md_iov, 1283 (uint32_t)bdev_io->internal.orig_iovcnt, 1284 &bdev_io->internal.bounce_md_iov, 1, 1285 bdev_io->internal.data_transfer_cpl, 1286 bdev_io); 1287 if (rc == 0) { 1288 /* Continue IO completion in async callback */ 1289 return; 1290 } 1291 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1292 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1293 } else { 1294 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1295 bdev_io->internal.orig_md_iov.iov_len); 1296 } 1297 } 1298 } 1299 1300 assert(bdev_io->internal.data_transfer_cpl); 1301 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1302 } 1303 1304 static void 1305 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1306 { 1307 struct spdk_bdev_io *bdev_io = ctx; 1308 1309 assert(bdev_io->internal.data_transfer_cpl); 1310 1311 if (rc) { 1312 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1313 return; 1314 } 1315 1316 /* set original buffer for this io */ 1317 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1318 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1319 /* disable bouncing buffer for this io */ 1320 bdev_io->internal.orig_iovcnt = 0; 1321 bdev_io->internal.orig_iovs = NULL; 1322 1323 _bdev_io_push_bounce_md_buffer(bdev_io); 1324 } 1325 1326 static inline void 1327 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1328 { 1329 int rc = 0; 1330 1331 bdev_io->internal.data_transfer_cpl = cpl_cb; 1332 1333 /* if this is read path, copy data from bounce buffer to original buffer */ 1334 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1335 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1336 if (bdev_io_use_memory_domain(bdev_io)) { 1337 /* If memory domain is used then we need to call async push function */ 1338 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1339 bdev_io->internal.memory_domain_ctx, 1340 bdev_io->internal.orig_iovs, 1341 (uint32_t)bdev_io->internal.orig_iovcnt, 1342 &bdev_io->internal.bounce_iov, 1, 1343 _bdev_io_push_bounce_data_buffer_done, 1344 bdev_io); 1345 if (rc == 0) { 1346 /* Continue IO completion in async callback */ 1347 return; 1348 } 1349 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1350 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1351 } else { 1352 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1353 bdev_io->internal.orig_iovcnt, 1354 bdev_io->internal.bounce_iov.iov_base, 1355 bdev_io->internal.bounce_iov.iov_len); 1356 } 1357 } 1358 1359 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1360 } 1361 1362 static void 1363 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1364 { 1365 struct spdk_bdev_io *bdev_io; 1366 1367 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1368 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1369 } 1370 1371 static void 1372 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1373 { 1374 struct spdk_bdev_mgmt_channel *mgmt_ch; 1375 uint64_t max_len; 1376 void *buf; 1377 1378 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1379 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1380 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1381 1382 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1383 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1384 bdev_io_get_buf_complete(bdev_io, false); 1385 return; 1386 } 1387 1388 bdev_io->internal.buf_len = len; 1389 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1390 bdev_io_get_iobuf_cb); 1391 if (buf != NULL) { 1392 _bdev_io_set_buf(bdev_io, buf, len); 1393 } 1394 } 1395 1396 void 1397 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1398 { 1399 struct spdk_bdev *bdev = bdev_io->bdev; 1400 uint64_t alignment; 1401 1402 assert(cb != NULL); 1403 bdev_io->internal.get_buf_cb = cb; 1404 1405 alignment = spdk_bdev_get_buf_align(bdev); 1406 1407 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1408 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1409 /* Buffer already present and aligned */ 1410 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1411 return; 1412 } 1413 1414 bdev_io_get_buf(bdev_io, len); 1415 } 1416 1417 static void 1418 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1419 bool success) 1420 { 1421 if (!success) { 1422 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1423 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1424 bdev_io_complete_unsubmitted(bdev_io); 1425 } else { 1426 bdev_io_submit(bdev_io); 1427 } 1428 } 1429 1430 static void 1431 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1432 uint64_t len) 1433 { 1434 assert(cb != NULL); 1435 bdev_io->internal.get_buf_cb = cb; 1436 1437 bdev_io_get_buf(bdev_io, len); 1438 } 1439 1440 void 1441 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1442 { 1443 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1444 1445 assert(cb != NULL); 1446 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1447 bdev_io->internal.get_aux_buf_cb = cb; 1448 bdev_io_get_buf(bdev_io, len); 1449 } 1450 1451 static int 1452 bdev_module_get_max_ctx_size(void) 1453 { 1454 struct spdk_bdev_module *bdev_module; 1455 int max_bdev_module_size = 0; 1456 1457 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1458 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1459 max_bdev_module_size = bdev_module->get_ctx_size(); 1460 } 1461 } 1462 1463 return max_bdev_module_size; 1464 } 1465 1466 static void 1467 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1468 { 1469 int i; 1470 struct spdk_bdev_qos *qos = bdev->internal.qos; 1471 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1472 1473 if (!qos) { 1474 return; 1475 } 1476 1477 spdk_bdev_get_qos_rate_limits(bdev, limits); 1478 1479 spdk_json_write_object_begin(w); 1480 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1481 1482 spdk_json_write_named_object_begin(w, "params"); 1483 spdk_json_write_named_string(w, "name", bdev->name); 1484 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1485 if (limits[i] > 0) { 1486 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1487 } 1488 } 1489 spdk_json_write_object_end(w); 1490 1491 spdk_json_write_object_end(w); 1492 } 1493 1494 void 1495 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1496 { 1497 struct spdk_bdev_module *bdev_module; 1498 struct spdk_bdev *bdev; 1499 1500 assert(w != NULL); 1501 1502 spdk_json_write_array_begin(w); 1503 1504 spdk_json_write_object_begin(w); 1505 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1506 spdk_json_write_named_object_begin(w, "params"); 1507 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1508 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1509 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1510 spdk_json_write_object_end(w); 1511 spdk_json_write_object_end(w); 1512 1513 bdev_examine_allowlist_config_json(w); 1514 1515 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1516 if (bdev_module->config_json) { 1517 bdev_module->config_json(w); 1518 } 1519 } 1520 1521 spdk_spin_lock(&g_bdev_mgr.spinlock); 1522 1523 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1524 if (bdev->fn_table->write_config_json) { 1525 bdev->fn_table->write_config_json(bdev, w); 1526 } 1527 1528 bdev_qos_config_json(bdev, w); 1529 } 1530 1531 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1532 1533 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1534 spdk_json_write_object_begin(w); 1535 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1536 spdk_json_write_object_end(w); 1537 1538 spdk_json_write_array_end(w); 1539 } 1540 1541 static void 1542 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1543 { 1544 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1545 struct spdk_bdev_io *bdev_io; 1546 1547 spdk_iobuf_channel_fini(&ch->iobuf); 1548 1549 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1550 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1551 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1552 ch->per_thread_cache_count--; 1553 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1554 } 1555 1556 assert(ch->per_thread_cache_count == 0); 1557 } 1558 1559 static int 1560 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1561 { 1562 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1563 struct spdk_bdev_io *bdev_io; 1564 uint32_t i; 1565 int rc; 1566 1567 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1568 if (rc != 0) { 1569 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1570 return -1; 1571 } 1572 1573 STAILQ_INIT(&ch->per_thread_cache); 1574 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1575 1576 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1577 ch->per_thread_cache_count = 0; 1578 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1579 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1580 if (bdev_io == NULL) { 1581 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1582 assert(false); 1583 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1584 return -1; 1585 } 1586 ch->per_thread_cache_count++; 1587 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1588 } 1589 1590 TAILQ_INIT(&ch->shared_resources); 1591 TAILQ_INIT(&ch->io_wait_queue); 1592 1593 return 0; 1594 } 1595 1596 static void 1597 bdev_init_complete(int rc) 1598 { 1599 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1600 void *cb_arg = g_init_cb_arg; 1601 struct spdk_bdev_module *m; 1602 1603 g_bdev_mgr.init_complete = true; 1604 g_init_cb_fn = NULL; 1605 g_init_cb_arg = NULL; 1606 1607 /* 1608 * For modules that need to know when subsystem init is complete, 1609 * inform them now. 1610 */ 1611 if (rc == 0) { 1612 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1613 if (m->init_complete) { 1614 m->init_complete(); 1615 } 1616 } 1617 } 1618 1619 cb_fn(cb_arg, rc); 1620 } 1621 1622 static bool 1623 bdev_module_all_actions_completed(void) 1624 { 1625 struct spdk_bdev_module *m; 1626 1627 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1628 if (m->internal.action_in_progress > 0) { 1629 return false; 1630 } 1631 } 1632 return true; 1633 } 1634 1635 static void 1636 bdev_module_action_complete(void) 1637 { 1638 /* 1639 * Don't finish bdev subsystem initialization if 1640 * module pre-initialization is still in progress, or 1641 * the subsystem been already initialized. 1642 */ 1643 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1644 return; 1645 } 1646 1647 /* 1648 * Check all bdev modules for inits/examinations in progress. If any 1649 * exist, return immediately since we cannot finish bdev subsystem 1650 * initialization until all are completed. 1651 */ 1652 if (!bdev_module_all_actions_completed()) { 1653 return; 1654 } 1655 1656 /* 1657 * Modules already finished initialization - now that all 1658 * the bdev modules have finished their asynchronous I/O 1659 * processing, the entire bdev layer can be marked as complete. 1660 */ 1661 bdev_init_complete(0); 1662 } 1663 1664 static void 1665 bdev_module_action_done(struct spdk_bdev_module *module) 1666 { 1667 spdk_spin_lock(&module->internal.spinlock); 1668 assert(module->internal.action_in_progress > 0); 1669 module->internal.action_in_progress--; 1670 spdk_spin_unlock(&module->internal.spinlock); 1671 bdev_module_action_complete(); 1672 } 1673 1674 void 1675 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1676 { 1677 assert(module->async_init); 1678 bdev_module_action_done(module); 1679 } 1680 1681 void 1682 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1683 { 1684 bdev_module_action_done(module); 1685 } 1686 1687 /** The last initialized bdev module */ 1688 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1689 1690 static void 1691 bdev_init_failed(void *cb_arg) 1692 { 1693 struct spdk_bdev_module *module = cb_arg; 1694 1695 spdk_spin_lock(&module->internal.spinlock); 1696 assert(module->internal.action_in_progress > 0); 1697 module->internal.action_in_progress--; 1698 spdk_spin_unlock(&module->internal.spinlock); 1699 bdev_init_complete(-1); 1700 } 1701 1702 static int 1703 bdev_modules_init(void) 1704 { 1705 struct spdk_bdev_module *module; 1706 int rc = 0; 1707 1708 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1709 g_resume_bdev_module = module; 1710 if (module->async_init) { 1711 spdk_spin_lock(&module->internal.spinlock); 1712 module->internal.action_in_progress = 1; 1713 spdk_spin_unlock(&module->internal.spinlock); 1714 } 1715 rc = module->module_init(); 1716 if (rc != 0) { 1717 /* Bump action_in_progress to prevent other modules from completion of modules_init 1718 * Send message to defer application shutdown until resources are cleaned up */ 1719 spdk_spin_lock(&module->internal.spinlock); 1720 module->internal.action_in_progress = 1; 1721 spdk_spin_unlock(&module->internal.spinlock); 1722 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1723 return rc; 1724 } 1725 } 1726 1727 g_resume_bdev_module = NULL; 1728 return 0; 1729 } 1730 1731 void 1732 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1733 { 1734 int rc = 0; 1735 char mempool_name[32]; 1736 1737 assert(cb_fn != NULL); 1738 1739 g_init_cb_fn = cb_fn; 1740 g_init_cb_arg = cb_arg; 1741 1742 spdk_notify_type_register("bdev_register"); 1743 spdk_notify_type_register("bdev_unregister"); 1744 1745 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1746 1747 rc = spdk_iobuf_register_module("bdev"); 1748 if (rc != 0) { 1749 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1750 bdev_init_complete(-1); 1751 return; 1752 } 1753 1754 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1755 g_bdev_opts.bdev_io_pool_size, 1756 sizeof(struct spdk_bdev_io) + 1757 bdev_module_get_max_ctx_size(), 1758 0, 1759 SPDK_ENV_SOCKET_ID_ANY); 1760 1761 if (g_bdev_mgr.bdev_io_pool == NULL) { 1762 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1763 bdev_init_complete(-1); 1764 return; 1765 } 1766 1767 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1768 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1769 if (!g_bdev_mgr.zero_buffer) { 1770 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1771 bdev_init_complete(-1); 1772 return; 1773 } 1774 1775 #ifdef SPDK_CONFIG_VTUNE 1776 SPDK_LOG_DEPRECATED(vtune_support); 1777 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1778 #endif 1779 1780 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1781 bdev_mgmt_channel_destroy, 1782 sizeof(struct spdk_bdev_mgmt_channel), 1783 "bdev_mgr"); 1784 1785 rc = bdev_modules_init(); 1786 g_bdev_mgr.module_init_complete = true; 1787 if (rc != 0) { 1788 SPDK_ERRLOG("bdev modules init failed\n"); 1789 return; 1790 } 1791 1792 bdev_module_action_complete(); 1793 } 1794 1795 static void 1796 bdev_mgr_unregister_cb(void *io_device) 1797 { 1798 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1799 1800 if (g_bdev_mgr.bdev_io_pool) { 1801 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1802 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1803 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1804 g_bdev_opts.bdev_io_pool_size); 1805 } 1806 1807 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1808 } 1809 1810 spdk_free(g_bdev_mgr.zero_buffer); 1811 1812 bdev_examine_allowlist_free(); 1813 1814 cb_fn(g_fini_cb_arg); 1815 g_fini_cb_fn = NULL; 1816 g_fini_cb_arg = NULL; 1817 g_bdev_mgr.init_complete = false; 1818 g_bdev_mgr.module_init_complete = false; 1819 } 1820 1821 static void 1822 bdev_module_fini_iter(void *arg) 1823 { 1824 struct spdk_bdev_module *bdev_module; 1825 1826 /* FIXME: Handling initialization failures is broken now, 1827 * so we won't even try cleaning up after successfully 1828 * initialized modules. if module_init_complete is false, 1829 * just call spdk_bdev_mgr_unregister_cb 1830 */ 1831 if (!g_bdev_mgr.module_init_complete) { 1832 bdev_mgr_unregister_cb(NULL); 1833 return; 1834 } 1835 1836 /* Start iterating from the last touched module */ 1837 if (!g_resume_bdev_module) { 1838 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1839 } else { 1840 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1841 internal.tailq); 1842 } 1843 1844 while (bdev_module) { 1845 if (bdev_module->async_fini) { 1846 /* Save our place so we can resume later. We must 1847 * save the variable here, before calling module_fini() 1848 * below, because in some cases the module may immediately 1849 * call spdk_bdev_module_fini_done() and re-enter 1850 * this function to continue iterating. */ 1851 g_resume_bdev_module = bdev_module; 1852 } 1853 1854 if (bdev_module->module_fini) { 1855 bdev_module->module_fini(); 1856 } 1857 1858 if (bdev_module->async_fini) { 1859 return; 1860 } 1861 1862 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1863 internal.tailq); 1864 } 1865 1866 g_resume_bdev_module = NULL; 1867 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1868 } 1869 1870 void 1871 spdk_bdev_module_fini_done(void) 1872 { 1873 if (spdk_get_thread() != g_fini_thread) { 1874 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1875 } else { 1876 bdev_module_fini_iter(NULL); 1877 } 1878 } 1879 1880 static void 1881 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1882 { 1883 struct spdk_bdev *bdev = cb_arg; 1884 1885 if (bdeverrno && bdev) { 1886 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1887 bdev->name); 1888 1889 /* 1890 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1891 * bdev; try to continue by manually removing this bdev from the list and continue 1892 * with the next bdev in the list. 1893 */ 1894 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1895 } 1896 1897 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1898 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1899 /* 1900 * Bdev module finish need to be deferred as we might be in the middle of some context 1901 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1902 * after returning. 1903 */ 1904 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1905 return; 1906 } 1907 1908 /* 1909 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1910 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1911 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1912 * base bdevs. 1913 * 1914 * Also, walk the list in the reverse order. 1915 */ 1916 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1917 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1918 spdk_spin_lock(&bdev->internal.spinlock); 1919 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1920 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 1921 spdk_spin_unlock(&bdev->internal.spinlock); 1922 continue; 1923 } 1924 spdk_spin_unlock(&bdev->internal.spinlock); 1925 1926 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1927 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1928 return; 1929 } 1930 1931 /* 1932 * If any bdev fails to unclaim underlying bdev properly, we may face the 1933 * case of bdev list consisting of claimed bdevs only (if claims are managed 1934 * correctly, this would mean there's a loop in the claims graph which is 1935 * clearly impossible). Warn and unregister last bdev on the list then. 1936 */ 1937 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1938 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1939 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1940 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1941 return; 1942 } 1943 } 1944 1945 static void 1946 bdev_module_fini_start_iter(void *arg) 1947 { 1948 struct spdk_bdev_module *bdev_module; 1949 1950 if (!g_resume_bdev_module) { 1951 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1952 } else { 1953 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1954 } 1955 1956 while (bdev_module) { 1957 if (bdev_module->async_fini_start) { 1958 /* Save our place so we can resume later. We must 1959 * save the variable here, before calling fini_start() 1960 * below, because in some cases the module may immediately 1961 * call spdk_bdev_module_fini_start_done() and re-enter 1962 * this function to continue iterating. */ 1963 g_resume_bdev_module = bdev_module; 1964 } 1965 1966 if (bdev_module->fini_start) { 1967 bdev_module->fini_start(); 1968 } 1969 1970 if (bdev_module->async_fini_start) { 1971 return; 1972 } 1973 1974 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1975 } 1976 1977 g_resume_bdev_module = NULL; 1978 1979 bdev_finish_unregister_bdevs_iter(NULL, 0); 1980 } 1981 1982 void 1983 spdk_bdev_module_fini_start_done(void) 1984 { 1985 if (spdk_get_thread() != g_fini_thread) { 1986 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1987 } else { 1988 bdev_module_fini_start_iter(NULL); 1989 } 1990 } 1991 1992 static void 1993 bdev_finish_wait_for_examine_done(void *cb_arg) 1994 { 1995 bdev_module_fini_start_iter(NULL); 1996 } 1997 1998 void 1999 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2000 { 2001 int rc; 2002 2003 assert(cb_fn != NULL); 2004 2005 g_fini_thread = spdk_get_thread(); 2006 2007 g_fini_cb_fn = cb_fn; 2008 g_fini_cb_arg = cb_arg; 2009 2010 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2011 if (rc != 0) { 2012 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2013 bdev_finish_wait_for_examine_done(NULL); 2014 } 2015 } 2016 2017 struct spdk_bdev_io * 2018 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2019 { 2020 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2021 struct spdk_bdev_io *bdev_io; 2022 2023 if (ch->per_thread_cache_count > 0) { 2024 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2025 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2026 ch->per_thread_cache_count--; 2027 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2028 /* 2029 * Don't try to look for bdev_ios in the global pool if there are 2030 * waiters on bdev_ios - we don't want this caller to jump the line. 2031 */ 2032 bdev_io = NULL; 2033 } else { 2034 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2035 } 2036 2037 return bdev_io; 2038 } 2039 2040 void 2041 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2042 { 2043 struct spdk_bdev_mgmt_channel *ch; 2044 2045 assert(bdev_io != NULL); 2046 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2047 2048 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2049 2050 if (bdev_io->internal.buf != NULL) { 2051 bdev_io_put_buf(bdev_io); 2052 } 2053 2054 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2055 ch->per_thread_cache_count++; 2056 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2057 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2058 struct spdk_bdev_io_wait_entry *entry; 2059 2060 entry = TAILQ_FIRST(&ch->io_wait_queue); 2061 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2062 entry->cb_fn(entry->cb_arg); 2063 } 2064 } else { 2065 /* We should never have a full cache with entries on the io wait queue. */ 2066 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2067 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2068 } 2069 } 2070 2071 static bool 2072 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2073 { 2074 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2075 2076 switch (limit) { 2077 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2078 return true; 2079 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2080 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2081 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2082 return false; 2083 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2084 default: 2085 return false; 2086 } 2087 } 2088 2089 static bool 2090 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2091 { 2092 switch (bdev_io->type) { 2093 case SPDK_BDEV_IO_TYPE_NVME_IO: 2094 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2095 case SPDK_BDEV_IO_TYPE_READ: 2096 case SPDK_BDEV_IO_TYPE_WRITE: 2097 return true; 2098 case SPDK_BDEV_IO_TYPE_ZCOPY: 2099 if (bdev_io->u.bdev.zcopy.start) { 2100 return true; 2101 } else { 2102 return false; 2103 } 2104 default: 2105 return false; 2106 } 2107 } 2108 2109 static bool 2110 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2111 { 2112 switch (bdev_io->type) { 2113 case SPDK_BDEV_IO_TYPE_NVME_IO: 2114 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2115 /* Bit 1 (0x2) set for read operation */ 2116 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2117 return true; 2118 } else { 2119 return false; 2120 } 2121 case SPDK_BDEV_IO_TYPE_READ: 2122 return true; 2123 case SPDK_BDEV_IO_TYPE_ZCOPY: 2124 /* Populate to read from disk */ 2125 if (bdev_io->u.bdev.zcopy.populate) { 2126 return true; 2127 } else { 2128 return false; 2129 } 2130 default: 2131 return false; 2132 } 2133 } 2134 2135 static uint64_t 2136 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2137 { 2138 struct spdk_bdev *bdev = bdev_io->bdev; 2139 2140 switch (bdev_io->type) { 2141 case SPDK_BDEV_IO_TYPE_NVME_IO: 2142 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2143 return bdev_io->u.nvme_passthru.nbytes; 2144 case SPDK_BDEV_IO_TYPE_READ: 2145 case SPDK_BDEV_IO_TYPE_WRITE: 2146 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2147 case SPDK_BDEV_IO_TYPE_ZCOPY: 2148 /* Track the data in the start phase only */ 2149 if (bdev_io->u.bdev.zcopy.start) { 2150 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2151 } else { 2152 return 0; 2153 } 2154 default: 2155 return 0; 2156 } 2157 } 2158 2159 static bool 2160 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2161 { 2162 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2163 return true; 2164 } else { 2165 return false; 2166 } 2167 } 2168 2169 static bool 2170 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2171 { 2172 if (bdev_is_read_io(io) == false) { 2173 return false; 2174 } 2175 2176 return bdev_qos_rw_queue_io(limit, io); 2177 } 2178 2179 static bool 2180 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2181 { 2182 if (bdev_is_read_io(io) == true) { 2183 return false; 2184 } 2185 2186 return bdev_qos_rw_queue_io(limit, io); 2187 } 2188 2189 static void 2190 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2191 { 2192 limit->remaining_this_timeslice--; 2193 } 2194 2195 static void 2196 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2197 { 2198 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2199 } 2200 2201 static void 2202 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2203 { 2204 if (bdev_is_read_io(io) == false) { 2205 return; 2206 } 2207 2208 return bdev_qos_rw_bps_update_quota(limit, io); 2209 } 2210 2211 static void 2212 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2213 { 2214 if (bdev_is_read_io(io) == true) { 2215 return; 2216 } 2217 2218 return bdev_qos_rw_bps_update_quota(limit, io); 2219 } 2220 2221 static void 2222 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2223 { 2224 int i; 2225 2226 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2227 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2228 qos->rate_limits[i].queue_io = NULL; 2229 qos->rate_limits[i].update_quota = NULL; 2230 continue; 2231 } 2232 2233 switch (i) { 2234 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2235 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2236 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2237 break; 2238 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2239 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2240 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2241 break; 2242 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2243 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2244 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2245 break; 2246 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2247 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2248 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2249 break; 2250 default: 2251 break; 2252 } 2253 } 2254 } 2255 2256 static void 2257 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2258 struct spdk_bdev_io *bdev_io, 2259 enum spdk_bdev_io_status status) 2260 { 2261 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2262 2263 bdev_io->internal.in_submit_request = true; 2264 bdev_ch->io_outstanding++; 2265 shared_resource->io_outstanding++; 2266 spdk_bdev_io_complete(bdev_io, status); 2267 bdev_io->internal.in_submit_request = false; 2268 } 2269 2270 static inline void 2271 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2272 { 2273 struct spdk_bdev *bdev = bdev_io->bdev; 2274 struct spdk_io_channel *ch = bdev_ch->channel; 2275 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2276 2277 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2278 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2279 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2280 2281 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2282 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2283 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2284 SPDK_BDEV_IO_STATUS_SUCCESS); 2285 return; 2286 } 2287 } 2288 2289 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2290 bdev_io->bdev->split_on_write_unit && 2291 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2292 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2293 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2294 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2295 return; 2296 } 2297 2298 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2299 bdev_ch->io_outstanding++; 2300 shared_resource->io_outstanding++; 2301 bdev_io->internal.in_submit_request = true; 2302 bdev->fn_table->submit_request(ch, bdev_io); 2303 bdev_io->internal.in_submit_request = false; 2304 } else { 2305 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2306 } 2307 } 2308 2309 static bool 2310 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2311 { 2312 int i; 2313 2314 if (bdev_qos_io_to_limit(bdev_io) == true) { 2315 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2316 if (!qos->rate_limits[i].queue_io) { 2317 continue; 2318 } 2319 2320 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2321 bdev_io) == true) { 2322 return true; 2323 } 2324 } 2325 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2326 if (!qos->rate_limits[i].update_quota) { 2327 continue; 2328 } 2329 2330 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2331 } 2332 } 2333 2334 return false; 2335 } 2336 2337 static inline void 2338 _bdev_io_do_submit(void *ctx) 2339 { 2340 struct spdk_bdev_io *bdev_io = ctx; 2341 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2342 2343 bdev_io_do_submit(ch, bdev_io); 2344 } 2345 2346 static int 2347 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2348 { 2349 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2350 int submitted_ios = 0; 2351 2352 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2353 if (!bdev_qos_queue_io(qos, bdev_io)) { 2354 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2355 2356 if (bdev_io->internal.io_submit_ch) { 2357 /* Send back the IO to the original thread for the actual processing. */ 2358 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2359 bdev_io->internal.io_submit_ch = NULL; 2360 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2361 _bdev_io_do_submit, bdev_io); 2362 } else { 2363 bdev_io_do_submit(ch, bdev_io); 2364 } 2365 2366 submitted_ios++; 2367 } 2368 } 2369 2370 return submitted_ios; 2371 } 2372 2373 static void 2374 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2375 { 2376 int rc; 2377 2378 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2379 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2380 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2381 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2382 &bdev_io->internal.waitq_entry); 2383 if (rc != 0) { 2384 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2385 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2386 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2387 } 2388 } 2389 2390 static bool 2391 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2392 { 2393 uint32_t io_boundary; 2394 struct spdk_bdev *bdev = bdev_io->bdev; 2395 uint32_t max_size = bdev->max_segment_size; 2396 int max_segs = bdev->max_num_segments; 2397 2398 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2399 io_boundary = bdev->write_unit_size; 2400 } else if (bdev->split_on_optimal_io_boundary) { 2401 io_boundary = bdev->optimal_io_boundary; 2402 } else { 2403 io_boundary = 0; 2404 } 2405 2406 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2407 return false; 2408 } 2409 2410 if (io_boundary) { 2411 uint64_t start_stripe, end_stripe; 2412 2413 start_stripe = bdev_io->u.bdev.offset_blocks; 2414 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2415 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2416 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2417 start_stripe >>= spdk_u32log2(io_boundary); 2418 end_stripe >>= spdk_u32log2(io_boundary); 2419 } else { 2420 start_stripe /= io_boundary; 2421 end_stripe /= io_boundary; 2422 } 2423 2424 if (start_stripe != end_stripe) { 2425 return true; 2426 } 2427 } 2428 2429 if (max_segs) { 2430 if (bdev_io->u.bdev.iovcnt > max_segs) { 2431 return true; 2432 } 2433 } 2434 2435 if (max_size) { 2436 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2437 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2438 return true; 2439 } 2440 } 2441 } 2442 2443 return false; 2444 } 2445 2446 static bool 2447 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2448 { 2449 uint32_t num_unmap_segments; 2450 2451 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2452 return false; 2453 } 2454 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2455 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2456 return true; 2457 } 2458 2459 return false; 2460 } 2461 2462 static bool 2463 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2464 { 2465 if (!bdev_io->bdev->max_write_zeroes) { 2466 return false; 2467 } 2468 2469 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2470 return true; 2471 } 2472 2473 return false; 2474 } 2475 2476 static bool 2477 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2478 { 2479 if (bdev_io->bdev->max_copy != 0 && 2480 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2481 return true; 2482 } 2483 2484 return false; 2485 } 2486 2487 static bool 2488 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2489 { 2490 switch (bdev_io->type) { 2491 case SPDK_BDEV_IO_TYPE_READ: 2492 case SPDK_BDEV_IO_TYPE_WRITE: 2493 return bdev_rw_should_split(bdev_io); 2494 case SPDK_BDEV_IO_TYPE_UNMAP: 2495 return bdev_unmap_should_split(bdev_io); 2496 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2497 return bdev_write_zeroes_should_split(bdev_io); 2498 case SPDK_BDEV_IO_TYPE_COPY: 2499 return bdev_copy_should_split(bdev_io); 2500 default: 2501 return false; 2502 } 2503 } 2504 2505 static uint32_t 2506 _to_next_boundary(uint64_t offset, uint32_t boundary) 2507 { 2508 return (boundary - (offset % boundary)); 2509 } 2510 2511 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2512 2513 static void _bdev_rw_split(void *_bdev_io); 2514 2515 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2516 2517 static void 2518 _bdev_unmap_split(void *_bdev_io) 2519 { 2520 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2521 } 2522 2523 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2524 2525 static void 2526 _bdev_write_zeroes_split(void *_bdev_io) 2527 { 2528 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2529 } 2530 2531 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2532 2533 static void 2534 _bdev_copy_split(void *_bdev_io) 2535 { 2536 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2537 } 2538 2539 static int 2540 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2541 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2542 { 2543 int rc; 2544 uint64_t current_offset, current_remaining, current_src_offset; 2545 spdk_bdev_io_wait_cb io_wait_fn; 2546 2547 current_offset = *offset; 2548 current_remaining = *remaining; 2549 2550 bdev_io->u.bdev.split_outstanding++; 2551 2552 io_wait_fn = _bdev_rw_split; 2553 switch (bdev_io->type) { 2554 case SPDK_BDEV_IO_TYPE_READ: 2555 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2556 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2557 iov, iovcnt, md_buf, current_offset, 2558 num_blocks, bdev_io->internal.memory_domain, 2559 bdev_io->internal.memory_domain_ctx, 2560 bdev_io_split_done, bdev_io); 2561 break; 2562 case SPDK_BDEV_IO_TYPE_WRITE: 2563 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2564 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2565 iov, iovcnt, md_buf, current_offset, 2566 num_blocks, bdev_io->internal.memory_domain, 2567 bdev_io->internal.memory_domain_ctx, 2568 bdev_io_split_done, bdev_io); 2569 break; 2570 case SPDK_BDEV_IO_TYPE_UNMAP: 2571 io_wait_fn = _bdev_unmap_split; 2572 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2573 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2574 current_offset, num_blocks, 2575 bdev_io_split_done, bdev_io); 2576 break; 2577 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2578 io_wait_fn = _bdev_write_zeroes_split; 2579 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2580 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2581 current_offset, num_blocks, 2582 bdev_io_split_done, bdev_io); 2583 break; 2584 case SPDK_BDEV_IO_TYPE_COPY: 2585 io_wait_fn = _bdev_copy_split; 2586 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2587 (current_offset - bdev_io->u.bdev.offset_blocks); 2588 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2589 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2590 current_offset, current_src_offset, num_blocks, 2591 bdev_io_split_done, bdev_io); 2592 break; 2593 default: 2594 assert(false); 2595 rc = -EINVAL; 2596 break; 2597 } 2598 2599 if (rc == 0) { 2600 current_offset += num_blocks; 2601 current_remaining -= num_blocks; 2602 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2603 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2604 *offset = current_offset; 2605 *remaining = current_remaining; 2606 } else { 2607 bdev_io->u.bdev.split_outstanding--; 2608 if (rc == -ENOMEM) { 2609 if (bdev_io->u.bdev.split_outstanding == 0) { 2610 /* No I/O is outstanding. Hence we should wait here. */ 2611 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2612 } 2613 } else { 2614 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2615 if (bdev_io->u.bdev.split_outstanding == 0) { 2616 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2617 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2618 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2619 } 2620 } 2621 } 2622 2623 return rc; 2624 } 2625 2626 static void 2627 _bdev_rw_split(void *_bdev_io) 2628 { 2629 struct iovec *parent_iov, *iov; 2630 struct spdk_bdev_io *bdev_io = _bdev_io; 2631 struct spdk_bdev *bdev = bdev_io->bdev; 2632 uint64_t parent_offset, current_offset, remaining; 2633 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2634 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2635 uint32_t iovcnt, iov_len, child_iovsize; 2636 uint32_t blocklen = bdev->blocklen; 2637 uint32_t io_boundary; 2638 uint32_t max_segment_size = bdev->max_segment_size; 2639 uint32_t max_child_iovcnt = bdev->max_num_segments; 2640 void *md_buf = NULL; 2641 int rc; 2642 2643 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2644 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2645 SPDK_BDEV_IO_NUM_CHILD_IOV; 2646 2647 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2648 io_boundary = bdev->write_unit_size; 2649 } else if (bdev->split_on_optimal_io_boundary) { 2650 io_boundary = bdev->optimal_io_boundary; 2651 } else { 2652 io_boundary = UINT32_MAX; 2653 } 2654 2655 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2656 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2657 parent_offset = bdev_io->u.bdev.offset_blocks; 2658 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2659 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2660 2661 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2662 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2663 if (parent_iov_offset < parent_iov->iov_len) { 2664 break; 2665 } 2666 parent_iov_offset -= parent_iov->iov_len; 2667 } 2668 2669 child_iovcnt = 0; 2670 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2671 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2672 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2673 to_next_boundary = spdk_min(remaining, to_next_boundary); 2674 to_next_boundary_bytes = to_next_boundary * blocklen; 2675 2676 iov = &bdev_io->child_iov[child_iovcnt]; 2677 iovcnt = 0; 2678 2679 if (bdev_io->u.bdev.md_buf) { 2680 md_buf = (char *)bdev_io->u.bdev.md_buf + 2681 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2682 } 2683 2684 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2685 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2686 iovcnt < child_iovsize) { 2687 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2688 iov_len = parent_iov->iov_len - parent_iov_offset; 2689 2690 iov_len = spdk_min(iov_len, max_segment_size); 2691 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2692 to_next_boundary_bytes -= iov_len; 2693 2694 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2695 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2696 2697 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2698 parent_iov_offset += iov_len; 2699 } else { 2700 parent_iovpos++; 2701 parent_iov_offset = 0; 2702 } 2703 child_iovcnt++; 2704 iovcnt++; 2705 } 2706 2707 if (to_next_boundary_bytes > 0) { 2708 /* We had to stop this child I/O early because we ran out of 2709 * child_iov space or were limited by max_num_segments. 2710 * Ensure the iovs to be aligned with block size and 2711 * then adjust to_next_boundary before starting the 2712 * child I/O. 2713 */ 2714 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2715 iovcnt == child_iovsize); 2716 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2717 if (to_last_block_bytes != 0) { 2718 uint32_t child_iovpos = child_iovcnt - 1; 2719 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2720 * so the loop will naturally end 2721 */ 2722 2723 to_last_block_bytes = blocklen - to_last_block_bytes; 2724 to_next_boundary_bytes += to_last_block_bytes; 2725 while (to_last_block_bytes > 0 && iovcnt > 0) { 2726 iov_len = spdk_min(to_last_block_bytes, 2727 bdev_io->child_iov[child_iovpos].iov_len); 2728 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2729 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2730 child_iovpos--; 2731 if (--iovcnt == 0) { 2732 /* If the child IO is less than a block size just return. 2733 * If the first child IO of any split round is less than 2734 * a block size, an error exit. 2735 */ 2736 if (bdev_io->u.bdev.split_outstanding == 0) { 2737 SPDK_ERRLOG("The first child io was less than a block size\n"); 2738 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2739 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2740 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2741 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2742 } 2743 2744 return; 2745 } 2746 } 2747 2748 to_last_block_bytes -= iov_len; 2749 2750 if (parent_iov_offset == 0) { 2751 parent_iovpos--; 2752 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2753 } 2754 parent_iov_offset -= iov_len; 2755 } 2756 2757 assert(to_last_block_bytes == 0); 2758 } 2759 to_next_boundary -= to_next_boundary_bytes / blocklen; 2760 } 2761 2762 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2763 ¤t_offset, &remaining); 2764 if (spdk_unlikely(rc)) { 2765 return; 2766 } 2767 } 2768 } 2769 2770 static void 2771 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2772 { 2773 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2774 uint32_t num_children_reqs = 0; 2775 int rc; 2776 2777 offset = bdev_io->u.bdev.split_current_offset_blocks; 2778 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2779 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2780 2781 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2782 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2783 2784 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2785 &offset, &remaining); 2786 if (spdk_likely(rc == 0)) { 2787 num_children_reqs++; 2788 } else { 2789 return; 2790 } 2791 } 2792 } 2793 2794 static void 2795 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2796 { 2797 uint64_t offset, write_zeroes_blocks, remaining; 2798 uint32_t num_children_reqs = 0; 2799 int rc; 2800 2801 offset = bdev_io->u.bdev.split_current_offset_blocks; 2802 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2803 2804 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2805 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2806 2807 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2808 &offset, &remaining); 2809 if (spdk_likely(rc == 0)) { 2810 num_children_reqs++; 2811 } else { 2812 return; 2813 } 2814 } 2815 } 2816 2817 static void 2818 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2819 { 2820 uint64_t offset, copy_blocks, remaining; 2821 uint32_t num_children_reqs = 0; 2822 int rc; 2823 2824 offset = bdev_io->u.bdev.split_current_offset_blocks; 2825 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2826 2827 assert(bdev_io->bdev->max_copy != 0); 2828 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2829 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2830 2831 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2832 &offset, &remaining); 2833 if (spdk_likely(rc == 0)) { 2834 num_children_reqs++; 2835 } else { 2836 return; 2837 } 2838 } 2839 } 2840 2841 static void 2842 parent_bdev_io_complete(void *ctx, int rc) 2843 { 2844 struct spdk_bdev_io *parent_io = ctx; 2845 2846 if (rc) { 2847 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2848 } 2849 2850 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2851 parent_io->internal.caller_ctx); 2852 } 2853 2854 static void 2855 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2856 { 2857 struct spdk_bdev_io *parent_io = cb_arg; 2858 2859 spdk_bdev_free_io(bdev_io); 2860 2861 if (!success) { 2862 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2863 /* If any child I/O failed, stop further splitting process. */ 2864 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2865 parent_io->u.bdev.split_remaining_num_blocks = 0; 2866 } 2867 parent_io->u.bdev.split_outstanding--; 2868 if (parent_io->u.bdev.split_outstanding != 0) { 2869 return; 2870 } 2871 2872 /* 2873 * Parent I/O finishes when all blocks are consumed. 2874 */ 2875 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2876 assert(parent_io->internal.cb != bdev_io_split_done); 2877 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2878 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2879 2880 if (parent_io->internal.orig_iovcnt != 0) { 2881 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2882 /* bdev IO will be completed in the callback */ 2883 } else { 2884 parent_bdev_io_complete(parent_io, 0); 2885 } 2886 return; 2887 } 2888 2889 /* 2890 * Continue with the splitting process. This function will complete the parent I/O if the 2891 * splitting is done. 2892 */ 2893 switch (parent_io->type) { 2894 case SPDK_BDEV_IO_TYPE_READ: 2895 case SPDK_BDEV_IO_TYPE_WRITE: 2896 _bdev_rw_split(parent_io); 2897 break; 2898 case SPDK_BDEV_IO_TYPE_UNMAP: 2899 bdev_unmap_split(parent_io); 2900 break; 2901 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2902 bdev_write_zeroes_split(parent_io); 2903 break; 2904 case SPDK_BDEV_IO_TYPE_COPY: 2905 bdev_copy_split(parent_io); 2906 break; 2907 default: 2908 assert(false); 2909 break; 2910 } 2911 } 2912 2913 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2914 bool success); 2915 2916 static void 2917 bdev_io_split(struct spdk_bdev_io *bdev_io) 2918 { 2919 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2920 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2921 bdev_io->u.bdev.split_outstanding = 0; 2922 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2923 2924 switch (bdev_io->type) { 2925 case SPDK_BDEV_IO_TYPE_READ: 2926 case SPDK_BDEV_IO_TYPE_WRITE: 2927 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2928 _bdev_rw_split(bdev_io); 2929 } else { 2930 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2931 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2932 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2933 } 2934 break; 2935 case SPDK_BDEV_IO_TYPE_UNMAP: 2936 bdev_unmap_split(bdev_io); 2937 break; 2938 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2939 bdev_write_zeroes_split(bdev_io); 2940 break; 2941 case SPDK_BDEV_IO_TYPE_COPY: 2942 bdev_copy_split(bdev_io); 2943 break; 2944 default: 2945 assert(false); 2946 break; 2947 } 2948 } 2949 2950 static void 2951 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2952 { 2953 if (!success) { 2954 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2955 return; 2956 } 2957 2958 _bdev_rw_split(bdev_io); 2959 } 2960 2961 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2962 * be inlined, at least on some compilers. 2963 */ 2964 static inline void 2965 _bdev_io_submit(void *ctx) 2966 { 2967 struct spdk_bdev_io *bdev_io = ctx; 2968 struct spdk_bdev *bdev = bdev_io->bdev; 2969 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2970 2971 if (spdk_likely(bdev_ch->flags == 0)) { 2972 bdev_io_do_submit(bdev_ch, bdev_io); 2973 return; 2974 } 2975 2976 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2977 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2978 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2979 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2980 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2981 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2982 } else { 2983 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2984 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2985 } 2986 } else { 2987 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2988 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2989 } 2990 } 2991 2992 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2993 2994 bool 2995 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2996 { 2997 if (range1->length == 0 || range2->length == 0) { 2998 return false; 2999 } 3000 3001 if (range1->offset + range1->length <= range2->offset) { 3002 return false; 3003 } 3004 3005 if (range2->offset + range2->length <= range1->offset) { 3006 return false; 3007 } 3008 3009 return true; 3010 } 3011 3012 static bool 3013 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3014 { 3015 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3016 struct lba_range r; 3017 3018 switch (bdev_io->type) { 3019 case SPDK_BDEV_IO_TYPE_NVME_IO: 3020 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3021 /* Don't try to decode the NVMe command - just assume worst-case and that 3022 * it overlaps a locked range. 3023 */ 3024 return true; 3025 case SPDK_BDEV_IO_TYPE_WRITE: 3026 case SPDK_BDEV_IO_TYPE_UNMAP: 3027 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3028 case SPDK_BDEV_IO_TYPE_ZCOPY: 3029 case SPDK_BDEV_IO_TYPE_COPY: 3030 r.offset = bdev_io->u.bdev.offset_blocks; 3031 r.length = bdev_io->u.bdev.num_blocks; 3032 if (!bdev_lba_range_overlapped(range, &r)) { 3033 /* This I/O doesn't overlap the specified LBA range. */ 3034 return false; 3035 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3036 /* This I/O overlaps, but the I/O is on the same channel that locked this 3037 * range, and the caller_ctx is the same as the locked_ctx. This means 3038 * that this I/O is associated with the lock, and is allowed to execute. 3039 */ 3040 return false; 3041 } else { 3042 return true; 3043 } 3044 default: 3045 return false; 3046 } 3047 } 3048 3049 void 3050 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3051 { 3052 struct spdk_bdev *bdev = bdev_io->bdev; 3053 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3054 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3055 3056 assert(thread != NULL); 3057 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3058 3059 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3060 struct lba_range *range; 3061 3062 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3063 if (bdev_io_range_is_locked(bdev_io, range)) { 3064 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3065 return; 3066 } 3067 } 3068 } 3069 3070 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3071 3072 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3073 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3074 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3075 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3076 spdk_bdev_get_name(bdev)); 3077 3078 if (bdev_io_should_split(bdev_io)) { 3079 bdev_io_split(bdev_io); 3080 return; 3081 } 3082 3083 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3084 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3085 _bdev_io_submit(bdev_io); 3086 } else { 3087 bdev_io->internal.io_submit_ch = ch; 3088 bdev_io->internal.ch = bdev->internal.qos->ch; 3089 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3090 } 3091 } else { 3092 _bdev_io_submit(bdev_io); 3093 } 3094 } 3095 3096 static inline void 3097 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3098 { 3099 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3100 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3101 * For write operation we need to pull buffers from memory domain before submitting IO. 3102 * Once read operation completes, we need to use memory_domain push functionality to 3103 * update data in original memory domain IO buffer 3104 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3105 bdev_io->u.bdev.memory_domain = NULL; 3106 bdev_io->u.bdev.memory_domain_ctx = NULL; 3107 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3108 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3109 } 3110 3111 static inline void 3112 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3113 { 3114 if (bdev_io->internal.memory_domain && !desc->memory_domains_supported) { 3115 _bdev_io_ext_use_bounce_buffer(bdev_io); 3116 return; 3117 } 3118 3119 bdev_io_submit(bdev_io); 3120 } 3121 3122 static void 3123 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3124 { 3125 struct spdk_bdev *bdev = bdev_io->bdev; 3126 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3127 struct spdk_io_channel *ch = bdev_ch->channel; 3128 3129 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3130 3131 bdev_io->internal.in_submit_request = true; 3132 bdev->fn_table->submit_request(ch, bdev_io); 3133 bdev_io->internal.in_submit_request = false; 3134 } 3135 3136 void 3137 bdev_io_init(struct spdk_bdev_io *bdev_io, 3138 struct spdk_bdev *bdev, void *cb_arg, 3139 spdk_bdev_io_completion_cb cb) 3140 { 3141 bdev_io->bdev = bdev; 3142 bdev_io->internal.caller_ctx = cb_arg; 3143 bdev_io->internal.cb = cb; 3144 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3145 bdev_io->internal.in_submit_request = false; 3146 bdev_io->internal.buf = NULL; 3147 bdev_io->internal.io_submit_ch = NULL; 3148 bdev_io->internal.orig_iovs = NULL; 3149 bdev_io->internal.orig_iovcnt = 0; 3150 bdev_io->internal.orig_md_iov.iov_base = NULL; 3151 bdev_io->internal.error.nvme.cdw0 = 0; 3152 bdev_io->num_retries = 0; 3153 bdev_io->internal.get_buf_cb = NULL; 3154 bdev_io->internal.get_aux_buf_cb = NULL; 3155 bdev_io->internal.memory_domain = NULL; 3156 bdev_io->internal.memory_domain_ctx = NULL; 3157 bdev_io->internal.data_transfer_cpl = NULL; 3158 } 3159 3160 static bool 3161 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3162 { 3163 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3164 } 3165 3166 bool 3167 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3168 { 3169 bool supported; 3170 3171 supported = bdev_io_type_supported(bdev, io_type); 3172 3173 if (!supported) { 3174 switch (io_type) { 3175 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3176 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3177 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3178 break; 3179 default: 3180 break; 3181 } 3182 } 3183 3184 return supported; 3185 } 3186 3187 uint64_t 3188 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3189 { 3190 return bdev_io->internal.submit_tsc; 3191 } 3192 3193 int 3194 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3195 { 3196 if (bdev->fn_table->dump_info_json) { 3197 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3198 } 3199 3200 return 0; 3201 } 3202 3203 static void 3204 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3205 { 3206 uint32_t max_per_timeslice = 0; 3207 int i; 3208 3209 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3210 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3211 qos->rate_limits[i].max_per_timeslice = 0; 3212 continue; 3213 } 3214 3215 max_per_timeslice = qos->rate_limits[i].limit * 3216 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3217 3218 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3219 qos->rate_limits[i].min_per_timeslice); 3220 3221 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3222 } 3223 3224 bdev_qos_set_ops(qos); 3225 } 3226 3227 static int 3228 bdev_channel_poll_qos(void *arg) 3229 { 3230 struct spdk_bdev_qos *qos = arg; 3231 uint64_t now = spdk_get_ticks(); 3232 int i; 3233 3234 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3235 /* We received our callback earlier than expected - return 3236 * immediately and wait to do accounting until at least one 3237 * timeslice has actually expired. This should never happen 3238 * with a well-behaved timer implementation. 3239 */ 3240 return SPDK_POLLER_IDLE; 3241 } 3242 3243 /* Reset for next round of rate limiting */ 3244 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3245 /* We may have allowed the IOs or bytes to slightly overrun in the last 3246 * timeslice. remaining_this_timeslice is signed, so if it's negative 3247 * here, we'll account for the overrun so that the next timeslice will 3248 * be appropriately reduced. 3249 */ 3250 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3251 qos->rate_limits[i].remaining_this_timeslice = 0; 3252 } 3253 } 3254 3255 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3256 qos->last_timeslice += qos->timeslice_size; 3257 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3258 qos->rate_limits[i].remaining_this_timeslice += 3259 qos->rate_limits[i].max_per_timeslice; 3260 } 3261 } 3262 3263 return bdev_qos_io_submit(qos->ch, qos); 3264 } 3265 3266 static void 3267 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3268 { 3269 struct spdk_bdev_shared_resource *shared_resource; 3270 struct lba_range *range; 3271 3272 bdev_free_io_stat(ch->stat); 3273 #ifdef SPDK_CONFIG_VTUNE 3274 bdev_free_io_stat(ch->prev_stat); 3275 #endif 3276 3277 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3278 range = TAILQ_FIRST(&ch->locked_ranges); 3279 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3280 free(range); 3281 } 3282 3283 spdk_put_io_channel(ch->channel); 3284 spdk_put_io_channel(ch->accel_channel); 3285 3286 shared_resource = ch->shared_resource; 3287 3288 assert(TAILQ_EMPTY(&ch->io_locked)); 3289 assert(TAILQ_EMPTY(&ch->io_submitted)); 3290 assert(ch->io_outstanding == 0); 3291 assert(shared_resource->ref > 0); 3292 shared_resource->ref--; 3293 if (shared_resource->ref == 0) { 3294 assert(shared_resource->io_outstanding == 0); 3295 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3296 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3297 free(shared_resource); 3298 } 3299 } 3300 3301 static void 3302 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3303 { 3304 struct spdk_bdev_qos *qos = bdev->internal.qos; 3305 int i; 3306 3307 assert(spdk_spin_held(&bdev->internal.spinlock)); 3308 3309 /* Rate limiting on this bdev enabled */ 3310 if (qos) { 3311 if (qos->ch == NULL) { 3312 struct spdk_io_channel *io_ch; 3313 3314 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3315 bdev->name, spdk_get_thread()); 3316 3317 /* No qos channel has been selected, so set one up */ 3318 3319 /* Take another reference to ch */ 3320 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3321 assert(io_ch != NULL); 3322 qos->ch = ch; 3323 3324 qos->thread = spdk_io_channel_get_thread(io_ch); 3325 3326 TAILQ_INIT(&qos->queued); 3327 3328 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3329 if (bdev_qos_is_iops_rate_limit(i) == true) { 3330 qos->rate_limits[i].min_per_timeslice = 3331 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3332 } else { 3333 qos->rate_limits[i].min_per_timeslice = 3334 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3335 } 3336 3337 if (qos->rate_limits[i].limit == 0) { 3338 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3339 } 3340 } 3341 bdev_qos_update_max_quota_per_timeslice(qos); 3342 qos->timeslice_size = 3343 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3344 qos->last_timeslice = spdk_get_ticks(); 3345 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3346 qos, 3347 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3348 } 3349 3350 ch->flags |= BDEV_CH_QOS_ENABLED; 3351 } 3352 } 3353 3354 struct poll_timeout_ctx { 3355 struct spdk_bdev_desc *desc; 3356 uint64_t timeout_in_sec; 3357 spdk_bdev_io_timeout_cb cb_fn; 3358 void *cb_arg; 3359 }; 3360 3361 static void 3362 bdev_desc_free(struct spdk_bdev_desc *desc) 3363 { 3364 spdk_spin_destroy(&desc->spinlock); 3365 free(desc->media_events_buffer); 3366 free(desc); 3367 } 3368 3369 static void 3370 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3371 { 3372 struct poll_timeout_ctx *ctx = _ctx; 3373 struct spdk_bdev_desc *desc = ctx->desc; 3374 3375 free(ctx); 3376 3377 spdk_spin_lock(&desc->spinlock); 3378 desc->refs--; 3379 if (desc->closed == true && desc->refs == 0) { 3380 spdk_spin_unlock(&desc->spinlock); 3381 bdev_desc_free(desc); 3382 return; 3383 } 3384 spdk_spin_unlock(&desc->spinlock); 3385 } 3386 3387 static void 3388 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3389 struct spdk_io_channel *io_ch, void *_ctx) 3390 { 3391 struct poll_timeout_ctx *ctx = _ctx; 3392 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3393 struct spdk_bdev_desc *desc = ctx->desc; 3394 struct spdk_bdev_io *bdev_io; 3395 uint64_t now; 3396 3397 spdk_spin_lock(&desc->spinlock); 3398 if (desc->closed == true) { 3399 spdk_spin_unlock(&desc->spinlock); 3400 spdk_bdev_for_each_channel_continue(i, -1); 3401 return; 3402 } 3403 spdk_spin_unlock(&desc->spinlock); 3404 3405 now = spdk_get_ticks(); 3406 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3407 /* Exclude any I/O that are generated via splitting. */ 3408 if (bdev_io->internal.cb == bdev_io_split_done) { 3409 continue; 3410 } 3411 3412 /* Once we find an I/O that has not timed out, we can immediately 3413 * exit the loop. 3414 */ 3415 if (now < (bdev_io->internal.submit_tsc + 3416 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3417 goto end; 3418 } 3419 3420 if (bdev_io->internal.desc == desc) { 3421 ctx->cb_fn(ctx->cb_arg, bdev_io); 3422 } 3423 } 3424 3425 end: 3426 spdk_bdev_for_each_channel_continue(i, 0); 3427 } 3428 3429 static int 3430 bdev_poll_timeout_io(void *arg) 3431 { 3432 struct spdk_bdev_desc *desc = arg; 3433 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3434 struct poll_timeout_ctx *ctx; 3435 3436 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3437 if (!ctx) { 3438 SPDK_ERRLOG("failed to allocate memory\n"); 3439 return SPDK_POLLER_BUSY; 3440 } 3441 ctx->desc = desc; 3442 ctx->cb_arg = desc->cb_arg; 3443 ctx->cb_fn = desc->cb_fn; 3444 ctx->timeout_in_sec = desc->timeout_in_sec; 3445 3446 /* Take a ref on the descriptor in case it gets closed while we are checking 3447 * all of the channels. 3448 */ 3449 spdk_spin_lock(&desc->spinlock); 3450 desc->refs++; 3451 spdk_spin_unlock(&desc->spinlock); 3452 3453 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3454 bdev_channel_poll_timeout_io_done); 3455 3456 return SPDK_POLLER_BUSY; 3457 } 3458 3459 int 3460 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3461 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3462 { 3463 assert(desc->thread == spdk_get_thread()); 3464 3465 spdk_poller_unregister(&desc->io_timeout_poller); 3466 3467 if (timeout_in_sec) { 3468 assert(cb_fn != NULL); 3469 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3470 desc, 3471 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3472 1000); 3473 if (desc->io_timeout_poller == NULL) { 3474 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3475 return -1; 3476 } 3477 } 3478 3479 desc->cb_fn = cb_fn; 3480 desc->cb_arg = cb_arg; 3481 desc->timeout_in_sec = timeout_in_sec; 3482 3483 return 0; 3484 } 3485 3486 static int 3487 bdev_channel_create(void *io_device, void *ctx_buf) 3488 { 3489 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3490 struct spdk_bdev_channel *ch = ctx_buf; 3491 struct spdk_io_channel *mgmt_io_ch; 3492 struct spdk_bdev_mgmt_channel *mgmt_ch; 3493 struct spdk_bdev_shared_resource *shared_resource; 3494 struct lba_range *range; 3495 3496 ch->bdev = bdev; 3497 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3498 if (!ch->channel) { 3499 return -1; 3500 } 3501 3502 ch->accel_channel = spdk_accel_get_io_channel(); 3503 if (!ch->accel_channel) { 3504 spdk_put_io_channel(ch->channel); 3505 return -1; 3506 } 3507 3508 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3509 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3510 3511 assert(ch->histogram == NULL); 3512 if (bdev->internal.histogram_enabled) { 3513 ch->histogram = spdk_histogram_data_alloc(); 3514 if (ch->histogram == NULL) { 3515 SPDK_ERRLOG("Could not allocate histogram\n"); 3516 } 3517 } 3518 3519 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3520 if (!mgmt_io_ch) { 3521 spdk_put_io_channel(ch->channel); 3522 spdk_put_io_channel(ch->accel_channel); 3523 return -1; 3524 } 3525 3526 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3527 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3528 if (shared_resource->shared_ch == ch->channel) { 3529 spdk_put_io_channel(mgmt_io_ch); 3530 shared_resource->ref++; 3531 break; 3532 } 3533 } 3534 3535 if (shared_resource == NULL) { 3536 shared_resource = calloc(1, sizeof(*shared_resource)); 3537 if (shared_resource == NULL) { 3538 spdk_put_io_channel(ch->channel); 3539 spdk_put_io_channel(ch->accel_channel); 3540 spdk_put_io_channel(mgmt_io_ch); 3541 return -1; 3542 } 3543 3544 shared_resource->mgmt_ch = mgmt_ch; 3545 shared_resource->io_outstanding = 0; 3546 TAILQ_INIT(&shared_resource->nomem_io); 3547 shared_resource->nomem_threshold = 0; 3548 shared_resource->shared_ch = ch->channel; 3549 shared_resource->ref = 1; 3550 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3551 } 3552 3553 ch->io_outstanding = 0; 3554 TAILQ_INIT(&ch->queued_resets); 3555 TAILQ_INIT(&ch->locked_ranges); 3556 ch->flags = 0; 3557 ch->shared_resource = shared_resource; 3558 3559 TAILQ_INIT(&ch->io_submitted); 3560 TAILQ_INIT(&ch->io_locked); 3561 3562 ch->stat = bdev_alloc_io_stat(false); 3563 if (ch->stat == NULL) { 3564 bdev_channel_destroy_resource(ch); 3565 return -1; 3566 } 3567 3568 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3569 3570 #ifdef SPDK_CONFIG_VTUNE 3571 { 3572 char *name; 3573 __itt_init_ittlib(NULL, 0); 3574 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3575 if (!name) { 3576 bdev_channel_destroy_resource(ch); 3577 return -1; 3578 } 3579 ch->handle = __itt_string_handle_create(name); 3580 free(name); 3581 ch->start_tsc = spdk_get_ticks(); 3582 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3583 ch->prev_stat = bdev_alloc_io_stat(false); 3584 if (ch->prev_stat == NULL) { 3585 bdev_channel_destroy_resource(ch); 3586 return -1; 3587 } 3588 } 3589 #endif 3590 3591 spdk_spin_lock(&bdev->internal.spinlock); 3592 bdev_enable_qos(bdev, ch); 3593 3594 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3595 struct lba_range *new_range; 3596 3597 new_range = calloc(1, sizeof(*new_range)); 3598 if (new_range == NULL) { 3599 spdk_spin_unlock(&bdev->internal.spinlock); 3600 bdev_channel_destroy_resource(ch); 3601 return -1; 3602 } 3603 new_range->length = range->length; 3604 new_range->offset = range->offset; 3605 new_range->locked_ctx = range->locked_ctx; 3606 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3607 } 3608 3609 spdk_spin_unlock(&bdev->internal.spinlock); 3610 3611 return 0; 3612 } 3613 3614 static int 3615 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3616 void *cb_ctx) 3617 { 3618 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3619 struct spdk_bdev_io *bdev_io; 3620 uint64_t buf_len; 3621 3622 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3623 if (bdev_io->internal.ch == bdev_ch) { 3624 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3625 spdk_iobuf_entry_abort(ch, entry, buf_len); 3626 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3627 } 3628 3629 return 0; 3630 } 3631 3632 /* 3633 * Abort I/O that are waiting on a data buffer. 3634 */ 3635 static void 3636 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3637 { 3638 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3639 bdev_abort_all_buf_io_cb, ch); 3640 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3641 bdev_abort_all_buf_io_cb, ch); 3642 } 3643 3644 /* 3645 * Abort I/O that are queued waiting for submission. These types of I/O are 3646 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3647 */ 3648 static void 3649 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3650 { 3651 struct spdk_bdev_io *bdev_io, *tmp; 3652 3653 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3654 if (bdev_io->internal.ch == ch) { 3655 TAILQ_REMOVE(queue, bdev_io, internal.link); 3656 /* 3657 * spdk_bdev_io_complete() assumes that the completed I/O had 3658 * been submitted to the bdev module. Since in this case it 3659 * hadn't, bump io_outstanding to account for the decrement 3660 * that spdk_bdev_io_complete() will do. 3661 */ 3662 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3663 ch->io_outstanding++; 3664 ch->shared_resource->io_outstanding++; 3665 } 3666 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3667 } 3668 } 3669 } 3670 3671 static bool 3672 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3673 { 3674 struct spdk_bdev_io *bdev_io; 3675 3676 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3677 if (bdev_io == bio_to_abort) { 3678 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3679 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3680 return true; 3681 } 3682 } 3683 3684 return false; 3685 } 3686 3687 static int 3688 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3689 { 3690 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3691 uint64_t buf_len; 3692 3693 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3694 if (bdev_io == bio_to_abort) { 3695 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3696 spdk_iobuf_entry_abort(ch, entry, buf_len); 3697 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3698 return 1; 3699 } 3700 3701 return 0; 3702 } 3703 3704 static bool 3705 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3706 { 3707 int rc; 3708 3709 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3710 bdev_abort_buf_io_cb, bio_to_abort); 3711 if (rc == 1) { 3712 return true; 3713 } 3714 3715 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3716 bdev_abort_buf_io_cb, bio_to_abort); 3717 return rc == 1; 3718 } 3719 3720 static void 3721 bdev_qos_channel_destroy(void *cb_arg) 3722 { 3723 struct spdk_bdev_qos *qos = cb_arg; 3724 3725 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3726 spdk_poller_unregister(&qos->poller); 3727 3728 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3729 3730 free(qos); 3731 } 3732 3733 static int 3734 bdev_qos_destroy(struct spdk_bdev *bdev) 3735 { 3736 int i; 3737 3738 /* 3739 * Cleanly shutting down the QoS poller is tricky, because 3740 * during the asynchronous operation the user could open 3741 * a new descriptor and create a new channel, spawning 3742 * a new QoS poller. 3743 * 3744 * The strategy is to create a new QoS structure here and swap it 3745 * in. The shutdown path then continues to refer to the old one 3746 * until it completes and then releases it. 3747 */ 3748 struct spdk_bdev_qos *new_qos, *old_qos; 3749 3750 old_qos = bdev->internal.qos; 3751 3752 new_qos = calloc(1, sizeof(*new_qos)); 3753 if (!new_qos) { 3754 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3755 return -ENOMEM; 3756 } 3757 3758 /* Copy the old QoS data into the newly allocated structure */ 3759 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3760 3761 /* Zero out the key parts of the QoS structure */ 3762 new_qos->ch = NULL; 3763 new_qos->thread = NULL; 3764 new_qos->poller = NULL; 3765 TAILQ_INIT(&new_qos->queued); 3766 /* 3767 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3768 * It will be used later for the new QoS structure. 3769 */ 3770 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3771 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3772 new_qos->rate_limits[i].min_per_timeslice = 0; 3773 new_qos->rate_limits[i].max_per_timeslice = 0; 3774 } 3775 3776 bdev->internal.qos = new_qos; 3777 3778 if (old_qos->thread == NULL) { 3779 free(old_qos); 3780 } else { 3781 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3782 } 3783 3784 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3785 * been destroyed yet. The destruction path will end up waiting for the final 3786 * channel to be put before it releases resources. */ 3787 3788 return 0; 3789 } 3790 3791 void 3792 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3793 { 3794 total->bytes_read += add->bytes_read; 3795 total->num_read_ops += add->num_read_ops; 3796 total->bytes_written += add->bytes_written; 3797 total->num_write_ops += add->num_write_ops; 3798 total->bytes_unmapped += add->bytes_unmapped; 3799 total->num_unmap_ops += add->num_unmap_ops; 3800 total->bytes_copied += add->bytes_copied; 3801 total->num_copy_ops += add->num_copy_ops; 3802 total->read_latency_ticks += add->read_latency_ticks; 3803 total->write_latency_ticks += add->write_latency_ticks; 3804 total->unmap_latency_ticks += add->unmap_latency_ticks; 3805 total->copy_latency_ticks += add->copy_latency_ticks; 3806 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3807 total->max_read_latency_ticks = add->max_read_latency_ticks; 3808 } 3809 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3810 total->min_read_latency_ticks = add->min_read_latency_ticks; 3811 } 3812 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3813 total->max_write_latency_ticks = add->max_write_latency_ticks; 3814 } 3815 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3816 total->min_write_latency_ticks = add->min_write_latency_ticks; 3817 } 3818 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3819 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3820 } 3821 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3822 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3823 } 3824 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3825 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3826 } 3827 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3828 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3829 } 3830 } 3831 3832 static void 3833 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3834 { 3835 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3836 3837 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3838 memcpy(to_stat->io_error, from_stat->io_error, 3839 sizeof(struct spdk_bdev_io_error_stat)); 3840 } 3841 } 3842 3843 void 3844 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 3845 { 3846 stat->max_read_latency_ticks = 0; 3847 stat->min_read_latency_ticks = UINT64_MAX; 3848 stat->max_write_latency_ticks = 0; 3849 stat->min_write_latency_ticks = UINT64_MAX; 3850 stat->max_unmap_latency_ticks = 0; 3851 stat->min_unmap_latency_ticks = UINT64_MAX; 3852 stat->max_copy_latency_ticks = 0; 3853 stat->min_copy_latency_ticks = UINT64_MAX; 3854 3855 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 3856 return; 3857 } 3858 3859 stat->bytes_read = 0; 3860 stat->num_read_ops = 0; 3861 stat->bytes_written = 0; 3862 stat->num_write_ops = 0; 3863 stat->bytes_unmapped = 0; 3864 stat->num_unmap_ops = 0; 3865 stat->bytes_copied = 0; 3866 stat->num_copy_ops = 0; 3867 stat->read_latency_ticks = 0; 3868 stat->write_latency_ticks = 0; 3869 stat->unmap_latency_ticks = 0; 3870 stat->copy_latency_ticks = 0; 3871 3872 if (stat->io_error != NULL) { 3873 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3874 } 3875 } 3876 3877 struct spdk_bdev_io_stat * 3878 bdev_alloc_io_stat(bool io_error_stat) 3879 { 3880 struct spdk_bdev_io_stat *stat; 3881 3882 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3883 if (stat == NULL) { 3884 return NULL; 3885 } 3886 3887 if (io_error_stat) { 3888 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3889 if (stat->io_error == NULL) { 3890 free(stat); 3891 return NULL; 3892 } 3893 } else { 3894 stat->io_error = NULL; 3895 } 3896 3897 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 3898 3899 return stat; 3900 } 3901 3902 void 3903 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3904 { 3905 if (stat != NULL) { 3906 free(stat->io_error); 3907 free(stat); 3908 } 3909 } 3910 3911 void 3912 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3913 { 3914 int i; 3915 3916 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3917 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3918 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3919 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3920 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3921 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3922 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3923 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3924 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3925 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3926 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3927 stat->min_read_latency_ticks != UINT64_MAX ? 3928 stat->min_read_latency_ticks : 0); 3929 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3930 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3931 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3932 stat->min_write_latency_ticks != UINT64_MAX ? 3933 stat->min_write_latency_ticks : 0); 3934 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3935 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3936 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3937 stat->min_unmap_latency_ticks != UINT64_MAX ? 3938 stat->min_unmap_latency_ticks : 0); 3939 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3940 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3941 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3942 stat->min_copy_latency_ticks != UINT64_MAX ? 3943 stat->min_copy_latency_ticks : 0); 3944 3945 if (stat->io_error != NULL) { 3946 spdk_json_write_named_object_begin(w, "io_error"); 3947 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3948 if (stat->io_error->error_status[i] != 0) { 3949 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3950 stat->io_error->error_status[i]); 3951 } 3952 } 3953 spdk_json_write_object_end(w); 3954 } 3955 } 3956 3957 static void 3958 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3959 { 3960 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3961 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3962 3963 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3964 bdev_abort_all_buf_io(mgmt_ch, ch); 3965 bdev_abort_all_buf_io(mgmt_ch, ch); 3966 } 3967 3968 static void 3969 bdev_channel_destroy(void *io_device, void *ctx_buf) 3970 { 3971 struct spdk_bdev_channel *ch = ctx_buf; 3972 3973 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3974 spdk_get_thread()); 3975 3976 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3977 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3978 3979 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3980 spdk_spin_lock(&ch->bdev->internal.spinlock); 3981 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3982 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3983 3984 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3985 3986 bdev_channel_abort_queued_ios(ch); 3987 3988 if (ch->histogram) { 3989 spdk_histogram_data_free(ch->histogram); 3990 } 3991 3992 bdev_channel_destroy_resource(ch); 3993 } 3994 3995 /* 3996 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3997 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3998 */ 3999 static int 4000 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4001 { 4002 struct spdk_bdev_name *tmp; 4003 4004 bdev_name->name = strdup(name); 4005 if (bdev_name->name == NULL) { 4006 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4007 return -ENOMEM; 4008 } 4009 4010 bdev_name->bdev = bdev; 4011 4012 spdk_spin_lock(&g_bdev_mgr.spinlock); 4013 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4014 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4015 4016 if (tmp != NULL) { 4017 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4018 free(bdev_name->name); 4019 return -EEXIST; 4020 } 4021 4022 return 0; 4023 } 4024 4025 static void 4026 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4027 { 4028 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4029 free(bdev_name->name); 4030 } 4031 4032 static void 4033 bdev_name_del(struct spdk_bdev_name *bdev_name) 4034 { 4035 spdk_spin_lock(&g_bdev_mgr.spinlock); 4036 bdev_name_del_unsafe(bdev_name); 4037 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4038 } 4039 4040 int 4041 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4042 { 4043 struct spdk_bdev_alias *tmp; 4044 int ret; 4045 4046 if (alias == NULL) { 4047 SPDK_ERRLOG("Empty alias passed\n"); 4048 return -EINVAL; 4049 } 4050 4051 tmp = calloc(1, sizeof(*tmp)); 4052 if (tmp == NULL) { 4053 SPDK_ERRLOG("Unable to allocate alias\n"); 4054 return -ENOMEM; 4055 } 4056 4057 ret = bdev_name_add(&tmp->alias, bdev, alias); 4058 if (ret != 0) { 4059 free(tmp); 4060 return ret; 4061 } 4062 4063 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4064 4065 return 0; 4066 } 4067 4068 static int 4069 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4070 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4071 { 4072 struct spdk_bdev_alias *tmp; 4073 4074 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4075 if (strcmp(alias, tmp->alias.name) == 0) { 4076 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4077 alias_del_fn(&tmp->alias); 4078 free(tmp); 4079 return 0; 4080 } 4081 } 4082 4083 return -ENOENT; 4084 } 4085 4086 int 4087 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4088 { 4089 int rc; 4090 4091 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4092 if (rc == -ENOENT) { 4093 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4094 } 4095 4096 return rc; 4097 } 4098 4099 void 4100 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4101 { 4102 struct spdk_bdev_alias *p, *tmp; 4103 4104 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4105 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4106 bdev_name_del(&p->alias); 4107 free(p); 4108 } 4109 } 4110 4111 struct spdk_io_channel * 4112 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4113 { 4114 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4115 } 4116 4117 void * 4118 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4119 { 4120 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4121 void *ctx = NULL; 4122 4123 if (bdev->fn_table->get_module_ctx) { 4124 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4125 } 4126 4127 return ctx; 4128 } 4129 4130 const char * 4131 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4132 { 4133 return bdev->module->name; 4134 } 4135 4136 const char * 4137 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4138 { 4139 return bdev->name; 4140 } 4141 4142 const char * 4143 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4144 { 4145 return bdev->product_name; 4146 } 4147 4148 const struct spdk_bdev_aliases_list * 4149 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4150 { 4151 return &bdev->aliases; 4152 } 4153 4154 uint32_t 4155 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4156 { 4157 return bdev->blocklen; 4158 } 4159 4160 uint32_t 4161 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4162 { 4163 return bdev->write_unit_size; 4164 } 4165 4166 uint64_t 4167 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4168 { 4169 return bdev->blockcnt; 4170 } 4171 4172 const char * 4173 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4174 { 4175 return qos_rpc_type[type]; 4176 } 4177 4178 void 4179 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4180 { 4181 int i; 4182 4183 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4184 4185 spdk_spin_lock(&bdev->internal.spinlock); 4186 if (bdev->internal.qos) { 4187 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4188 if (bdev->internal.qos->rate_limits[i].limit != 4189 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4190 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4191 if (bdev_qos_is_iops_rate_limit(i) == false) { 4192 /* Change from Byte to Megabyte which is user visible. */ 4193 limits[i] = limits[i] / 1024 / 1024; 4194 } 4195 } 4196 } 4197 } 4198 spdk_spin_unlock(&bdev->internal.spinlock); 4199 } 4200 4201 size_t 4202 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4203 { 4204 return 1 << bdev->required_alignment; 4205 } 4206 4207 uint32_t 4208 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4209 { 4210 return bdev->optimal_io_boundary; 4211 } 4212 4213 bool 4214 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4215 { 4216 return bdev->write_cache; 4217 } 4218 4219 const struct spdk_uuid * 4220 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4221 { 4222 return &bdev->uuid; 4223 } 4224 4225 uint16_t 4226 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4227 { 4228 return bdev->acwu; 4229 } 4230 4231 uint32_t 4232 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4233 { 4234 return bdev->md_len; 4235 } 4236 4237 bool 4238 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4239 { 4240 return (bdev->md_len != 0) && bdev->md_interleave; 4241 } 4242 4243 bool 4244 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4245 { 4246 return (bdev->md_len != 0) && !bdev->md_interleave; 4247 } 4248 4249 bool 4250 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4251 { 4252 return bdev->zoned; 4253 } 4254 4255 uint32_t 4256 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4257 { 4258 if (spdk_bdev_is_md_interleaved(bdev)) { 4259 return bdev->blocklen - bdev->md_len; 4260 } else { 4261 return bdev->blocklen; 4262 } 4263 } 4264 4265 uint32_t 4266 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4267 { 4268 return bdev->phys_blocklen; 4269 } 4270 4271 static uint32_t 4272 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4273 { 4274 if (!spdk_bdev_is_md_interleaved(bdev)) { 4275 return bdev->blocklen + bdev->md_len; 4276 } else { 4277 return bdev->blocklen; 4278 } 4279 } 4280 4281 /* We have to use the typedef in the function declaration to appease astyle. */ 4282 typedef enum spdk_dif_type spdk_dif_type_t; 4283 4284 spdk_dif_type_t 4285 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4286 { 4287 if (bdev->md_len != 0) { 4288 return bdev->dif_type; 4289 } else { 4290 return SPDK_DIF_DISABLE; 4291 } 4292 } 4293 4294 bool 4295 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4296 { 4297 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4298 return bdev->dif_is_head_of_md; 4299 } else { 4300 return false; 4301 } 4302 } 4303 4304 bool 4305 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4306 enum spdk_dif_check_type check_type) 4307 { 4308 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4309 return false; 4310 } 4311 4312 switch (check_type) { 4313 case SPDK_DIF_CHECK_TYPE_REFTAG: 4314 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4315 case SPDK_DIF_CHECK_TYPE_APPTAG: 4316 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4317 case SPDK_DIF_CHECK_TYPE_GUARD: 4318 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4319 default: 4320 return false; 4321 } 4322 } 4323 4324 uint32_t 4325 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4326 { 4327 uint64_t alighed_length; 4328 uint64_t max_copy_blocks; 4329 uint64_t temp_max_copy_blocks; 4330 struct spdk_iobuf_opts opts; 4331 4332 if (spdk_bdev_io_type_supported((struct spdk_bdev *)bdev, SPDK_BDEV_IO_TYPE_COPY)) { 4333 return bdev->max_copy; 4334 } else { 4335 spdk_iobuf_get_opts(&opts); 4336 alighed_length = opts.large_bufsize - spdk_bdev_get_buf_align(bdev); 4337 temp_max_copy_blocks = spdk_bdev_is_md_separate(bdev) ? 4338 alighed_length / (bdev->blocklen + bdev->md_len) : 4339 alighed_length / bdev->blocklen; 4340 max_copy_blocks = 1 << spdk_u64log2(temp_max_copy_blocks); 4341 return max_copy_blocks; 4342 } 4343 } 4344 4345 uint64_t 4346 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4347 { 4348 return bdev->internal.measured_queue_depth; 4349 } 4350 4351 uint64_t 4352 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4353 { 4354 return bdev->internal.period; 4355 } 4356 4357 uint64_t 4358 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4359 { 4360 return bdev->internal.weighted_io_time; 4361 } 4362 4363 uint64_t 4364 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4365 { 4366 return bdev->internal.io_time; 4367 } 4368 4369 static void bdev_update_qd_sampling_period(void *ctx); 4370 4371 static void 4372 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4373 { 4374 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4375 4376 if (bdev->internal.measured_queue_depth) { 4377 bdev->internal.io_time += bdev->internal.period; 4378 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4379 } 4380 4381 bdev->internal.qd_poll_in_progress = false; 4382 4383 bdev_update_qd_sampling_period(bdev); 4384 } 4385 4386 static void 4387 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4388 struct spdk_io_channel *io_ch, void *_ctx) 4389 { 4390 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4391 4392 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4393 spdk_bdev_for_each_channel_continue(i, 0); 4394 } 4395 4396 static int 4397 bdev_calculate_measured_queue_depth(void *ctx) 4398 { 4399 struct spdk_bdev *bdev = ctx; 4400 4401 bdev->internal.qd_poll_in_progress = true; 4402 bdev->internal.temporary_queue_depth = 0; 4403 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4404 return SPDK_POLLER_BUSY; 4405 } 4406 4407 static void 4408 bdev_update_qd_sampling_period(void *ctx) 4409 { 4410 struct spdk_bdev *bdev = ctx; 4411 4412 if (bdev->internal.period == bdev->internal.new_period) { 4413 return; 4414 } 4415 4416 if (bdev->internal.qd_poll_in_progress) { 4417 return; 4418 } 4419 4420 bdev->internal.period = bdev->internal.new_period; 4421 4422 spdk_poller_unregister(&bdev->internal.qd_poller); 4423 if (bdev->internal.period != 0) { 4424 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4425 bdev, bdev->internal.period); 4426 } else { 4427 spdk_bdev_close(bdev->internal.qd_desc); 4428 bdev->internal.qd_desc = NULL; 4429 } 4430 } 4431 4432 static void 4433 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4434 { 4435 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4436 } 4437 4438 void 4439 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4440 { 4441 int rc; 4442 4443 if (bdev->internal.new_period == period) { 4444 return; 4445 } 4446 4447 bdev->internal.new_period = period; 4448 4449 if (bdev->internal.qd_desc != NULL) { 4450 assert(bdev->internal.period != 0); 4451 4452 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4453 bdev_update_qd_sampling_period, bdev); 4454 return; 4455 } 4456 4457 assert(bdev->internal.period == 0); 4458 4459 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4460 NULL, &bdev->internal.qd_desc); 4461 if (rc != 0) { 4462 return; 4463 } 4464 4465 bdev->internal.period = period; 4466 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4467 bdev, period); 4468 } 4469 4470 struct bdev_get_current_qd_ctx { 4471 uint64_t current_qd; 4472 spdk_bdev_get_current_qd_cb cb_fn; 4473 void *cb_arg; 4474 }; 4475 4476 static void 4477 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4478 { 4479 struct bdev_get_current_qd_ctx *ctx = _ctx; 4480 4481 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4482 4483 free(ctx); 4484 } 4485 4486 static void 4487 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4488 struct spdk_io_channel *io_ch, void *_ctx) 4489 { 4490 struct bdev_get_current_qd_ctx *ctx = _ctx; 4491 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4492 4493 ctx->current_qd += bdev_ch->io_outstanding; 4494 4495 spdk_bdev_for_each_channel_continue(i, 0); 4496 } 4497 4498 void 4499 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4500 void *cb_arg) 4501 { 4502 struct bdev_get_current_qd_ctx *ctx; 4503 4504 assert(cb_fn != NULL); 4505 4506 ctx = calloc(1, sizeof(*ctx)); 4507 if (ctx == NULL) { 4508 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4509 return; 4510 } 4511 4512 ctx->cb_fn = cb_fn; 4513 ctx->cb_arg = cb_arg; 4514 4515 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4516 } 4517 4518 static void 4519 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4520 { 4521 assert(desc->thread == spdk_get_thread()); 4522 4523 spdk_spin_lock(&desc->spinlock); 4524 desc->refs--; 4525 if (!desc->closed) { 4526 spdk_spin_unlock(&desc->spinlock); 4527 desc->callback.event_fn(type, 4528 desc->bdev, 4529 desc->callback.ctx); 4530 return; 4531 } else if (desc->refs == 0) { 4532 /* This descriptor was closed after this event_notify message was sent. 4533 * spdk_bdev_close() could not free the descriptor since this message was 4534 * in flight, so we free it now using bdev_desc_free(). 4535 */ 4536 spdk_spin_unlock(&desc->spinlock); 4537 bdev_desc_free(desc); 4538 return; 4539 } 4540 spdk_spin_unlock(&desc->spinlock); 4541 } 4542 4543 static void 4544 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4545 { 4546 spdk_spin_lock(&desc->spinlock); 4547 desc->refs++; 4548 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4549 spdk_spin_unlock(&desc->spinlock); 4550 } 4551 4552 static void 4553 _resize_notify(void *ctx) 4554 { 4555 struct spdk_bdev_desc *desc = ctx; 4556 4557 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4558 } 4559 4560 int 4561 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4562 { 4563 struct spdk_bdev_desc *desc; 4564 int ret; 4565 4566 if (size == bdev->blockcnt) { 4567 return 0; 4568 } 4569 4570 spdk_spin_lock(&bdev->internal.spinlock); 4571 4572 /* bdev has open descriptors */ 4573 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4574 bdev->blockcnt > size) { 4575 ret = -EBUSY; 4576 } else { 4577 bdev->blockcnt = size; 4578 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4579 event_notify(desc, _resize_notify); 4580 } 4581 ret = 0; 4582 } 4583 4584 spdk_spin_unlock(&bdev->internal.spinlock); 4585 4586 return ret; 4587 } 4588 4589 /* 4590 * Convert I/O offset and length from bytes to blocks. 4591 * 4592 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4593 */ 4594 static uint64_t 4595 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4596 uint64_t num_bytes, uint64_t *num_blocks) 4597 { 4598 uint32_t block_size = bdev->blocklen; 4599 uint8_t shift_cnt; 4600 4601 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4602 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4603 shift_cnt = spdk_u32log2(block_size); 4604 *offset_blocks = offset_bytes >> shift_cnt; 4605 *num_blocks = num_bytes >> shift_cnt; 4606 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4607 (num_bytes - (*num_blocks << shift_cnt)); 4608 } else { 4609 *offset_blocks = offset_bytes / block_size; 4610 *num_blocks = num_bytes / block_size; 4611 return (offset_bytes % block_size) | (num_bytes % block_size); 4612 } 4613 } 4614 4615 static bool 4616 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4617 { 4618 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4619 * has been an overflow and hence the offset has been wrapped around */ 4620 if (offset_blocks + num_blocks < offset_blocks) { 4621 return false; 4622 } 4623 4624 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4625 if (offset_blocks + num_blocks > bdev->blockcnt) { 4626 return false; 4627 } 4628 4629 return true; 4630 } 4631 4632 static void 4633 bdev_seek_complete_cb(void *ctx) 4634 { 4635 struct spdk_bdev_io *bdev_io = ctx; 4636 4637 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4638 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4639 } 4640 4641 static int 4642 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4643 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4644 spdk_bdev_io_completion_cb cb, void *cb_arg) 4645 { 4646 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4647 struct spdk_bdev_io *bdev_io; 4648 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4649 4650 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4651 4652 /* Check if offset_blocks is valid looking at the validity of one block */ 4653 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4654 return -EINVAL; 4655 } 4656 4657 bdev_io = bdev_channel_get_io(channel); 4658 if (!bdev_io) { 4659 return -ENOMEM; 4660 } 4661 4662 bdev_io->internal.ch = channel; 4663 bdev_io->internal.desc = desc; 4664 bdev_io->type = io_type; 4665 bdev_io->u.bdev.offset_blocks = offset_blocks; 4666 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4667 4668 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4669 /* In case bdev doesn't support seek to next data/hole offset, 4670 * it is assumed that only data and no holes are present */ 4671 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4672 bdev_io->u.bdev.seek.offset = offset_blocks; 4673 } else { 4674 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4675 } 4676 4677 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4678 return 0; 4679 } 4680 4681 bdev_io_submit(bdev_io); 4682 return 0; 4683 } 4684 4685 int 4686 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4687 uint64_t offset_blocks, 4688 spdk_bdev_io_completion_cb cb, void *cb_arg) 4689 { 4690 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4691 } 4692 4693 int 4694 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4695 uint64_t offset_blocks, 4696 spdk_bdev_io_completion_cb cb, void *cb_arg) 4697 { 4698 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4699 } 4700 4701 uint64_t 4702 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4703 { 4704 return bdev_io->u.bdev.seek.offset; 4705 } 4706 4707 static int 4708 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4709 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4710 spdk_bdev_io_completion_cb cb, void *cb_arg) 4711 { 4712 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4713 struct spdk_bdev_io *bdev_io; 4714 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4715 4716 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4717 return -EINVAL; 4718 } 4719 4720 bdev_io = bdev_channel_get_io(channel); 4721 if (!bdev_io) { 4722 return -ENOMEM; 4723 } 4724 4725 bdev_io->internal.ch = channel; 4726 bdev_io->internal.desc = desc; 4727 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4728 bdev_io->u.bdev.iovs = &bdev_io->iov; 4729 bdev_io->u.bdev.iovs[0].iov_base = buf; 4730 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4731 bdev_io->u.bdev.iovcnt = 1; 4732 bdev_io->u.bdev.md_buf = md_buf; 4733 bdev_io->u.bdev.num_blocks = num_blocks; 4734 bdev_io->u.bdev.offset_blocks = offset_blocks; 4735 bdev_io->u.bdev.memory_domain = NULL; 4736 bdev_io->u.bdev.memory_domain_ctx = NULL; 4737 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4738 4739 bdev_io_submit(bdev_io); 4740 return 0; 4741 } 4742 4743 int 4744 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4745 void *buf, uint64_t offset, uint64_t nbytes, 4746 spdk_bdev_io_completion_cb cb, void *cb_arg) 4747 { 4748 uint64_t offset_blocks, num_blocks; 4749 4750 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4751 nbytes, &num_blocks) != 0) { 4752 return -EINVAL; 4753 } 4754 4755 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4756 } 4757 4758 int 4759 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4760 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4761 spdk_bdev_io_completion_cb cb, void *cb_arg) 4762 { 4763 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4764 } 4765 4766 int 4767 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4768 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4769 spdk_bdev_io_completion_cb cb, void *cb_arg) 4770 { 4771 struct iovec iov = { 4772 .iov_base = buf, 4773 }; 4774 4775 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4776 return -EINVAL; 4777 } 4778 4779 if (md_buf && !_is_buf_allocated(&iov)) { 4780 return -EINVAL; 4781 } 4782 4783 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4784 cb, cb_arg); 4785 } 4786 4787 int 4788 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4789 struct iovec *iov, int iovcnt, 4790 uint64_t offset, uint64_t nbytes, 4791 spdk_bdev_io_completion_cb cb, void *cb_arg) 4792 { 4793 uint64_t offset_blocks, num_blocks; 4794 4795 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4796 nbytes, &num_blocks) != 0) { 4797 return -EINVAL; 4798 } 4799 4800 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4801 } 4802 4803 static int 4804 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4805 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4806 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 4807 spdk_bdev_io_completion_cb cb, void *cb_arg) 4808 { 4809 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4810 struct spdk_bdev_io *bdev_io; 4811 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4812 4813 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4814 return -EINVAL; 4815 } 4816 4817 bdev_io = bdev_channel_get_io(channel); 4818 if (!bdev_io) { 4819 return -ENOMEM; 4820 } 4821 4822 bdev_io->internal.ch = channel; 4823 bdev_io->internal.desc = desc; 4824 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4825 bdev_io->u.bdev.iovs = iov; 4826 bdev_io->u.bdev.iovcnt = iovcnt; 4827 bdev_io->u.bdev.md_buf = md_buf; 4828 bdev_io->u.bdev.num_blocks = num_blocks; 4829 bdev_io->u.bdev.offset_blocks = offset_blocks; 4830 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4831 bdev_io->internal.memory_domain = domain; 4832 bdev_io->internal.memory_domain_ctx = domain_ctx; 4833 bdev_io->u.bdev.memory_domain = domain; 4834 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 4835 4836 _bdev_io_submit_ext(desc, bdev_io); 4837 4838 return 0; 4839 } 4840 4841 int 4842 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4843 struct iovec *iov, int iovcnt, 4844 uint64_t offset_blocks, uint64_t num_blocks, 4845 spdk_bdev_io_completion_cb cb, void *cb_arg) 4846 { 4847 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4848 num_blocks, NULL, NULL, cb, cb_arg); 4849 } 4850 4851 int 4852 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4853 struct iovec *iov, int iovcnt, void *md_buf, 4854 uint64_t offset_blocks, uint64_t num_blocks, 4855 spdk_bdev_io_completion_cb cb, void *cb_arg) 4856 { 4857 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4858 return -EINVAL; 4859 } 4860 4861 if (md_buf && !_is_buf_allocated(iov)) { 4862 return -EINVAL; 4863 } 4864 4865 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4866 num_blocks, NULL, NULL, cb, cb_arg); 4867 } 4868 4869 static inline bool 4870 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4871 { 4872 /* 4873 * We check if opts size is at least of size when we first introduced 4874 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4875 * are not checked internal. 4876 */ 4877 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4878 sizeof(opts->metadata) && 4879 opts->size <= sizeof(*opts) && 4880 /* When memory domain is used, the user must provide data buffers */ 4881 (!opts->memory_domain || (iov && iov[0].iov_base)); 4882 } 4883 4884 int 4885 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4886 struct iovec *iov, int iovcnt, 4887 uint64_t offset_blocks, uint64_t num_blocks, 4888 spdk_bdev_io_completion_cb cb, void *cb_arg, 4889 struct spdk_bdev_ext_io_opts *opts) 4890 { 4891 void *md = NULL; 4892 4893 if (opts) { 4894 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4895 return -EINVAL; 4896 } 4897 md = opts->metadata; 4898 } 4899 4900 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4901 return -EINVAL; 4902 } 4903 4904 if (md && !_is_buf_allocated(iov)) { 4905 return -EINVAL; 4906 } 4907 4908 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4909 num_blocks, 4910 bdev_get_ext_io_opt(opts, memory_domain, NULL), 4911 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 4912 cb, cb_arg); 4913 } 4914 4915 static int 4916 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4917 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4918 spdk_bdev_io_completion_cb cb, void *cb_arg) 4919 { 4920 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4921 struct spdk_bdev_io *bdev_io; 4922 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4923 4924 if (!desc->write) { 4925 return -EBADF; 4926 } 4927 4928 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4929 return -EINVAL; 4930 } 4931 4932 bdev_io = bdev_channel_get_io(channel); 4933 if (!bdev_io) { 4934 return -ENOMEM; 4935 } 4936 4937 bdev_io->internal.ch = channel; 4938 bdev_io->internal.desc = desc; 4939 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4940 bdev_io->u.bdev.iovs = &bdev_io->iov; 4941 bdev_io->u.bdev.iovs[0].iov_base = buf; 4942 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4943 bdev_io->u.bdev.iovcnt = 1; 4944 bdev_io->u.bdev.md_buf = md_buf; 4945 bdev_io->u.bdev.num_blocks = num_blocks; 4946 bdev_io->u.bdev.offset_blocks = offset_blocks; 4947 bdev_io->u.bdev.memory_domain = NULL; 4948 bdev_io->u.bdev.memory_domain_ctx = NULL; 4949 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4950 4951 bdev_io_submit(bdev_io); 4952 return 0; 4953 } 4954 4955 int 4956 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4957 void *buf, uint64_t offset, uint64_t nbytes, 4958 spdk_bdev_io_completion_cb cb, void *cb_arg) 4959 { 4960 uint64_t offset_blocks, num_blocks; 4961 4962 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4963 nbytes, &num_blocks) != 0) { 4964 return -EINVAL; 4965 } 4966 4967 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4968 } 4969 4970 int 4971 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4972 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4973 spdk_bdev_io_completion_cb cb, void *cb_arg) 4974 { 4975 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4976 cb, cb_arg); 4977 } 4978 4979 int 4980 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4981 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4982 spdk_bdev_io_completion_cb cb, void *cb_arg) 4983 { 4984 struct iovec iov = { 4985 .iov_base = buf, 4986 }; 4987 4988 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4989 return -EINVAL; 4990 } 4991 4992 if (md_buf && !_is_buf_allocated(&iov)) { 4993 return -EINVAL; 4994 } 4995 4996 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4997 cb, cb_arg); 4998 } 4999 5000 static int 5001 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5002 struct iovec *iov, int iovcnt, void *md_buf, 5003 uint64_t offset_blocks, uint64_t num_blocks, 5004 struct spdk_memory_domain *domain, void *domain_ctx, 5005 spdk_bdev_io_completion_cb cb, void *cb_arg) 5006 { 5007 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5008 struct spdk_bdev_io *bdev_io; 5009 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5010 5011 if (!desc->write) { 5012 return -EBADF; 5013 } 5014 5015 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5016 return -EINVAL; 5017 } 5018 5019 bdev_io = bdev_channel_get_io(channel); 5020 if (!bdev_io) { 5021 return -ENOMEM; 5022 } 5023 5024 bdev_io->internal.ch = channel; 5025 bdev_io->internal.desc = desc; 5026 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5027 bdev_io->u.bdev.iovs = iov; 5028 bdev_io->u.bdev.iovcnt = iovcnt; 5029 bdev_io->u.bdev.md_buf = md_buf; 5030 bdev_io->u.bdev.num_blocks = num_blocks; 5031 bdev_io->u.bdev.offset_blocks = offset_blocks; 5032 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5033 bdev_io->internal.memory_domain = domain; 5034 bdev_io->internal.memory_domain_ctx = domain_ctx; 5035 bdev_io->u.bdev.memory_domain = domain; 5036 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5037 5038 _bdev_io_submit_ext(desc, bdev_io); 5039 5040 return 0; 5041 } 5042 5043 int 5044 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5045 struct iovec *iov, int iovcnt, 5046 uint64_t offset, uint64_t len, 5047 spdk_bdev_io_completion_cb cb, void *cb_arg) 5048 { 5049 uint64_t offset_blocks, num_blocks; 5050 5051 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5052 len, &num_blocks) != 0) { 5053 return -EINVAL; 5054 } 5055 5056 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5057 } 5058 5059 int 5060 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5061 struct iovec *iov, int iovcnt, 5062 uint64_t offset_blocks, uint64_t num_blocks, 5063 spdk_bdev_io_completion_cb cb, void *cb_arg) 5064 { 5065 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5066 num_blocks, NULL, NULL, cb, cb_arg); 5067 } 5068 5069 int 5070 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5071 struct iovec *iov, int iovcnt, void *md_buf, 5072 uint64_t offset_blocks, uint64_t num_blocks, 5073 spdk_bdev_io_completion_cb cb, void *cb_arg) 5074 { 5075 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5076 return -EINVAL; 5077 } 5078 5079 if (md_buf && !_is_buf_allocated(iov)) { 5080 return -EINVAL; 5081 } 5082 5083 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5084 num_blocks, NULL, NULL, cb, cb_arg); 5085 } 5086 5087 int 5088 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5089 struct iovec *iov, int iovcnt, 5090 uint64_t offset_blocks, uint64_t num_blocks, 5091 spdk_bdev_io_completion_cb cb, void *cb_arg, 5092 struct spdk_bdev_ext_io_opts *opts) 5093 { 5094 void *md = NULL; 5095 5096 if (opts) { 5097 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5098 return -EINVAL; 5099 } 5100 md = opts->metadata; 5101 } 5102 5103 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5104 return -EINVAL; 5105 } 5106 5107 if (md && !_is_buf_allocated(iov)) { 5108 return -EINVAL; 5109 } 5110 5111 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5112 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5113 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5114 cb, cb_arg); 5115 } 5116 5117 static void 5118 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5119 { 5120 struct spdk_bdev_io *parent_io = cb_arg; 5121 struct spdk_bdev *bdev = parent_io->bdev; 5122 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5123 int i, rc = 0; 5124 5125 if (!success) { 5126 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5127 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5128 spdk_bdev_free_io(bdev_io); 5129 return; 5130 } 5131 5132 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5133 rc = memcmp(read_buf, 5134 parent_io->u.bdev.iovs[i].iov_base, 5135 parent_io->u.bdev.iovs[i].iov_len); 5136 if (rc) { 5137 break; 5138 } 5139 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5140 } 5141 5142 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5143 rc = memcmp(bdev_io->u.bdev.md_buf, 5144 parent_io->u.bdev.md_buf, 5145 spdk_bdev_get_md_size(bdev)); 5146 } 5147 5148 spdk_bdev_free_io(bdev_io); 5149 5150 if (rc == 0) { 5151 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5152 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5153 } else { 5154 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5155 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5156 } 5157 } 5158 5159 static void 5160 bdev_compare_do_read(void *_bdev_io) 5161 { 5162 struct spdk_bdev_io *bdev_io = _bdev_io; 5163 int rc; 5164 5165 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5166 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5167 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5168 bdev_compare_do_read_done, bdev_io); 5169 5170 if (rc == -ENOMEM) { 5171 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5172 } else if (rc != 0) { 5173 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5174 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5175 } 5176 } 5177 5178 static int 5179 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5180 struct iovec *iov, int iovcnt, void *md_buf, 5181 uint64_t offset_blocks, uint64_t num_blocks, 5182 spdk_bdev_io_completion_cb cb, void *cb_arg) 5183 { 5184 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5185 struct spdk_bdev_io *bdev_io; 5186 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5187 5188 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5189 return -EINVAL; 5190 } 5191 5192 bdev_io = bdev_channel_get_io(channel); 5193 if (!bdev_io) { 5194 return -ENOMEM; 5195 } 5196 5197 bdev_io->internal.ch = channel; 5198 bdev_io->internal.desc = desc; 5199 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5200 bdev_io->u.bdev.iovs = iov; 5201 bdev_io->u.bdev.iovcnt = iovcnt; 5202 bdev_io->u.bdev.md_buf = md_buf; 5203 bdev_io->u.bdev.num_blocks = num_blocks; 5204 bdev_io->u.bdev.offset_blocks = offset_blocks; 5205 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5206 bdev_io->u.bdev.memory_domain = NULL; 5207 bdev_io->u.bdev.memory_domain_ctx = NULL; 5208 5209 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5210 bdev_io_submit(bdev_io); 5211 return 0; 5212 } 5213 5214 bdev_compare_do_read(bdev_io); 5215 5216 return 0; 5217 } 5218 5219 int 5220 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5221 struct iovec *iov, int iovcnt, 5222 uint64_t offset_blocks, uint64_t num_blocks, 5223 spdk_bdev_io_completion_cb cb, void *cb_arg) 5224 { 5225 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5226 num_blocks, cb, cb_arg); 5227 } 5228 5229 int 5230 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5231 struct iovec *iov, int iovcnt, void *md_buf, 5232 uint64_t offset_blocks, uint64_t num_blocks, 5233 spdk_bdev_io_completion_cb cb, void *cb_arg) 5234 { 5235 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5236 return -EINVAL; 5237 } 5238 5239 if (md_buf && !_is_buf_allocated(iov)) { 5240 return -EINVAL; 5241 } 5242 5243 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5244 num_blocks, cb, cb_arg); 5245 } 5246 5247 static int 5248 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5249 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5250 spdk_bdev_io_completion_cb cb, void *cb_arg) 5251 { 5252 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5253 struct spdk_bdev_io *bdev_io; 5254 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5255 5256 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5257 return -EINVAL; 5258 } 5259 5260 bdev_io = bdev_channel_get_io(channel); 5261 if (!bdev_io) { 5262 return -ENOMEM; 5263 } 5264 5265 bdev_io->internal.ch = channel; 5266 bdev_io->internal.desc = desc; 5267 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5268 bdev_io->u.bdev.iovs = &bdev_io->iov; 5269 bdev_io->u.bdev.iovs[0].iov_base = buf; 5270 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5271 bdev_io->u.bdev.iovcnt = 1; 5272 bdev_io->u.bdev.md_buf = md_buf; 5273 bdev_io->u.bdev.num_blocks = num_blocks; 5274 bdev_io->u.bdev.offset_blocks = offset_blocks; 5275 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5276 bdev_io->u.bdev.memory_domain = NULL; 5277 bdev_io->u.bdev.memory_domain_ctx = NULL; 5278 5279 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5280 bdev_io_submit(bdev_io); 5281 return 0; 5282 } 5283 5284 bdev_compare_do_read(bdev_io); 5285 5286 return 0; 5287 } 5288 5289 int 5290 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5291 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5292 spdk_bdev_io_completion_cb cb, void *cb_arg) 5293 { 5294 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5295 cb, cb_arg); 5296 } 5297 5298 int 5299 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5300 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5301 spdk_bdev_io_completion_cb cb, void *cb_arg) 5302 { 5303 struct iovec iov = { 5304 .iov_base = buf, 5305 }; 5306 5307 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5308 return -EINVAL; 5309 } 5310 5311 if (md_buf && !_is_buf_allocated(&iov)) { 5312 return -EINVAL; 5313 } 5314 5315 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5316 cb, cb_arg); 5317 } 5318 5319 static void 5320 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5321 { 5322 struct spdk_bdev_io *bdev_io = ctx; 5323 5324 if (unlock_status) { 5325 SPDK_ERRLOG("LBA range unlock failed\n"); 5326 } 5327 5328 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5329 false, bdev_io->internal.caller_ctx); 5330 } 5331 5332 static void 5333 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5334 { 5335 bdev_io->internal.status = status; 5336 5337 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5338 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5339 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5340 } 5341 5342 static void 5343 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5344 { 5345 struct spdk_bdev_io *parent_io = cb_arg; 5346 5347 if (!success) { 5348 SPDK_ERRLOG("Compare and write operation failed\n"); 5349 } 5350 5351 spdk_bdev_free_io(bdev_io); 5352 5353 bdev_comparev_and_writev_blocks_unlock(parent_io, 5354 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5355 } 5356 5357 static void 5358 bdev_compare_and_write_do_write(void *_bdev_io) 5359 { 5360 struct spdk_bdev_io *bdev_io = _bdev_io; 5361 int rc; 5362 5363 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5364 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5365 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5366 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5367 bdev_compare_and_write_do_write_done, bdev_io); 5368 5369 5370 if (rc == -ENOMEM) { 5371 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5372 } else if (rc != 0) { 5373 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5374 } 5375 } 5376 5377 static void 5378 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5379 { 5380 struct spdk_bdev_io *parent_io = cb_arg; 5381 5382 spdk_bdev_free_io(bdev_io); 5383 5384 if (!success) { 5385 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5386 return; 5387 } 5388 5389 bdev_compare_and_write_do_write(parent_io); 5390 } 5391 5392 static void 5393 bdev_compare_and_write_do_compare(void *_bdev_io) 5394 { 5395 struct spdk_bdev_io *bdev_io = _bdev_io; 5396 int rc; 5397 5398 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5399 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5400 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5401 bdev_compare_and_write_do_compare_done, bdev_io); 5402 5403 if (rc == -ENOMEM) { 5404 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5405 } else if (rc != 0) { 5406 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5407 } 5408 } 5409 5410 static void 5411 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5412 { 5413 struct spdk_bdev_io *bdev_io = ctx; 5414 5415 if (status) { 5416 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5417 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5418 return; 5419 } 5420 5421 bdev_compare_and_write_do_compare(bdev_io); 5422 } 5423 5424 int 5425 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5426 struct iovec *compare_iov, int compare_iovcnt, 5427 struct iovec *write_iov, int write_iovcnt, 5428 uint64_t offset_blocks, uint64_t num_blocks, 5429 spdk_bdev_io_completion_cb cb, void *cb_arg) 5430 { 5431 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5432 struct spdk_bdev_io *bdev_io; 5433 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5434 5435 if (!desc->write) { 5436 return -EBADF; 5437 } 5438 5439 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5440 return -EINVAL; 5441 } 5442 5443 if (num_blocks > bdev->acwu) { 5444 return -EINVAL; 5445 } 5446 5447 bdev_io = bdev_channel_get_io(channel); 5448 if (!bdev_io) { 5449 return -ENOMEM; 5450 } 5451 5452 bdev_io->internal.ch = channel; 5453 bdev_io->internal.desc = desc; 5454 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5455 bdev_io->u.bdev.iovs = compare_iov; 5456 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5457 bdev_io->u.bdev.fused_iovs = write_iov; 5458 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5459 bdev_io->u.bdev.md_buf = NULL; 5460 bdev_io->u.bdev.num_blocks = num_blocks; 5461 bdev_io->u.bdev.offset_blocks = offset_blocks; 5462 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5463 bdev_io->u.bdev.memory_domain = NULL; 5464 bdev_io->u.bdev.memory_domain_ctx = NULL; 5465 5466 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5467 bdev_io_submit(bdev_io); 5468 return 0; 5469 } 5470 5471 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5472 bdev_comparev_and_writev_blocks_locked, bdev_io); 5473 } 5474 5475 int 5476 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5477 struct iovec *iov, int iovcnt, 5478 uint64_t offset_blocks, uint64_t num_blocks, 5479 bool populate, 5480 spdk_bdev_io_completion_cb cb, void *cb_arg) 5481 { 5482 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5483 struct spdk_bdev_io *bdev_io; 5484 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5485 5486 if (!desc->write) { 5487 return -EBADF; 5488 } 5489 5490 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5491 return -EINVAL; 5492 } 5493 5494 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5495 return -ENOTSUP; 5496 } 5497 5498 bdev_io = bdev_channel_get_io(channel); 5499 if (!bdev_io) { 5500 return -ENOMEM; 5501 } 5502 5503 bdev_io->internal.ch = channel; 5504 bdev_io->internal.desc = desc; 5505 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5506 bdev_io->u.bdev.num_blocks = num_blocks; 5507 bdev_io->u.bdev.offset_blocks = offset_blocks; 5508 bdev_io->u.bdev.iovs = iov; 5509 bdev_io->u.bdev.iovcnt = iovcnt; 5510 bdev_io->u.bdev.md_buf = NULL; 5511 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5512 bdev_io->u.bdev.zcopy.commit = 0; 5513 bdev_io->u.bdev.zcopy.start = 1; 5514 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5515 bdev_io->u.bdev.memory_domain = NULL; 5516 bdev_io->u.bdev.memory_domain_ctx = NULL; 5517 5518 bdev_io_submit(bdev_io); 5519 5520 return 0; 5521 } 5522 5523 int 5524 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5525 spdk_bdev_io_completion_cb cb, void *cb_arg) 5526 { 5527 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5528 return -EINVAL; 5529 } 5530 5531 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5532 bdev_io->u.bdev.zcopy.start = 0; 5533 bdev_io->internal.caller_ctx = cb_arg; 5534 bdev_io->internal.cb = cb; 5535 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5536 5537 bdev_io_submit(bdev_io); 5538 5539 return 0; 5540 } 5541 5542 int 5543 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5544 uint64_t offset, uint64_t len, 5545 spdk_bdev_io_completion_cb cb, void *cb_arg) 5546 { 5547 uint64_t offset_blocks, num_blocks; 5548 5549 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5550 len, &num_blocks) != 0) { 5551 return -EINVAL; 5552 } 5553 5554 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5555 } 5556 5557 int 5558 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5559 uint64_t offset_blocks, uint64_t num_blocks, 5560 spdk_bdev_io_completion_cb cb, void *cb_arg) 5561 { 5562 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5563 struct spdk_bdev_io *bdev_io; 5564 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5565 5566 if (!desc->write) { 5567 return -EBADF; 5568 } 5569 5570 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5571 return -EINVAL; 5572 } 5573 5574 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5575 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5576 return -ENOTSUP; 5577 } 5578 5579 bdev_io = bdev_channel_get_io(channel); 5580 5581 if (!bdev_io) { 5582 return -ENOMEM; 5583 } 5584 5585 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5586 bdev_io->internal.ch = channel; 5587 bdev_io->internal.desc = desc; 5588 bdev_io->u.bdev.offset_blocks = offset_blocks; 5589 bdev_io->u.bdev.num_blocks = num_blocks; 5590 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5591 bdev_io->u.bdev.memory_domain = NULL; 5592 bdev_io->u.bdev.memory_domain_ctx = NULL; 5593 5594 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5595 bdev_io_submit(bdev_io); 5596 return 0; 5597 } 5598 5599 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5600 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5601 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5602 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5603 bdev_write_zero_buffer_next(bdev_io); 5604 5605 return 0; 5606 } 5607 5608 int 5609 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5610 uint64_t offset, uint64_t nbytes, 5611 spdk_bdev_io_completion_cb cb, void *cb_arg) 5612 { 5613 uint64_t offset_blocks, num_blocks; 5614 5615 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5616 nbytes, &num_blocks) != 0) { 5617 return -EINVAL; 5618 } 5619 5620 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5621 } 5622 5623 int 5624 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5625 uint64_t offset_blocks, uint64_t num_blocks, 5626 spdk_bdev_io_completion_cb cb, void *cb_arg) 5627 { 5628 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5629 struct spdk_bdev_io *bdev_io; 5630 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5631 5632 if (!desc->write) { 5633 return -EBADF; 5634 } 5635 5636 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5637 return -EINVAL; 5638 } 5639 5640 if (num_blocks == 0) { 5641 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5642 return -EINVAL; 5643 } 5644 5645 bdev_io = bdev_channel_get_io(channel); 5646 if (!bdev_io) { 5647 return -ENOMEM; 5648 } 5649 5650 bdev_io->internal.ch = channel; 5651 bdev_io->internal.desc = desc; 5652 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5653 5654 bdev_io->u.bdev.iovs = &bdev_io->iov; 5655 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5656 bdev_io->u.bdev.iovs[0].iov_len = 0; 5657 bdev_io->u.bdev.iovcnt = 1; 5658 5659 bdev_io->u.bdev.offset_blocks = offset_blocks; 5660 bdev_io->u.bdev.num_blocks = num_blocks; 5661 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5662 bdev_io->u.bdev.memory_domain = NULL; 5663 bdev_io->u.bdev.memory_domain_ctx = NULL; 5664 5665 bdev_io_submit(bdev_io); 5666 return 0; 5667 } 5668 5669 int 5670 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5671 uint64_t offset, uint64_t length, 5672 spdk_bdev_io_completion_cb cb, void *cb_arg) 5673 { 5674 uint64_t offset_blocks, num_blocks; 5675 5676 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5677 length, &num_blocks) != 0) { 5678 return -EINVAL; 5679 } 5680 5681 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5682 } 5683 5684 int 5685 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5686 uint64_t offset_blocks, uint64_t num_blocks, 5687 spdk_bdev_io_completion_cb cb, void *cb_arg) 5688 { 5689 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5690 struct spdk_bdev_io *bdev_io; 5691 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5692 5693 if (!desc->write) { 5694 return -EBADF; 5695 } 5696 5697 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5698 return -EINVAL; 5699 } 5700 5701 bdev_io = bdev_channel_get_io(channel); 5702 if (!bdev_io) { 5703 return -ENOMEM; 5704 } 5705 5706 bdev_io->internal.ch = channel; 5707 bdev_io->internal.desc = desc; 5708 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5709 bdev_io->u.bdev.iovs = NULL; 5710 bdev_io->u.bdev.iovcnt = 0; 5711 bdev_io->u.bdev.offset_blocks = offset_blocks; 5712 bdev_io->u.bdev.num_blocks = num_blocks; 5713 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5714 5715 bdev_io_submit(bdev_io); 5716 return 0; 5717 } 5718 5719 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5720 5721 static void 5722 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5723 { 5724 struct spdk_bdev_channel *ch = _ctx; 5725 struct spdk_bdev_io *bdev_io; 5726 5727 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5728 5729 if (status == -EBUSY) { 5730 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5731 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5732 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5733 } else { 5734 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5735 * start the reset. */ 5736 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5737 bdev_io_submit_reset(bdev_io); 5738 } 5739 } else { 5740 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5741 SPDK_DEBUGLOG(bdev, 5742 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5743 ch->bdev->name); 5744 /* Mark the completion status as a SUCCESS and complete the reset. */ 5745 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5746 } 5747 } 5748 5749 static void 5750 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5751 struct spdk_io_channel *io_ch, void *_ctx) 5752 { 5753 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5754 int status = 0; 5755 5756 if (cur_ch->io_outstanding > 0) { 5757 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5758 * further iteration over the rest of the channels and pass non-zero status 5759 * to the callback function. */ 5760 status = -EBUSY; 5761 } 5762 spdk_bdev_for_each_channel_continue(i, status); 5763 } 5764 5765 static int 5766 bdev_reset_poll_for_outstanding_io(void *ctx) 5767 { 5768 struct spdk_bdev_channel *ch = ctx; 5769 struct spdk_bdev_io *bdev_io; 5770 5771 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5772 5773 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5774 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5775 bdev_reset_check_outstanding_io_done); 5776 5777 return SPDK_POLLER_BUSY; 5778 } 5779 5780 static void 5781 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5782 { 5783 struct spdk_bdev_channel *ch = _ctx; 5784 struct spdk_bdev_io *bdev_io; 5785 5786 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5787 5788 if (bdev->reset_io_drain_timeout == 0) { 5789 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5790 5791 bdev_io_submit_reset(bdev_io); 5792 return; 5793 } 5794 5795 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5796 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5797 5798 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5799 * submit the reset to the underlying module only if outstanding I/O 5800 * remain after reset_io_drain_timeout seconds have passed. */ 5801 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5802 bdev_reset_check_outstanding_io_done); 5803 } 5804 5805 static void 5806 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5807 struct spdk_io_channel *ch, void *_ctx) 5808 { 5809 struct spdk_bdev_channel *channel; 5810 struct spdk_bdev_mgmt_channel *mgmt_channel; 5811 struct spdk_bdev_shared_resource *shared_resource; 5812 bdev_io_tailq_t tmp_queued; 5813 5814 TAILQ_INIT(&tmp_queued); 5815 5816 channel = __io_ch_to_bdev_ch(ch); 5817 shared_resource = channel->shared_resource; 5818 mgmt_channel = shared_resource->mgmt_ch; 5819 5820 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5821 5822 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5823 /* The QoS object is always valid and readable while 5824 * the channel flag is set, so the lock here should not 5825 * be necessary. We're not in the fast path though, so 5826 * just take it anyway. */ 5827 spdk_spin_lock(&channel->bdev->internal.spinlock); 5828 if (channel->bdev->internal.qos->ch == channel) { 5829 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5830 } 5831 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5832 } 5833 5834 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5835 bdev_abort_all_buf_io(mgmt_channel, channel); 5836 bdev_abort_all_buf_io(mgmt_channel, channel); 5837 bdev_abort_all_queued_io(&tmp_queued, channel); 5838 5839 spdk_bdev_for_each_channel_continue(i, 0); 5840 } 5841 5842 static void 5843 bdev_start_reset(void *ctx) 5844 { 5845 struct spdk_bdev_channel *ch = ctx; 5846 5847 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5848 bdev_reset_freeze_channel_done); 5849 } 5850 5851 static void 5852 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5853 { 5854 struct spdk_bdev *bdev = ch->bdev; 5855 5856 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5857 5858 spdk_spin_lock(&bdev->internal.spinlock); 5859 if (bdev->internal.reset_in_progress == NULL) { 5860 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5861 /* 5862 * Take a channel reference for the target bdev for the life of this 5863 * reset. This guards against the channel getting destroyed while 5864 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5865 * progress. We will release the reference when this reset is 5866 * completed. 5867 */ 5868 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5869 bdev_start_reset(ch); 5870 } 5871 spdk_spin_unlock(&bdev->internal.spinlock); 5872 } 5873 5874 int 5875 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5876 spdk_bdev_io_completion_cb cb, void *cb_arg) 5877 { 5878 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5879 struct spdk_bdev_io *bdev_io; 5880 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5881 5882 bdev_io = bdev_channel_get_io(channel); 5883 if (!bdev_io) { 5884 return -ENOMEM; 5885 } 5886 5887 bdev_io->internal.ch = channel; 5888 bdev_io->internal.desc = desc; 5889 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5890 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5891 bdev_io->u.reset.ch_ref = NULL; 5892 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5893 5894 spdk_spin_lock(&bdev->internal.spinlock); 5895 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5896 spdk_spin_unlock(&bdev->internal.spinlock); 5897 5898 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5899 internal.ch_link); 5900 5901 bdev_channel_start_reset(channel); 5902 5903 return 0; 5904 } 5905 5906 void 5907 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5908 struct spdk_bdev_io_stat *stat) 5909 { 5910 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5911 5912 bdev_get_io_stat(stat, channel->stat); 5913 } 5914 5915 static void 5916 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5917 { 5918 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5919 5920 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5921 bdev_iostat_ctx->cb_arg, 0); 5922 free(bdev_iostat_ctx); 5923 } 5924 5925 static void 5926 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5927 struct spdk_io_channel *ch, void *_ctx) 5928 { 5929 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5930 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5931 5932 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5933 spdk_bdev_for_each_channel_continue(i, 0); 5934 } 5935 5936 void 5937 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5938 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5939 { 5940 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5941 5942 assert(bdev != NULL); 5943 assert(stat != NULL); 5944 assert(cb != NULL); 5945 5946 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5947 if (bdev_iostat_ctx == NULL) { 5948 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5949 cb(bdev, stat, cb_arg, -ENOMEM); 5950 return; 5951 } 5952 5953 bdev_iostat_ctx->stat = stat; 5954 bdev_iostat_ctx->cb = cb; 5955 bdev_iostat_ctx->cb_arg = cb_arg; 5956 5957 /* Start with the statistics from previously deleted channels. */ 5958 spdk_spin_lock(&bdev->internal.spinlock); 5959 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5960 spdk_spin_unlock(&bdev->internal.spinlock); 5961 5962 /* Then iterate and add the statistics from each existing channel. */ 5963 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5964 bdev_get_device_stat_done); 5965 } 5966 5967 struct bdev_iostat_reset_ctx { 5968 enum spdk_bdev_reset_stat_mode mode; 5969 bdev_reset_device_stat_cb cb; 5970 void *cb_arg; 5971 }; 5972 5973 static void 5974 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5975 { 5976 struct bdev_iostat_reset_ctx *ctx = _ctx; 5977 5978 ctx->cb(bdev, ctx->cb_arg, 0); 5979 5980 free(ctx); 5981 } 5982 5983 static void 5984 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5985 struct spdk_io_channel *ch, void *_ctx) 5986 { 5987 struct bdev_iostat_reset_ctx *ctx = _ctx; 5988 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5989 5990 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 5991 5992 spdk_bdev_for_each_channel_continue(i, 0); 5993 } 5994 5995 void 5996 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 5997 bdev_reset_device_stat_cb cb, void *cb_arg) 5998 { 5999 struct bdev_iostat_reset_ctx *ctx; 6000 6001 assert(bdev != NULL); 6002 assert(cb != NULL); 6003 6004 ctx = calloc(1, sizeof(*ctx)); 6005 if (ctx == NULL) { 6006 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6007 cb(bdev, cb_arg, -ENOMEM); 6008 return; 6009 } 6010 6011 ctx->mode = mode; 6012 ctx->cb = cb; 6013 ctx->cb_arg = cb_arg; 6014 6015 spdk_spin_lock(&bdev->internal.spinlock); 6016 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6017 spdk_spin_unlock(&bdev->internal.spinlock); 6018 6019 spdk_bdev_for_each_channel(bdev, 6020 bdev_reset_each_channel_stat, 6021 ctx, 6022 bdev_reset_device_stat_done); 6023 } 6024 6025 int 6026 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6027 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6028 spdk_bdev_io_completion_cb cb, void *cb_arg) 6029 { 6030 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6031 struct spdk_bdev_io *bdev_io; 6032 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6033 6034 if (!desc->write) { 6035 return -EBADF; 6036 } 6037 6038 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6039 return -ENOTSUP; 6040 } 6041 6042 bdev_io = bdev_channel_get_io(channel); 6043 if (!bdev_io) { 6044 return -ENOMEM; 6045 } 6046 6047 bdev_io->internal.ch = channel; 6048 bdev_io->internal.desc = desc; 6049 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6050 bdev_io->u.nvme_passthru.cmd = *cmd; 6051 bdev_io->u.nvme_passthru.buf = buf; 6052 bdev_io->u.nvme_passthru.nbytes = nbytes; 6053 bdev_io->u.nvme_passthru.md_buf = NULL; 6054 bdev_io->u.nvme_passthru.md_len = 0; 6055 6056 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6057 6058 bdev_io_submit(bdev_io); 6059 return 0; 6060 } 6061 6062 int 6063 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6064 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6065 spdk_bdev_io_completion_cb cb, void *cb_arg) 6066 { 6067 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6068 struct spdk_bdev_io *bdev_io; 6069 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6070 6071 if (!desc->write) { 6072 /* 6073 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6074 * to easily determine if the command is a read or write, but for now just 6075 * do not allow io_passthru with a read-only descriptor. 6076 */ 6077 return -EBADF; 6078 } 6079 6080 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6081 return -ENOTSUP; 6082 } 6083 6084 bdev_io = bdev_channel_get_io(channel); 6085 if (!bdev_io) { 6086 return -ENOMEM; 6087 } 6088 6089 bdev_io->internal.ch = channel; 6090 bdev_io->internal.desc = desc; 6091 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6092 bdev_io->u.nvme_passthru.cmd = *cmd; 6093 bdev_io->u.nvme_passthru.buf = buf; 6094 bdev_io->u.nvme_passthru.nbytes = nbytes; 6095 bdev_io->u.nvme_passthru.md_buf = NULL; 6096 bdev_io->u.nvme_passthru.md_len = 0; 6097 6098 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6099 6100 bdev_io_submit(bdev_io); 6101 return 0; 6102 } 6103 6104 int 6105 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6106 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6107 spdk_bdev_io_completion_cb cb, void *cb_arg) 6108 { 6109 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6110 struct spdk_bdev_io *bdev_io; 6111 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6112 6113 if (!desc->write) { 6114 /* 6115 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6116 * to easily determine if the command is a read or write, but for now just 6117 * do not allow io_passthru with a read-only descriptor. 6118 */ 6119 return -EBADF; 6120 } 6121 6122 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6123 return -ENOTSUP; 6124 } 6125 6126 bdev_io = bdev_channel_get_io(channel); 6127 if (!bdev_io) { 6128 return -ENOMEM; 6129 } 6130 6131 bdev_io->internal.ch = channel; 6132 bdev_io->internal.desc = desc; 6133 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6134 bdev_io->u.nvme_passthru.cmd = *cmd; 6135 bdev_io->u.nvme_passthru.buf = buf; 6136 bdev_io->u.nvme_passthru.nbytes = nbytes; 6137 bdev_io->u.nvme_passthru.md_buf = md_buf; 6138 bdev_io->u.nvme_passthru.md_len = md_len; 6139 6140 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6141 6142 bdev_io_submit(bdev_io); 6143 return 0; 6144 } 6145 6146 static void bdev_abort_retry(void *ctx); 6147 static void bdev_abort(struct spdk_bdev_io *parent_io); 6148 6149 static void 6150 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6151 { 6152 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6153 struct spdk_bdev_io *parent_io = cb_arg; 6154 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6155 6156 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6157 6158 spdk_bdev_free_io(bdev_io); 6159 6160 if (!success) { 6161 /* Check if the target I/O completed in the meantime. */ 6162 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6163 if (tmp_io == bio_to_abort) { 6164 break; 6165 } 6166 } 6167 6168 /* If the target I/O still exists, set the parent to failed. */ 6169 if (tmp_io != NULL) { 6170 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6171 } 6172 } 6173 6174 parent_io->u.bdev.split_outstanding--; 6175 if (parent_io->u.bdev.split_outstanding == 0) { 6176 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6177 bdev_abort_retry(parent_io); 6178 } else { 6179 bdev_io_complete(parent_io); 6180 } 6181 } 6182 } 6183 6184 static int 6185 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6186 struct spdk_bdev_io *bio_to_abort, 6187 spdk_bdev_io_completion_cb cb, void *cb_arg) 6188 { 6189 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6190 struct spdk_bdev_io *bdev_io; 6191 6192 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6193 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6194 /* TODO: Abort reset or abort request. */ 6195 return -ENOTSUP; 6196 } 6197 6198 bdev_io = bdev_channel_get_io(channel); 6199 if (bdev_io == NULL) { 6200 return -ENOMEM; 6201 } 6202 6203 bdev_io->internal.ch = channel; 6204 bdev_io->internal.desc = desc; 6205 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6206 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6207 6208 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6209 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6210 6211 /* Parent abort request is not submitted directly, but to manage its 6212 * execution add it to the submitted list here. 6213 */ 6214 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6215 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6216 6217 bdev_abort(bdev_io); 6218 6219 return 0; 6220 } 6221 6222 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6223 6224 /* Submit the abort request to the underlying bdev module. */ 6225 bdev_io_submit(bdev_io); 6226 6227 return 0; 6228 } 6229 6230 static uint32_t 6231 _bdev_abort(struct spdk_bdev_io *parent_io) 6232 { 6233 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6234 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6235 void *bio_cb_arg; 6236 struct spdk_bdev_io *bio_to_abort; 6237 uint32_t matched_ios; 6238 int rc; 6239 6240 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6241 6242 /* matched_ios is returned and will be kept by the caller. 6243 * 6244 * This function will be used for two cases, 1) the same cb_arg is used for 6245 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6246 * Incrementing split_outstanding directly here may confuse readers especially 6247 * for the 1st case. 6248 * 6249 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6250 * works as expected. 6251 */ 6252 matched_ios = 0; 6253 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6254 6255 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6256 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6257 continue; 6258 } 6259 6260 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6261 /* Any I/O which was submitted after this abort command should be excluded. */ 6262 continue; 6263 } 6264 6265 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6266 if (rc != 0) { 6267 if (rc == -ENOMEM) { 6268 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6269 } else { 6270 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6271 } 6272 break; 6273 } 6274 matched_ios++; 6275 } 6276 6277 return matched_ios; 6278 } 6279 6280 static void 6281 bdev_abort_retry(void *ctx) 6282 { 6283 struct spdk_bdev_io *parent_io = ctx; 6284 uint32_t matched_ios; 6285 6286 matched_ios = _bdev_abort(parent_io); 6287 6288 if (matched_ios == 0) { 6289 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6290 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6291 } else { 6292 /* For retry, the case that no target I/O was found is success 6293 * because it means target I/Os completed in the meantime. 6294 */ 6295 bdev_io_complete(parent_io); 6296 } 6297 return; 6298 } 6299 6300 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6301 parent_io->u.bdev.split_outstanding = matched_ios; 6302 } 6303 6304 static void 6305 bdev_abort(struct spdk_bdev_io *parent_io) 6306 { 6307 uint32_t matched_ios; 6308 6309 matched_ios = _bdev_abort(parent_io); 6310 6311 if (matched_ios == 0) { 6312 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6313 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6314 } else { 6315 /* The case the no target I/O was found is failure. */ 6316 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6317 bdev_io_complete(parent_io); 6318 } 6319 return; 6320 } 6321 6322 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6323 parent_io->u.bdev.split_outstanding = matched_ios; 6324 } 6325 6326 int 6327 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6328 void *bio_cb_arg, 6329 spdk_bdev_io_completion_cb cb, void *cb_arg) 6330 { 6331 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6333 struct spdk_bdev_io *bdev_io; 6334 6335 if (bio_cb_arg == NULL) { 6336 return -EINVAL; 6337 } 6338 6339 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6340 return -ENOTSUP; 6341 } 6342 6343 bdev_io = bdev_channel_get_io(channel); 6344 if (bdev_io == NULL) { 6345 return -ENOMEM; 6346 } 6347 6348 bdev_io->internal.ch = channel; 6349 bdev_io->internal.desc = desc; 6350 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6351 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6352 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6353 6354 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6355 6356 /* Parent abort request is not submitted directly, but to manage its execution, 6357 * add it to the submitted list here. 6358 */ 6359 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6360 6361 bdev_abort(bdev_io); 6362 6363 return 0; 6364 } 6365 6366 int 6367 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6368 struct spdk_bdev_io_wait_entry *entry) 6369 { 6370 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6371 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6372 6373 if (bdev != entry->bdev) { 6374 SPDK_ERRLOG("bdevs do not match\n"); 6375 return -EINVAL; 6376 } 6377 6378 if (mgmt_ch->per_thread_cache_count > 0) { 6379 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6380 return -EINVAL; 6381 } 6382 6383 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6384 return 0; 6385 } 6386 6387 static inline void 6388 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6389 { 6390 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6391 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6392 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6393 uint32_t blocklen = bdev_io->bdev->blocklen; 6394 6395 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6396 switch (bdev_io->type) { 6397 case SPDK_BDEV_IO_TYPE_READ: 6398 io_stat->bytes_read += num_blocks * blocklen; 6399 io_stat->num_read_ops++; 6400 io_stat->read_latency_ticks += tsc_diff; 6401 if (io_stat->max_read_latency_ticks < tsc_diff) { 6402 io_stat->max_read_latency_ticks = tsc_diff; 6403 } 6404 if (io_stat->min_read_latency_ticks > tsc_diff) { 6405 io_stat->min_read_latency_ticks = tsc_diff; 6406 } 6407 break; 6408 case SPDK_BDEV_IO_TYPE_WRITE: 6409 io_stat->bytes_written += num_blocks * blocklen; 6410 io_stat->num_write_ops++; 6411 io_stat->write_latency_ticks += tsc_diff; 6412 if (io_stat->max_write_latency_ticks < tsc_diff) { 6413 io_stat->max_write_latency_ticks = tsc_diff; 6414 } 6415 if (io_stat->min_write_latency_ticks > tsc_diff) { 6416 io_stat->min_write_latency_ticks = tsc_diff; 6417 } 6418 break; 6419 case SPDK_BDEV_IO_TYPE_UNMAP: 6420 io_stat->bytes_unmapped += num_blocks * blocklen; 6421 io_stat->num_unmap_ops++; 6422 io_stat->unmap_latency_ticks += tsc_diff; 6423 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6424 io_stat->max_unmap_latency_ticks = tsc_diff; 6425 } 6426 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6427 io_stat->min_unmap_latency_ticks = tsc_diff; 6428 } 6429 break; 6430 case SPDK_BDEV_IO_TYPE_ZCOPY: 6431 /* Track the data in the start phase only */ 6432 if (bdev_io->u.bdev.zcopy.start) { 6433 if (bdev_io->u.bdev.zcopy.populate) { 6434 io_stat->bytes_read += num_blocks * blocklen; 6435 io_stat->num_read_ops++; 6436 io_stat->read_latency_ticks += tsc_diff; 6437 if (io_stat->max_read_latency_ticks < tsc_diff) { 6438 io_stat->max_read_latency_ticks = tsc_diff; 6439 } 6440 if (io_stat->min_read_latency_ticks > tsc_diff) { 6441 io_stat->min_read_latency_ticks = tsc_diff; 6442 } 6443 } else { 6444 io_stat->bytes_written += num_blocks * blocklen; 6445 io_stat->num_write_ops++; 6446 io_stat->write_latency_ticks += tsc_diff; 6447 if (io_stat->max_write_latency_ticks < tsc_diff) { 6448 io_stat->max_write_latency_ticks = tsc_diff; 6449 } 6450 if (io_stat->min_write_latency_ticks > tsc_diff) { 6451 io_stat->min_write_latency_ticks = tsc_diff; 6452 } 6453 } 6454 } 6455 break; 6456 case SPDK_BDEV_IO_TYPE_COPY: 6457 io_stat->bytes_copied += num_blocks * blocklen; 6458 io_stat->num_copy_ops++; 6459 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6460 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6461 io_stat->max_copy_latency_ticks = tsc_diff; 6462 } 6463 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6464 io_stat->min_copy_latency_ticks = tsc_diff; 6465 } 6466 break; 6467 default: 6468 break; 6469 } 6470 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6471 io_stat = bdev_io->bdev->internal.stat; 6472 assert(io_stat->io_error != NULL); 6473 6474 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6475 io_stat->io_error->error_status[-io_status - 1]++; 6476 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6477 } 6478 6479 #ifdef SPDK_CONFIG_VTUNE 6480 uint64_t now_tsc = spdk_get_ticks(); 6481 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6482 uint64_t data[5]; 6483 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6484 6485 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6486 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6487 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6488 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6489 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6490 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6491 6492 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6493 __itt_metadata_u64, 5, data); 6494 6495 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6496 bdev_io->internal.ch->start_tsc = now_tsc; 6497 } 6498 #endif 6499 } 6500 6501 static inline void 6502 _bdev_io_complete(void *ctx) 6503 { 6504 struct spdk_bdev_io *bdev_io = ctx; 6505 6506 assert(bdev_io->internal.cb != NULL); 6507 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6508 6509 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6510 bdev_io->internal.caller_ctx); 6511 } 6512 6513 static inline void 6514 bdev_io_complete(void *ctx) 6515 { 6516 struct spdk_bdev_io *bdev_io = ctx; 6517 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6518 uint64_t tsc, tsc_diff; 6519 6520 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6521 /* 6522 * Defer completion to avoid potential infinite recursion if the 6523 * user's completion callback issues a new I/O. 6524 */ 6525 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6526 bdev_io_complete, bdev_io); 6527 return; 6528 } 6529 6530 tsc = spdk_get_ticks(); 6531 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6532 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6533 bdev_io->internal.caller_ctx); 6534 6535 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6536 6537 if (bdev_io->internal.ch->histogram) { 6538 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6539 } 6540 6541 bdev_io_update_io_stat(bdev_io, tsc_diff); 6542 _bdev_io_complete(bdev_io); 6543 } 6544 6545 /* The difference between this function and bdev_io_complete() is that this should be called to 6546 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6547 * io_submitted list and don't have submit_tsc updated. 6548 */ 6549 static inline void 6550 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 6551 { 6552 /* Since the IO hasn't been submitted it's bound to be failed */ 6553 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6554 6555 /* At this point we don't know if the IO is completed from submission context or not, but, 6556 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 6557 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6558 _bdev_io_complete, bdev_io); 6559 } 6560 6561 static void bdev_destroy_cb(void *io_device); 6562 6563 static void 6564 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6565 { 6566 struct spdk_bdev_io *bdev_io = _ctx; 6567 6568 if (bdev_io->u.reset.ch_ref != NULL) { 6569 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6570 bdev_io->u.reset.ch_ref = NULL; 6571 } 6572 6573 bdev_io_complete(bdev_io); 6574 6575 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6576 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6577 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6578 } 6579 } 6580 6581 static void 6582 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6583 struct spdk_io_channel *_ch, void *_ctx) 6584 { 6585 struct spdk_bdev_io *bdev_io = _ctx; 6586 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6587 struct spdk_bdev_io *queued_reset; 6588 6589 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6590 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6591 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6592 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6593 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6594 } 6595 6596 spdk_bdev_for_each_channel_continue(i, 0); 6597 } 6598 6599 void 6600 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6601 { 6602 struct spdk_bdev *bdev = bdev_io->bdev; 6603 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6604 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6605 6606 bdev_io->internal.status = status; 6607 6608 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6609 bool unlock_channels = false; 6610 6611 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6612 SPDK_ERRLOG("NOMEM returned for reset\n"); 6613 } 6614 spdk_spin_lock(&bdev->internal.spinlock); 6615 if (bdev_io == bdev->internal.reset_in_progress) { 6616 bdev->internal.reset_in_progress = NULL; 6617 unlock_channels = true; 6618 } 6619 spdk_spin_unlock(&bdev->internal.spinlock); 6620 6621 if (unlock_channels) { 6622 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6623 bdev_reset_complete); 6624 return; 6625 } 6626 } else { 6627 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6628 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6629 /* bdev IO will be completed in the callback */ 6630 return; 6631 } 6632 6633 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6634 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6635 return; 6636 } 6637 } 6638 6639 bdev_io_complete(bdev_io); 6640 } 6641 6642 void 6643 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6644 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6645 { 6646 if (sc == SPDK_SCSI_STATUS_GOOD) { 6647 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6648 } else { 6649 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6650 bdev_io->internal.error.scsi.sc = sc; 6651 bdev_io->internal.error.scsi.sk = sk; 6652 bdev_io->internal.error.scsi.asc = asc; 6653 bdev_io->internal.error.scsi.ascq = ascq; 6654 } 6655 6656 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6657 } 6658 6659 void 6660 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6661 int *sc, int *sk, int *asc, int *ascq) 6662 { 6663 assert(sc != NULL); 6664 assert(sk != NULL); 6665 assert(asc != NULL); 6666 assert(ascq != NULL); 6667 6668 switch (bdev_io->internal.status) { 6669 case SPDK_BDEV_IO_STATUS_SUCCESS: 6670 *sc = SPDK_SCSI_STATUS_GOOD; 6671 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6672 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6673 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6674 break; 6675 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6676 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6677 break; 6678 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6679 *sc = bdev_io->internal.error.scsi.sc; 6680 *sk = bdev_io->internal.error.scsi.sk; 6681 *asc = bdev_io->internal.error.scsi.asc; 6682 *ascq = bdev_io->internal.error.scsi.ascq; 6683 break; 6684 default: 6685 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6686 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6687 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6688 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6689 break; 6690 } 6691 } 6692 6693 void 6694 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6695 { 6696 if (aio_result == 0) { 6697 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6698 } else { 6699 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6700 } 6701 6702 bdev_io->internal.error.aio_result = aio_result; 6703 6704 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6705 } 6706 6707 void 6708 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6709 { 6710 assert(aio_result != NULL); 6711 6712 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6713 *aio_result = bdev_io->internal.error.aio_result; 6714 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6715 *aio_result = 0; 6716 } else { 6717 *aio_result = -EIO; 6718 } 6719 } 6720 6721 void 6722 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6723 { 6724 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6725 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6726 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6727 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6728 } else { 6729 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6730 } 6731 6732 bdev_io->internal.error.nvme.cdw0 = cdw0; 6733 bdev_io->internal.error.nvme.sct = sct; 6734 bdev_io->internal.error.nvme.sc = sc; 6735 6736 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6737 } 6738 6739 void 6740 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6741 { 6742 assert(sct != NULL); 6743 assert(sc != NULL); 6744 assert(cdw0 != NULL); 6745 6746 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6747 *sct = SPDK_NVME_SCT_GENERIC; 6748 *sc = SPDK_NVME_SC_SUCCESS; 6749 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6750 *cdw0 = 0; 6751 } else { 6752 *cdw0 = 1U; 6753 } 6754 return; 6755 } 6756 6757 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6758 *sct = bdev_io->internal.error.nvme.sct; 6759 *sc = bdev_io->internal.error.nvme.sc; 6760 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6761 *sct = SPDK_NVME_SCT_GENERIC; 6762 *sc = SPDK_NVME_SC_SUCCESS; 6763 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6764 *sct = SPDK_NVME_SCT_GENERIC; 6765 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6766 } else { 6767 *sct = SPDK_NVME_SCT_GENERIC; 6768 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6769 } 6770 6771 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6772 } 6773 6774 void 6775 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6776 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6777 { 6778 assert(first_sct != NULL); 6779 assert(first_sc != NULL); 6780 assert(second_sct != NULL); 6781 assert(second_sc != NULL); 6782 assert(cdw0 != NULL); 6783 6784 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6785 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6786 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6787 *first_sct = bdev_io->internal.error.nvme.sct; 6788 *first_sc = bdev_io->internal.error.nvme.sc; 6789 *second_sct = SPDK_NVME_SCT_GENERIC; 6790 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6791 } else { 6792 *first_sct = SPDK_NVME_SCT_GENERIC; 6793 *first_sc = SPDK_NVME_SC_SUCCESS; 6794 *second_sct = bdev_io->internal.error.nvme.sct; 6795 *second_sc = bdev_io->internal.error.nvme.sc; 6796 } 6797 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6798 *first_sct = SPDK_NVME_SCT_GENERIC; 6799 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6800 *second_sct = SPDK_NVME_SCT_GENERIC; 6801 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6802 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6803 *first_sct = SPDK_NVME_SCT_GENERIC; 6804 *first_sc = SPDK_NVME_SC_SUCCESS; 6805 *second_sct = SPDK_NVME_SCT_GENERIC; 6806 *second_sc = SPDK_NVME_SC_SUCCESS; 6807 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6808 *first_sct = SPDK_NVME_SCT_GENERIC; 6809 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6810 *second_sct = SPDK_NVME_SCT_GENERIC; 6811 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6812 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6813 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6814 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6815 *second_sct = SPDK_NVME_SCT_GENERIC; 6816 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6817 } else { 6818 *first_sct = SPDK_NVME_SCT_GENERIC; 6819 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6820 *second_sct = SPDK_NVME_SCT_GENERIC; 6821 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6822 } 6823 6824 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6825 } 6826 6827 struct spdk_thread * 6828 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6829 { 6830 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6831 } 6832 6833 struct spdk_io_channel * 6834 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6835 { 6836 return bdev_io->internal.ch->channel; 6837 } 6838 6839 static int 6840 bdev_register(struct spdk_bdev *bdev) 6841 { 6842 char *bdev_name; 6843 char uuid[SPDK_UUID_STRING_LEN]; 6844 int ret, i; 6845 6846 assert(bdev->module != NULL); 6847 6848 if (!bdev->name) { 6849 SPDK_ERRLOG("Bdev name is NULL\n"); 6850 return -EINVAL; 6851 } 6852 6853 if (!strlen(bdev->name)) { 6854 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6855 return -EINVAL; 6856 } 6857 6858 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 6859 if (bdev->fn_table->accel_sequence_supported == NULL) { 6860 continue; 6861 } 6862 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 6863 (enum spdk_bdev_io_type)i)) { 6864 continue; 6865 } 6866 6867 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 6868 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 6869 "memory domains\n"); 6870 return -EINVAL; 6871 } 6872 6873 if (spdk_bdev_is_md_separate(bdev)) { 6874 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 6875 "accel sequence support\n"); 6876 return -EINVAL; 6877 } 6878 } 6879 6880 /* Users often register their own I/O devices using the bdev name. In 6881 * order to avoid conflicts, prepend bdev_. */ 6882 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6883 if (!bdev_name) { 6884 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6885 return -ENOMEM; 6886 } 6887 6888 bdev->internal.stat = bdev_alloc_io_stat(true); 6889 if (!bdev->internal.stat) { 6890 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6891 free(bdev_name); 6892 return -ENOMEM; 6893 } 6894 6895 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6896 bdev->internal.measured_queue_depth = UINT64_MAX; 6897 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6898 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6899 bdev->internal.qd_poller = NULL; 6900 bdev->internal.qos = NULL; 6901 6902 TAILQ_INIT(&bdev->internal.open_descs); 6903 TAILQ_INIT(&bdev->internal.locked_ranges); 6904 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6905 TAILQ_INIT(&bdev->aliases); 6906 6907 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6908 if (ret != 0) { 6909 bdev_free_io_stat(bdev->internal.stat); 6910 free(bdev_name); 6911 return ret; 6912 } 6913 6914 /* UUID has to be specified by the user or defined by bdev itself. 6915 * Otherwise this field must remain empty, to indicate that this 6916 * value cannot be depended upon. */ 6917 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6918 /* Add the UUID alias only if it's different than the name */ 6919 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6920 if (strcmp(bdev->name, uuid) != 0) { 6921 ret = spdk_bdev_alias_add(bdev, uuid); 6922 if (ret != 0) { 6923 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6924 bdev_name_del(&bdev->internal.bdev_name); 6925 bdev_free_io_stat(bdev->internal.stat); 6926 free(bdev_name); 6927 return ret; 6928 } 6929 } 6930 } 6931 6932 if (spdk_bdev_get_buf_align(bdev) > 1) { 6933 if (bdev->split_on_optimal_io_boundary) { 6934 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6935 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6936 } else { 6937 bdev->split_on_optimal_io_boundary = true; 6938 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6939 } 6940 } 6941 6942 /* If the user didn't specify a write unit size, set it to one. */ 6943 if (bdev->write_unit_size == 0) { 6944 bdev->write_unit_size = 1; 6945 } 6946 6947 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6948 if (bdev->acwu == 0) { 6949 bdev->acwu = bdev->write_unit_size; 6950 } 6951 6952 if (bdev->phys_blocklen == 0) { 6953 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6954 } 6955 6956 bdev->internal.reset_in_progress = NULL; 6957 bdev->internal.qd_poll_in_progress = false; 6958 bdev->internal.period = 0; 6959 bdev->internal.new_period = 0; 6960 6961 spdk_io_device_register(__bdev_to_io_dev(bdev), 6962 bdev_channel_create, bdev_channel_destroy, 6963 sizeof(struct spdk_bdev_channel), 6964 bdev_name); 6965 6966 free(bdev_name); 6967 6968 spdk_spin_init(&bdev->internal.spinlock); 6969 6970 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6971 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6972 6973 return 0; 6974 } 6975 6976 static void 6977 bdev_destroy_cb(void *io_device) 6978 { 6979 int rc; 6980 struct spdk_bdev *bdev; 6981 spdk_bdev_unregister_cb cb_fn; 6982 void *cb_arg; 6983 6984 bdev = __bdev_from_io_dev(io_device); 6985 6986 if (bdev->internal.unregister_td != spdk_get_thread()) { 6987 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 6988 return; 6989 } 6990 6991 cb_fn = bdev->internal.unregister_cb; 6992 cb_arg = bdev->internal.unregister_ctx; 6993 6994 spdk_spin_destroy(&bdev->internal.spinlock); 6995 free(bdev->internal.qos); 6996 bdev_free_io_stat(bdev->internal.stat); 6997 6998 rc = bdev->fn_table->destruct(bdev->ctxt); 6999 if (rc < 0) { 7000 SPDK_ERRLOG("destruct failed\n"); 7001 } 7002 if (rc <= 0 && cb_fn != NULL) { 7003 cb_fn(cb_arg, rc); 7004 } 7005 } 7006 7007 void 7008 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7009 { 7010 if (bdev->internal.unregister_cb != NULL) { 7011 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7012 } 7013 } 7014 7015 static void 7016 _remove_notify(void *arg) 7017 { 7018 struct spdk_bdev_desc *desc = arg; 7019 7020 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7021 } 7022 7023 /* returns: 0 - bdev removed and ready to be destructed. 7024 * -EBUSY - bdev can't be destructed yet. */ 7025 static int 7026 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7027 { 7028 struct spdk_bdev_desc *desc, *tmp; 7029 int rc = 0; 7030 char uuid[SPDK_UUID_STRING_LEN]; 7031 7032 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7033 assert(spdk_spin_held(&bdev->internal.spinlock)); 7034 7035 /* Notify each descriptor about hotremoval */ 7036 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7037 rc = -EBUSY; 7038 /* 7039 * Defer invocation of the event_cb to a separate message that will 7040 * run later on its thread. This ensures this context unwinds and 7041 * we don't recursively unregister this bdev again if the event_cb 7042 * immediately closes its descriptor. 7043 */ 7044 event_notify(desc, _remove_notify); 7045 } 7046 7047 /* If there are no descriptors, proceed removing the bdev */ 7048 if (rc == 0) { 7049 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7050 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7051 7052 /* Delete the name and the UUID alias */ 7053 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7054 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7055 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7056 7057 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7058 7059 if (bdev->internal.reset_in_progress != NULL) { 7060 /* If reset is in progress, let the completion callback for reset 7061 * unregister the bdev. 7062 */ 7063 rc = -EBUSY; 7064 } 7065 } 7066 7067 return rc; 7068 } 7069 7070 static void 7071 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7072 struct spdk_io_channel *io_ch, void *_ctx) 7073 { 7074 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7075 7076 bdev_channel_abort_queued_ios(bdev_ch); 7077 spdk_bdev_for_each_channel_continue(i, 0); 7078 } 7079 7080 static void 7081 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7082 { 7083 int rc; 7084 7085 spdk_spin_lock(&g_bdev_mgr.spinlock); 7086 spdk_spin_lock(&bdev->internal.spinlock); 7087 /* 7088 * Set the status to REMOVING after completing to abort channels. Otherwise, 7089 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7090 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7091 * may fail. 7092 */ 7093 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7094 rc = bdev_unregister_unsafe(bdev); 7095 spdk_spin_unlock(&bdev->internal.spinlock); 7096 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7097 7098 if (rc == 0) { 7099 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7100 } 7101 } 7102 7103 void 7104 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7105 { 7106 struct spdk_thread *thread; 7107 7108 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7109 7110 thread = spdk_get_thread(); 7111 if (!thread) { 7112 /* The user called this from a non-SPDK thread. */ 7113 if (cb_fn != NULL) { 7114 cb_fn(cb_arg, -ENOTSUP); 7115 } 7116 return; 7117 } 7118 7119 spdk_spin_lock(&g_bdev_mgr.spinlock); 7120 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7121 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7122 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7123 if (cb_fn) { 7124 cb_fn(cb_arg, -EBUSY); 7125 } 7126 return; 7127 } 7128 7129 spdk_spin_lock(&bdev->internal.spinlock); 7130 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7131 bdev->internal.unregister_cb = cb_fn; 7132 bdev->internal.unregister_ctx = cb_arg; 7133 bdev->internal.unregister_td = thread; 7134 spdk_spin_unlock(&bdev->internal.spinlock); 7135 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7136 7137 spdk_bdev_set_qd_sampling_period(bdev, 0); 7138 7139 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7140 bdev_unregister); 7141 } 7142 7143 int 7144 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7145 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7146 { 7147 struct spdk_bdev_desc *desc; 7148 struct spdk_bdev *bdev; 7149 int rc; 7150 7151 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7152 if (rc != 0) { 7153 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7154 return rc; 7155 } 7156 7157 bdev = spdk_bdev_desc_get_bdev(desc); 7158 7159 if (bdev->module != module) { 7160 spdk_bdev_close(desc); 7161 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7162 bdev_name); 7163 return -ENODEV; 7164 } 7165 7166 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7167 7168 spdk_bdev_close(desc); 7169 7170 return 0; 7171 } 7172 7173 static int 7174 bdev_start_qos(struct spdk_bdev *bdev) 7175 { 7176 struct set_qos_limit_ctx *ctx; 7177 7178 /* Enable QoS */ 7179 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7180 ctx = calloc(1, sizeof(*ctx)); 7181 if (ctx == NULL) { 7182 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7183 return -ENOMEM; 7184 } 7185 ctx->bdev = bdev; 7186 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7187 } 7188 7189 return 0; 7190 } 7191 7192 static void 7193 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7194 struct spdk_bdev *bdev) 7195 { 7196 enum spdk_bdev_claim_type type; 7197 const char *typename, *modname; 7198 extern struct spdk_log_flag SPDK_LOG_bdev; 7199 7200 assert(spdk_spin_held(&bdev->internal.spinlock)); 7201 7202 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7203 return; 7204 } 7205 7206 type = bdev->internal.claim_type; 7207 typename = spdk_bdev_claim_get_name(type); 7208 7209 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7210 modname = bdev->internal.claim.v1.module->name; 7211 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7212 bdev->name, detail, typename, modname); 7213 return; 7214 } 7215 7216 if (claim_type_is_v2(type)) { 7217 struct spdk_bdev_module_claim *claim; 7218 7219 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7220 modname = claim->module->name; 7221 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7222 bdev->name, detail, typename, modname); 7223 } 7224 return; 7225 } 7226 7227 assert(false); 7228 } 7229 7230 static int 7231 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7232 { 7233 struct spdk_thread *thread; 7234 int rc = 0; 7235 7236 thread = spdk_get_thread(); 7237 if (!thread) { 7238 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7239 return -ENOTSUP; 7240 } 7241 7242 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7243 spdk_get_thread()); 7244 7245 desc->bdev = bdev; 7246 desc->thread = thread; 7247 desc->write = write; 7248 7249 spdk_spin_lock(&bdev->internal.spinlock); 7250 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7251 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7252 spdk_spin_unlock(&bdev->internal.spinlock); 7253 return -ENODEV; 7254 } 7255 7256 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7257 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7258 spdk_spin_unlock(&bdev->internal.spinlock); 7259 return -EPERM; 7260 } 7261 7262 rc = bdev_start_qos(bdev); 7263 if (rc != 0) { 7264 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7265 spdk_spin_unlock(&bdev->internal.spinlock); 7266 return rc; 7267 } 7268 7269 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7270 7271 spdk_spin_unlock(&bdev->internal.spinlock); 7272 7273 return 0; 7274 } 7275 7276 static int 7277 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7278 struct spdk_bdev_desc **_desc) 7279 { 7280 struct spdk_bdev_desc *desc; 7281 unsigned int i; 7282 7283 desc = calloc(1, sizeof(*desc)); 7284 if (desc == NULL) { 7285 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7286 return -ENOMEM; 7287 } 7288 7289 TAILQ_INIT(&desc->pending_media_events); 7290 TAILQ_INIT(&desc->free_media_events); 7291 7292 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7293 desc->callback.event_fn = event_cb; 7294 desc->callback.ctx = event_ctx; 7295 spdk_spin_init(&desc->spinlock); 7296 7297 if (bdev->media_events) { 7298 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7299 sizeof(*desc->media_events_buffer)); 7300 if (desc->media_events_buffer == NULL) { 7301 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7302 bdev_desc_free(desc); 7303 return -ENOMEM; 7304 } 7305 7306 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7307 TAILQ_INSERT_TAIL(&desc->free_media_events, 7308 &desc->media_events_buffer[i], tailq); 7309 } 7310 } 7311 7312 if (bdev->fn_table->accel_sequence_supported != NULL) { 7313 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7314 desc->accel_sequence_supported[i] = 7315 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7316 (enum spdk_bdev_io_type)i); 7317 } 7318 } 7319 7320 *_desc = desc; 7321 7322 return 0; 7323 } 7324 7325 int 7326 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7327 void *event_ctx, struct spdk_bdev_desc **_desc) 7328 { 7329 struct spdk_bdev_desc *desc; 7330 struct spdk_bdev *bdev; 7331 int rc; 7332 7333 if (event_cb == NULL) { 7334 SPDK_ERRLOG("Missing event callback function\n"); 7335 return -EINVAL; 7336 } 7337 7338 spdk_spin_lock(&g_bdev_mgr.spinlock); 7339 7340 bdev = bdev_get_by_name(bdev_name); 7341 7342 if (bdev == NULL) { 7343 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7344 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7345 return -ENODEV; 7346 } 7347 7348 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7349 if (rc != 0) { 7350 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7351 return rc; 7352 } 7353 7354 rc = bdev_open(bdev, write, desc); 7355 if (rc != 0) { 7356 bdev_desc_free(desc); 7357 desc = NULL; 7358 } 7359 7360 *_desc = desc; 7361 7362 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7363 7364 return rc; 7365 } 7366 7367 static void 7368 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7369 { 7370 int rc; 7371 7372 spdk_spin_lock(&bdev->internal.spinlock); 7373 spdk_spin_lock(&desc->spinlock); 7374 7375 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7376 7377 desc->closed = true; 7378 7379 if (desc->claim != NULL) { 7380 bdev_desc_release_claims(desc); 7381 } 7382 7383 if (0 == desc->refs) { 7384 spdk_spin_unlock(&desc->spinlock); 7385 bdev_desc_free(desc); 7386 } else { 7387 spdk_spin_unlock(&desc->spinlock); 7388 } 7389 7390 /* If no more descriptors, kill QoS channel */ 7391 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7392 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7393 bdev->name, spdk_get_thread()); 7394 7395 if (bdev_qos_destroy(bdev)) { 7396 /* There isn't anything we can do to recover here. Just let the 7397 * old QoS poller keep running. The QoS handling won't change 7398 * cores when the user allocates a new channel, but it won't break. */ 7399 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7400 } 7401 } 7402 7403 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7404 rc = bdev_unregister_unsafe(bdev); 7405 spdk_spin_unlock(&bdev->internal.spinlock); 7406 7407 if (rc == 0) { 7408 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7409 } 7410 } else { 7411 spdk_spin_unlock(&bdev->internal.spinlock); 7412 } 7413 } 7414 7415 void 7416 spdk_bdev_close(struct spdk_bdev_desc *desc) 7417 { 7418 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7419 7420 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7421 spdk_get_thread()); 7422 7423 assert(desc->thread == spdk_get_thread()); 7424 7425 spdk_poller_unregister(&desc->io_timeout_poller); 7426 7427 spdk_spin_lock(&g_bdev_mgr.spinlock); 7428 7429 bdev_close(bdev, desc); 7430 7431 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7432 } 7433 7434 static void 7435 bdev_register_finished(void *arg) 7436 { 7437 struct spdk_bdev_desc *desc = arg; 7438 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7439 7440 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7441 7442 spdk_spin_lock(&g_bdev_mgr.spinlock); 7443 7444 bdev_close(bdev, desc); 7445 7446 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7447 } 7448 7449 int 7450 spdk_bdev_register(struct spdk_bdev *bdev) 7451 { 7452 struct spdk_bdev_desc *desc; 7453 struct spdk_thread *thread = spdk_get_thread(); 7454 int rc; 7455 7456 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7457 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7458 thread ? spdk_thread_get_name(thread) : "null"); 7459 return -EINVAL; 7460 } 7461 7462 rc = bdev_register(bdev); 7463 if (rc != 0) { 7464 return rc; 7465 } 7466 7467 /* A descriptor is opened to prevent bdev deletion during examination */ 7468 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7469 if (rc != 0) { 7470 spdk_bdev_unregister(bdev, NULL, NULL); 7471 return rc; 7472 } 7473 7474 rc = bdev_open(bdev, false, desc); 7475 if (rc != 0) { 7476 bdev_desc_free(desc); 7477 spdk_bdev_unregister(bdev, NULL, NULL); 7478 return rc; 7479 } 7480 7481 /* Examine configuration before initializing I/O */ 7482 bdev_examine(bdev); 7483 7484 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7485 if (rc != 0) { 7486 bdev_close(bdev, desc); 7487 spdk_bdev_unregister(bdev, NULL, NULL); 7488 } 7489 7490 return rc; 7491 } 7492 7493 int 7494 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7495 struct spdk_bdev_module *module) 7496 { 7497 spdk_spin_lock(&bdev->internal.spinlock); 7498 7499 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7500 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7501 spdk_spin_unlock(&bdev->internal.spinlock); 7502 return -EPERM; 7503 } 7504 7505 if (desc && !desc->write) { 7506 desc->write = true; 7507 } 7508 7509 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7510 bdev->internal.claim.v1.module = module; 7511 7512 spdk_spin_unlock(&bdev->internal.spinlock); 7513 return 0; 7514 } 7515 7516 void 7517 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7518 { 7519 spdk_spin_lock(&bdev->internal.spinlock); 7520 7521 assert(bdev->internal.claim.v1.module != NULL); 7522 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7523 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7524 bdev->internal.claim.v1.module = NULL; 7525 7526 spdk_spin_unlock(&bdev->internal.spinlock); 7527 } 7528 7529 /* 7530 * Start claims v2 7531 */ 7532 7533 const char * 7534 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7535 { 7536 switch (type) { 7537 case SPDK_BDEV_CLAIM_NONE: 7538 return "not_claimed"; 7539 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7540 return "exclusive_write"; 7541 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7542 return "read_many_write_one"; 7543 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7544 return "read_many_write_none"; 7545 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7546 return "read_many_write_many"; 7547 default: 7548 break; 7549 } 7550 return "invalid_claim"; 7551 } 7552 7553 static bool 7554 claim_type_is_v2(enum spdk_bdev_claim_type type) 7555 { 7556 switch (type) { 7557 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7558 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7559 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7560 return true; 7561 default: 7562 break; 7563 } 7564 return false; 7565 } 7566 7567 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7568 static bool 7569 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7570 { 7571 switch (type) { 7572 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7573 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7574 return true; 7575 default: 7576 break; 7577 } 7578 return false; 7579 } 7580 7581 void 7582 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7583 { 7584 if (opts == NULL) { 7585 SPDK_ERRLOG("opts should not be NULL\n"); 7586 assert(opts != NULL); 7587 return; 7588 } 7589 if (size == 0) { 7590 SPDK_ERRLOG("size should not be zero\n"); 7591 assert(size != 0); 7592 return; 7593 } 7594 7595 memset(opts, 0, size); 7596 opts->opts_size = size; 7597 7598 #define FIELD_OK(field) \ 7599 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7600 7601 #define SET_FIELD(field, value) \ 7602 if (FIELD_OK(field)) { \ 7603 opts->field = value; \ 7604 } \ 7605 7606 SET_FIELD(shared_claim_key, 0); 7607 7608 #undef FIELD_OK 7609 #undef SET_FIELD 7610 } 7611 7612 static int 7613 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7614 { 7615 if (src->opts_size == 0) { 7616 SPDK_ERRLOG("size should not be zero\n"); 7617 return -1; 7618 } 7619 7620 memset(dst, 0, sizeof(*dst)); 7621 dst->opts_size = src->opts_size; 7622 7623 #define FIELD_OK(field) \ 7624 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7625 7626 #define SET_FIELD(field) \ 7627 if (FIELD_OK(field)) { \ 7628 dst->field = src->field; \ 7629 } \ 7630 7631 if (FIELD_OK(name)) { 7632 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7633 } 7634 7635 SET_FIELD(shared_claim_key); 7636 7637 /* You should not remove this statement, but need to update the assert statement 7638 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7639 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7640 7641 #undef FIELD_OK 7642 #undef SET_FIELD 7643 return 0; 7644 } 7645 7646 /* Returns 0 if a read-write-once claim can be taken. */ 7647 static int 7648 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7649 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7650 { 7651 struct spdk_bdev *bdev = desc->bdev; 7652 struct spdk_bdev_desc *open_desc; 7653 7654 assert(spdk_spin_held(&bdev->internal.spinlock)); 7655 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7656 7657 if (opts->shared_claim_key != 0) { 7658 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7659 bdev->name); 7660 return -EINVAL; 7661 } 7662 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7663 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7664 return -EPERM; 7665 } 7666 if (desc->claim != NULL) { 7667 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7668 bdev->name, desc->claim->module->name); 7669 return -EPERM; 7670 } 7671 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7672 if (desc != open_desc && open_desc->write) { 7673 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7674 "another descriptor is open for writing\n", 7675 bdev->name); 7676 return -EPERM; 7677 } 7678 } 7679 7680 return 0; 7681 } 7682 7683 /* Returns 0 if a read-only-many claim can be taken. */ 7684 static int 7685 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7686 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7687 { 7688 struct spdk_bdev *bdev = desc->bdev; 7689 struct spdk_bdev_desc *open_desc; 7690 7691 assert(spdk_spin_held(&bdev->internal.spinlock)); 7692 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 7693 assert(desc->claim == NULL); 7694 7695 if (desc->write) { 7696 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 7697 bdev->name); 7698 return -EINVAL; 7699 } 7700 if (opts->shared_claim_key != 0) { 7701 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 7702 return -EINVAL; 7703 } 7704 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7705 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7706 if (open_desc->write) { 7707 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 7708 "another descriptor is open for writing\n", 7709 bdev->name); 7710 return -EPERM; 7711 } 7712 } 7713 } 7714 7715 return 0; 7716 } 7717 7718 /* Returns 0 if a read-write-many claim can be taken. */ 7719 static int 7720 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7721 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7722 { 7723 struct spdk_bdev *bdev = desc->bdev; 7724 struct spdk_bdev_desc *open_desc; 7725 7726 assert(spdk_spin_held(&bdev->internal.spinlock)); 7727 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 7728 assert(desc->claim == NULL); 7729 7730 if (opts->shared_claim_key == 0) { 7731 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 7732 bdev->name); 7733 return -EINVAL; 7734 } 7735 switch (bdev->internal.claim_type) { 7736 case SPDK_BDEV_CLAIM_NONE: 7737 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7738 if (open_desc == desc) { 7739 continue; 7740 } 7741 if (open_desc->write) { 7742 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 7743 "another descriptor is open for writing without a " 7744 "claim\n", bdev->name); 7745 return -EPERM; 7746 } 7747 } 7748 break; 7749 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7750 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 7751 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 7752 return -EPERM; 7753 } 7754 break; 7755 default: 7756 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7757 return -EBUSY; 7758 } 7759 7760 return 0; 7761 } 7762 7763 /* Updates desc and its bdev with a v2 claim. */ 7764 static int 7765 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7766 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7767 { 7768 struct spdk_bdev *bdev = desc->bdev; 7769 struct spdk_bdev_module_claim *claim; 7770 7771 assert(spdk_spin_held(&bdev->internal.spinlock)); 7772 assert(claim_type_is_v2(type)); 7773 assert(desc->claim == NULL); 7774 7775 claim = calloc(1, sizeof(*desc->claim)); 7776 if (claim == NULL) { 7777 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 7778 return -ENOMEM; 7779 } 7780 claim->module = module; 7781 claim->desc = desc; 7782 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 7783 memcpy(claim->name, opts->name, sizeof(claim->name)); 7784 desc->claim = claim; 7785 7786 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7787 bdev->internal.claim_type = type; 7788 TAILQ_INIT(&bdev->internal.claim.v2.claims); 7789 bdev->internal.claim.v2.key = opts->shared_claim_key; 7790 } 7791 assert(type == bdev->internal.claim_type); 7792 7793 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 7794 7795 if (!desc->write && claim_type_promotes_to_write(type)) { 7796 desc->write = true; 7797 } 7798 7799 return 0; 7800 } 7801 7802 int 7803 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7804 struct spdk_bdev_claim_opts *_opts, 7805 struct spdk_bdev_module *module) 7806 { 7807 struct spdk_bdev *bdev = desc->bdev; 7808 struct spdk_bdev_claim_opts opts; 7809 int rc = 0; 7810 7811 if (desc == NULL) { 7812 SPDK_ERRLOG("descriptor must not be NULL\n"); 7813 return -EINVAL; 7814 } 7815 7816 if (_opts == NULL) { 7817 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 7818 } else if (claim_opts_copy(_opts, &opts) != 0) { 7819 return -EINVAL; 7820 } 7821 7822 spdk_spin_lock(&bdev->internal.spinlock); 7823 7824 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 7825 bdev->internal.claim_type != type) { 7826 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7827 spdk_spin_unlock(&bdev->internal.spinlock); 7828 return -EPERM; 7829 } 7830 7831 if (claim_type_is_v2(type) && desc->claim != NULL) { 7832 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 7833 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 7834 spdk_spin_unlock(&bdev->internal.spinlock); 7835 return -EPERM; 7836 } 7837 7838 switch (type) { 7839 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7840 spdk_spin_unlock(&bdev->internal.spinlock); 7841 return spdk_bdev_module_claim_bdev(bdev, desc, module); 7842 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7843 rc = claim_verify_rwo(desc, type, &opts, module); 7844 break; 7845 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7846 rc = claim_verify_rom(desc, type, &opts, module); 7847 break; 7848 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7849 rc = claim_verify_rwm(desc, type, &opts, module); 7850 break; 7851 default: 7852 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 7853 rc = -ENOTSUP; 7854 } 7855 7856 if (rc == 0) { 7857 rc = claim_bdev(desc, type, &opts, module); 7858 } 7859 7860 spdk_spin_unlock(&bdev->internal.spinlock); 7861 return rc; 7862 } 7863 7864 static void 7865 claim_reset(struct spdk_bdev *bdev) 7866 { 7867 assert(spdk_spin_held(&bdev->internal.spinlock)); 7868 assert(claim_type_is_v2(bdev->internal.claim_type)); 7869 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 7870 7871 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7872 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7873 } 7874 7875 static void 7876 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 7877 { 7878 struct spdk_bdev *bdev = desc->bdev; 7879 7880 assert(spdk_spin_held(&bdev->internal.spinlock)); 7881 assert(claim_type_is_v2(bdev->internal.claim_type)); 7882 7883 if (bdev->internal.examine_in_progress == 0) { 7884 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 7885 free(desc->claim); 7886 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 7887 claim_reset(bdev); 7888 } 7889 } else { 7890 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 7891 desc->claim->module = NULL; 7892 desc->claim->desc = NULL; 7893 } 7894 desc->claim = NULL; 7895 } 7896 7897 /* 7898 * End claims v2 7899 */ 7900 7901 struct spdk_bdev * 7902 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7903 { 7904 assert(desc != NULL); 7905 return desc->bdev; 7906 } 7907 7908 int 7909 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7910 { 7911 struct spdk_bdev *bdev, *tmp; 7912 struct spdk_bdev_desc *desc; 7913 int rc = 0; 7914 7915 assert(fn != NULL); 7916 7917 spdk_spin_lock(&g_bdev_mgr.spinlock); 7918 bdev = spdk_bdev_first(); 7919 while (bdev != NULL) { 7920 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7921 if (rc != 0) { 7922 break; 7923 } 7924 rc = bdev_open(bdev, false, desc); 7925 if (rc != 0) { 7926 bdev_desc_free(desc); 7927 if (rc == -ENODEV) { 7928 /* Ignore the error and move to the next bdev. */ 7929 rc = 0; 7930 bdev = spdk_bdev_next(bdev); 7931 continue; 7932 } 7933 break; 7934 } 7935 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7936 7937 rc = fn(ctx, bdev); 7938 7939 spdk_spin_lock(&g_bdev_mgr.spinlock); 7940 tmp = spdk_bdev_next(bdev); 7941 bdev_close(bdev, desc); 7942 if (rc != 0) { 7943 break; 7944 } 7945 bdev = tmp; 7946 } 7947 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7948 7949 return rc; 7950 } 7951 7952 int 7953 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7954 { 7955 struct spdk_bdev *bdev, *tmp; 7956 struct spdk_bdev_desc *desc; 7957 int rc = 0; 7958 7959 assert(fn != NULL); 7960 7961 spdk_spin_lock(&g_bdev_mgr.spinlock); 7962 bdev = spdk_bdev_first_leaf(); 7963 while (bdev != NULL) { 7964 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7965 if (rc != 0) { 7966 break; 7967 } 7968 rc = bdev_open(bdev, false, desc); 7969 if (rc != 0) { 7970 bdev_desc_free(desc); 7971 if (rc == -ENODEV) { 7972 /* Ignore the error and move to the next bdev. */ 7973 rc = 0; 7974 bdev = spdk_bdev_next_leaf(bdev); 7975 continue; 7976 } 7977 break; 7978 } 7979 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7980 7981 rc = fn(ctx, bdev); 7982 7983 spdk_spin_lock(&g_bdev_mgr.spinlock); 7984 tmp = spdk_bdev_next_leaf(bdev); 7985 bdev_close(bdev, desc); 7986 if (rc != 0) { 7987 break; 7988 } 7989 bdev = tmp; 7990 } 7991 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7992 7993 return rc; 7994 } 7995 7996 void 7997 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7998 { 7999 struct iovec *iovs; 8000 int iovcnt; 8001 8002 if (bdev_io == NULL) { 8003 return; 8004 } 8005 8006 switch (bdev_io->type) { 8007 case SPDK_BDEV_IO_TYPE_READ: 8008 case SPDK_BDEV_IO_TYPE_WRITE: 8009 case SPDK_BDEV_IO_TYPE_ZCOPY: 8010 iovs = bdev_io->u.bdev.iovs; 8011 iovcnt = bdev_io->u.bdev.iovcnt; 8012 break; 8013 default: 8014 iovs = NULL; 8015 iovcnt = 0; 8016 break; 8017 } 8018 8019 if (iovp) { 8020 *iovp = iovs; 8021 } 8022 if (iovcntp) { 8023 *iovcntp = iovcnt; 8024 } 8025 } 8026 8027 void * 8028 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8029 { 8030 if (bdev_io == NULL) { 8031 return NULL; 8032 } 8033 8034 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8035 return NULL; 8036 } 8037 8038 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8039 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8040 return bdev_io->u.bdev.md_buf; 8041 } 8042 8043 return NULL; 8044 } 8045 8046 void * 8047 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8048 { 8049 if (bdev_io == NULL) { 8050 assert(false); 8051 return NULL; 8052 } 8053 8054 return bdev_io->internal.caller_ctx; 8055 } 8056 8057 void 8058 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8059 { 8060 8061 if (spdk_bdev_module_list_find(bdev_module->name)) { 8062 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8063 assert(false); 8064 } 8065 8066 spdk_spin_init(&bdev_module->internal.spinlock); 8067 8068 /* 8069 * Modules with examine callbacks must be initialized first, so they are 8070 * ready to handle examine callbacks from later modules that will 8071 * register physical bdevs. 8072 */ 8073 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8074 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8075 } else { 8076 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8077 } 8078 } 8079 8080 struct spdk_bdev_module * 8081 spdk_bdev_module_list_find(const char *name) 8082 { 8083 struct spdk_bdev_module *bdev_module; 8084 8085 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8086 if (strcmp(name, bdev_module->name) == 0) { 8087 break; 8088 } 8089 } 8090 8091 return bdev_module; 8092 } 8093 8094 static void 8095 bdev_write_zero_buffer_next(void *_bdev_io) 8096 { 8097 struct spdk_bdev_io *bdev_io = _bdev_io; 8098 uint64_t num_bytes, num_blocks; 8099 void *md_buf = NULL; 8100 int rc; 8101 8102 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8103 bdev_io->u.bdev.split_remaining_num_blocks, 8104 ZERO_BUFFER_SIZE); 8105 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8106 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8107 8108 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8109 md_buf = (char *)g_bdev_mgr.zero_buffer + 8110 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8111 } 8112 8113 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8114 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8115 g_bdev_mgr.zero_buffer, md_buf, 8116 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8117 bdev_write_zero_buffer_done, bdev_io); 8118 if (rc == 0) { 8119 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8120 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8121 } else if (rc == -ENOMEM) { 8122 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8123 } else { 8124 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8125 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8126 } 8127 } 8128 8129 static void 8130 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8131 { 8132 struct spdk_bdev_io *parent_io = cb_arg; 8133 8134 spdk_bdev_free_io(bdev_io); 8135 8136 if (!success) { 8137 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8138 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8139 return; 8140 } 8141 8142 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8143 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8144 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8145 return; 8146 } 8147 8148 bdev_write_zero_buffer_next(parent_io); 8149 } 8150 8151 static void 8152 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8153 { 8154 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8155 ctx->bdev->internal.qos_mod_in_progress = false; 8156 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8157 8158 if (ctx->cb_fn) { 8159 ctx->cb_fn(ctx->cb_arg, status); 8160 } 8161 free(ctx); 8162 } 8163 8164 static void 8165 bdev_disable_qos_done(void *cb_arg) 8166 { 8167 struct set_qos_limit_ctx *ctx = cb_arg; 8168 struct spdk_bdev *bdev = ctx->bdev; 8169 struct spdk_bdev_io *bdev_io; 8170 struct spdk_bdev_qos *qos; 8171 8172 spdk_spin_lock(&bdev->internal.spinlock); 8173 qos = bdev->internal.qos; 8174 bdev->internal.qos = NULL; 8175 spdk_spin_unlock(&bdev->internal.spinlock); 8176 8177 while (!TAILQ_EMPTY(&qos->queued)) { 8178 /* Send queued I/O back to their original thread for resubmission. */ 8179 bdev_io = TAILQ_FIRST(&qos->queued); 8180 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8181 8182 if (bdev_io->internal.io_submit_ch) { 8183 /* 8184 * Channel was changed when sending it to the QoS thread - change it back 8185 * before sending it back to the original thread. 8186 */ 8187 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8188 bdev_io->internal.io_submit_ch = NULL; 8189 } 8190 8191 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8192 _bdev_io_submit, bdev_io); 8193 } 8194 8195 if (qos->thread != NULL) { 8196 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8197 spdk_poller_unregister(&qos->poller); 8198 } 8199 8200 free(qos); 8201 8202 bdev_set_qos_limit_done(ctx, 0); 8203 } 8204 8205 static void 8206 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8207 { 8208 struct set_qos_limit_ctx *ctx = _ctx; 8209 struct spdk_thread *thread; 8210 8211 spdk_spin_lock(&bdev->internal.spinlock); 8212 thread = bdev->internal.qos->thread; 8213 spdk_spin_unlock(&bdev->internal.spinlock); 8214 8215 if (thread != NULL) { 8216 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8217 } else { 8218 bdev_disable_qos_done(ctx); 8219 } 8220 } 8221 8222 static void 8223 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8224 struct spdk_io_channel *ch, void *_ctx) 8225 { 8226 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8227 8228 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8229 8230 spdk_bdev_for_each_channel_continue(i, 0); 8231 } 8232 8233 static void 8234 bdev_update_qos_rate_limit_msg(void *cb_arg) 8235 { 8236 struct set_qos_limit_ctx *ctx = cb_arg; 8237 struct spdk_bdev *bdev = ctx->bdev; 8238 8239 spdk_spin_lock(&bdev->internal.spinlock); 8240 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8241 spdk_spin_unlock(&bdev->internal.spinlock); 8242 8243 bdev_set_qos_limit_done(ctx, 0); 8244 } 8245 8246 static void 8247 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8248 struct spdk_io_channel *ch, void *_ctx) 8249 { 8250 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8251 8252 spdk_spin_lock(&bdev->internal.spinlock); 8253 bdev_enable_qos(bdev, bdev_ch); 8254 spdk_spin_unlock(&bdev->internal.spinlock); 8255 spdk_bdev_for_each_channel_continue(i, 0); 8256 } 8257 8258 static void 8259 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8260 { 8261 struct set_qos_limit_ctx *ctx = _ctx; 8262 8263 bdev_set_qos_limit_done(ctx, status); 8264 } 8265 8266 static void 8267 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8268 { 8269 int i; 8270 8271 assert(bdev->internal.qos != NULL); 8272 8273 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8274 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8275 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8276 8277 if (limits[i] == 0) { 8278 bdev->internal.qos->rate_limits[i].limit = 8279 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8280 } 8281 } 8282 } 8283 } 8284 8285 void 8286 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8287 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8288 { 8289 struct set_qos_limit_ctx *ctx; 8290 uint32_t limit_set_complement; 8291 uint64_t min_limit_per_sec; 8292 int i; 8293 bool disable_rate_limit = true; 8294 8295 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8296 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8297 continue; 8298 } 8299 8300 if (limits[i] > 0) { 8301 disable_rate_limit = false; 8302 } 8303 8304 if (bdev_qos_is_iops_rate_limit(i) == true) { 8305 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8306 } else { 8307 /* Change from megabyte to byte rate limit */ 8308 limits[i] = limits[i] * 1024 * 1024; 8309 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8310 } 8311 8312 limit_set_complement = limits[i] % min_limit_per_sec; 8313 if (limit_set_complement) { 8314 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8315 limits[i], min_limit_per_sec); 8316 limits[i] += min_limit_per_sec - limit_set_complement; 8317 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8318 } 8319 } 8320 8321 ctx = calloc(1, sizeof(*ctx)); 8322 if (ctx == NULL) { 8323 cb_fn(cb_arg, -ENOMEM); 8324 return; 8325 } 8326 8327 ctx->cb_fn = cb_fn; 8328 ctx->cb_arg = cb_arg; 8329 ctx->bdev = bdev; 8330 8331 spdk_spin_lock(&bdev->internal.spinlock); 8332 if (bdev->internal.qos_mod_in_progress) { 8333 spdk_spin_unlock(&bdev->internal.spinlock); 8334 free(ctx); 8335 cb_fn(cb_arg, -EAGAIN); 8336 return; 8337 } 8338 bdev->internal.qos_mod_in_progress = true; 8339 8340 if (disable_rate_limit == true && bdev->internal.qos) { 8341 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8342 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8343 (bdev->internal.qos->rate_limits[i].limit > 0 && 8344 bdev->internal.qos->rate_limits[i].limit != 8345 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8346 disable_rate_limit = false; 8347 break; 8348 } 8349 } 8350 } 8351 8352 if (disable_rate_limit == false) { 8353 if (bdev->internal.qos == NULL) { 8354 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8355 if (!bdev->internal.qos) { 8356 spdk_spin_unlock(&bdev->internal.spinlock); 8357 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8358 bdev_set_qos_limit_done(ctx, -ENOMEM); 8359 return; 8360 } 8361 } 8362 8363 if (bdev->internal.qos->thread == NULL) { 8364 /* Enabling */ 8365 bdev_set_qos_rate_limits(bdev, limits); 8366 8367 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8368 bdev_enable_qos_done); 8369 } else { 8370 /* Updating */ 8371 bdev_set_qos_rate_limits(bdev, limits); 8372 8373 spdk_thread_send_msg(bdev->internal.qos->thread, 8374 bdev_update_qos_rate_limit_msg, ctx); 8375 } 8376 } else { 8377 if (bdev->internal.qos != NULL) { 8378 bdev_set_qos_rate_limits(bdev, limits); 8379 8380 /* Disabling */ 8381 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8382 bdev_disable_qos_msg_done); 8383 } else { 8384 spdk_spin_unlock(&bdev->internal.spinlock); 8385 bdev_set_qos_limit_done(ctx, 0); 8386 return; 8387 } 8388 } 8389 8390 spdk_spin_unlock(&bdev->internal.spinlock); 8391 } 8392 8393 struct spdk_bdev_histogram_ctx { 8394 spdk_bdev_histogram_status_cb cb_fn; 8395 void *cb_arg; 8396 struct spdk_bdev *bdev; 8397 int status; 8398 }; 8399 8400 static void 8401 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8402 { 8403 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8404 8405 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8406 ctx->bdev->internal.histogram_in_progress = false; 8407 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8408 ctx->cb_fn(ctx->cb_arg, ctx->status); 8409 free(ctx); 8410 } 8411 8412 static void 8413 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8414 struct spdk_io_channel *_ch, void *_ctx) 8415 { 8416 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8417 8418 if (ch->histogram != NULL) { 8419 spdk_histogram_data_free(ch->histogram); 8420 ch->histogram = NULL; 8421 } 8422 spdk_bdev_for_each_channel_continue(i, 0); 8423 } 8424 8425 static void 8426 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8427 { 8428 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8429 8430 if (status != 0) { 8431 ctx->status = status; 8432 ctx->bdev->internal.histogram_enabled = false; 8433 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8434 bdev_histogram_disable_channel_cb); 8435 } else { 8436 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8437 ctx->bdev->internal.histogram_in_progress = false; 8438 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8439 ctx->cb_fn(ctx->cb_arg, ctx->status); 8440 free(ctx); 8441 } 8442 } 8443 8444 static void 8445 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8446 struct spdk_io_channel *_ch, void *_ctx) 8447 { 8448 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8449 int status = 0; 8450 8451 if (ch->histogram == NULL) { 8452 ch->histogram = spdk_histogram_data_alloc(); 8453 if (ch->histogram == NULL) { 8454 status = -ENOMEM; 8455 } 8456 } 8457 8458 spdk_bdev_for_each_channel_continue(i, status); 8459 } 8460 8461 void 8462 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8463 void *cb_arg, bool enable) 8464 { 8465 struct spdk_bdev_histogram_ctx *ctx; 8466 8467 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8468 if (ctx == NULL) { 8469 cb_fn(cb_arg, -ENOMEM); 8470 return; 8471 } 8472 8473 ctx->bdev = bdev; 8474 ctx->status = 0; 8475 ctx->cb_fn = cb_fn; 8476 ctx->cb_arg = cb_arg; 8477 8478 spdk_spin_lock(&bdev->internal.spinlock); 8479 if (bdev->internal.histogram_in_progress) { 8480 spdk_spin_unlock(&bdev->internal.spinlock); 8481 free(ctx); 8482 cb_fn(cb_arg, -EAGAIN); 8483 return; 8484 } 8485 8486 bdev->internal.histogram_in_progress = true; 8487 spdk_spin_unlock(&bdev->internal.spinlock); 8488 8489 bdev->internal.histogram_enabled = enable; 8490 8491 if (enable) { 8492 /* Allocate histogram for each channel */ 8493 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8494 bdev_histogram_enable_channel_cb); 8495 } else { 8496 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8497 bdev_histogram_disable_channel_cb); 8498 } 8499 } 8500 8501 struct spdk_bdev_histogram_data_ctx { 8502 spdk_bdev_histogram_data_cb cb_fn; 8503 void *cb_arg; 8504 struct spdk_bdev *bdev; 8505 /** merged histogram data from all channels */ 8506 struct spdk_histogram_data *histogram; 8507 }; 8508 8509 static void 8510 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8511 { 8512 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8513 8514 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8515 free(ctx); 8516 } 8517 8518 static void 8519 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8520 struct spdk_io_channel *_ch, void *_ctx) 8521 { 8522 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8523 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8524 int status = 0; 8525 8526 if (ch->histogram == NULL) { 8527 status = -EFAULT; 8528 } else { 8529 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8530 } 8531 8532 spdk_bdev_for_each_channel_continue(i, status); 8533 } 8534 8535 void 8536 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8537 spdk_bdev_histogram_data_cb cb_fn, 8538 void *cb_arg) 8539 { 8540 struct spdk_bdev_histogram_data_ctx *ctx; 8541 8542 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8543 if (ctx == NULL) { 8544 cb_fn(cb_arg, -ENOMEM, NULL); 8545 return; 8546 } 8547 8548 ctx->bdev = bdev; 8549 ctx->cb_fn = cb_fn; 8550 ctx->cb_arg = cb_arg; 8551 8552 ctx->histogram = histogram; 8553 8554 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8555 bdev_histogram_get_channel_cb); 8556 } 8557 8558 void 8559 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8560 void *cb_arg) 8561 { 8562 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8563 int status = 0; 8564 8565 assert(cb_fn != NULL); 8566 8567 if (bdev_ch->histogram == NULL) { 8568 status = -EFAULT; 8569 } 8570 cb_fn(cb_arg, status, bdev_ch->histogram); 8571 } 8572 8573 size_t 8574 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8575 size_t max_events) 8576 { 8577 struct media_event_entry *entry; 8578 size_t num_events = 0; 8579 8580 for (; num_events < max_events; ++num_events) { 8581 entry = TAILQ_FIRST(&desc->pending_media_events); 8582 if (entry == NULL) { 8583 break; 8584 } 8585 8586 events[num_events] = entry->event; 8587 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8588 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8589 } 8590 8591 return num_events; 8592 } 8593 8594 int 8595 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8596 size_t num_events) 8597 { 8598 struct spdk_bdev_desc *desc; 8599 struct media_event_entry *entry; 8600 size_t event_id; 8601 int rc = 0; 8602 8603 assert(bdev->media_events); 8604 8605 spdk_spin_lock(&bdev->internal.spinlock); 8606 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8607 if (desc->write) { 8608 break; 8609 } 8610 } 8611 8612 if (desc == NULL || desc->media_events_buffer == NULL) { 8613 rc = -ENODEV; 8614 goto out; 8615 } 8616 8617 for (event_id = 0; event_id < num_events; ++event_id) { 8618 entry = TAILQ_FIRST(&desc->free_media_events); 8619 if (entry == NULL) { 8620 break; 8621 } 8622 8623 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8624 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8625 entry->event = events[event_id]; 8626 } 8627 8628 rc = event_id; 8629 out: 8630 spdk_spin_unlock(&bdev->internal.spinlock); 8631 return rc; 8632 } 8633 8634 static void 8635 _media_management_notify(void *arg) 8636 { 8637 struct spdk_bdev_desc *desc = arg; 8638 8639 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8640 } 8641 8642 void 8643 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8644 { 8645 struct spdk_bdev_desc *desc; 8646 8647 spdk_spin_lock(&bdev->internal.spinlock); 8648 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8649 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8650 event_notify(desc, _media_management_notify); 8651 } 8652 } 8653 spdk_spin_unlock(&bdev->internal.spinlock); 8654 } 8655 8656 struct locked_lba_range_ctx { 8657 struct lba_range range; 8658 struct spdk_bdev *bdev; 8659 struct lba_range *current_range; 8660 struct lba_range *owner_range; 8661 struct spdk_poller *poller; 8662 lock_range_cb cb_fn; 8663 void *cb_arg; 8664 }; 8665 8666 static void 8667 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8668 { 8669 struct locked_lba_range_ctx *ctx = _ctx; 8670 8671 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8672 free(ctx); 8673 } 8674 8675 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8676 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8677 8678 static void 8679 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8680 { 8681 struct locked_lba_range_ctx *ctx = _ctx; 8682 8683 if (status == -ENOMEM) { 8684 /* One of the channels could not allocate a range object. 8685 * So we have to go back and clean up any ranges that were 8686 * allocated successfully before we return error status to 8687 * the caller. We can reuse the unlock function to do that 8688 * clean up. 8689 */ 8690 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8691 bdev_lock_error_cleanup_cb); 8692 return; 8693 } 8694 8695 /* All channels have locked this range and no I/O overlapping the range 8696 * are outstanding! Set the owner_ch for the range object for the 8697 * locking channel, so that this channel will know that it is allowed 8698 * to write to this range. 8699 */ 8700 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8701 ctx->cb_fn(ctx->cb_arg, status); 8702 8703 /* Don't free the ctx here. Its range is in the bdev's global list of 8704 * locked ranges still, and will be removed and freed when this range 8705 * is later unlocked. 8706 */ 8707 } 8708 8709 static int 8710 bdev_lock_lba_range_check_io(void *_i) 8711 { 8712 struct spdk_bdev_channel_iter *i = _i; 8713 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8714 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8715 struct locked_lba_range_ctx *ctx = i->ctx; 8716 struct lba_range *range = ctx->current_range; 8717 struct spdk_bdev_io *bdev_io; 8718 8719 spdk_poller_unregister(&ctx->poller); 8720 8721 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8722 * range. But we need to wait until any outstanding IO overlapping with this range 8723 * are completed. 8724 */ 8725 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8726 if (bdev_io_range_is_locked(bdev_io, range)) { 8727 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8728 return SPDK_POLLER_BUSY; 8729 } 8730 } 8731 8732 spdk_bdev_for_each_channel_continue(i, 0); 8733 return SPDK_POLLER_BUSY; 8734 } 8735 8736 static void 8737 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8738 struct spdk_io_channel *_ch, void *_ctx) 8739 { 8740 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8741 struct locked_lba_range_ctx *ctx = _ctx; 8742 struct lba_range *range; 8743 8744 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8745 if (range->length == ctx->range.length && 8746 range->offset == ctx->range.offset && 8747 range->locked_ctx == ctx->range.locked_ctx) { 8748 /* This range already exists on this channel, so don't add 8749 * it again. This can happen when a new channel is created 8750 * while the for_each_channel operation is in progress. 8751 * Do not check for outstanding I/O in that case, since the 8752 * range was locked before any I/O could be submitted to the 8753 * new channel. 8754 */ 8755 spdk_bdev_for_each_channel_continue(i, 0); 8756 return; 8757 } 8758 } 8759 8760 range = calloc(1, sizeof(*range)); 8761 if (range == NULL) { 8762 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8763 return; 8764 } 8765 8766 range->length = ctx->range.length; 8767 range->offset = ctx->range.offset; 8768 range->locked_ctx = ctx->range.locked_ctx; 8769 ctx->current_range = range; 8770 if (ctx->range.owner_ch == ch) { 8771 /* This is the range object for the channel that will hold 8772 * the lock. Store it in the ctx object so that we can easily 8773 * set its owner_ch after the lock is finally acquired. 8774 */ 8775 ctx->owner_range = range; 8776 } 8777 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8778 bdev_lock_lba_range_check_io(i); 8779 } 8780 8781 static void 8782 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8783 { 8784 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8785 8786 /* We will add a copy of this range to each channel now. */ 8787 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8788 bdev_lock_lba_range_cb); 8789 } 8790 8791 static bool 8792 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8793 { 8794 struct lba_range *r; 8795 8796 TAILQ_FOREACH(r, tailq, tailq) { 8797 if (bdev_lba_range_overlapped(range, r)) { 8798 return true; 8799 } 8800 } 8801 return false; 8802 } 8803 8804 static int 8805 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8806 uint64_t offset, uint64_t length, 8807 lock_range_cb cb_fn, void *cb_arg) 8808 { 8809 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8810 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8811 struct locked_lba_range_ctx *ctx; 8812 8813 if (cb_arg == NULL) { 8814 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8815 return -EINVAL; 8816 } 8817 8818 ctx = calloc(1, sizeof(*ctx)); 8819 if (ctx == NULL) { 8820 return -ENOMEM; 8821 } 8822 8823 ctx->range.offset = offset; 8824 ctx->range.length = length; 8825 ctx->range.owner_ch = ch; 8826 ctx->range.locked_ctx = cb_arg; 8827 ctx->bdev = bdev; 8828 ctx->cb_fn = cb_fn; 8829 ctx->cb_arg = cb_arg; 8830 8831 spdk_spin_lock(&bdev->internal.spinlock); 8832 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8833 /* There is an active lock overlapping with this range. 8834 * Put it on the pending list until this range no 8835 * longer overlaps with another. 8836 */ 8837 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8838 } else { 8839 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8840 bdev_lock_lba_range_ctx(bdev, ctx); 8841 } 8842 spdk_spin_unlock(&bdev->internal.spinlock); 8843 return 0; 8844 } 8845 8846 static void 8847 bdev_lock_lba_range_ctx_msg(void *_ctx) 8848 { 8849 struct locked_lba_range_ctx *ctx = _ctx; 8850 8851 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8852 } 8853 8854 static void 8855 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8856 { 8857 struct locked_lba_range_ctx *ctx = _ctx; 8858 struct locked_lba_range_ctx *pending_ctx; 8859 struct lba_range *range, *tmp; 8860 8861 spdk_spin_lock(&bdev->internal.spinlock); 8862 /* Check if there are any pending locked ranges that overlap with this range 8863 * that was just unlocked. If there are, check that it doesn't overlap with any 8864 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8865 * the lock process. 8866 */ 8867 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8868 if (bdev_lba_range_overlapped(range, &ctx->range) && 8869 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8870 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8871 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8872 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8873 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8874 bdev_lock_lba_range_ctx_msg, pending_ctx); 8875 } 8876 } 8877 spdk_spin_unlock(&bdev->internal.spinlock); 8878 8879 ctx->cb_fn(ctx->cb_arg, status); 8880 free(ctx); 8881 } 8882 8883 static void 8884 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8885 struct spdk_io_channel *_ch, void *_ctx) 8886 { 8887 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8888 struct locked_lba_range_ctx *ctx = _ctx; 8889 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8890 struct spdk_bdev_io *bdev_io; 8891 struct lba_range *range; 8892 8893 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8894 if (ctx->range.offset == range->offset && 8895 ctx->range.length == range->length && 8896 ctx->range.locked_ctx == range->locked_ctx) { 8897 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8898 free(range); 8899 break; 8900 } 8901 } 8902 8903 /* Note: we should almost always be able to assert that the range specified 8904 * was found. But there are some very rare corner cases where a new channel 8905 * gets created simultaneously with a range unlock, where this function 8906 * would execute on that new channel and wouldn't have the range. 8907 * We also use this to clean up range allocations when a later allocation 8908 * fails in the locking path. 8909 * So we can't actually assert() here. 8910 */ 8911 8912 /* Swap the locked IO into a temporary list, and then try to submit them again. 8913 * We could hyper-optimize this to only resubmit locked I/O that overlap 8914 * with the range that was just unlocked, but this isn't a performance path so 8915 * we go for simplicity here. 8916 */ 8917 TAILQ_INIT(&io_locked); 8918 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8919 while (!TAILQ_EMPTY(&io_locked)) { 8920 bdev_io = TAILQ_FIRST(&io_locked); 8921 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8922 bdev_io_submit(bdev_io); 8923 } 8924 8925 spdk_bdev_for_each_channel_continue(i, 0); 8926 } 8927 8928 static int 8929 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8930 uint64_t offset, uint64_t length, 8931 lock_range_cb cb_fn, void *cb_arg) 8932 { 8933 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8934 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8935 struct locked_lba_range_ctx *ctx; 8936 struct lba_range *range; 8937 bool range_found = false; 8938 8939 /* Let's make sure the specified channel actually has a lock on 8940 * the specified range. Note that the range must match exactly. 8941 */ 8942 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8943 if (range->offset == offset && range->length == length && 8944 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8945 range_found = true; 8946 break; 8947 } 8948 } 8949 8950 if (!range_found) { 8951 return -EINVAL; 8952 } 8953 8954 spdk_spin_lock(&bdev->internal.spinlock); 8955 /* We confirmed that this channel has locked the specified range. To 8956 * start the unlock the process, we find the range in the bdev's locked_ranges 8957 * and remove it. This ensures new channels don't inherit the locked range. 8958 * Then we will send a message to each channel (including the one specified 8959 * here) to remove the range from its per-channel list. 8960 */ 8961 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8962 if (range->offset == offset && range->length == length && 8963 range->locked_ctx == cb_arg) { 8964 break; 8965 } 8966 } 8967 if (range == NULL) { 8968 assert(false); 8969 spdk_spin_unlock(&bdev->internal.spinlock); 8970 return -EINVAL; 8971 } 8972 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8973 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8974 spdk_spin_unlock(&bdev->internal.spinlock); 8975 8976 ctx->cb_fn = cb_fn; 8977 ctx->cb_arg = cb_arg; 8978 8979 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8980 bdev_unlock_lba_range_cb); 8981 return 0; 8982 } 8983 8984 int 8985 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8986 int array_size) 8987 { 8988 if (!bdev) { 8989 return -EINVAL; 8990 } 8991 8992 if (bdev->fn_table->get_memory_domains) { 8993 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8994 } 8995 8996 return 0; 8997 } 8998 8999 struct spdk_bdev_for_each_io_ctx { 9000 void *ctx; 9001 spdk_bdev_io_fn fn; 9002 spdk_bdev_for_each_io_cb cb; 9003 }; 9004 9005 static void 9006 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9007 struct spdk_io_channel *io_ch, void *_ctx) 9008 { 9009 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9010 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9011 struct spdk_bdev_io *bdev_io; 9012 int rc = 0; 9013 9014 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9015 rc = ctx->fn(ctx->ctx, bdev_io); 9016 if (rc != 0) { 9017 break; 9018 } 9019 } 9020 9021 spdk_bdev_for_each_channel_continue(i, rc); 9022 } 9023 9024 static void 9025 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9026 { 9027 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9028 9029 ctx->cb(ctx->ctx, status); 9030 9031 free(ctx); 9032 } 9033 9034 void 9035 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9036 spdk_bdev_for_each_io_cb cb) 9037 { 9038 struct spdk_bdev_for_each_io_ctx *ctx; 9039 9040 assert(fn != NULL && cb != NULL); 9041 9042 ctx = calloc(1, sizeof(*ctx)); 9043 if (ctx == NULL) { 9044 SPDK_ERRLOG("Failed to allocate context.\n"); 9045 cb(_ctx, -ENOMEM); 9046 return; 9047 } 9048 9049 ctx->ctx = _ctx; 9050 ctx->fn = fn; 9051 ctx->cb = cb; 9052 9053 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9054 bdev_for_each_io_done); 9055 } 9056 9057 void 9058 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9059 { 9060 spdk_for_each_channel_continue(iter->i, status); 9061 } 9062 9063 static struct spdk_bdev * 9064 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9065 { 9066 void *io_device = spdk_io_channel_iter_get_io_device(i); 9067 9068 return __bdev_from_io_dev(io_device); 9069 } 9070 9071 static void 9072 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9073 { 9074 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9075 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9076 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9077 9078 iter->i = i; 9079 iter->fn(iter, bdev, ch, iter->ctx); 9080 } 9081 9082 static void 9083 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9084 { 9085 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9086 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9087 9088 iter->i = i; 9089 iter->cpl(bdev, iter->ctx, status); 9090 9091 free(iter); 9092 } 9093 9094 void 9095 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9096 void *ctx, spdk_bdev_for_each_channel_done cpl) 9097 { 9098 struct spdk_bdev_channel_iter *iter; 9099 9100 assert(bdev != NULL && fn != NULL && ctx != NULL); 9101 9102 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9103 if (iter == NULL) { 9104 SPDK_ERRLOG("Unable to allocate iterator\n"); 9105 assert(false); 9106 return; 9107 } 9108 9109 iter->fn = fn; 9110 iter->cpl = cpl; 9111 iter->ctx = ctx; 9112 9113 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9114 iter, bdev_each_channel_cpl); 9115 } 9116 9117 static void 9118 bdev_copy_do_write_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9119 { 9120 struct spdk_bdev_io *parent_io = cb_arg; 9121 9122 /* Check return status of write */ 9123 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9124 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9125 spdk_bdev_free_io(bdev_io); 9126 } 9127 9128 static void 9129 bdev_copy_do_write(void *_bdev_io) 9130 { 9131 struct spdk_bdev_io *bdev_io = _bdev_io; 9132 int rc; 9133 9134 /* Write blocks */ 9135 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9136 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9137 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9138 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_complete, bdev_io); 9139 9140 if (rc == -ENOMEM) { 9141 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9142 } else if (rc != 0) { 9143 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9144 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9145 } 9146 } 9147 9148 static void 9149 bdev_copy_do_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9150 { 9151 struct spdk_bdev_io *parent_io = cb_arg; 9152 9153 /* Check return status of read */ 9154 if (!success) { 9155 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9156 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9157 spdk_bdev_free_io(bdev_io); 9158 return; 9159 } 9160 9161 spdk_bdev_free_io(bdev_io); 9162 9163 /* Do write */ 9164 bdev_copy_do_write(parent_io); 9165 } 9166 9167 static void 9168 bdev_copy_do_read(void *_bdev_io) 9169 { 9170 struct spdk_bdev_io *bdev_io = _bdev_io; 9171 int rc; 9172 9173 /* Read blocks */ 9174 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9175 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9176 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9177 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_complete, bdev_io); 9178 9179 if (rc == -ENOMEM) { 9180 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9181 } else if (rc != 0) { 9182 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9183 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9184 } 9185 } 9186 9187 static void 9188 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9189 { 9190 if (!success) { 9191 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9192 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9193 return; 9194 } 9195 9196 bdev_copy_do_read(bdev_io); 9197 } 9198 9199 int 9200 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9201 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9202 spdk_bdev_io_completion_cb cb, void *cb_arg) 9203 { 9204 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9205 struct spdk_bdev_io *bdev_io; 9206 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9207 9208 if (!desc->write) { 9209 return -EBADF; 9210 } 9211 9212 if (num_blocks == 0) { 9213 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9214 return -EINVAL; 9215 } 9216 9217 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9218 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9219 SPDK_DEBUGLOG(bdev, 9220 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9221 dst_offset_blocks, src_offset_blocks, num_blocks); 9222 return -EINVAL; 9223 } 9224 9225 bdev_io = bdev_channel_get_io(channel); 9226 if (!bdev_io) { 9227 return -ENOMEM; 9228 } 9229 9230 bdev_io->internal.ch = channel; 9231 bdev_io->internal.desc = desc; 9232 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9233 9234 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9235 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9236 bdev_io->u.bdev.num_blocks = num_blocks; 9237 bdev_io->u.bdev.memory_domain = NULL; 9238 bdev_io->u.bdev.memory_domain_ctx = NULL; 9239 bdev_io->u.bdev.iovs = NULL; 9240 bdev_io->u.bdev.iovcnt = 0; 9241 bdev_io->u.bdev.md_buf = NULL; 9242 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9243 9244 if (dst_offset_blocks == src_offset_blocks) { 9245 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9246 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9247 9248 return 0; 9249 } 9250 9251 /* If the bdev backing device support copy directly, pass to it to process. 9252 * Else do general processing from bdev layer. 9253 */ 9254 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 9255 bdev_io_submit(bdev_io); 9256 return 0; 9257 } 9258 9259 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9260 9261 return 0; 9262 } 9263 9264 SPDK_LOG_REGISTER_COMPONENT(bdev) 9265 9266 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9267 { 9268 struct spdk_trace_tpoint_opts opts[] = { 9269 { 9270 "BDEV_IO_START", TRACE_BDEV_IO_START, 9271 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9272 { 9273 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9274 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9275 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9276 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9277 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9278 } 9279 }, 9280 { 9281 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9282 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9283 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9284 }, 9285 { 9286 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9287 OWNER_BDEV, OBJECT_NONE, 1, 9288 { 9289 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9290 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9291 } 9292 }, 9293 { 9294 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9295 OWNER_BDEV, OBJECT_NONE, 0, 9296 { 9297 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9298 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9299 } 9300 }, 9301 }; 9302 9303 9304 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9305 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9306 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9307 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9308 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9309 } 9310