1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 65 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 66 #ifdef DEBUG 67 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 68 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 69 #else 70 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 71 #endif 72 73 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 74 const char *detail, struct spdk_bdev *bdev); 75 76 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 77 78 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 79 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 80 }; 81 82 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 83 84 RB_HEAD(bdev_name_tree, spdk_bdev_name); 85 86 static int 87 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 88 { 89 return strcmp(name1->name, name2->name); 90 } 91 92 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 93 94 struct spdk_bdev_mgr { 95 struct spdk_mempool *bdev_io_pool; 96 97 void *zero_buffer; 98 99 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 100 101 struct spdk_bdev_list bdevs; 102 struct bdev_name_tree bdev_names; 103 104 bool init_complete; 105 bool module_init_complete; 106 107 struct spdk_spinlock spinlock; 108 109 #ifdef SPDK_CONFIG_VTUNE 110 __itt_domain *domain; 111 #endif 112 }; 113 114 static struct spdk_bdev_mgr g_bdev_mgr = { 115 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 116 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 117 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 118 .init_complete = false, 119 .module_init_complete = false, 120 }; 121 122 static void 123 __attribute__((constructor)) 124 _bdev_init(void) 125 { 126 spdk_spin_init(&g_bdev_mgr.spinlock); 127 } 128 129 typedef void (*lock_range_cb)(void *ctx, int status); 130 131 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 132 133 struct lba_range { 134 uint64_t offset; 135 uint64_t length; 136 void *locked_ctx; 137 struct spdk_bdev_channel *owner_ch; 138 TAILQ_ENTRY(lba_range) tailq; 139 }; 140 141 static struct spdk_bdev_opts g_bdev_opts = { 142 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 143 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 144 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 145 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 146 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 147 }; 148 149 static spdk_bdev_init_cb g_init_cb_fn = NULL; 150 static void *g_init_cb_arg = NULL; 151 152 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 153 static void *g_fini_cb_arg = NULL; 154 static struct spdk_thread *g_fini_thread = NULL; 155 156 struct spdk_bdev_qos_limit { 157 /** IOs or bytes allowed per second (i.e., 1s). */ 158 uint64_t limit; 159 160 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 161 * For remaining bytes, allowed to run negative if an I/O is submitted when 162 * some bytes are remaining, but the I/O is bigger than that amount. The 163 * excess will be deducted from the next timeslice. 164 */ 165 int64_t remaining_this_timeslice; 166 167 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 168 uint32_t min_per_timeslice; 169 170 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 171 uint32_t max_per_timeslice; 172 173 /** Function to check whether to queue the IO. */ 174 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 175 176 /** Function to update for the submitted IO. */ 177 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 178 }; 179 180 struct spdk_bdev_qos { 181 /** Types of structure of rate limits. */ 182 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 183 184 /** The channel that all I/O are funneled through. */ 185 struct spdk_bdev_channel *ch; 186 187 /** The thread on which the poller is running. */ 188 struct spdk_thread *thread; 189 190 /** Queue of I/O waiting to be issued. */ 191 bdev_io_tailq_t queued; 192 193 /** Size of a timeslice in tsc ticks. */ 194 uint64_t timeslice_size; 195 196 /** Timestamp of start of last timeslice. */ 197 uint64_t last_timeslice; 198 199 /** Poller that processes queued I/O commands each time slice. */ 200 struct spdk_poller *poller; 201 }; 202 203 struct spdk_bdev_mgmt_channel { 204 /* 205 * Each thread keeps a cache of bdev_io - this allows 206 * bdev threads which are *not* DPDK threads to still 207 * benefit from a per-thread bdev_io cache. Without 208 * this, non-DPDK threads fetching from the mempool 209 * incur a cmpxchg on get and put. 210 */ 211 bdev_io_stailq_t per_thread_cache; 212 uint32_t per_thread_cache_count; 213 uint32_t bdev_io_cache_size; 214 215 struct spdk_iobuf_channel iobuf; 216 217 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 218 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 219 }; 220 221 /* 222 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 223 * will queue here their IO that awaits retry. It makes it possible to retry sending 224 * IO to one bdev after IO from other bdev completes. 225 */ 226 struct spdk_bdev_shared_resource { 227 /* The bdev management channel */ 228 struct spdk_bdev_mgmt_channel *mgmt_ch; 229 230 /* 231 * Count of I/O submitted to bdev module and waiting for completion. 232 * Incremented before submit_request() is called on an spdk_bdev_io. 233 */ 234 uint64_t io_outstanding; 235 236 /* 237 * Queue of IO awaiting retry because of a previous NOMEM status returned 238 * on this channel. 239 */ 240 bdev_io_tailq_t nomem_io; 241 242 /* 243 * Threshold which io_outstanding must drop to before retrying nomem_io. 244 */ 245 uint64_t nomem_threshold; 246 247 /* I/O channel allocated by a bdev module */ 248 struct spdk_io_channel *shared_ch; 249 250 /* Refcount of bdev channels using this resource */ 251 uint32_t ref; 252 253 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 254 }; 255 256 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 257 #define BDEV_CH_QOS_ENABLED (1 << 1) 258 259 struct spdk_bdev_channel { 260 struct spdk_bdev *bdev; 261 262 /* The channel for the underlying device */ 263 struct spdk_io_channel *channel; 264 265 /* Per io_device per thread data */ 266 struct spdk_bdev_shared_resource *shared_resource; 267 268 struct spdk_bdev_io_stat *stat; 269 270 /* 271 * Count of I/O submitted to the underlying dev module through this channel 272 * and waiting for completion. 273 */ 274 uint64_t io_outstanding; 275 276 /* 277 * List of all submitted I/Os including I/O that are generated via splitting. 278 */ 279 bdev_io_tailq_t io_submitted; 280 281 /* 282 * List of spdk_bdev_io that are currently queued because they write to a locked 283 * LBA range. 284 */ 285 bdev_io_tailq_t io_locked; 286 287 uint32_t flags; 288 289 struct spdk_histogram_data *histogram; 290 291 #ifdef SPDK_CONFIG_VTUNE 292 uint64_t start_tsc; 293 uint64_t interval_tsc; 294 __itt_string_handle *handle; 295 struct spdk_bdev_io_stat *prev_stat; 296 #endif 297 298 bdev_io_tailq_t queued_resets; 299 300 lba_range_tailq_t locked_ranges; 301 }; 302 303 struct media_event_entry { 304 struct spdk_bdev_media_event event; 305 TAILQ_ENTRY(media_event_entry) tailq; 306 }; 307 308 #define MEDIA_EVENT_POOL_SIZE 64 309 310 struct spdk_bdev_desc { 311 struct spdk_bdev *bdev; 312 struct spdk_thread *thread; 313 struct { 314 spdk_bdev_event_cb_t event_fn; 315 void *ctx; 316 } callback; 317 bool closed; 318 bool write; 319 bool memory_domains_supported; 320 struct spdk_spinlock spinlock; 321 uint32_t refs; 322 TAILQ_HEAD(, media_event_entry) pending_media_events; 323 TAILQ_HEAD(, media_event_entry) free_media_events; 324 struct media_event_entry *media_events_buffer; 325 TAILQ_ENTRY(spdk_bdev_desc) link; 326 327 uint64_t timeout_in_sec; 328 spdk_bdev_io_timeout_cb cb_fn; 329 void *cb_arg; 330 struct spdk_poller *io_timeout_poller; 331 struct spdk_bdev_module_claim *claim; 332 }; 333 334 struct spdk_bdev_iostat_ctx { 335 struct spdk_bdev_io_stat *stat; 336 spdk_bdev_get_device_stat_cb cb; 337 void *cb_arg; 338 }; 339 340 struct set_qos_limit_ctx { 341 void (*cb_fn)(void *cb_arg, int status); 342 void *cb_arg; 343 struct spdk_bdev *bdev; 344 }; 345 346 struct spdk_bdev_channel_iter { 347 spdk_bdev_for_each_channel_msg fn; 348 spdk_bdev_for_each_channel_done cpl; 349 struct spdk_io_channel_iter *i; 350 void *ctx; 351 }; 352 353 struct spdk_bdev_io_error_stat { 354 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 355 }; 356 357 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 358 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 359 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 360 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 361 362 static inline void bdev_io_complete(void *ctx); 363 364 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 365 static void bdev_write_zero_buffer_next(void *_bdev_io); 366 367 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 368 struct spdk_io_channel *ch, void *_ctx); 369 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 370 371 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 372 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 373 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 374 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 375 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 376 struct iovec *iov, int iovcnt, void *md_buf, 377 uint64_t offset_blocks, uint64_t num_blocks, 378 spdk_bdev_io_completion_cb cb, void *cb_arg, 379 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 380 381 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 382 uint64_t offset, uint64_t length, 383 lock_range_cb cb_fn, void *cb_arg); 384 385 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 386 uint64_t offset, uint64_t length, 387 lock_range_cb cb_fn, void *cb_arg); 388 389 static inline void bdev_io_complete(void *ctx); 390 391 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 392 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 393 394 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 395 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 396 static void claim_reset(struct spdk_bdev *bdev); 397 398 void 399 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 400 { 401 if (!opts) { 402 SPDK_ERRLOG("opts should not be NULL\n"); 403 return; 404 } 405 406 if (!opts_size) { 407 SPDK_ERRLOG("opts_size should not be zero value\n"); 408 return; 409 } 410 411 opts->opts_size = opts_size; 412 413 #define SET_FIELD(field) \ 414 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 415 opts->field = g_bdev_opts.field; \ 416 } \ 417 418 SET_FIELD(bdev_io_pool_size); 419 SET_FIELD(bdev_io_cache_size); 420 SET_FIELD(bdev_auto_examine); 421 SET_FIELD(small_buf_pool_size); 422 SET_FIELD(large_buf_pool_size); 423 424 /* Do not remove this statement, you should always update this statement when you adding a new field, 425 * and do not forget to add the SET_FIELD statement for your added field. */ 426 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 427 428 #undef SET_FIELD 429 } 430 431 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 432 "v23.05", 0); 433 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 434 "v23.05", 0); 435 int 436 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 437 { 438 struct spdk_iobuf_opts iobuf_opts; 439 uint32_t min_pool_size; 440 int rc; 441 442 if (!opts) { 443 SPDK_ERRLOG("opts cannot be NULL\n"); 444 return -1; 445 } 446 447 if (!opts->opts_size) { 448 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 449 return -1; 450 } 451 452 /* 453 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 454 * initialization. A second mgmt_ch will be created on the same thread when the application starts 455 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 456 */ 457 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 458 if (opts->bdev_io_pool_size < min_pool_size) { 459 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 460 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 461 spdk_thread_get_count()); 462 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 463 return -1; 464 } 465 466 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 467 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 468 } 469 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 470 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 471 } 472 473 #define SET_FIELD(field) \ 474 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 475 g_bdev_opts.field = opts->field; \ 476 } \ 477 478 SET_FIELD(bdev_io_pool_size); 479 SET_FIELD(bdev_io_cache_size); 480 SET_FIELD(bdev_auto_examine); 481 SET_FIELD(small_buf_pool_size); 482 SET_FIELD(large_buf_pool_size); 483 484 spdk_iobuf_get_opts(&iobuf_opts); 485 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 486 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 487 488 rc = spdk_iobuf_set_opts(&iobuf_opts); 489 if (rc != 0) { 490 SPDK_ERRLOG("Failed to set iobuf opts\n"); 491 return -1; 492 } 493 494 g_bdev_opts.opts_size = opts->opts_size; 495 496 #undef SET_FIELD 497 498 return 0; 499 } 500 501 static struct spdk_bdev * 502 bdev_get_by_name(const char *bdev_name) 503 { 504 struct spdk_bdev_name find; 505 struct spdk_bdev_name *res; 506 507 find.name = (char *)bdev_name; 508 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 509 if (res != NULL) { 510 return res->bdev; 511 } 512 513 return NULL; 514 } 515 516 struct spdk_bdev * 517 spdk_bdev_get_by_name(const char *bdev_name) 518 { 519 struct spdk_bdev *bdev; 520 521 spdk_spin_lock(&g_bdev_mgr.spinlock); 522 bdev = bdev_get_by_name(bdev_name); 523 spdk_spin_unlock(&g_bdev_mgr.spinlock); 524 525 return bdev; 526 } 527 528 struct bdev_io_status_string { 529 enum spdk_bdev_io_status status; 530 const char *str; 531 }; 532 533 static const struct bdev_io_status_string bdev_io_status_strings[] = { 534 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 535 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 536 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 537 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 538 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 539 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 540 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 541 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 542 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 543 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 544 }; 545 546 static const char * 547 bdev_io_status_get_string(enum spdk_bdev_io_status status) 548 { 549 uint32_t i; 550 551 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 552 if (bdev_io_status_strings[i].status == status) { 553 return bdev_io_status_strings[i].str; 554 } 555 } 556 557 return "reserved"; 558 } 559 560 struct spdk_bdev_wait_for_examine_ctx { 561 struct spdk_poller *poller; 562 spdk_bdev_wait_for_examine_cb cb_fn; 563 void *cb_arg; 564 }; 565 566 static bool bdev_module_all_actions_completed(void); 567 568 static int 569 bdev_wait_for_examine_cb(void *arg) 570 { 571 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 572 573 if (!bdev_module_all_actions_completed()) { 574 return SPDK_POLLER_IDLE; 575 } 576 577 spdk_poller_unregister(&ctx->poller); 578 ctx->cb_fn(ctx->cb_arg); 579 free(ctx); 580 581 return SPDK_POLLER_BUSY; 582 } 583 584 int 585 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 586 { 587 struct spdk_bdev_wait_for_examine_ctx *ctx; 588 589 ctx = calloc(1, sizeof(*ctx)); 590 if (ctx == NULL) { 591 return -ENOMEM; 592 } 593 ctx->cb_fn = cb_fn; 594 ctx->cb_arg = cb_arg; 595 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 596 597 return 0; 598 } 599 600 struct spdk_bdev_examine_item { 601 char *name; 602 TAILQ_ENTRY(spdk_bdev_examine_item) link; 603 }; 604 605 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 606 607 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 608 g_bdev_examine_allowlist); 609 610 static inline bool 611 bdev_examine_allowlist_check(const char *name) 612 { 613 struct spdk_bdev_examine_item *item; 614 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 615 if (strcmp(name, item->name) == 0) { 616 return true; 617 } 618 } 619 return false; 620 } 621 622 static inline void 623 bdev_examine_allowlist_free(void) 624 { 625 struct spdk_bdev_examine_item *item; 626 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 627 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 628 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 629 free(item->name); 630 free(item); 631 } 632 } 633 634 static inline bool 635 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 636 { 637 struct spdk_bdev_alias *tmp; 638 if (bdev_examine_allowlist_check(bdev->name)) { 639 return true; 640 } 641 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 642 if (bdev_examine_allowlist_check(tmp->alias.name)) { 643 return true; 644 } 645 } 646 return false; 647 } 648 649 static inline bool 650 bdev_ok_to_examine(struct spdk_bdev *bdev) 651 { 652 if (g_bdev_opts.bdev_auto_examine) { 653 return true; 654 } else { 655 return bdev_in_examine_allowlist(bdev); 656 } 657 } 658 659 static void 660 bdev_examine(struct spdk_bdev *bdev) 661 { 662 struct spdk_bdev_module *module; 663 struct spdk_bdev_module_claim *claim, *tmpclaim; 664 uint32_t action; 665 666 if (!bdev_ok_to_examine(bdev)) { 667 return; 668 } 669 670 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 671 if (module->examine_config) { 672 spdk_spin_lock(&module->internal.spinlock); 673 action = module->internal.action_in_progress; 674 module->internal.action_in_progress++; 675 spdk_spin_unlock(&module->internal.spinlock); 676 module->examine_config(bdev); 677 if (action != module->internal.action_in_progress) { 678 SPDK_ERRLOG("examine_config for module %s did not call " 679 "spdk_bdev_module_examine_done()\n", module->name); 680 } 681 } 682 } 683 684 spdk_spin_lock(&bdev->internal.spinlock); 685 686 switch (bdev->internal.claim_type) { 687 case SPDK_BDEV_CLAIM_NONE: 688 /* Examine by all bdev modules */ 689 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 690 if (module->examine_disk) { 691 spdk_spin_lock(&module->internal.spinlock); 692 module->internal.action_in_progress++; 693 spdk_spin_unlock(&module->internal.spinlock); 694 spdk_spin_unlock(&bdev->internal.spinlock); 695 module->examine_disk(bdev); 696 spdk_spin_lock(&bdev->internal.spinlock); 697 } 698 } 699 break; 700 case SPDK_BDEV_CLAIM_EXCL_WRITE: 701 /* Examine by the one bdev module with a v1 claim */ 702 module = bdev->internal.claim.v1.module; 703 if (module->examine_disk) { 704 spdk_spin_lock(&module->internal.spinlock); 705 module->internal.action_in_progress++; 706 spdk_spin_unlock(&module->internal.spinlock); 707 spdk_spin_unlock(&bdev->internal.spinlock); 708 module->examine_disk(bdev); 709 return; 710 } 711 break; 712 default: 713 /* Examine by all bdev modules with a v2 claim */ 714 assert(claim_type_is_v2(bdev->internal.claim_type)); 715 /* 716 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 717 * list, perhaps accessing freed memory. Without protection, this could happen 718 * while the lock is dropped during the examine callback. 719 */ 720 bdev->internal.examine_in_progress++; 721 722 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 723 module = claim->module; 724 725 if (module == NULL) { 726 /* This is a vestigial claim, held by examine_count */ 727 continue; 728 } 729 730 if (module->examine_disk == NULL) { 731 continue; 732 } 733 734 spdk_spin_lock(&module->internal.spinlock); 735 module->internal.action_in_progress++; 736 spdk_spin_unlock(&module->internal.spinlock); 737 738 /* Call examine_disk without holding internal.spinlock. */ 739 spdk_spin_unlock(&bdev->internal.spinlock); 740 module->examine_disk(bdev); 741 spdk_spin_lock(&bdev->internal.spinlock); 742 } 743 744 assert(bdev->internal.examine_in_progress > 0); 745 bdev->internal.examine_in_progress--; 746 if (bdev->internal.examine_in_progress == 0) { 747 /* Remove any claims that were released during examine_disk */ 748 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 749 if (claim->desc != NULL) { 750 continue; 751 } 752 753 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 754 free(claim); 755 } 756 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 757 claim_reset(bdev); 758 } 759 } 760 } 761 762 spdk_spin_unlock(&bdev->internal.spinlock); 763 } 764 765 int 766 spdk_bdev_examine(const char *name) 767 { 768 struct spdk_bdev *bdev; 769 struct spdk_bdev_examine_item *item; 770 struct spdk_thread *thread = spdk_get_thread(); 771 772 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 773 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 774 thread ? spdk_thread_get_name(thread) : "null"); 775 return -EINVAL; 776 } 777 778 if (g_bdev_opts.bdev_auto_examine) { 779 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 780 return -EINVAL; 781 } 782 783 if (bdev_examine_allowlist_check(name)) { 784 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 785 return -EEXIST; 786 } 787 788 item = calloc(1, sizeof(*item)); 789 if (!item) { 790 return -ENOMEM; 791 } 792 item->name = strdup(name); 793 if (!item->name) { 794 free(item); 795 return -ENOMEM; 796 } 797 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 798 799 bdev = spdk_bdev_get_by_name(name); 800 if (bdev) { 801 bdev_examine(bdev); 802 } 803 return 0; 804 } 805 806 static inline void 807 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 808 { 809 struct spdk_bdev_examine_item *item; 810 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 811 spdk_json_write_object_begin(w); 812 spdk_json_write_named_string(w, "method", "bdev_examine"); 813 spdk_json_write_named_object_begin(w, "params"); 814 spdk_json_write_named_string(w, "name", item->name); 815 spdk_json_write_object_end(w); 816 spdk_json_write_object_end(w); 817 } 818 } 819 820 struct spdk_bdev * 821 spdk_bdev_first(void) 822 { 823 struct spdk_bdev *bdev; 824 825 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 826 if (bdev) { 827 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 828 } 829 830 return bdev; 831 } 832 833 struct spdk_bdev * 834 spdk_bdev_next(struct spdk_bdev *prev) 835 { 836 struct spdk_bdev *bdev; 837 838 bdev = TAILQ_NEXT(prev, internal.link); 839 if (bdev) { 840 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 841 } 842 843 return bdev; 844 } 845 846 static struct spdk_bdev * 847 _bdev_next_leaf(struct spdk_bdev *bdev) 848 { 849 while (bdev != NULL) { 850 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 851 return bdev; 852 } else { 853 bdev = TAILQ_NEXT(bdev, internal.link); 854 } 855 } 856 857 return bdev; 858 } 859 860 struct spdk_bdev * 861 spdk_bdev_first_leaf(void) 862 { 863 struct spdk_bdev *bdev; 864 865 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 866 867 if (bdev) { 868 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 869 } 870 871 return bdev; 872 } 873 874 struct spdk_bdev * 875 spdk_bdev_next_leaf(struct spdk_bdev *prev) 876 { 877 struct spdk_bdev *bdev; 878 879 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 880 881 if (bdev) { 882 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 883 } 884 885 return bdev; 886 } 887 888 static inline bool 889 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 890 { 891 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 892 } 893 894 void 895 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 896 { 897 struct iovec *iovs; 898 899 if (bdev_io->u.bdev.iovs == NULL) { 900 bdev_io->u.bdev.iovs = &bdev_io->iov; 901 bdev_io->u.bdev.iovcnt = 1; 902 } 903 904 iovs = bdev_io->u.bdev.iovs; 905 906 assert(iovs != NULL); 907 assert(bdev_io->u.bdev.iovcnt >= 1); 908 909 iovs[0].iov_base = buf; 910 iovs[0].iov_len = len; 911 } 912 913 void 914 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 915 { 916 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 917 bdev_io->u.bdev.md_buf = md_buf; 918 } 919 920 static bool 921 _is_buf_allocated(const struct iovec *iovs) 922 { 923 if (iovs == NULL) { 924 return false; 925 } 926 927 return iovs[0].iov_base != NULL; 928 } 929 930 static bool 931 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 932 { 933 int i; 934 uintptr_t iov_base; 935 936 if (spdk_likely(alignment == 1)) { 937 return true; 938 } 939 940 for (i = 0; i < iovcnt; i++) { 941 iov_base = (uintptr_t)iovs[i].iov_base; 942 if ((iov_base & (alignment - 1)) != 0) { 943 return false; 944 } 945 } 946 947 return true; 948 } 949 950 static void 951 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 952 { 953 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 954 void *buf; 955 956 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 957 buf = bdev_io->internal.buf; 958 bdev_io->internal.buf = NULL; 959 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 960 bdev_io->internal.get_aux_buf_cb = NULL; 961 } else { 962 assert(bdev_io->internal.get_buf_cb != NULL); 963 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 964 bdev_io->internal.get_buf_cb = NULL; 965 } 966 } 967 968 static void 969 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 970 { 971 struct spdk_bdev_io *bdev_io = ctx; 972 973 if (rc) { 974 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 975 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 976 } 977 bdev_io_get_buf_complete(bdev_io, !rc); 978 } 979 980 static void 981 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 982 { 983 int rc = 0; 984 985 /* save original md_buf */ 986 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 987 bdev_io->internal.orig_md_iov.iov_len = len; 988 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 989 bdev_io->internal.bounce_md_iov.iov_len = len; 990 /* set bounce md_buf */ 991 bdev_io->u.bdev.md_buf = md_buf; 992 993 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 994 if (bdev_io_use_memory_domain(bdev_io)) { 995 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 996 bdev_io->internal.ext_opts->memory_domain_ctx, 997 &bdev_io->internal.orig_md_iov, 1, 998 &bdev_io->internal.bounce_md_iov, 1, 999 bdev_io->internal.data_transfer_cpl, 1000 bdev_io); 1001 if (rc == 0) { 1002 /* Continue to submit IO in completion callback */ 1003 return; 1004 } 1005 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1006 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 1007 } else { 1008 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1009 } 1010 } 1011 1012 assert(bdev_io->internal.data_transfer_cpl); 1013 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1014 } 1015 1016 static void 1017 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1018 { 1019 struct spdk_bdev *bdev = bdev_io->bdev; 1020 uint64_t md_len; 1021 void *buf; 1022 1023 if (spdk_bdev_is_md_separate(bdev)) { 1024 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1025 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1026 1027 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1028 1029 if (bdev_io->u.bdev.md_buf != NULL) { 1030 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1031 return; 1032 } else { 1033 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1034 } 1035 } 1036 1037 bdev_io_get_buf_complete(bdev_io, true); 1038 } 1039 1040 static void 1041 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1042 { 1043 struct spdk_bdev_io *bdev_io = ctx; 1044 1045 if (rc) { 1046 SPDK_ERRLOG("Failed to get data buffer\n"); 1047 assert(bdev_io->internal.data_transfer_cpl); 1048 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1049 return; 1050 } 1051 1052 _bdev_io_set_md_buf(bdev_io); 1053 } 1054 1055 static void 1056 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1057 bdev_copy_bounce_buffer_cpl cpl_cb) 1058 { 1059 int rc = 0; 1060 1061 bdev_io->internal.data_transfer_cpl = cpl_cb; 1062 /* save original iovec */ 1063 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1064 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1065 /* set bounce iov */ 1066 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1067 bdev_io->u.bdev.iovcnt = 1; 1068 /* set bounce buffer for this operation */ 1069 bdev_io->u.bdev.iovs[0].iov_base = buf; 1070 bdev_io->u.bdev.iovs[0].iov_len = len; 1071 /* if this is write path, copy data from original buffer to bounce buffer */ 1072 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1073 if (bdev_io_use_memory_domain(bdev_io)) { 1074 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 1075 bdev_io->internal.ext_opts->memory_domain_ctx, 1076 bdev_io->internal.orig_iovs, 1077 (uint32_t) bdev_io->internal.orig_iovcnt, 1078 bdev_io->u.bdev.iovs, 1, 1079 _bdev_io_pull_bounce_data_buf_done, 1080 bdev_io); 1081 if (rc == 0) { 1082 /* Continue to submit IO in completion callback */ 1083 return; 1084 } 1085 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1086 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1087 } else { 1088 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1089 } 1090 } 1091 1092 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1093 } 1094 1095 static void 1096 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1097 { 1098 struct spdk_bdev *bdev = bdev_io->bdev; 1099 bool buf_allocated; 1100 uint64_t alignment; 1101 void *aligned_buf; 1102 1103 bdev_io->internal.buf = buf; 1104 1105 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1106 bdev_io_get_buf_complete(bdev_io, true); 1107 return; 1108 } 1109 1110 alignment = spdk_bdev_get_buf_align(bdev); 1111 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1112 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1113 1114 if (buf_allocated) { 1115 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1116 /* Continue in completion callback */ 1117 return; 1118 } else { 1119 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1120 } 1121 1122 _bdev_io_set_md_buf(bdev_io); 1123 } 1124 1125 static inline uint64_t 1126 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1127 { 1128 struct spdk_bdev *bdev = bdev_io->bdev; 1129 uint64_t md_len, alignment; 1130 1131 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1132 alignment = spdk_bdev_get_buf_align(bdev); 1133 1134 return len + alignment + md_len; 1135 } 1136 1137 static void 1138 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1139 { 1140 struct spdk_bdev_mgmt_channel *ch; 1141 1142 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1143 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1144 } 1145 1146 static void 1147 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1148 { 1149 assert(bdev_io->internal.buf != NULL); 1150 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1151 bdev_io->internal.buf = NULL; 1152 } 1153 1154 void 1155 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1156 { 1157 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1158 1159 assert(buf != NULL); 1160 _bdev_io_put_buf(bdev_io, buf, len); 1161 } 1162 1163 static void 1164 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1165 { 1166 struct spdk_bdev *bdev = bdev_ch->bdev; 1167 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1168 struct spdk_bdev_io *bdev_io; 1169 1170 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1171 /* 1172 * Allow some more I/O to complete before retrying the nomem_io queue. 1173 * Some drivers (such as nvme) cannot immediately take a new I/O in 1174 * the context of a completion, because the resources for the I/O are 1175 * not released until control returns to the bdev poller. Also, we 1176 * may require several small I/O to complete before a larger I/O 1177 * (that requires splitting) can be submitted. 1178 */ 1179 return; 1180 } 1181 1182 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1183 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1184 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1185 bdev_io->internal.ch->io_outstanding++; 1186 shared_resource->io_outstanding++; 1187 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1188 bdev_io->internal.error.nvme.cdw0 = 0; 1189 bdev_io->num_retries++; 1190 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1191 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1192 break; 1193 } 1194 } 1195 } 1196 1197 static inline void 1198 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1199 struct spdk_bdev_shared_resource *shared_resource) 1200 { 1201 assert(bdev_ch->io_outstanding > 0); 1202 assert(shared_resource->io_outstanding > 0); 1203 bdev_ch->io_outstanding--; 1204 shared_resource->io_outstanding--; 1205 } 1206 1207 static inline bool 1208 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1209 { 1210 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1211 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1212 1213 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1214 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1215 /* 1216 * Wait for some of the outstanding I/O to complete before we 1217 * retry any of the nomem_io. Normally we will wait for 1218 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1219 * depth channels we will instead wait for half to complete. 1220 */ 1221 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1222 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1223 return true; 1224 } 1225 1226 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1227 bdev_ch_retry_io(bdev_ch); 1228 } 1229 1230 return false; 1231 } 1232 1233 static void 1234 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1235 { 1236 struct spdk_bdev_io *bdev_io = ctx; 1237 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1238 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1239 1240 if (rc) { 1241 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1242 } 1243 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1244 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1245 */ 1246 bdev_io_put_buf(bdev_io); 1247 1248 /* Continue with IO completion flow */ 1249 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1250 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1251 return; 1252 } 1253 1254 bdev_io_complete(bdev_io); 1255 } 1256 1257 static inline void 1258 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1259 { 1260 int rc = 0; 1261 1262 /* do the same for metadata buffer */ 1263 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1264 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1265 1266 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1267 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1268 if (bdev_io_use_memory_domain(bdev_io)) { 1269 /* If memory domain is used then we need to call async push function */ 1270 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1271 bdev_io->internal.ext_opts->memory_domain_ctx, 1272 &bdev_io->internal.orig_md_iov, 1273 (uint32_t)bdev_io->internal.orig_iovcnt, 1274 &bdev_io->internal.bounce_md_iov, 1, 1275 bdev_io->internal.data_transfer_cpl, 1276 bdev_io); 1277 if (rc == 0) { 1278 /* Continue IO completion in async callback */ 1279 return; 1280 } 1281 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1282 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1283 } else { 1284 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1285 bdev_io->internal.orig_md_iov.iov_len); 1286 } 1287 } 1288 } 1289 1290 assert(bdev_io->internal.data_transfer_cpl); 1291 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1292 } 1293 1294 static void 1295 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1296 { 1297 struct spdk_bdev_io *bdev_io = ctx; 1298 1299 assert(bdev_io->internal.data_transfer_cpl); 1300 1301 if (rc) { 1302 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1303 return; 1304 } 1305 1306 /* set original buffer for this io */ 1307 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1308 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1309 /* disable bouncing buffer for this io */ 1310 bdev_io->internal.orig_iovcnt = 0; 1311 bdev_io->internal.orig_iovs = NULL; 1312 1313 _bdev_io_push_bounce_md_buffer(bdev_io); 1314 } 1315 1316 static inline void 1317 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1318 { 1319 int rc = 0; 1320 1321 bdev_io->internal.data_transfer_cpl = cpl_cb; 1322 1323 /* if this is read path, copy data from bounce buffer to original buffer */ 1324 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1325 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1326 if (bdev_io_use_memory_domain(bdev_io)) { 1327 /* If memory domain is used then we need to call async push function */ 1328 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1329 bdev_io->internal.ext_opts->memory_domain_ctx, 1330 bdev_io->internal.orig_iovs, 1331 (uint32_t)bdev_io->internal.orig_iovcnt, 1332 &bdev_io->internal.bounce_iov, 1, 1333 _bdev_io_push_bounce_data_buffer_done, 1334 bdev_io); 1335 if (rc == 0) { 1336 /* Continue IO completion in async callback */ 1337 return; 1338 } 1339 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1340 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1341 } else { 1342 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1343 bdev_io->internal.orig_iovcnt, 1344 bdev_io->internal.bounce_iov.iov_base, 1345 bdev_io->internal.bounce_iov.iov_len); 1346 } 1347 } 1348 1349 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1350 } 1351 1352 static void 1353 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1354 { 1355 struct spdk_bdev_io *bdev_io; 1356 1357 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1358 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1359 } 1360 1361 static void 1362 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1363 { 1364 struct spdk_bdev_mgmt_channel *mgmt_ch; 1365 uint64_t max_len; 1366 void *buf; 1367 1368 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1369 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1370 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1371 1372 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1373 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1374 bdev_io_get_buf_complete(bdev_io, false); 1375 return; 1376 } 1377 1378 bdev_io->internal.buf_len = len; 1379 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1380 bdev_io_get_iobuf_cb); 1381 if (buf != NULL) { 1382 _bdev_io_set_buf(bdev_io, buf, len); 1383 } 1384 } 1385 1386 void 1387 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1388 { 1389 struct spdk_bdev *bdev = bdev_io->bdev; 1390 uint64_t alignment; 1391 1392 assert(cb != NULL); 1393 bdev_io->internal.get_buf_cb = cb; 1394 1395 alignment = spdk_bdev_get_buf_align(bdev); 1396 1397 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1398 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1399 /* Buffer already present and aligned */ 1400 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1401 return; 1402 } 1403 1404 bdev_io_get_buf(bdev_io, len); 1405 } 1406 1407 static void 1408 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1409 bool success) 1410 { 1411 if (!success) { 1412 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1413 bdev_io_complete(bdev_io); 1414 } else { 1415 bdev_io_submit(bdev_io); 1416 } 1417 } 1418 1419 static void 1420 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1421 uint64_t len) 1422 { 1423 assert(cb != NULL); 1424 bdev_io->internal.get_buf_cb = cb; 1425 1426 bdev_io_get_buf(bdev_io, len); 1427 } 1428 1429 void 1430 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1431 { 1432 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1433 1434 assert(cb != NULL); 1435 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1436 bdev_io->internal.get_aux_buf_cb = cb; 1437 bdev_io_get_buf(bdev_io, len); 1438 } 1439 1440 static int 1441 bdev_module_get_max_ctx_size(void) 1442 { 1443 struct spdk_bdev_module *bdev_module; 1444 int max_bdev_module_size = 0; 1445 1446 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1447 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1448 max_bdev_module_size = bdev_module->get_ctx_size(); 1449 } 1450 } 1451 1452 return max_bdev_module_size; 1453 } 1454 1455 static void 1456 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1457 { 1458 int i; 1459 struct spdk_bdev_qos *qos = bdev->internal.qos; 1460 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1461 1462 if (!qos) { 1463 return; 1464 } 1465 1466 spdk_bdev_get_qos_rate_limits(bdev, limits); 1467 1468 spdk_json_write_object_begin(w); 1469 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1470 1471 spdk_json_write_named_object_begin(w, "params"); 1472 spdk_json_write_named_string(w, "name", bdev->name); 1473 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1474 if (limits[i] > 0) { 1475 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1476 } 1477 } 1478 spdk_json_write_object_end(w); 1479 1480 spdk_json_write_object_end(w); 1481 } 1482 1483 void 1484 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1485 { 1486 struct spdk_bdev_module *bdev_module; 1487 struct spdk_bdev *bdev; 1488 1489 assert(w != NULL); 1490 1491 spdk_json_write_array_begin(w); 1492 1493 spdk_json_write_object_begin(w); 1494 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1495 spdk_json_write_named_object_begin(w, "params"); 1496 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1497 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1498 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1499 spdk_json_write_object_end(w); 1500 spdk_json_write_object_end(w); 1501 1502 bdev_examine_allowlist_config_json(w); 1503 1504 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1505 if (bdev_module->config_json) { 1506 bdev_module->config_json(w); 1507 } 1508 } 1509 1510 spdk_spin_lock(&g_bdev_mgr.spinlock); 1511 1512 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1513 if (bdev->fn_table->write_config_json) { 1514 bdev->fn_table->write_config_json(bdev, w); 1515 } 1516 1517 bdev_qos_config_json(bdev, w); 1518 } 1519 1520 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1521 1522 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1523 spdk_json_write_object_begin(w); 1524 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1525 spdk_json_write_object_end(w); 1526 1527 spdk_json_write_array_end(w); 1528 } 1529 1530 static void 1531 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1532 { 1533 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1534 struct spdk_bdev_io *bdev_io; 1535 1536 spdk_iobuf_channel_fini(&ch->iobuf); 1537 1538 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1539 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1540 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1541 ch->per_thread_cache_count--; 1542 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1543 } 1544 1545 assert(ch->per_thread_cache_count == 0); 1546 } 1547 1548 static int 1549 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1550 { 1551 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1552 struct spdk_bdev_io *bdev_io; 1553 uint32_t i; 1554 int rc; 1555 1556 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1557 if (rc != 0) { 1558 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1559 return -1; 1560 } 1561 1562 STAILQ_INIT(&ch->per_thread_cache); 1563 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1564 1565 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1566 ch->per_thread_cache_count = 0; 1567 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1568 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1569 if (bdev_io == NULL) { 1570 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1571 assert(false); 1572 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1573 return -1; 1574 } 1575 ch->per_thread_cache_count++; 1576 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1577 } 1578 1579 TAILQ_INIT(&ch->shared_resources); 1580 TAILQ_INIT(&ch->io_wait_queue); 1581 1582 return 0; 1583 } 1584 1585 static void 1586 bdev_init_complete(int rc) 1587 { 1588 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1589 void *cb_arg = g_init_cb_arg; 1590 struct spdk_bdev_module *m; 1591 1592 g_bdev_mgr.init_complete = true; 1593 g_init_cb_fn = NULL; 1594 g_init_cb_arg = NULL; 1595 1596 /* 1597 * For modules that need to know when subsystem init is complete, 1598 * inform them now. 1599 */ 1600 if (rc == 0) { 1601 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1602 if (m->init_complete) { 1603 m->init_complete(); 1604 } 1605 } 1606 } 1607 1608 cb_fn(cb_arg, rc); 1609 } 1610 1611 static bool 1612 bdev_module_all_actions_completed(void) 1613 { 1614 struct spdk_bdev_module *m; 1615 1616 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1617 if (m->internal.action_in_progress > 0) { 1618 return false; 1619 } 1620 } 1621 return true; 1622 } 1623 1624 static void 1625 bdev_module_action_complete(void) 1626 { 1627 /* 1628 * Don't finish bdev subsystem initialization if 1629 * module pre-initialization is still in progress, or 1630 * the subsystem been already initialized. 1631 */ 1632 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1633 return; 1634 } 1635 1636 /* 1637 * Check all bdev modules for inits/examinations in progress. If any 1638 * exist, return immediately since we cannot finish bdev subsystem 1639 * initialization until all are completed. 1640 */ 1641 if (!bdev_module_all_actions_completed()) { 1642 return; 1643 } 1644 1645 /* 1646 * Modules already finished initialization - now that all 1647 * the bdev modules have finished their asynchronous I/O 1648 * processing, the entire bdev layer can be marked as complete. 1649 */ 1650 bdev_init_complete(0); 1651 } 1652 1653 static void 1654 bdev_module_action_done(struct spdk_bdev_module *module) 1655 { 1656 spdk_spin_lock(&module->internal.spinlock); 1657 assert(module->internal.action_in_progress > 0); 1658 module->internal.action_in_progress--; 1659 spdk_spin_unlock(&module->internal.spinlock); 1660 bdev_module_action_complete(); 1661 } 1662 1663 void 1664 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1665 { 1666 assert(module->async_init); 1667 bdev_module_action_done(module); 1668 } 1669 1670 void 1671 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1672 { 1673 bdev_module_action_done(module); 1674 } 1675 1676 /** The last initialized bdev module */ 1677 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1678 1679 static void 1680 bdev_init_failed(void *cb_arg) 1681 { 1682 struct spdk_bdev_module *module = cb_arg; 1683 1684 spdk_spin_lock(&module->internal.spinlock); 1685 assert(module->internal.action_in_progress > 0); 1686 module->internal.action_in_progress--; 1687 spdk_spin_unlock(&module->internal.spinlock); 1688 bdev_init_complete(-1); 1689 } 1690 1691 static int 1692 bdev_modules_init(void) 1693 { 1694 struct spdk_bdev_module *module; 1695 int rc = 0; 1696 1697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1698 g_resume_bdev_module = module; 1699 if (module->async_init) { 1700 spdk_spin_lock(&module->internal.spinlock); 1701 module->internal.action_in_progress = 1; 1702 spdk_spin_unlock(&module->internal.spinlock); 1703 } 1704 rc = module->module_init(); 1705 if (rc != 0) { 1706 /* Bump action_in_progress to prevent other modules from completion of modules_init 1707 * Send message to defer application shutdown until resources are cleaned up */ 1708 spdk_spin_lock(&module->internal.spinlock); 1709 module->internal.action_in_progress = 1; 1710 spdk_spin_unlock(&module->internal.spinlock); 1711 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1712 return rc; 1713 } 1714 } 1715 1716 g_resume_bdev_module = NULL; 1717 return 0; 1718 } 1719 1720 void 1721 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1722 { 1723 int rc = 0; 1724 char mempool_name[32]; 1725 1726 assert(cb_fn != NULL); 1727 1728 g_init_cb_fn = cb_fn; 1729 g_init_cb_arg = cb_arg; 1730 1731 spdk_notify_type_register("bdev_register"); 1732 spdk_notify_type_register("bdev_unregister"); 1733 1734 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1735 1736 rc = spdk_iobuf_register_module("bdev"); 1737 if (rc != 0) { 1738 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1739 bdev_init_complete(-1); 1740 return; 1741 } 1742 1743 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1744 g_bdev_opts.bdev_io_pool_size, 1745 sizeof(struct spdk_bdev_io) + 1746 bdev_module_get_max_ctx_size(), 1747 0, 1748 SPDK_ENV_SOCKET_ID_ANY); 1749 1750 if (g_bdev_mgr.bdev_io_pool == NULL) { 1751 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1752 bdev_init_complete(-1); 1753 return; 1754 } 1755 1756 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1757 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1758 if (!g_bdev_mgr.zero_buffer) { 1759 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1760 bdev_init_complete(-1); 1761 return; 1762 } 1763 1764 #ifdef SPDK_CONFIG_VTUNE 1765 SPDK_LOG_DEPRECATED(vtune_support); 1766 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1767 #endif 1768 1769 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1770 bdev_mgmt_channel_destroy, 1771 sizeof(struct spdk_bdev_mgmt_channel), 1772 "bdev_mgr"); 1773 1774 rc = bdev_modules_init(); 1775 g_bdev_mgr.module_init_complete = true; 1776 if (rc != 0) { 1777 SPDK_ERRLOG("bdev modules init failed\n"); 1778 return; 1779 } 1780 1781 bdev_module_action_complete(); 1782 } 1783 1784 static void 1785 bdev_mgr_unregister_cb(void *io_device) 1786 { 1787 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1788 1789 if (g_bdev_mgr.bdev_io_pool) { 1790 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1791 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1792 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1793 g_bdev_opts.bdev_io_pool_size); 1794 } 1795 1796 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1797 } 1798 1799 spdk_free(g_bdev_mgr.zero_buffer); 1800 1801 bdev_examine_allowlist_free(); 1802 1803 cb_fn(g_fini_cb_arg); 1804 g_fini_cb_fn = NULL; 1805 g_fini_cb_arg = NULL; 1806 g_bdev_mgr.init_complete = false; 1807 g_bdev_mgr.module_init_complete = false; 1808 } 1809 1810 static void 1811 bdev_module_fini_iter(void *arg) 1812 { 1813 struct spdk_bdev_module *bdev_module; 1814 1815 /* FIXME: Handling initialization failures is broken now, 1816 * so we won't even try cleaning up after successfully 1817 * initialized modules. if module_init_complete is false, 1818 * just call spdk_bdev_mgr_unregister_cb 1819 */ 1820 if (!g_bdev_mgr.module_init_complete) { 1821 bdev_mgr_unregister_cb(NULL); 1822 return; 1823 } 1824 1825 /* Start iterating from the last touched module */ 1826 if (!g_resume_bdev_module) { 1827 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1828 } else { 1829 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1830 internal.tailq); 1831 } 1832 1833 while (bdev_module) { 1834 if (bdev_module->async_fini) { 1835 /* Save our place so we can resume later. We must 1836 * save the variable here, before calling module_fini() 1837 * below, because in some cases the module may immediately 1838 * call spdk_bdev_module_fini_done() and re-enter 1839 * this function to continue iterating. */ 1840 g_resume_bdev_module = bdev_module; 1841 } 1842 1843 if (bdev_module->module_fini) { 1844 bdev_module->module_fini(); 1845 } 1846 1847 if (bdev_module->async_fini) { 1848 return; 1849 } 1850 1851 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1852 internal.tailq); 1853 } 1854 1855 g_resume_bdev_module = NULL; 1856 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1857 } 1858 1859 void 1860 spdk_bdev_module_fini_done(void) 1861 { 1862 if (spdk_get_thread() != g_fini_thread) { 1863 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1864 } else { 1865 bdev_module_fini_iter(NULL); 1866 } 1867 } 1868 1869 static void 1870 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1871 { 1872 struct spdk_bdev *bdev = cb_arg; 1873 1874 if (bdeverrno && bdev) { 1875 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1876 bdev->name); 1877 1878 /* 1879 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1880 * bdev; try to continue by manually removing this bdev from the list and continue 1881 * with the next bdev in the list. 1882 */ 1883 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1884 } 1885 1886 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1887 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1888 /* 1889 * Bdev module finish need to be deferred as we might be in the middle of some context 1890 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1891 * after returning. 1892 */ 1893 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1894 return; 1895 } 1896 1897 /* 1898 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1899 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1900 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1901 * base bdevs. 1902 * 1903 * Also, walk the list in the reverse order. 1904 */ 1905 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1906 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1907 spdk_spin_lock(&bdev->internal.spinlock); 1908 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1909 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 1910 spdk_spin_unlock(&bdev->internal.spinlock); 1911 continue; 1912 } 1913 spdk_spin_unlock(&bdev->internal.spinlock); 1914 1915 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1916 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1917 return; 1918 } 1919 1920 /* 1921 * If any bdev fails to unclaim underlying bdev properly, we may face the 1922 * case of bdev list consisting of claimed bdevs only (if claims are managed 1923 * correctly, this would mean there's a loop in the claims graph which is 1924 * clearly impossible). Warn and unregister last bdev on the list then. 1925 */ 1926 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1927 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1928 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1929 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1930 return; 1931 } 1932 } 1933 1934 static void 1935 bdev_module_fini_start_iter(void *arg) 1936 { 1937 struct spdk_bdev_module *bdev_module; 1938 1939 if (!g_resume_bdev_module) { 1940 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1941 } else { 1942 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1943 } 1944 1945 while (bdev_module) { 1946 if (bdev_module->async_fini_start) { 1947 /* Save our place so we can resume later. We must 1948 * save the variable here, before calling fini_start() 1949 * below, because in some cases the module may immediately 1950 * call spdk_bdev_module_fini_start_done() and re-enter 1951 * this function to continue iterating. */ 1952 g_resume_bdev_module = bdev_module; 1953 } 1954 1955 if (bdev_module->fini_start) { 1956 bdev_module->fini_start(); 1957 } 1958 1959 if (bdev_module->async_fini_start) { 1960 return; 1961 } 1962 1963 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1964 } 1965 1966 g_resume_bdev_module = NULL; 1967 1968 bdev_finish_unregister_bdevs_iter(NULL, 0); 1969 } 1970 1971 void 1972 spdk_bdev_module_fini_start_done(void) 1973 { 1974 if (spdk_get_thread() != g_fini_thread) { 1975 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1976 } else { 1977 bdev_module_fini_start_iter(NULL); 1978 } 1979 } 1980 1981 static void 1982 bdev_finish_wait_for_examine_done(void *cb_arg) 1983 { 1984 bdev_module_fini_start_iter(NULL); 1985 } 1986 1987 void 1988 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1989 { 1990 int rc; 1991 1992 assert(cb_fn != NULL); 1993 1994 g_fini_thread = spdk_get_thread(); 1995 1996 g_fini_cb_fn = cb_fn; 1997 g_fini_cb_arg = cb_arg; 1998 1999 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2000 if (rc != 0) { 2001 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2002 bdev_finish_wait_for_examine_done(NULL); 2003 } 2004 } 2005 2006 struct spdk_bdev_io * 2007 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2008 { 2009 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2010 struct spdk_bdev_io *bdev_io; 2011 2012 if (ch->per_thread_cache_count > 0) { 2013 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2014 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2015 ch->per_thread_cache_count--; 2016 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2017 /* 2018 * Don't try to look for bdev_ios in the global pool if there are 2019 * waiters on bdev_ios - we don't want this caller to jump the line. 2020 */ 2021 bdev_io = NULL; 2022 } else { 2023 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2024 } 2025 2026 return bdev_io; 2027 } 2028 2029 void 2030 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2031 { 2032 struct spdk_bdev_mgmt_channel *ch; 2033 2034 assert(bdev_io != NULL); 2035 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2036 2037 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2038 2039 if (bdev_io->internal.buf != NULL) { 2040 bdev_io_put_buf(bdev_io); 2041 } 2042 2043 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2044 ch->per_thread_cache_count++; 2045 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2046 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2047 struct spdk_bdev_io_wait_entry *entry; 2048 2049 entry = TAILQ_FIRST(&ch->io_wait_queue); 2050 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2051 entry->cb_fn(entry->cb_arg); 2052 } 2053 } else { 2054 /* We should never have a full cache with entries on the io wait queue. */ 2055 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2056 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2057 } 2058 } 2059 2060 static bool 2061 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2062 { 2063 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2064 2065 switch (limit) { 2066 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2067 return true; 2068 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2069 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2070 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2071 return false; 2072 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2073 default: 2074 return false; 2075 } 2076 } 2077 2078 static bool 2079 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2080 { 2081 switch (bdev_io->type) { 2082 case SPDK_BDEV_IO_TYPE_NVME_IO: 2083 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2084 case SPDK_BDEV_IO_TYPE_READ: 2085 case SPDK_BDEV_IO_TYPE_WRITE: 2086 return true; 2087 case SPDK_BDEV_IO_TYPE_ZCOPY: 2088 if (bdev_io->u.bdev.zcopy.start) { 2089 return true; 2090 } else { 2091 return false; 2092 } 2093 default: 2094 return false; 2095 } 2096 } 2097 2098 static bool 2099 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2100 { 2101 switch (bdev_io->type) { 2102 case SPDK_BDEV_IO_TYPE_NVME_IO: 2103 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2104 /* Bit 1 (0x2) set for read operation */ 2105 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2106 return true; 2107 } else { 2108 return false; 2109 } 2110 case SPDK_BDEV_IO_TYPE_READ: 2111 return true; 2112 case SPDK_BDEV_IO_TYPE_ZCOPY: 2113 /* Populate to read from disk */ 2114 if (bdev_io->u.bdev.zcopy.populate) { 2115 return true; 2116 } else { 2117 return false; 2118 } 2119 default: 2120 return false; 2121 } 2122 } 2123 2124 static uint64_t 2125 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2126 { 2127 struct spdk_bdev *bdev = bdev_io->bdev; 2128 2129 switch (bdev_io->type) { 2130 case SPDK_BDEV_IO_TYPE_NVME_IO: 2131 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2132 return bdev_io->u.nvme_passthru.nbytes; 2133 case SPDK_BDEV_IO_TYPE_READ: 2134 case SPDK_BDEV_IO_TYPE_WRITE: 2135 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2136 case SPDK_BDEV_IO_TYPE_ZCOPY: 2137 /* Track the data in the start phase only */ 2138 if (bdev_io->u.bdev.zcopy.start) { 2139 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2140 } else { 2141 return 0; 2142 } 2143 default: 2144 return 0; 2145 } 2146 } 2147 2148 static bool 2149 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2150 { 2151 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2152 return true; 2153 } else { 2154 return false; 2155 } 2156 } 2157 2158 static bool 2159 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2160 { 2161 if (bdev_is_read_io(io) == false) { 2162 return false; 2163 } 2164 2165 return bdev_qos_rw_queue_io(limit, io); 2166 } 2167 2168 static bool 2169 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2170 { 2171 if (bdev_is_read_io(io) == true) { 2172 return false; 2173 } 2174 2175 return bdev_qos_rw_queue_io(limit, io); 2176 } 2177 2178 static void 2179 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2180 { 2181 limit->remaining_this_timeslice--; 2182 } 2183 2184 static void 2185 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2186 { 2187 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2188 } 2189 2190 static void 2191 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2192 { 2193 if (bdev_is_read_io(io) == false) { 2194 return; 2195 } 2196 2197 return bdev_qos_rw_bps_update_quota(limit, io); 2198 } 2199 2200 static void 2201 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2202 { 2203 if (bdev_is_read_io(io) == true) { 2204 return; 2205 } 2206 2207 return bdev_qos_rw_bps_update_quota(limit, io); 2208 } 2209 2210 static void 2211 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2212 { 2213 int i; 2214 2215 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2216 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2217 qos->rate_limits[i].queue_io = NULL; 2218 qos->rate_limits[i].update_quota = NULL; 2219 continue; 2220 } 2221 2222 switch (i) { 2223 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2224 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2225 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2226 break; 2227 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2228 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2229 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2230 break; 2231 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2232 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2233 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2234 break; 2235 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2236 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2237 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2238 break; 2239 default: 2240 break; 2241 } 2242 } 2243 } 2244 2245 static void 2246 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2247 struct spdk_bdev_io *bdev_io, 2248 enum spdk_bdev_io_status status) 2249 { 2250 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2251 2252 bdev_io->internal.in_submit_request = true; 2253 bdev_ch->io_outstanding++; 2254 shared_resource->io_outstanding++; 2255 spdk_bdev_io_complete(bdev_io, status); 2256 bdev_io->internal.in_submit_request = false; 2257 } 2258 2259 static inline void 2260 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2261 { 2262 struct spdk_bdev *bdev = bdev_io->bdev; 2263 struct spdk_io_channel *ch = bdev_ch->channel; 2264 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2265 2266 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2267 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2268 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2269 2270 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2271 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2272 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2273 SPDK_BDEV_IO_STATUS_SUCCESS); 2274 return; 2275 } 2276 } 2277 2278 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2279 bdev_io->bdev->split_on_write_unit && 2280 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2281 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2282 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2283 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2284 return; 2285 } 2286 2287 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2288 bdev_ch->io_outstanding++; 2289 shared_resource->io_outstanding++; 2290 bdev_io->internal.in_submit_request = true; 2291 bdev->fn_table->submit_request(ch, bdev_io); 2292 bdev_io->internal.in_submit_request = false; 2293 } else { 2294 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2295 } 2296 } 2297 2298 static bool 2299 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2300 { 2301 int i; 2302 2303 if (bdev_qos_io_to_limit(bdev_io) == true) { 2304 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2305 if (!qos->rate_limits[i].queue_io) { 2306 continue; 2307 } 2308 2309 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2310 bdev_io) == true) { 2311 return true; 2312 } 2313 } 2314 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2315 if (!qos->rate_limits[i].update_quota) { 2316 continue; 2317 } 2318 2319 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2320 } 2321 } 2322 2323 return false; 2324 } 2325 2326 static inline void 2327 _bdev_io_do_submit(void *ctx) 2328 { 2329 struct spdk_bdev_io *bdev_io = ctx; 2330 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2331 2332 bdev_io_do_submit(ch, bdev_io); 2333 } 2334 2335 static int 2336 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2337 { 2338 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2339 int submitted_ios = 0; 2340 2341 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2342 if (!bdev_qos_queue_io(qos, bdev_io)) { 2343 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2344 2345 if (bdev_io->internal.io_submit_ch) { 2346 /* Send back the IO to the original thread for the actual processing. */ 2347 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2348 bdev_io->internal.io_submit_ch = NULL; 2349 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2350 _bdev_io_do_submit, bdev_io); 2351 } else { 2352 bdev_io_do_submit(ch, bdev_io); 2353 } 2354 2355 submitted_ios++; 2356 } 2357 } 2358 2359 return submitted_ios; 2360 } 2361 2362 static void 2363 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2364 { 2365 int rc; 2366 2367 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2368 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2369 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2370 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2371 &bdev_io->internal.waitq_entry); 2372 if (rc != 0) { 2373 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2374 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2375 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2376 } 2377 } 2378 2379 static bool 2380 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2381 { 2382 uint32_t io_boundary; 2383 struct spdk_bdev *bdev = bdev_io->bdev; 2384 uint32_t max_size = bdev->max_segment_size; 2385 int max_segs = bdev->max_num_segments; 2386 2387 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2388 io_boundary = bdev->write_unit_size; 2389 } else if (bdev->split_on_optimal_io_boundary) { 2390 io_boundary = bdev->optimal_io_boundary; 2391 } else { 2392 io_boundary = 0; 2393 } 2394 2395 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2396 return false; 2397 } 2398 2399 if (io_boundary) { 2400 uint64_t start_stripe, end_stripe; 2401 2402 start_stripe = bdev_io->u.bdev.offset_blocks; 2403 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2404 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2405 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2406 start_stripe >>= spdk_u32log2(io_boundary); 2407 end_stripe >>= spdk_u32log2(io_boundary); 2408 } else { 2409 start_stripe /= io_boundary; 2410 end_stripe /= io_boundary; 2411 } 2412 2413 if (start_stripe != end_stripe) { 2414 return true; 2415 } 2416 } 2417 2418 if (max_segs) { 2419 if (bdev_io->u.bdev.iovcnt > max_segs) { 2420 return true; 2421 } 2422 } 2423 2424 if (max_size) { 2425 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2426 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2427 return true; 2428 } 2429 } 2430 } 2431 2432 return false; 2433 } 2434 2435 static bool 2436 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2437 { 2438 uint32_t num_unmap_segments; 2439 2440 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2441 return false; 2442 } 2443 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2444 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2445 return true; 2446 } 2447 2448 return false; 2449 } 2450 2451 static bool 2452 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2453 { 2454 if (!bdev_io->bdev->max_write_zeroes) { 2455 return false; 2456 } 2457 2458 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2459 return true; 2460 } 2461 2462 return false; 2463 } 2464 2465 static bool 2466 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2467 { 2468 if (bdev_io->bdev->max_copy != 0 && 2469 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2470 return true; 2471 } 2472 2473 return false; 2474 } 2475 2476 static bool 2477 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2478 { 2479 switch (bdev_io->type) { 2480 case SPDK_BDEV_IO_TYPE_READ: 2481 case SPDK_BDEV_IO_TYPE_WRITE: 2482 return bdev_rw_should_split(bdev_io); 2483 case SPDK_BDEV_IO_TYPE_UNMAP: 2484 return bdev_unmap_should_split(bdev_io); 2485 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2486 return bdev_write_zeroes_should_split(bdev_io); 2487 case SPDK_BDEV_IO_TYPE_COPY: 2488 return bdev_copy_should_split(bdev_io); 2489 default: 2490 return false; 2491 } 2492 } 2493 2494 static uint32_t 2495 _to_next_boundary(uint64_t offset, uint32_t boundary) 2496 { 2497 return (boundary - (offset % boundary)); 2498 } 2499 2500 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2501 2502 static void _bdev_rw_split(void *_bdev_io); 2503 2504 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2505 2506 static void 2507 _bdev_unmap_split(void *_bdev_io) 2508 { 2509 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2510 } 2511 2512 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2513 2514 static void 2515 _bdev_write_zeroes_split(void *_bdev_io) 2516 { 2517 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2518 } 2519 2520 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2521 2522 static void 2523 _bdev_copy_split(void *_bdev_io) 2524 { 2525 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2526 } 2527 2528 static int 2529 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2530 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2531 { 2532 int rc; 2533 uint64_t current_offset, current_remaining, current_src_offset; 2534 spdk_bdev_io_wait_cb io_wait_fn; 2535 2536 current_offset = *offset; 2537 current_remaining = *remaining; 2538 2539 bdev_io->u.bdev.split_outstanding++; 2540 2541 io_wait_fn = _bdev_rw_split; 2542 switch (bdev_io->type) { 2543 case SPDK_BDEV_IO_TYPE_READ: 2544 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2545 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2546 iov, iovcnt, md_buf, current_offset, 2547 num_blocks, 2548 bdev_io_split_done, bdev_io, 2549 bdev_io->internal.ext_opts, true); 2550 break; 2551 case SPDK_BDEV_IO_TYPE_WRITE: 2552 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2553 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2554 iov, iovcnt, md_buf, current_offset, 2555 num_blocks, 2556 bdev_io_split_done, bdev_io, 2557 bdev_io->internal.ext_opts, true); 2558 break; 2559 case SPDK_BDEV_IO_TYPE_UNMAP: 2560 io_wait_fn = _bdev_unmap_split; 2561 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2562 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2563 current_offset, num_blocks, 2564 bdev_io_split_done, bdev_io); 2565 break; 2566 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2567 io_wait_fn = _bdev_write_zeroes_split; 2568 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2569 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2570 current_offset, num_blocks, 2571 bdev_io_split_done, bdev_io); 2572 break; 2573 case SPDK_BDEV_IO_TYPE_COPY: 2574 io_wait_fn = _bdev_copy_split; 2575 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2576 (current_offset - bdev_io->u.bdev.offset_blocks); 2577 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2578 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2579 current_offset, current_src_offset, num_blocks, 2580 bdev_io_split_done, bdev_io); 2581 break; 2582 default: 2583 assert(false); 2584 rc = -EINVAL; 2585 break; 2586 } 2587 2588 if (rc == 0) { 2589 current_offset += num_blocks; 2590 current_remaining -= num_blocks; 2591 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2592 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2593 *offset = current_offset; 2594 *remaining = current_remaining; 2595 } else { 2596 bdev_io->u.bdev.split_outstanding--; 2597 if (rc == -ENOMEM) { 2598 if (bdev_io->u.bdev.split_outstanding == 0) { 2599 /* No I/O is outstanding. Hence we should wait here. */ 2600 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2601 } 2602 } else { 2603 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2604 if (bdev_io->u.bdev.split_outstanding == 0) { 2605 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2606 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2607 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2608 } 2609 } 2610 } 2611 2612 return rc; 2613 } 2614 2615 static void 2616 _bdev_rw_split(void *_bdev_io) 2617 { 2618 struct iovec *parent_iov, *iov; 2619 struct spdk_bdev_io *bdev_io = _bdev_io; 2620 struct spdk_bdev *bdev = bdev_io->bdev; 2621 uint64_t parent_offset, current_offset, remaining; 2622 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2623 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2624 uint32_t iovcnt, iov_len, child_iovsize; 2625 uint32_t blocklen = bdev->blocklen; 2626 uint32_t io_boundary; 2627 uint32_t max_segment_size = bdev->max_segment_size; 2628 uint32_t max_child_iovcnt = bdev->max_num_segments; 2629 void *md_buf = NULL; 2630 int rc; 2631 2632 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2633 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2634 SPDK_BDEV_IO_NUM_CHILD_IOV; 2635 2636 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2637 io_boundary = bdev->write_unit_size; 2638 } else if (bdev->split_on_optimal_io_boundary) { 2639 io_boundary = bdev->optimal_io_boundary; 2640 } else { 2641 io_boundary = UINT32_MAX; 2642 } 2643 2644 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2645 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2646 parent_offset = bdev_io->u.bdev.offset_blocks; 2647 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2648 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2649 2650 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2651 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2652 if (parent_iov_offset < parent_iov->iov_len) { 2653 break; 2654 } 2655 parent_iov_offset -= parent_iov->iov_len; 2656 } 2657 2658 child_iovcnt = 0; 2659 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2660 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2661 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2662 to_next_boundary = spdk_min(remaining, to_next_boundary); 2663 to_next_boundary_bytes = to_next_boundary * blocklen; 2664 2665 iov = &bdev_io->child_iov[child_iovcnt]; 2666 iovcnt = 0; 2667 2668 if (bdev_io->u.bdev.md_buf) { 2669 md_buf = (char *)bdev_io->u.bdev.md_buf + 2670 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2671 } 2672 2673 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2674 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2675 iovcnt < child_iovsize) { 2676 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2677 iov_len = parent_iov->iov_len - parent_iov_offset; 2678 2679 iov_len = spdk_min(iov_len, max_segment_size); 2680 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2681 to_next_boundary_bytes -= iov_len; 2682 2683 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2684 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2685 2686 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2687 parent_iov_offset += iov_len; 2688 } else { 2689 parent_iovpos++; 2690 parent_iov_offset = 0; 2691 } 2692 child_iovcnt++; 2693 iovcnt++; 2694 } 2695 2696 if (to_next_boundary_bytes > 0) { 2697 /* We had to stop this child I/O early because we ran out of 2698 * child_iov space or were limited by max_num_segments. 2699 * Ensure the iovs to be aligned with block size and 2700 * then adjust to_next_boundary before starting the 2701 * child I/O. 2702 */ 2703 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2704 iovcnt == child_iovsize); 2705 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2706 if (to_last_block_bytes != 0) { 2707 uint32_t child_iovpos = child_iovcnt - 1; 2708 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2709 * so the loop will naturally end 2710 */ 2711 2712 to_last_block_bytes = blocklen - to_last_block_bytes; 2713 to_next_boundary_bytes += to_last_block_bytes; 2714 while (to_last_block_bytes > 0 && iovcnt > 0) { 2715 iov_len = spdk_min(to_last_block_bytes, 2716 bdev_io->child_iov[child_iovpos].iov_len); 2717 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2718 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2719 child_iovpos--; 2720 if (--iovcnt == 0) { 2721 /* If the child IO is less than a block size just return. 2722 * If the first child IO of any split round is less than 2723 * a block size, an error exit. 2724 */ 2725 if (bdev_io->u.bdev.split_outstanding == 0) { 2726 SPDK_ERRLOG("The first child io was less than a block size\n"); 2727 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2728 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2729 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2730 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2731 } 2732 2733 return; 2734 } 2735 } 2736 2737 to_last_block_bytes -= iov_len; 2738 2739 if (parent_iov_offset == 0) { 2740 parent_iovpos--; 2741 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2742 } 2743 parent_iov_offset -= iov_len; 2744 } 2745 2746 assert(to_last_block_bytes == 0); 2747 } 2748 to_next_boundary -= to_next_boundary_bytes / blocklen; 2749 } 2750 2751 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2752 ¤t_offset, &remaining); 2753 if (spdk_unlikely(rc)) { 2754 return; 2755 } 2756 } 2757 } 2758 2759 static void 2760 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2761 { 2762 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2763 uint32_t num_children_reqs = 0; 2764 int rc; 2765 2766 offset = bdev_io->u.bdev.split_current_offset_blocks; 2767 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2768 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2769 2770 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2771 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2772 2773 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2774 &offset, &remaining); 2775 if (spdk_likely(rc == 0)) { 2776 num_children_reqs++; 2777 } else { 2778 return; 2779 } 2780 } 2781 } 2782 2783 static void 2784 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2785 { 2786 uint64_t offset, write_zeroes_blocks, remaining; 2787 uint32_t num_children_reqs = 0; 2788 int rc; 2789 2790 offset = bdev_io->u.bdev.split_current_offset_blocks; 2791 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2792 2793 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2794 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2795 2796 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2797 &offset, &remaining); 2798 if (spdk_likely(rc == 0)) { 2799 num_children_reqs++; 2800 } else { 2801 return; 2802 } 2803 } 2804 } 2805 2806 static void 2807 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2808 { 2809 uint64_t offset, copy_blocks, remaining; 2810 uint32_t num_children_reqs = 0; 2811 int rc; 2812 2813 offset = bdev_io->u.bdev.split_current_offset_blocks; 2814 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2815 2816 assert(bdev_io->bdev->max_copy != 0); 2817 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2818 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2819 2820 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2821 &offset, &remaining); 2822 if (spdk_likely(rc == 0)) { 2823 num_children_reqs++; 2824 } else { 2825 return; 2826 } 2827 } 2828 } 2829 2830 static void 2831 parent_bdev_io_complete(void *ctx, int rc) 2832 { 2833 struct spdk_bdev_io *parent_io = ctx; 2834 2835 if (rc) { 2836 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2837 } 2838 2839 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2840 parent_io->internal.caller_ctx); 2841 } 2842 2843 static void 2844 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2845 { 2846 struct spdk_bdev_io *parent_io = cb_arg; 2847 2848 spdk_bdev_free_io(bdev_io); 2849 2850 if (!success) { 2851 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2852 /* If any child I/O failed, stop further splitting process. */ 2853 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2854 parent_io->u.bdev.split_remaining_num_blocks = 0; 2855 } 2856 parent_io->u.bdev.split_outstanding--; 2857 if (parent_io->u.bdev.split_outstanding != 0) { 2858 return; 2859 } 2860 2861 /* 2862 * Parent I/O finishes when all blocks are consumed. 2863 */ 2864 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2865 assert(parent_io->internal.cb != bdev_io_split_done); 2866 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2867 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2868 2869 if (parent_io->internal.orig_iovcnt != 0) { 2870 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2871 /* bdev IO will be completed in the callback */ 2872 } else { 2873 parent_bdev_io_complete(parent_io, 0); 2874 } 2875 return; 2876 } 2877 2878 /* 2879 * Continue with the splitting process. This function will complete the parent I/O if the 2880 * splitting is done. 2881 */ 2882 switch (parent_io->type) { 2883 case SPDK_BDEV_IO_TYPE_READ: 2884 case SPDK_BDEV_IO_TYPE_WRITE: 2885 _bdev_rw_split(parent_io); 2886 break; 2887 case SPDK_BDEV_IO_TYPE_UNMAP: 2888 bdev_unmap_split(parent_io); 2889 break; 2890 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2891 bdev_write_zeroes_split(parent_io); 2892 break; 2893 case SPDK_BDEV_IO_TYPE_COPY: 2894 bdev_copy_split(parent_io); 2895 break; 2896 default: 2897 assert(false); 2898 break; 2899 } 2900 } 2901 2902 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2903 bool success); 2904 2905 static void 2906 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2907 { 2908 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2909 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2910 bdev_io->u.bdev.split_outstanding = 0; 2911 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2912 2913 switch (bdev_io->type) { 2914 case SPDK_BDEV_IO_TYPE_READ: 2915 case SPDK_BDEV_IO_TYPE_WRITE: 2916 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2917 _bdev_rw_split(bdev_io); 2918 } else { 2919 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2920 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2921 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2922 } 2923 break; 2924 case SPDK_BDEV_IO_TYPE_UNMAP: 2925 bdev_unmap_split(bdev_io); 2926 break; 2927 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2928 bdev_write_zeroes_split(bdev_io); 2929 break; 2930 case SPDK_BDEV_IO_TYPE_COPY: 2931 bdev_copy_split(bdev_io); 2932 break; 2933 default: 2934 assert(false); 2935 break; 2936 } 2937 } 2938 2939 static void 2940 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2941 { 2942 if (!success) { 2943 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2944 return; 2945 } 2946 2947 _bdev_rw_split(bdev_io); 2948 } 2949 2950 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2951 * be inlined, at least on some compilers. 2952 */ 2953 static inline void 2954 _bdev_io_submit(void *ctx) 2955 { 2956 struct spdk_bdev_io *bdev_io = ctx; 2957 struct spdk_bdev *bdev = bdev_io->bdev; 2958 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2959 2960 if (spdk_likely(bdev_ch->flags == 0)) { 2961 bdev_io_do_submit(bdev_ch, bdev_io); 2962 return; 2963 } 2964 2965 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2966 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2967 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2968 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2969 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2970 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2971 } else { 2972 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2973 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2974 } 2975 } else { 2976 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2977 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2978 } 2979 } 2980 2981 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2982 2983 bool 2984 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2985 { 2986 if (range1->length == 0 || range2->length == 0) { 2987 return false; 2988 } 2989 2990 if (range1->offset + range1->length <= range2->offset) { 2991 return false; 2992 } 2993 2994 if (range2->offset + range2->length <= range1->offset) { 2995 return false; 2996 } 2997 2998 return true; 2999 } 3000 3001 static bool 3002 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3003 { 3004 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3005 struct lba_range r; 3006 3007 switch (bdev_io->type) { 3008 case SPDK_BDEV_IO_TYPE_NVME_IO: 3009 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3010 /* Don't try to decode the NVMe command - just assume worst-case and that 3011 * it overlaps a locked range. 3012 */ 3013 return true; 3014 case SPDK_BDEV_IO_TYPE_WRITE: 3015 case SPDK_BDEV_IO_TYPE_UNMAP: 3016 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3017 case SPDK_BDEV_IO_TYPE_ZCOPY: 3018 case SPDK_BDEV_IO_TYPE_COPY: 3019 r.offset = bdev_io->u.bdev.offset_blocks; 3020 r.length = bdev_io->u.bdev.num_blocks; 3021 if (!bdev_lba_range_overlapped(range, &r)) { 3022 /* This I/O doesn't overlap the specified LBA range. */ 3023 return false; 3024 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3025 /* This I/O overlaps, but the I/O is on the same channel that locked this 3026 * range, and the caller_ctx is the same as the locked_ctx. This means 3027 * that this I/O is associated with the lock, and is allowed to execute. 3028 */ 3029 return false; 3030 } else { 3031 return true; 3032 } 3033 default: 3034 return false; 3035 } 3036 } 3037 3038 void 3039 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3040 { 3041 struct spdk_bdev *bdev = bdev_io->bdev; 3042 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3043 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3044 3045 assert(thread != NULL); 3046 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3047 3048 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3049 struct lba_range *range; 3050 3051 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3052 if (bdev_io_range_is_locked(bdev_io, range)) { 3053 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3054 return; 3055 } 3056 } 3057 } 3058 3059 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3060 3061 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3062 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3063 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3064 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3065 spdk_bdev_get_name(bdev)); 3066 3067 if (bdev_io_should_split(bdev_io)) { 3068 bdev_io_split(NULL, bdev_io); 3069 return; 3070 } 3071 3072 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3073 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3074 _bdev_io_submit(bdev_io); 3075 } else { 3076 bdev_io->internal.io_submit_ch = ch; 3077 bdev_io->internal.ch = bdev->internal.qos->ch; 3078 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3079 } 3080 } else { 3081 _bdev_io_submit(bdev_io); 3082 } 3083 } 3084 3085 static inline void 3086 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 3087 { 3088 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 3089 3090 /* Zero part we don't copy */ 3091 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 3092 memcpy(opts_copy, opts, opts->size); 3093 opts_copy->size = sizeof(*opts_copy); 3094 opts_copy->metadata = bdev_io->u.bdev.md_buf; 3095 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 3096 bdev_io->u.bdev.ext_opts = opts_copy; 3097 } 3098 3099 static inline void 3100 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3101 { 3102 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3103 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3104 * For write operation we need to pull buffers from memory domain before submitting IO. 3105 * Once read operation completes, we need to use memory_domain push functionality to 3106 * update data in original memory domain IO buffer 3107 * This IO request will go through a regular IO flow, so clear memory domains pointers in 3108 * the copied ext_opts */ 3109 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 3110 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 3111 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3112 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3113 } 3114 3115 static inline void 3116 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 3117 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 3118 { 3119 if (opts) { 3120 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 3121 assert(opts->size <= sizeof(*opts)); 3122 /* 3123 * copy if size is smaller than opts struct to avoid having to check size 3124 * on every access to bdev_io->u.bdev.ext_opts 3125 */ 3126 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 3127 _bdev_io_copy_ext_opts(bdev_io, opts); 3128 if (use_pull_push) { 3129 _bdev_io_ext_use_bounce_buffer(bdev_io); 3130 return; 3131 } 3132 } 3133 } 3134 bdev_io_submit(bdev_io); 3135 } 3136 3137 static void 3138 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3139 { 3140 struct spdk_bdev *bdev = bdev_io->bdev; 3141 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3142 struct spdk_io_channel *ch = bdev_ch->channel; 3143 3144 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3145 3146 bdev_io->internal.in_submit_request = true; 3147 bdev->fn_table->submit_request(ch, bdev_io); 3148 bdev_io->internal.in_submit_request = false; 3149 } 3150 3151 void 3152 bdev_io_init(struct spdk_bdev_io *bdev_io, 3153 struct spdk_bdev *bdev, void *cb_arg, 3154 spdk_bdev_io_completion_cb cb) 3155 { 3156 bdev_io->bdev = bdev; 3157 bdev_io->internal.caller_ctx = cb_arg; 3158 bdev_io->internal.cb = cb; 3159 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3160 bdev_io->internal.in_submit_request = false; 3161 bdev_io->internal.buf = NULL; 3162 bdev_io->internal.io_submit_ch = NULL; 3163 bdev_io->internal.orig_iovs = NULL; 3164 bdev_io->internal.orig_iovcnt = 0; 3165 bdev_io->internal.orig_md_iov.iov_base = NULL; 3166 bdev_io->internal.error.nvme.cdw0 = 0; 3167 bdev_io->num_retries = 0; 3168 bdev_io->internal.get_buf_cb = NULL; 3169 bdev_io->internal.get_aux_buf_cb = NULL; 3170 bdev_io->internal.ext_opts = NULL; 3171 bdev_io->internal.data_transfer_cpl = NULL; 3172 } 3173 3174 static bool 3175 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3176 { 3177 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3178 } 3179 3180 bool 3181 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3182 { 3183 bool supported; 3184 3185 supported = bdev_io_type_supported(bdev, io_type); 3186 3187 if (!supported) { 3188 switch (io_type) { 3189 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3190 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3191 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3192 break; 3193 default: 3194 break; 3195 } 3196 } 3197 3198 return supported; 3199 } 3200 3201 uint64_t 3202 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3203 { 3204 return bdev_io->internal.submit_tsc; 3205 } 3206 3207 int 3208 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3209 { 3210 if (bdev->fn_table->dump_info_json) { 3211 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3212 } 3213 3214 return 0; 3215 } 3216 3217 static void 3218 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3219 { 3220 uint32_t max_per_timeslice = 0; 3221 int i; 3222 3223 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3224 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3225 qos->rate_limits[i].max_per_timeslice = 0; 3226 continue; 3227 } 3228 3229 max_per_timeslice = qos->rate_limits[i].limit * 3230 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3231 3232 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3233 qos->rate_limits[i].min_per_timeslice); 3234 3235 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3236 } 3237 3238 bdev_qos_set_ops(qos); 3239 } 3240 3241 static int 3242 bdev_channel_poll_qos(void *arg) 3243 { 3244 struct spdk_bdev_qos *qos = arg; 3245 uint64_t now = spdk_get_ticks(); 3246 int i; 3247 3248 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3249 /* We received our callback earlier than expected - return 3250 * immediately and wait to do accounting until at least one 3251 * timeslice has actually expired. This should never happen 3252 * with a well-behaved timer implementation. 3253 */ 3254 return SPDK_POLLER_IDLE; 3255 } 3256 3257 /* Reset for next round of rate limiting */ 3258 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3259 /* We may have allowed the IOs or bytes to slightly overrun in the last 3260 * timeslice. remaining_this_timeslice is signed, so if it's negative 3261 * here, we'll account for the overrun so that the next timeslice will 3262 * be appropriately reduced. 3263 */ 3264 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3265 qos->rate_limits[i].remaining_this_timeslice = 0; 3266 } 3267 } 3268 3269 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3270 qos->last_timeslice += qos->timeslice_size; 3271 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3272 qos->rate_limits[i].remaining_this_timeslice += 3273 qos->rate_limits[i].max_per_timeslice; 3274 } 3275 } 3276 3277 return bdev_qos_io_submit(qos->ch, qos); 3278 } 3279 3280 static void 3281 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3282 { 3283 struct spdk_bdev_shared_resource *shared_resource; 3284 struct lba_range *range; 3285 3286 bdev_free_io_stat(ch->stat); 3287 #ifdef SPDK_CONFIG_VTUNE 3288 bdev_free_io_stat(ch->prev_stat); 3289 #endif 3290 3291 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3292 range = TAILQ_FIRST(&ch->locked_ranges); 3293 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3294 free(range); 3295 } 3296 3297 spdk_put_io_channel(ch->channel); 3298 3299 shared_resource = ch->shared_resource; 3300 3301 assert(TAILQ_EMPTY(&ch->io_locked)); 3302 assert(TAILQ_EMPTY(&ch->io_submitted)); 3303 assert(ch->io_outstanding == 0); 3304 assert(shared_resource->ref > 0); 3305 shared_resource->ref--; 3306 if (shared_resource->ref == 0) { 3307 assert(shared_resource->io_outstanding == 0); 3308 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3309 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3310 free(shared_resource); 3311 } 3312 } 3313 3314 static void 3315 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3316 { 3317 struct spdk_bdev_qos *qos = bdev->internal.qos; 3318 int i; 3319 3320 assert(spdk_spin_held(&bdev->internal.spinlock)); 3321 3322 /* Rate limiting on this bdev enabled */ 3323 if (qos) { 3324 if (qos->ch == NULL) { 3325 struct spdk_io_channel *io_ch; 3326 3327 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3328 bdev->name, spdk_get_thread()); 3329 3330 /* No qos channel has been selected, so set one up */ 3331 3332 /* Take another reference to ch */ 3333 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3334 assert(io_ch != NULL); 3335 qos->ch = ch; 3336 3337 qos->thread = spdk_io_channel_get_thread(io_ch); 3338 3339 TAILQ_INIT(&qos->queued); 3340 3341 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3342 if (bdev_qos_is_iops_rate_limit(i) == true) { 3343 qos->rate_limits[i].min_per_timeslice = 3344 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3345 } else { 3346 qos->rate_limits[i].min_per_timeslice = 3347 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3348 } 3349 3350 if (qos->rate_limits[i].limit == 0) { 3351 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3352 } 3353 } 3354 bdev_qos_update_max_quota_per_timeslice(qos); 3355 qos->timeslice_size = 3356 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3357 qos->last_timeslice = spdk_get_ticks(); 3358 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3359 qos, 3360 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3361 } 3362 3363 ch->flags |= BDEV_CH_QOS_ENABLED; 3364 } 3365 } 3366 3367 struct poll_timeout_ctx { 3368 struct spdk_bdev_desc *desc; 3369 uint64_t timeout_in_sec; 3370 spdk_bdev_io_timeout_cb cb_fn; 3371 void *cb_arg; 3372 }; 3373 3374 static void 3375 bdev_desc_free(struct spdk_bdev_desc *desc) 3376 { 3377 spdk_spin_destroy(&desc->spinlock); 3378 free(desc->media_events_buffer); 3379 free(desc); 3380 } 3381 3382 static void 3383 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3384 { 3385 struct poll_timeout_ctx *ctx = _ctx; 3386 struct spdk_bdev_desc *desc = ctx->desc; 3387 3388 free(ctx); 3389 3390 spdk_spin_lock(&desc->spinlock); 3391 desc->refs--; 3392 if (desc->closed == true && desc->refs == 0) { 3393 spdk_spin_unlock(&desc->spinlock); 3394 bdev_desc_free(desc); 3395 return; 3396 } 3397 spdk_spin_unlock(&desc->spinlock); 3398 } 3399 3400 static void 3401 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3402 struct spdk_io_channel *io_ch, void *_ctx) 3403 { 3404 struct poll_timeout_ctx *ctx = _ctx; 3405 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3406 struct spdk_bdev_desc *desc = ctx->desc; 3407 struct spdk_bdev_io *bdev_io; 3408 uint64_t now; 3409 3410 spdk_spin_lock(&desc->spinlock); 3411 if (desc->closed == true) { 3412 spdk_spin_unlock(&desc->spinlock); 3413 spdk_bdev_for_each_channel_continue(i, -1); 3414 return; 3415 } 3416 spdk_spin_unlock(&desc->spinlock); 3417 3418 now = spdk_get_ticks(); 3419 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3420 /* Exclude any I/O that are generated via splitting. */ 3421 if (bdev_io->internal.cb == bdev_io_split_done) { 3422 continue; 3423 } 3424 3425 /* Once we find an I/O that has not timed out, we can immediately 3426 * exit the loop. 3427 */ 3428 if (now < (bdev_io->internal.submit_tsc + 3429 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3430 goto end; 3431 } 3432 3433 if (bdev_io->internal.desc == desc) { 3434 ctx->cb_fn(ctx->cb_arg, bdev_io); 3435 } 3436 } 3437 3438 end: 3439 spdk_bdev_for_each_channel_continue(i, 0); 3440 } 3441 3442 static int 3443 bdev_poll_timeout_io(void *arg) 3444 { 3445 struct spdk_bdev_desc *desc = arg; 3446 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3447 struct poll_timeout_ctx *ctx; 3448 3449 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3450 if (!ctx) { 3451 SPDK_ERRLOG("failed to allocate memory\n"); 3452 return SPDK_POLLER_BUSY; 3453 } 3454 ctx->desc = desc; 3455 ctx->cb_arg = desc->cb_arg; 3456 ctx->cb_fn = desc->cb_fn; 3457 ctx->timeout_in_sec = desc->timeout_in_sec; 3458 3459 /* Take a ref on the descriptor in case it gets closed while we are checking 3460 * all of the channels. 3461 */ 3462 spdk_spin_lock(&desc->spinlock); 3463 desc->refs++; 3464 spdk_spin_unlock(&desc->spinlock); 3465 3466 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3467 bdev_channel_poll_timeout_io_done); 3468 3469 return SPDK_POLLER_BUSY; 3470 } 3471 3472 int 3473 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3474 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3475 { 3476 assert(desc->thread == spdk_get_thread()); 3477 3478 spdk_poller_unregister(&desc->io_timeout_poller); 3479 3480 if (timeout_in_sec) { 3481 assert(cb_fn != NULL); 3482 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3483 desc, 3484 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3485 1000); 3486 if (desc->io_timeout_poller == NULL) { 3487 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3488 return -1; 3489 } 3490 } 3491 3492 desc->cb_fn = cb_fn; 3493 desc->cb_arg = cb_arg; 3494 desc->timeout_in_sec = timeout_in_sec; 3495 3496 return 0; 3497 } 3498 3499 static int 3500 bdev_channel_create(void *io_device, void *ctx_buf) 3501 { 3502 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3503 struct spdk_bdev_channel *ch = ctx_buf; 3504 struct spdk_io_channel *mgmt_io_ch; 3505 struct spdk_bdev_mgmt_channel *mgmt_ch; 3506 struct spdk_bdev_shared_resource *shared_resource; 3507 struct lba_range *range; 3508 3509 ch->bdev = bdev; 3510 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3511 if (!ch->channel) { 3512 return -1; 3513 } 3514 3515 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3516 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3517 3518 assert(ch->histogram == NULL); 3519 if (bdev->internal.histogram_enabled) { 3520 ch->histogram = spdk_histogram_data_alloc(); 3521 if (ch->histogram == NULL) { 3522 SPDK_ERRLOG("Could not allocate histogram\n"); 3523 } 3524 } 3525 3526 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3527 if (!mgmt_io_ch) { 3528 spdk_put_io_channel(ch->channel); 3529 return -1; 3530 } 3531 3532 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3533 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3534 if (shared_resource->shared_ch == ch->channel) { 3535 spdk_put_io_channel(mgmt_io_ch); 3536 shared_resource->ref++; 3537 break; 3538 } 3539 } 3540 3541 if (shared_resource == NULL) { 3542 shared_resource = calloc(1, sizeof(*shared_resource)); 3543 if (shared_resource == NULL) { 3544 spdk_put_io_channel(ch->channel); 3545 spdk_put_io_channel(mgmt_io_ch); 3546 return -1; 3547 } 3548 3549 shared_resource->mgmt_ch = mgmt_ch; 3550 shared_resource->io_outstanding = 0; 3551 TAILQ_INIT(&shared_resource->nomem_io); 3552 shared_resource->nomem_threshold = 0; 3553 shared_resource->shared_ch = ch->channel; 3554 shared_resource->ref = 1; 3555 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3556 } 3557 3558 ch->io_outstanding = 0; 3559 TAILQ_INIT(&ch->queued_resets); 3560 TAILQ_INIT(&ch->locked_ranges); 3561 ch->flags = 0; 3562 ch->shared_resource = shared_resource; 3563 3564 TAILQ_INIT(&ch->io_submitted); 3565 TAILQ_INIT(&ch->io_locked); 3566 3567 ch->stat = bdev_alloc_io_stat(false); 3568 if (ch->stat == NULL) { 3569 bdev_channel_destroy_resource(ch); 3570 return -1; 3571 } 3572 3573 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3574 3575 #ifdef SPDK_CONFIG_VTUNE 3576 { 3577 char *name; 3578 __itt_init_ittlib(NULL, 0); 3579 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3580 if (!name) { 3581 bdev_channel_destroy_resource(ch); 3582 return -1; 3583 } 3584 ch->handle = __itt_string_handle_create(name); 3585 free(name); 3586 ch->start_tsc = spdk_get_ticks(); 3587 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3588 ch->prev_stat = bdev_alloc_io_stat(false); 3589 if (ch->prev_stat == NULL) { 3590 bdev_channel_destroy_resource(ch); 3591 return -1; 3592 } 3593 } 3594 #endif 3595 3596 spdk_spin_lock(&bdev->internal.spinlock); 3597 bdev_enable_qos(bdev, ch); 3598 3599 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3600 struct lba_range *new_range; 3601 3602 new_range = calloc(1, sizeof(*new_range)); 3603 if (new_range == NULL) { 3604 spdk_spin_unlock(&bdev->internal.spinlock); 3605 bdev_channel_destroy_resource(ch); 3606 return -1; 3607 } 3608 new_range->length = range->length; 3609 new_range->offset = range->offset; 3610 new_range->locked_ctx = range->locked_ctx; 3611 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3612 } 3613 3614 spdk_spin_unlock(&bdev->internal.spinlock); 3615 3616 return 0; 3617 } 3618 3619 static int 3620 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3621 void *cb_ctx) 3622 { 3623 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3624 struct spdk_bdev_io *bdev_io; 3625 uint64_t buf_len; 3626 3627 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3628 if (bdev_io->internal.ch == bdev_ch) { 3629 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3630 spdk_iobuf_entry_abort(ch, entry, buf_len); 3631 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3632 } 3633 3634 return 0; 3635 } 3636 3637 /* 3638 * Abort I/O that are waiting on a data buffer. 3639 */ 3640 static void 3641 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3642 { 3643 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3644 bdev_abort_all_buf_io_cb, ch); 3645 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3646 bdev_abort_all_buf_io_cb, ch); 3647 } 3648 3649 /* 3650 * Abort I/O that are queued waiting for submission. These types of I/O are 3651 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3652 */ 3653 static void 3654 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3655 { 3656 struct spdk_bdev_io *bdev_io, *tmp; 3657 3658 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3659 if (bdev_io->internal.ch == ch) { 3660 TAILQ_REMOVE(queue, bdev_io, internal.link); 3661 /* 3662 * spdk_bdev_io_complete() assumes that the completed I/O had 3663 * been submitted to the bdev module. Since in this case it 3664 * hadn't, bump io_outstanding to account for the decrement 3665 * that spdk_bdev_io_complete() will do. 3666 */ 3667 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3668 ch->io_outstanding++; 3669 ch->shared_resource->io_outstanding++; 3670 } 3671 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3672 } 3673 } 3674 } 3675 3676 static bool 3677 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3678 { 3679 struct spdk_bdev_io *bdev_io; 3680 3681 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3682 if (bdev_io == bio_to_abort) { 3683 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3684 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3685 return true; 3686 } 3687 } 3688 3689 return false; 3690 } 3691 3692 static int 3693 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3694 { 3695 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3696 uint64_t buf_len; 3697 3698 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3699 if (bdev_io == bio_to_abort) { 3700 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3701 spdk_iobuf_entry_abort(ch, entry, buf_len); 3702 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3703 return 1; 3704 } 3705 3706 return 0; 3707 } 3708 3709 static bool 3710 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3711 { 3712 int rc; 3713 3714 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3715 bdev_abort_buf_io_cb, bio_to_abort); 3716 if (rc == 1) { 3717 return true; 3718 } 3719 3720 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3721 bdev_abort_buf_io_cb, bio_to_abort); 3722 return rc == 1; 3723 } 3724 3725 static void 3726 bdev_qos_channel_destroy(void *cb_arg) 3727 { 3728 struct spdk_bdev_qos *qos = cb_arg; 3729 3730 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3731 spdk_poller_unregister(&qos->poller); 3732 3733 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3734 3735 free(qos); 3736 } 3737 3738 static int 3739 bdev_qos_destroy(struct spdk_bdev *bdev) 3740 { 3741 int i; 3742 3743 /* 3744 * Cleanly shutting down the QoS poller is tricky, because 3745 * during the asynchronous operation the user could open 3746 * a new descriptor and create a new channel, spawning 3747 * a new QoS poller. 3748 * 3749 * The strategy is to create a new QoS structure here and swap it 3750 * in. The shutdown path then continues to refer to the old one 3751 * until it completes and then releases it. 3752 */ 3753 struct spdk_bdev_qos *new_qos, *old_qos; 3754 3755 old_qos = bdev->internal.qos; 3756 3757 new_qos = calloc(1, sizeof(*new_qos)); 3758 if (!new_qos) { 3759 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3760 return -ENOMEM; 3761 } 3762 3763 /* Copy the old QoS data into the newly allocated structure */ 3764 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3765 3766 /* Zero out the key parts of the QoS structure */ 3767 new_qos->ch = NULL; 3768 new_qos->thread = NULL; 3769 new_qos->poller = NULL; 3770 TAILQ_INIT(&new_qos->queued); 3771 /* 3772 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3773 * It will be used later for the new QoS structure. 3774 */ 3775 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3776 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3777 new_qos->rate_limits[i].min_per_timeslice = 0; 3778 new_qos->rate_limits[i].max_per_timeslice = 0; 3779 } 3780 3781 bdev->internal.qos = new_qos; 3782 3783 if (old_qos->thread == NULL) { 3784 free(old_qos); 3785 } else { 3786 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3787 } 3788 3789 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3790 * been destroyed yet. The destruction path will end up waiting for the final 3791 * channel to be put before it releases resources. */ 3792 3793 return 0; 3794 } 3795 3796 void 3797 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3798 { 3799 total->bytes_read += add->bytes_read; 3800 total->num_read_ops += add->num_read_ops; 3801 total->bytes_written += add->bytes_written; 3802 total->num_write_ops += add->num_write_ops; 3803 total->bytes_unmapped += add->bytes_unmapped; 3804 total->num_unmap_ops += add->num_unmap_ops; 3805 total->bytes_copied += add->bytes_copied; 3806 total->num_copy_ops += add->num_copy_ops; 3807 total->read_latency_ticks += add->read_latency_ticks; 3808 total->write_latency_ticks += add->write_latency_ticks; 3809 total->unmap_latency_ticks += add->unmap_latency_ticks; 3810 total->copy_latency_ticks += add->copy_latency_ticks; 3811 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3812 total->max_read_latency_ticks = add->max_read_latency_ticks; 3813 } 3814 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3815 total->min_read_latency_ticks = add->min_read_latency_ticks; 3816 } 3817 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3818 total->max_write_latency_ticks = add->max_write_latency_ticks; 3819 } 3820 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3821 total->min_write_latency_ticks = add->min_write_latency_ticks; 3822 } 3823 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3824 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3825 } 3826 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3827 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3828 } 3829 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3830 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3831 } 3832 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3833 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3834 } 3835 } 3836 3837 static void 3838 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3839 { 3840 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3841 3842 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3843 memcpy(to_stat->io_error, from_stat->io_error, 3844 sizeof(struct spdk_bdev_io_error_stat)); 3845 } 3846 } 3847 3848 void 3849 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 3850 { 3851 stat->max_read_latency_ticks = 0; 3852 stat->min_read_latency_ticks = UINT64_MAX; 3853 stat->max_write_latency_ticks = 0; 3854 stat->min_write_latency_ticks = UINT64_MAX; 3855 stat->max_unmap_latency_ticks = 0; 3856 stat->min_unmap_latency_ticks = UINT64_MAX; 3857 stat->max_copy_latency_ticks = 0; 3858 stat->min_copy_latency_ticks = UINT64_MAX; 3859 3860 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 3861 return; 3862 } 3863 3864 stat->bytes_read = 0; 3865 stat->num_read_ops = 0; 3866 stat->bytes_written = 0; 3867 stat->num_write_ops = 0; 3868 stat->bytes_unmapped = 0; 3869 stat->num_unmap_ops = 0; 3870 stat->bytes_copied = 0; 3871 stat->num_copy_ops = 0; 3872 stat->read_latency_ticks = 0; 3873 stat->write_latency_ticks = 0; 3874 stat->unmap_latency_ticks = 0; 3875 stat->copy_latency_ticks = 0; 3876 3877 if (stat->io_error != NULL) { 3878 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3879 } 3880 } 3881 3882 struct spdk_bdev_io_stat * 3883 bdev_alloc_io_stat(bool io_error_stat) 3884 { 3885 struct spdk_bdev_io_stat *stat; 3886 3887 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3888 if (stat == NULL) { 3889 return NULL; 3890 } 3891 3892 if (io_error_stat) { 3893 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3894 if (stat->io_error == NULL) { 3895 free(stat); 3896 return NULL; 3897 } 3898 } else { 3899 stat->io_error = NULL; 3900 } 3901 3902 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 3903 3904 return stat; 3905 } 3906 3907 void 3908 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3909 { 3910 if (stat != NULL) { 3911 free(stat->io_error); 3912 free(stat); 3913 } 3914 } 3915 3916 void 3917 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3918 { 3919 int i; 3920 3921 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3922 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3923 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3924 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3925 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3926 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3927 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3928 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3929 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3930 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3931 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3932 stat->min_read_latency_ticks != UINT64_MAX ? 3933 stat->min_read_latency_ticks : 0); 3934 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3935 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3936 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3937 stat->min_write_latency_ticks != UINT64_MAX ? 3938 stat->min_write_latency_ticks : 0); 3939 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3940 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3941 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3942 stat->min_unmap_latency_ticks != UINT64_MAX ? 3943 stat->min_unmap_latency_ticks : 0); 3944 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3945 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3946 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3947 stat->min_copy_latency_ticks != UINT64_MAX ? 3948 stat->min_copy_latency_ticks : 0); 3949 3950 if (stat->io_error != NULL) { 3951 spdk_json_write_named_object_begin(w, "io_error"); 3952 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3953 if (stat->io_error->error_status[i] != 0) { 3954 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3955 stat->io_error->error_status[i]); 3956 } 3957 } 3958 spdk_json_write_object_end(w); 3959 } 3960 } 3961 3962 static void 3963 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3964 { 3965 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3966 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3967 3968 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3969 bdev_abort_all_buf_io(mgmt_ch, ch); 3970 bdev_abort_all_buf_io(mgmt_ch, ch); 3971 } 3972 3973 static void 3974 bdev_channel_destroy(void *io_device, void *ctx_buf) 3975 { 3976 struct spdk_bdev_channel *ch = ctx_buf; 3977 3978 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3979 spdk_get_thread()); 3980 3981 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3982 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3983 3984 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3985 spdk_spin_lock(&ch->bdev->internal.spinlock); 3986 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3987 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3988 3989 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3990 3991 bdev_channel_abort_queued_ios(ch); 3992 3993 if (ch->histogram) { 3994 spdk_histogram_data_free(ch->histogram); 3995 } 3996 3997 bdev_channel_destroy_resource(ch); 3998 } 3999 4000 /* 4001 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4002 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4003 */ 4004 static int 4005 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4006 { 4007 struct spdk_bdev_name *tmp; 4008 4009 bdev_name->name = strdup(name); 4010 if (bdev_name->name == NULL) { 4011 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4012 return -ENOMEM; 4013 } 4014 4015 bdev_name->bdev = bdev; 4016 4017 spdk_spin_lock(&g_bdev_mgr.spinlock); 4018 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4019 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4020 4021 if (tmp != NULL) { 4022 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4023 free(bdev_name->name); 4024 return -EEXIST; 4025 } 4026 4027 return 0; 4028 } 4029 4030 static void 4031 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4032 { 4033 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4034 free(bdev_name->name); 4035 } 4036 4037 static void 4038 bdev_name_del(struct spdk_bdev_name *bdev_name) 4039 { 4040 spdk_spin_lock(&g_bdev_mgr.spinlock); 4041 bdev_name_del_unsafe(bdev_name); 4042 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4043 } 4044 4045 int 4046 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4047 { 4048 struct spdk_bdev_alias *tmp; 4049 int ret; 4050 4051 if (alias == NULL) { 4052 SPDK_ERRLOG("Empty alias passed\n"); 4053 return -EINVAL; 4054 } 4055 4056 tmp = calloc(1, sizeof(*tmp)); 4057 if (tmp == NULL) { 4058 SPDK_ERRLOG("Unable to allocate alias\n"); 4059 return -ENOMEM; 4060 } 4061 4062 ret = bdev_name_add(&tmp->alias, bdev, alias); 4063 if (ret != 0) { 4064 free(tmp); 4065 return ret; 4066 } 4067 4068 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4069 4070 return 0; 4071 } 4072 4073 static int 4074 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4075 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4076 { 4077 struct spdk_bdev_alias *tmp; 4078 4079 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4080 if (strcmp(alias, tmp->alias.name) == 0) { 4081 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4082 alias_del_fn(&tmp->alias); 4083 free(tmp); 4084 return 0; 4085 } 4086 } 4087 4088 return -ENOENT; 4089 } 4090 4091 int 4092 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4093 { 4094 int rc; 4095 4096 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4097 if (rc == -ENOENT) { 4098 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4099 } 4100 4101 return rc; 4102 } 4103 4104 void 4105 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4106 { 4107 struct spdk_bdev_alias *p, *tmp; 4108 4109 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4110 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4111 bdev_name_del(&p->alias); 4112 free(p); 4113 } 4114 } 4115 4116 struct spdk_io_channel * 4117 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4118 { 4119 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4120 } 4121 4122 void * 4123 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4124 { 4125 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4126 void *ctx = NULL; 4127 4128 if (bdev->fn_table->get_module_ctx) { 4129 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4130 } 4131 4132 return ctx; 4133 } 4134 4135 const char * 4136 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4137 { 4138 return bdev->module->name; 4139 } 4140 4141 const char * 4142 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4143 { 4144 return bdev->name; 4145 } 4146 4147 const char * 4148 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4149 { 4150 return bdev->product_name; 4151 } 4152 4153 const struct spdk_bdev_aliases_list * 4154 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4155 { 4156 return &bdev->aliases; 4157 } 4158 4159 uint32_t 4160 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4161 { 4162 return bdev->blocklen; 4163 } 4164 4165 uint32_t 4166 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4167 { 4168 return bdev->write_unit_size; 4169 } 4170 4171 uint64_t 4172 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4173 { 4174 return bdev->blockcnt; 4175 } 4176 4177 const char * 4178 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4179 { 4180 return qos_rpc_type[type]; 4181 } 4182 4183 void 4184 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4185 { 4186 int i; 4187 4188 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4189 4190 spdk_spin_lock(&bdev->internal.spinlock); 4191 if (bdev->internal.qos) { 4192 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4193 if (bdev->internal.qos->rate_limits[i].limit != 4194 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4195 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4196 if (bdev_qos_is_iops_rate_limit(i) == false) { 4197 /* Change from Byte to Megabyte which is user visible. */ 4198 limits[i] = limits[i] / 1024 / 1024; 4199 } 4200 } 4201 } 4202 } 4203 spdk_spin_unlock(&bdev->internal.spinlock); 4204 } 4205 4206 size_t 4207 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4208 { 4209 return 1 << bdev->required_alignment; 4210 } 4211 4212 uint32_t 4213 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4214 { 4215 return bdev->optimal_io_boundary; 4216 } 4217 4218 bool 4219 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4220 { 4221 return bdev->write_cache; 4222 } 4223 4224 const struct spdk_uuid * 4225 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4226 { 4227 return &bdev->uuid; 4228 } 4229 4230 uint16_t 4231 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4232 { 4233 return bdev->acwu; 4234 } 4235 4236 uint32_t 4237 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4238 { 4239 return bdev->md_len; 4240 } 4241 4242 bool 4243 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4244 { 4245 return (bdev->md_len != 0) && bdev->md_interleave; 4246 } 4247 4248 bool 4249 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4250 { 4251 return (bdev->md_len != 0) && !bdev->md_interleave; 4252 } 4253 4254 bool 4255 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4256 { 4257 return bdev->zoned; 4258 } 4259 4260 uint32_t 4261 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4262 { 4263 if (spdk_bdev_is_md_interleaved(bdev)) { 4264 return bdev->blocklen - bdev->md_len; 4265 } else { 4266 return bdev->blocklen; 4267 } 4268 } 4269 4270 uint32_t 4271 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4272 { 4273 return bdev->phys_blocklen; 4274 } 4275 4276 static uint32_t 4277 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4278 { 4279 if (!spdk_bdev_is_md_interleaved(bdev)) { 4280 return bdev->blocklen + bdev->md_len; 4281 } else { 4282 return bdev->blocklen; 4283 } 4284 } 4285 4286 /* We have to use the typedef in the function declaration to appease astyle. */ 4287 typedef enum spdk_dif_type spdk_dif_type_t; 4288 4289 spdk_dif_type_t 4290 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4291 { 4292 if (bdev->md_len != 0) { 4293 return bdev->dif_type; 4294 } else { 4295 return SPDK_DIF_DISABLE; 4296 } 4297 } 4298 4299 bool 4300 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4301 { 4302 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4303 return bdev->dif_is_head_of_md; 4304 } else { 4305 return false; 4306 } 4307 } 4308 4309 bool 4310 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4311 enum spdk_dif_check_type check_type) 4312 { 4313 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4314 return false; 4315 } 4316 4317 switch (check_type) { 4318 case SPDK_DIF_CHECK_TYPE_REFTAG: 4319 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4320 case SPDK_DIF_CHECK_TYPE_APPTAG: 4321 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4322 case SPDK_DIF_CHECK_TYPE_GUARD: 4323 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4324 default: 4325 return false; 4326 } 4327 } 4328 4329 uint32_t 4330 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4331 { 4332 return bdev->max_copy; 4333 } 4334 4335 uint64_t 4336 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4337 { 4338 return bdev->internal.measured_queue_depth; 4339 } 4340 4341 uint64_t 4342 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4343 { 4344 return bdev->internal.period; 4345 } 4346 4347 uint64_t 4348 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4349 { 4350 return bdev->internal.weighted_io_time; 4351 } 4352 4353 uint64_t 4354 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4355 { 4356 return bdev->internal.io_time; 4357 } 4358 4359 static void bdev_update_qd_sampling_period(void *ctx); 4360 4361 static void 4362 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4363 { 4364 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4365 4366 if (bdev->internal.measured_queue_depth) { 4367 bdev->internal.io_time += bdev->internal.period; 4368 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4369 } 4370 4371 bdev->internal.qd_poll_in_progress = false; 4372 4373 bdev_update_qd_sampling_period(bdev); 4374 } 4375 4376 static void 4377 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4378 struct spdk_io_channel *io_ch, void *_ctx) 4379 { 4380 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4381 4382 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4383 spdk_bdev_for_each_channel_continue(i, 0); 4384 } 4385 4386 static int 4387 bdev_calculate_measured_queue_depth(void *ctx) 4388 { 4389 struct spdk_bdev *bdev = ctx; 4390 4391 bdev->internal.qd_poll_in_progress = true; 4392 bdev->internal.temporary_queue_depth = 0; 4393 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4394 return SPDK_POLLER_BUSY; 4395 } 4396 4397 static void 4398 bdev_update_qd_sampling_period(void *ctx) 4399 { 4400 struct spdk_bdev *bdev = ctx; 4401 4402 if (bdev->internal.period == bdev->internal.new_period) { 4403 return; 4404 } 4405 4406 if (bdev->internal.qd_poll_in_progress) { 4407 return; 4408 } 4409 4410 bdev->internal.period = bdev->internal.new_period; 4411 4412 spdk_poller_unregister(&bdev->internal.qd_poller); 4413 if (bdev->internal.period != 0) { 4414 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4415 bdev, bdev->internal.period); 4416 } else { 4417 spdk_bdev_close(bdev->internal.qd_desc); 4418 bdev->internal.qd_desc = NULL; 4419 } 4420 } 4421 4422 static void 4423 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4424 { 4425 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4426 } 4427 4428 void 4429 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4430 { 4431 int rc; 4432 4433 if (bdev->internal.new_period == period) { 4434 return; 4435 } 4436 4437 bdev->internal.new_period = period; 4438 4439 if (bdev->internal.qd_desc != NULL) { 4440 assert(bdev->internal.period != 0); 4441 4442 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4443 bdev_update_qd_sampling_period, bdev); 4444 return; 4445 } 4446 4447 assert(bdev->internal.period == 0); 4448 4449 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4450 NULL, &bdev->internal.qd_desc); 4451 if (rc != 0) { 4452 return; 4453 } 4454 4455 bdev->internal.period = period; 4456 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4457 bdev, period); 4458 } 4459 4460 struct bdev_get_current_qd_ctx { 4461 uint64_t current_qd; 4462 spdk_bdev_get_current_qd_cb cb_fn; 4463 void *cb_arg; 4464 }; 4465 4466 static void 4467 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4468 { 4469 struct bdev_get_current_qd_ctx *ctx = _ctx; 4470 4471 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4472 4473 free(ctx); 4474 } 4475 4476 static void 4477 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4478 struct spdk_io_channel *io_ch, void *_ctx) 4479 { 4480 struct bdev_get_current_qd_ctx *ctx = _ctx; 4481 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4482 4483 ctx->current_qd += bdev_ch->io_outstanding; 4484 4485 spdk_bdev_for_each_channel_continue(i, 0); 4486 } 4487 4488 void 4489 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4490 void *cb_arg) 4491 { 4492 struct bdev_get_current_qd_ctx *ctx; 4493 4494 assert(cb_fn != NULL); 4495 4496 ctx = calloc(1, sizeof(*ctx)); 4497 if (ctx == NULL) { 4498 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4499 return; 4500 } 4501 4502 ctx->cb_fn = cb_fn; 4503 ctx->cb_arg = cb_arg; 4504 4505 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4506 } 4507 4508 static void 4509 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4510 { 4511 assert(desc->thread == spdk_get_thread()); 4512 4513 spdk_spin_lock(&desc->spinlock); 4514 desc->refs--; 4515 if (!desc->closed) { 4516 spdk_spin_unlock(&desc->spinlock); 4517 desc->callback.event_fn(type, 4518 desc->bdev, 4519 desc->callback.ctx); 4520 return; 4521 } else if (desc->refs == 0) { 4522 /* This descriptor was closed after this event_notify message was sent. 4523 * spdk_bdev_close() could not free the descriptor since this message was 4524 * in flight, so we free it now using bdev_desc_free(). 4525 */ 4526 spdk_spin_unlock(&desc->spinlock); 4527 bdev_desc_free(desc); 4528 return; 4529 } 4530 spdk_spin_unlock(&desc->spinlock); 4531 } 4532 4533 static void 4534 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4535 { 4536 spdk_spin_lock(&desc->spinlock); 4537 desc->refs++; 4538 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4539 spdk_spin_unlock(&desc->spinlock); 4540 } 4541 4542 static void 4543 _resize_notify(void *ctx) 4544 { 4545 struct spdk_bdev_desc *desc = ctx; 4546 4547 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4548 } 4549 4550 int 4551 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4552 { 4553 struct spdk_bdev_desc *desc; 4554 int ret; 4555 4556 if (size == bdev->blockcnt) { 4557 return 0; 4558 } 4559 4560 spdk_spin_lock(&bdev->internal.spinlock); 4561 4562 /* bdev has open descriptors */ 4563 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4564 bdev->blockcnt > size) { 4565 ret = -EBUSY; 4566 } else { 4567 bdev->blockcnt = size; 4568 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4569 event_notify(desc, _resize_notify); 4570 } 4571 ret = 0; 4572 } 4573 4574 spdk_spin_unlock(&bdev->internal.spinlock); 4575 4576 return ret; 4577 } 4578 4579 /* 4580 * Convert I/O offset and length from bytes to blocks. 4581 * 4582 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4583 */ 4584 static uint64_t 4585 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4586 uint64_t num_bytes, uint64_t *num_blocks) 4587 { 4588 uint32_t block_size = bdev->blocklen; 4589 uint8_t shift_cnt; 4590 4591 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4592 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4593 shift_cnt = spdk_u32log2(block_size); 4594 *offset_blocks = offset_bytes >> shift_cnt; 4595 *num_blocks = num_bytes >> shift_cnt; 4596 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4597 (num_bytes - (*num_blocks << shift_cnt)); 4598 } else { 4599 *offset_blocks = offset_bytes / block_size; 4600 *num_blocks = num_bytes / block_size; 4601 return (offset_bytes % block_size) | (num_bytes % block_size); 4602 } 4603 } 4604 4605 static bool 4606 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4607 { 4608 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4609 * has been an overflow and hence the offset has been wrapped around */ 4610 if (offset_blocks + num_blocks < offset_blocks) { 4611 return false; 4612 } 4613 4614 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4615 if (offset_blocks + num_blocks > bdev->blockcnt) { 4616 return false; 4617 } 4618 4619 return true; 4620 } 4621 4622 static void 4623 bdev_seek_complete_cb(void *ctx) 4624 { 4625 struct spdk_bdev_io *bdev_io = ctx; 4626 4627 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4628 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4629 } 4630 4631 static int 4632 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4633 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4634 spdk_bdev_io_completion_cb cb, void *cb_arg) 4635 { 4636 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4637 struct spdk_bdev_io *bdev_io; 4638 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4639 4640 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4641 4642 /* Check if offset_blocks is valid looking at the validity of one block */ 4643 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4644 return -EINVAL; 4645 } 4646 4647 bdev_io = bdev_channel_get_io(channel); 4648 if (!bdev_io) { 4649 return -ENOMEM; 4650 } 4651 4652 bdev_io->internal.ch = channel; 4653 bdev_io->internal.desc = desc; 4654 bdev_io->type = io_type; 4655 bdev_io->u.bdev.offset_blocks = offset_blocks; 4656 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4657 4658 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4659 /* In case bdev doesn't support seek to next data/hole offset, 4660 * it is assumed that only data and no holes are present */ 4661 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4662 bdev_io->u.bdev.seek.offset = offset_blocks; 4663 } else { 4664 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4665 } 4666 4667 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4668 return 0; 4669 } 4670 4671 bdev_io_submit(bdev_io); 4672 return 0; 4673 } 4674 4675 int 4676 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4677 uint64_t offset_blocks, 4678 spdk_bdev_io_completion_cb cb, void *cb_arg) 4679 { 4680 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4681 } 4682 4683 int 4684 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4685 uint64_t offset_blocks, 4686 spdk_bdev_io_completion_cb cb, void *cb_arg) 4687 { 4688 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4689 } 4690 4691 uint64_t 4692 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4693 { 4694 return bdev_io->u.bdev.seek.offset; 4695 } 4696 4697 static int 4698 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4699 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4700 spdk_bdev_io_completion_cb cb, void *cb_arg) 4701 { 4702 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4703 struct spdk_bdev_io *bdev_io; 4704 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4705 4706 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4707 return -EINVAL; 4708 } 4709 4710 bdev_io = bdev_channel_get_io(channel); 4711 if (!bdev_io) { 4712 return -ENOMEM; 4713 } 4714 4715 bdev_io->internal.ch = channel; 4716 bdev_io->internal.desc = desc; 4717 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4718 bdev_io->u.bdev.iovs = &bdev_io->iov; 4719 bdev_io->u.bdev.iovs[0].iov_base = buf; 4720 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4721 bdev_io->u.bdev.iovcnt = 1; 4722 bdev_io->u.bdev.md_buf = md_buf; 4723 bdev_io->u.bdev.num_blocks = num_blocks; 4724 bdev_io->u.bdev.offset_blocks = offset_blocks; 4725 bdev_io->u.bdev.ext_opts = NULL; 4726 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4727 4728 bdev_io_submit(bdev_io); 4729 return 0; 4730 } 4731 4732 int 4733 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4734 void *buf, uint64_t offset, uint64_t nbytes, 4735 spdk_bdev_io_completion_cb cb, void *cb_arg) 4736 { 4737 uint64_t offset_blocks, num_blocks; 4738 4739 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4740 nbytes, &num_blocks) != 0) { 4741 return -EINVAL; 4742 } 4743 4744 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4745 } 4746 4747 int 4748 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4749 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4750 spdk_bdev_io_completion_cb cb, void *cb_arg) 4751 { 4752 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4753 } 4754 4755 int 4756 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4757 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4758 spdk_bdev_io_completion_cb cb, void *cb_arg) 4759 { 4760 struct iovec iov = { 4761 .iov_base = buf, 4762 }; 4763 4764 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4765 return -EINVAL; 4766 } 4767 4768 if (md_buf && !_is_buf_allocated(&iov)) { 4769 return -EINVAL; 4770 } 4771 4772 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4773 cb, cb_arg); 4774 } 4775 4776 int 4777 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4778 struct iovec *iov, int iovcnt, 4779 uint64_t offset, uint64_t nbytes, 4780 spdk_bdev_io_completion_cb cb, void *cb_arg) 4781 { 4782 uint64_t offset_blocks, num_blocks; 4783 4784 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4785 nbytes, &num_blocks) != 0) { 4786 return -EINVAL; 4787 } 4788 4789 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4790 } 4791 4792 static int 4793 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4794 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4795 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4796 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4797 { 4798 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4799 struct spdk_bdev_io *bdev_io; 4800 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4801 4802 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4803 return -EINVAL; 4804 } 4805 4806 bdev_io = bdev_channel_get_io(channel); 4807 if (!bdev_io) { 4808 return -ENOMEM; 4809 } 4810 4811 bdev_io->internal.ch = channel; 4812 bdev_io->internal.desc = desc; 4813 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4814 bdev_io->u.bdev.iovs = iov; 4815 bdev_io->u.bdev.iovcnt = iovcnt; 4816 bdev_io->u.bdev.md_buf = md_buf; 4817 bdev_io->u.bdev.num_blocks = num_blocks; 4818 bdev_io->u.bdev.offset_blocks = offset_blocks; 4819 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4820 bdev_io->internal.ext_opts = opts; 4821 bdev_io->u.bdev.ext_opts = opts; 4822 4823 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4824 4825 return 0; 4826 } 4827 4828 int 4829 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4830 struct iovec *iov, int iovcnt, 4831 uint64_t offset_blocks, uint64_t num_blocks, 4832 spdk_bdev_io_completion_cb cb, void *cb_arg) 4833 { 4834 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4835 num_blocks, cb, cb_arg, NULL, false); 4836 } 4837 4838 int 4839 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4840 struct iovec *iov, int iovcnt, void *md_buf, 4841 uint64_t offset_blocks, uint64_t num_blocks, 4842 spdk_bdev_io_completion_cb cb, void *cb_arg) 4843 { 4844 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4845 return -EINVAL; 4846 } 4847 4848 if (md_buf && !_is_buf_allocated(iov)) { 4849 return -EINVAL; 4850 } 4851 4852 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4853 num_blocks, cb, cb_arg, NULL, false); 4854 } 4855 4856 static inline bool 4857 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4858 { 4859 /* 4860 * We check if opts size is at least of size when we first introduced 4861 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4862 * are not checked internal. 4863 */ 4864 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4865 sizeof(opts->metadata) && 4866 opts->size <= sizeof(*opts) && 4867 /* When memory domain is used, the user must provide data buffers */ 4868 (!opts->memory_domain || (iov && iov[0].iov_base)); 4869 } 4870 4871 int 4872 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4873 struct iovec *iov, int iovcnt, 4874 uint64_t offset_blocks, uint64_t num_blocks, 4875 spdk_bdev_io_completion_cb cb, void *cb_arg, 4876 struct spdk_bdev_ext_io_opts *opts) 4877 { 4878 void *md = NULL; 4879 4880 if (opts) { 4881 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4882 return -EINVAL; 4883 } 4884 md = opts->metadata; 4885 } 4886 4887 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4888 return -EINVAL; 4889 } 4890 4891 if (md && !_is_buf_allocated(iov)) { 4892 return -EINVAL; 4893 } 4894 4895 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4896 num_blocks, cb, cb_arg, opts, false); 4897 } 4898 4899 static int 4900 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4901 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4902 spdk_bdev_io_completion_cb cb, void *cb_arg) 4903 { 4904 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4905 struct spdk_bdev_io *bdev_io; 4906 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4907 4908 if (!desc->write) { 4909 return -EBADF; 4910 } 4911 4912 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4913 return -EINVAL; 4914 } 4915 4916 bdev_io = bdev_channel_get_io(channel); 4917 if (!bdev_io) { 4918 return -ENOMEM; 4919 } 4920 4921 bdev_io->internal.ch = channel; 4922 bdev_io->internal.desc = desc; 4923 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4924 bdev_io->u.bdev.iovs = &bdev_io->iov; 4925 bdev_io->u.bdev.iovs[0].iov_base = buf; 4926 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4927 bdev_io->u.bdev.iovcnt = 1; 4928 bdev_io->u.bdev.md_buf = md_buf; 4929 bdev_io->u.bdev.num_blocks = num_blocks; 4930 bdev_io->u.bdev.offset_blocks = offset_blocks; 4931 bdev_io->u.bdev.ext_opts = NULL; 4932 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4933 4934 bdev_io_submit(bdev_io); 4935 return 0; 4936 } 4937 4938 int 4939 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4940 void *buf, uint64_t offset, uint64_t nbytes, 4941 spdk_bdev_io_completion_cb cb, void *cb_arg) 4942 { 4943 uint64_t offset_blocks, num_blocks; 4944 4945 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4946 nbytes, &num_blocks) != 0) { 4947 return -EINVAL; 4948 } 4949 4950 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4951 } 4952 4953 int 4954 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4955 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4956 spdk_bdev_io_completion_cb cb, void *cb_arg) 4957 { 4958 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4959 cb, cb_arg); 4960 } 4961 4962 int 4963 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4964 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4965 spdk_bdev_io_completion_cb cb, void *cb_arg) 4966 { 4967 struct iovec iov = { 4968 .iov_base = buf, 4969 }; 4970 4971 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4972 return -EINVAL; 4973 } 4974 4975 if (md_buf && !_is_buf_allocated(&iov)) { 4976 return -EINVAL; 4977 } 4978 4979 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4980 cb, cb_arg); 4981 } 4982 4983 static int 4984 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4985 struct iovec *iov, int iovcnt, void *md_buf, 4986 uint64_t offset_blocks, uint64_t num_blocks, 4987 spdk_bdev_io_completion_cb cb, void *cb_arg, 4988 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4989 { 4990 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4991 struct spdk_bdev_io *bdev_io; 4992 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4993 4994 if (!desc->write) { 4995 return -EBADF; 4996 } 4997 4998 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4999 return -EINVAL; 5000 } 5001 5002 bdev_io = bdev_channel_get_io(channel); 5003 if (!bdev_io) { 5004 return -ENOMEM; 5005 } 5006 5007 bdev_io->internal.ch = channel; 5008 bdev_io->internal.desc = desc; 5009 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5010 bdev_io->u.bdev.iovs = iov; 5011 bdev_io->u.bdev.iovcnt = iovcnt; 5012 bdev_io->u.bdev.md_buf = md_buf; 5013 bdev_io->u.bdev.num_blocks = num_blocks; 5014 bdev_io->u.bdev.offset_blocks = offset_blocks; 5015 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5016 bdev_io->internal.ext_opts = opts; 5017 bdev_io->u.bdev.ext_opts = opts; 5018 5019 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 5020 5021 return 0; 5022 } 5023 5024 int 5025 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5026 struct iovec *iov, int iovcnt, 5027 uint64_t offset, uint64_t len, 5028 spdk_bdev_io_completion_cb cb, void *cb_arg) 5029 { 5030 uint64_t offset_blocks, num_blocks; 5031 5032 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5033 len, &num_blocks) != 0) { 5034 return -EINVAL; 5035 } 5036 5037 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5038 } 5039 5040 int 5041 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5042 struct iovec *iov, int iovcnt, 5043 uint64_t offset_blocks, uint64_t num_blocks, 5044 spdk_bdev_io_completion_cb cb, void *cb_arg) 5045 { 5046 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5047 num_blocks, cb, cb_arg, NULL, false); 5048 } 5049 5050 int 5051 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5052 struct iovec *iov, int iovcnt, void *md_buf, 5053 uint64_t offset_blocks, uint64_t num_blocks, 5054 spdk_bdev_io_completion_cb cb, void *cb_arg) 5055 { 5056 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5057 return -EINVAL; 5058 } 5059 5060 if (md_buf && !_is_buf_allocated(iov)) { 5061 return -EINVAL; 5062 } 5063 5064 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5065 num_blocks, cb, cb_arg, NULL, false); 5066 } 5067 5068 int 5069 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5070 struct iovec *iov, int iovcnt, 5071 uint64_t offset_blocks, uint64_t num_blocks, 5072 spdk_bdev_io_completion_cb cb, void *cb_arg, 5073 struct spdk_bdev_ext_io_opts *opts) 5074 { 5075 void *md = NULL; 5076 5077 if (opts) { 5078 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5079 return -EINVAL; 5080 } 5081 md = opts->metadata; 5082 } 5083 5084 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5085 return -EINVAL; 5086 } 5087 5088 if (md && !_is_buf_allocated(iov)) { 5089 return -EINVAL; 5090 } 5091 5092 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5093 num_blocks, cb, cb_arg, opts, false); 5094 } 5095 5096 static void 5097 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5098 { 5099 struct spdk_bdev_io *parent_io = cb_arg; 5100 struct spdk_bdev *bdev = parent_io->bdev; 5101 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5102 int i, rc = 0; 5103 5104 if (!success) { 5105 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5106 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5107 spdk_bdev_free_io(bdev_io); 5108 return; 5109 } 5110 5111 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5112 rc = memcmp(read_buf, 5113 parent_io->u.bdev.iovs[i].iov_base, 5114 parent_io->u.bdev.iovs[i].iov_len); 5115 if (rc) { 5116 break; 5117 } 5118 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5119 } 5120 5121 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5122 rc = memcmp(bdev_io->u.bdev.md_buf, 5123 parent_io->u.bdev.md_buf, 5124 spdk_bdev_get_md_size(bdev)); 5125 } 5126 5127 spdk_bdev_free_io(bdev_io); 5128 5129 if (rc == 0) { 5130 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5131 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5132 } else { 5133 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5134 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5135 } 5136 } 5137 5138 static void 5139 bdev_compare_do_read(void *_bdev_io) 5140 { 5141 struct spdk_bdev_io *bdev_io = _bdev_io; 5142 int rc; 5143 5144 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5145 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5146 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5147 bdev_compare_do_read_done, bdev_io); 5148 5149 if (rc == -ENOMEM) { 5150 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5151 } else if (rc != 0) { 5152 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5153 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5154 } 5155 } 5156 5157 static int 5158 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5159 struct iovec *iov, int iovcnt, void *md_buf, 5160 uint64_t offset_blocks, uint64_t num_blocks, 5161 spdk_bdev_io_completion_cb cb, void *cb_arg) 5162 { 5163 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5164 struct spdk_bdev_io *bdev_io; 5165 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5166 5167 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5168 return -EINVAL; 5169 } 5170 5171 bdev_io = bdev_channel_get_io(channel); 5172 if (!bdev_io) { 5173 return -ENOMEM; 5174 } 5175 5176 bdev_io->internal.ch = channel; 5177 bdev_io->internal.desc = desc; 5178 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5179 bdev_io->u.bdev.iovs = iov; 5180 bdev_io->u.bdev.iovcnt = iovcnt; 5181 bdev_io->u.bdev.md_buf = md_buf; 5182 bdev_io->u.bdev.num_blocks = num_blocks; 5183 bdev_io->u.bdev.offset_blocks = offset_blocks; 5184 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5185 bdev_io->u.bdev.ext_opts = NULL; 5186 5187 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5188 bdev_io_submit(bdev_io); 5189 return 0; 5190 } 5191 5192 bdev_compare_do_read(bdev_io); 5193 5194 return 0; 5195 } 5196 5197 int 5198 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5199 struct iovec *iov, int iovcnt, 5200 uint64_t offset_blocks, uint64_t num_blocks, 5201 spdk_bdev_io_completion_cb cb, void *cb_arg) 5202 { 5203 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5204 num_blocks, cb, cb_arg); 5205 } 5206 5207 int 5208 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5209 struct iovec *iov, int iovcnt, void *md_buf, 5210 uint64_t offset_blocks, uint64_t num_blocks, 5211 spdk_bdev_io_completion_cb cb, void *cb_arg) 5212 { 5213 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5214 return -EINVAL; 5215 } 5216 5217 if (md_buf && !_is_buf_allocated(iov)) { 5218 return -EINVAL; 5219 } 5220 5221 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5222 num_blocks, cb, cb_arg); 5223 } 5224 5225 static int 5226 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5227 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5228 spdk_bdev_io_completion_cb cb, void *cb_arg) 5229 { 5230 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5231 struct spdk_bdev_io *bdev_io; 5232 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5233 5234 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5235 return -EINVAL; 5236 } 5237 5238 bdev_io = bdev_channel_get_io(channel); 5239 if (!bdev_io) { 5240 return -ENOMEM; 5241 } 5242 5243 bdev_io->internal.ch = channel; 5244 bdev_io->internal.desc = desc; 5245 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5246 bdev_io->u.bdev.iovs = &bdev_io->iov; 5247 bdev_io->u.bdev.iovs[0].iov_base = buf; 5248 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5249 bdev_io->u.bdev.iovcnt = 1; 5250 bdev_io->u.bdev.md_buf = md_buf; 5251 bdev_io->u.bdev.num_blocks = num_blocks; 5252 bdev_io->u.bdev.offset_blocks = offset_blocks; 5253 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5254 bdev_io->u.bdev.ext_opts = NULL; 5255 5256 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5257 bdev_io_submit(bdev_io); 5258 return 0; 5259 } 5260 5261 bdev_compare_do_read(bdev_io); 5262 5263 return 0; 5264 } 5265 5266 int 5267 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5268 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5269 spdk_bdev_io_completion_cb cb, void *cb_arg) 5270 { 5271 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5272 cb, cb_arg); 5273 } 5274 5275 int 5276 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5277 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5278 spdk_bdev_io_completion_cb cb, void *cb_arg) 5279 { 5280 struct iovec iov = { 5281 .iov_base = buf, 5282 }; 5283 5284 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5285 return -EINVAL; 5286 } 5287 5288 if (md_buf && !_is_buf_allocated(&iov)) { 5289 return -EINVAL; 5290 } 5291 5292 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5293 cb, cb_arg); 5294 } 5295 5296 static void 5297 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5298 { 5299 struct spdk_bdev_io *bdev_io = ctx; 5300 5301 if (unlock_status) { 5302 SPDK_ERRLOG("LBA range unlock failed\n"); 5303 } 5304 5305 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5306 false, bdev_io->internal.caller_ctx); 5307 } 5308 5309 static void 5310 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5311 { 5312 bdev_io->internal.status = status; 5313 5314 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5315 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5316 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5317 } 5318 5319 static void 5320 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5321 { 5322 struct spdk_bdev_io *parent_io = cb_arg; 5323 5324 if (!success) { 5325 SPDK_ERRLOG("Compare and write operation failed\n"); 5326 } 5327 5328 spdk_bdev_free_io(bdev_io); 5329 5330 bdev_comparev_and_writev_blocks_unlock(parent_io, 5331 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5332 } 5333 5334 static void 5335 bdev_compare_and_write_do_write(void *_bdev_io) 5336 { 5337 struct spdk_bdev_io *bdev_io = _bdev_io; 5338 int rc; 5339 5340 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5341 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5342 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5343 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5344 bdev_compare_and_write_do_write_done, bdev_io); 5345 5346 5347 if (rc == -ENOMEM) { 5348 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5349 } else if (rc != 0) { 5350 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5351 } 5352 } 5353 5354 static void 5355 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5356 { 5357 struct spdk_bdev_io *parent_io = cb_arg; 5358 5359 spdk_bdev_free_io(bdev_io); 5360 5361 if (!success) { 5362 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5363 return; 5364 } 5365 5366 bdev_compare_and_write_do_write(parent_io); 5367 } 5368 5369 static void 5370 bdev_compare_and_write_do_compare(void *_bdev_io) 5371 { 5372 struct spdk_bdev_io *bdev_io = _bdev_io; 5373 int rc; 5374 5375 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5376 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5377 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5378 bdev_compare_and_write_do_compare_done, bdev_io); 5379 5380 if (rc == -ENOMEM) { 5381 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5382 } else if (rc != 0) { 5383 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5384 } 5385 } 5386 5387 static void 5388 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5389 { 5390 struct spdk_bdev_io *bdev_io = ctx; 5391 5392 if (status) { 5393 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5394 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5395 return; 5396 } 5397 5398 bdev_compare_and_write_do_compare(bdev_io); 5399 } 5400 5401 int 5402 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5403 struct iovec *compare_iov, int compare_iovcnt, 5404 struct iovec *write_iov, int write_iovcnt, 5405 uint64_t offset_blocks, uint64_t num_blocks, 5406 spdk_bdev_io_completion_cb cb, void *cb_arg) 5407 { 5408 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5409 struct spdk_bdev_io *bdev_io; 5410 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5411 5412 if (!desc->write) { 5413 return -EBADF; 5414 } 5415 5416 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5417 return -EINVAL; 5418 } 5419 5420 if (num_blocks > bdev->acwu) { 5421 return -EINVAL; 5422 } 5423 5424 bdev_io = bdev_channel_get_io(channel); 5425 if (!bdev_io) { 5426 return -ENOMEM; 5427 } 5428 5429 bdev_io->internal.ch = channel; 5430 bdev_io->internal.desc = desc; 5431 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5432 bdev_io->u.bdev.iovs = compare_iov; 5433 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5434 bdev_io->u.bdev.fused_iovs = write_iov; 5435 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5436 bdev_io->u.bdev.md_buf = NULL; 5437 bdev_io->u.bdev.num_blocks = num_blocks; 5438 bdev_io->u.bdev.offset_blocks = offset_blocks; 5439 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5440 bdev_io->u.bdev.ext_opts = NULL; 5441 5442 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5443 bdev_io_submit(bdev_io); 5444 return 0; 5445 } 5446 5447 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5448 bdev_comparev_and_writev_blocks_locked, bdev_io); 5449 } 5450 5451 int 5452 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5453 struct iovec *iov, int iovcnt, 5454 uint64_t offset_blocks, uint64_t num_blocks, 5455 bool populate, 5456 spdk_bdev_io_completion_cb cb, void *cb_arg) 5457 { 5458 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5459 struct spdk_bdev_io *bdev_io; 5460 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5461 5462 if (!desc->write) { 5463 return -EBADF; 5464 } 5465 5466 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5467 return -EINVAL; 5468 } 5469 5470 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5471 return -ENOTSUP; 5472 } 5473 5474 bdev_io = bdev_channel_get_io(channel); 5475 if (!bdev_io) { 5476 return -ENOMEM; 5477 } 5478 5479 bdev_io->internal.ch = channel; 5480 bdev_io->internal.desc = desc; 5481 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5482 bdev_io->u.bdev.num_blocks = num_blocks; 5483 bdev_io->u.bdev.offset_blocks = offset_blocks; 5484 bdev_io->u.bdev.iovs = iov; 5485 bdev_io->u.bdev.iovcnt = iovcnt; 5486 bdev_io->u.bdev.md_buf = NULL; 5487 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5488 bdev_io->u.bdev.zcopy.commit = 0; 5489 bdev_io->u.bdev.zcopy.start = 1; 5490 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5491 bdev_io->u.bdev.ext_opts = NULL; 5492 5493 bdev_io_submit(bdev_io); 5494 5495 return 0; 5496 } 5497 5498 int 5499 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5500 spdk_bdev_io_completion_cb cb, void *cb_arg) 5501 { 5502 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5503 return -EINVAL; 5504 } 5505 5506 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5507 bdev_io->u.bdev.zcopy.start = 0; 5508 bdev_io->internal.caller_ctx = cb_arg; 5509 bdev_io->internal.cb = cb; 5510 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5511 5512 bdev_io_submit(bdev_io); 5513 5514 return 0; 5515 } 5516 5517 int 5518 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5519 uint64_t offset, uint64_t len, 5520 spdk_bdev_io_completion_cb cb, void *cb_arg) 5521 { 5522 uint64_t offset_blocks, num_blocks; 5523 5524 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5525 len, &num_blocks) != 0) { 5526 return -EINVAL; 5527 } 5528 5529 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5530 } 5531 5532 int 5533 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5534 uint64_t offset_blocks, uint64_t num_blocks, 5535 spdk_bdev_io_completion_cb cb, void *cb_arg) 5536 { 5537 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5538 struct spdk_bdev_io *bdev_io; 5539 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5540 5541 if (!desc->write) { 5542 return -EBADF; 5543 } 5544 5545 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5546 return -EINVAL; 5547 } 5548 5549 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5550 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5551 return -ENOTSUP; 5552 } 5553 5554 bdev_io = bdev_channel_get_io(channel); 5555 5556 if (!bdev_io) { 5557 return -ENOMEM; 5558 } 5559 5560 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5561 bdev_io->internal.ch = channel; 5562 bdev_io->internal.desc = desc; 5563 bdev_io->u.bdev.offset_blocks = offset_blocks; 5564 bdev_io->u.bdev.num_blocks = num_blocks; 5565 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5566 bdev_io->u.bdev.ext_opts = NULL; 5567 5568 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5569 bdev_io_submit(bdev_io); 5570 return 0; 5571 } 5572 5573 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5574 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5575 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5576 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5577 bdev_write_zero_buffer_next(bdev_io); 5578 5579 return 0; 5580 } 5581 5582 int 5583 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5584 uint64_t offset, uint64_t nbytes, 5585 spdk_bdev_io_completion_cb cb, void *cb_arg) 5586 { 5587 uint64_t offset_blocks, num_blocks; 5588 5589 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5590 nbytes, &num_blocks) != 0) { 5591 return -EINVAL; 5592 } 5593 5594 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5595 } 5596 5597 int 5598 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5599 uint64_t offset_blocks, uint64_t num_blocks, 5600 spdk_bdev_io_completion_cb cb, void *cb_arg) 5601 { 5602 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5603 struct spdk_bdev_io *bdev_io; 5604 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5605 5606 if (!desc->write) { 5607 return -EBADF; 5608 } 5609 5610 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5611 return -EINVAL; 5612 } 5613 5614 if (num_blocks == 0) { 5615 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5616 return -EINVAL; 5617 } 5618 5619 bdev_io = bdev_channel_get_io(channel); 5620 if (!bdev_io) { 5621 return -ENOMEM; 5622 } 5623 5624 bdev_io->internal.ch = channel; 5625 bdev_io->internal.desc = desc; 5626 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5627 5628 bdev_io->u.bdev.iovs = &bdev_io->iov; 5629 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5630 bdev_io->u.bdev.iovs[0].iov_len = 0; 5631 bdev_io->u.bdev.iovcnt = 1; 5632 5633 bdev_io->u.bdev.offset_blocks = offset_blocks; 5634 bdev_io->u.bdev.num_blocks = num_blocks; 5635 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5636 bdev_io->u.bdev.ext_opts = NULL; 5637 5638 bdev_io_submit(bdev_io); 5639 return 0; 5640 } 5641 5642 int 5643 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5644 uint64_t offset, uint64_t length, 5645 spdk_bdev_io_completion_cb cb, void *cb_arg) 5646 { 5647 uint64_t offset_blocks, num_blocks; 5648 5649 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5650 length, &num_blocks) != 0) { 5651 return -EINVAL; 5652 } 5653 5654 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5655 } 5656 5657 int 5658 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5659 uint64_t offset_blocks, uint64_t num_blocks, 5660 spdk_bdev_io_completion_cb cb, void *cb_arg) 5661 { 5662 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5663 struct spdk_bdev_io *bdev_io; 5664 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5665 5666 if (!desc->write) { 5667 return -EBADF; 5668 } 5669 5670 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5671 return -EINVAL; 5672 } 5673 5674 bdev_io = bdev_channel_get_io(channel); 5675 if (!bdev_io) { 5676 return -ENOMEM; 5677 } 5678 5679 bdev_io->internal.ch = channel; 5680 bdev_io->internal.desc = desc; 5681 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5682 bdev_io->u.bdev.iovs = NULL; 5683 bdev_io->u.bdev.iovcnt = 0; 5684 bdev_io->u.bdev.offset_blocks = offset_blocks; 5685 bdev_io->u.bdev.num_blocks = num_blocks; 5686 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5687 5688 bdev_io_submit(bdev_io); 5689 return 0; 5690 } 5691 5692 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5693 5694 static void 5695 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5696 { 5697 struct spdk_bdev_channel *ch = _ctx; 5698 struct spdk_bdev_io *bdev_io; 5699 5700 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5701 5702 if (status == -EBUSY) { 5703 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5704 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5705 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5706 } else { 5707 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5708 * start the reset. */ 5709 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5710 bdev_io_submit_reset(bdev_io); 5711 } 5712 } else { 5713 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5714 SPDK_DEBUGLOG(bdev, 5715 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5716 ch->bdev->name); 5717 /* Mark the completion status as a SUCCESS and complete the reset. */ 5718 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5719 } 5720 } 5721 5722 static void 5723 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5724 struct spdk_io_channel *io_ch, void *_ctx) 5725 { 5726 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5727 int status = 0; 5728 5729 if (cur_ch->io_outstanding > 0) { 5730 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5731 * further iteration over the rest of the channels and pass non-zero status 5732 * to the callback function. */ 5733 status = -EBUSY; 5734 } 5735 spdk_bdev_for_each_channel_continue(i, status); 5736 } 5737 5738 static int 5739 bdev_reset_poll_for_outstanding_io(void *ctx) 5740 { 5741 struct spdk_bdev_channel *ch = ctx; 5742 struct spdk_bdev_io *bdev_io; 5743 5744 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5745 5746 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5747 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5748 bdev_reset_check_outstanding_io_done); 5749 5750 return SPDK_POLLER_BUSY; 5751 } 5752 5753 static void 5754 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5755 { 5756 struct spdk_bdev_channel *ch = _ctx; 5757 struct spdk_bdev_io *bdev_io; 5758 5759 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5760 5761 if (bdev->reset_io_drain_timeout == 0) { 5762 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5763 5764 bdev_io_submit_reset(bdev_io); 5765 return; 5766 } 5767 5768 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5769 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5770 5771 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5772 * submit the reset to the underlying module only if outstanding I/O 5773 * remain after reset_io_drain_timeout seconds have passed. */ 5774 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5775 bdev_reset_check_outstanding_io_done); 5776 } 5777 5778 static void 5779 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5780 struct spdk_io_channel *ch, void *_ctx) 5781 { 5782 struct spdk_bdev_channel *channel; 5783 struct spdk_bdev_mgmt_channel *mgmt_channel; 5784 struct spdk_bdev_shared_resource *shared_resource; 5785 bdev_io_tailq_t tmp_queued; 5786 5787 TAILQ_INIT(&tmp_queued); 5788 5789 channel = __io_ch_to_bdev_ch(ch); 5790 shared_resource = channel->shared_resource; 5791 mgmt_channel = shared_resource->mgmt_ch; 5792 5793 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5794 5795 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5796 /* The QoS object is always valid and readable while 5797 * the channel flag is set, so the lock here should not 5798 * be necessary. We're not in the fast path though, so 5799 * just take it anyway. */ 5800 spdk_spin_lock(&channel->bdev->internal.spinlock); 5801 if (channel->bdev->internal.qos->ch == channel) { 5802 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5803 } 5804 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5805 } 5806 5807 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5808 bdev_abort_all_buf_io(mgmt_channel, channel); 5809 bdev_abort_all_buf_io(mgmt_channel, channel); 5810 bdev_abort_all_queued_io(&tmp_queued, channel); 5811 5812 spdk_bdev_for_each_channel_continue(i, 0); 5813 } 5814 5815 static void 5816 bdev_start_reset(void *ctx) 5817 { 5818 struct spdk_bdev_channel *ch = ctx; 5819 5820 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5821 bdev_reset_freeze_channel_done); 5822 } 5823 5824 static void 5825 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5826 { 5827 struct spdk_bdev *bdev = ch->bdev; 5828 5829 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5830 5831 spdk_spin_lock(&bdev->internal.spinlock); 5832 if (bdev->internal.reset_in_progress == NULL) { 5833 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5834 /* 5835 * Take a channel reference for the target bdev for the life of this 5836 * reset. This guards against the channel getting destroyed while 5837 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5838 * progress. We will release the reference when this reset is 5839 * completed. 5840 */ 5841 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5842 bdev_start_reset(ch); 5843 } 5844 spdk_spin_unlock(&bdev->internal.spinlock); 5845 } 5846 5847 int 5848 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5849 spdk_bdev_io_completion_cb cb, void *cb_arg) 5850 { 5851 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5852 struct spdk_bdev_io *bdev_io; 5853 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5854 5855 bdev_io = bdev_channel_get_io(channel); 5856 if (!bdev_io) { 5857 return -ENOMEM; 5858 } 5859 5860 bdev_io->internal.ch = channel; 5861 bdev_io->internal.desc = desc; 5862 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5863 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5864 bdev_io->u.reset.ch_ref = NULL; 5865 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5866 5867 spdk_spin_lock(&bdev->internal.spinlock); 5868 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5869 spdk_spin_unlock(&bdev->internal.spinlock); 5870 5871 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5872 internal.ch_link); 5873 5874 bdev_channel_start_reset(channel); 5875 5876 return 0; 5877 } 5878 5879 void 5880 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5881 struct spdk_bdev_io_stat *stat) 5882 { 5883 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5884 5885 bdev_get_io_stat(stat, channel->stat); 5886 } 5887 5888 static void 5889 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5890 { 5891 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5892 5893 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5894 bdev_iostat_ctx->cb_arg, 0); 5895 free(bdev_iostat_ctx); 5896 } 5897 5898 static void 5899 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5900 struct spdk_io_channel *ch, void *_ctx) 5901 { 5902 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5903 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5904 5905 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5906 spdk_bdev_for_each_channel_continue(i, 0); 5907 } 5908 5909 void 5910 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5911 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5912 { 5913 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5914 5915 assert(bdev != NULL); 5916 assert(stat != NULL); 5917 assert(cb != NULL); 5918 5919 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5920 if (bdev_iostat_ctx == NULL) { 5921 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5922 cb(bdev, stat, cb_arg, -ENOMEM); 5923 return; 5924 } 5925 5926 bdev_iostat_ctx->stat = stat; 5927 bdev_iostat_ctx->cb = cb; 5928 bdev_iostat_ctx->cb_arg = cb_arg; 5929 5930 /* Start with the statistics from previously deleted channels. */ 5931 spdk_spin_lock(&bdev->internal.spinlock); 5932 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5933 spdk_spin_unlock(&bdev->internal.spinlock); 5934 5935 /* Then iterate and add the statistics from each existing channel. */ 5936 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5937 bdev_get_device_stat_done); 5938 } 5939 5940 struct bdev_iostat_reset_ctx { 5941 enum spdk_bdev_reset_stat_mode mode; 5942 bdev_reset_device_stat_cb cb; 5943 void *cb_arg; 5944 }; 5945 5946 static void 5947 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5948 { 5949 struct bdev_iostat_reset_ctx *ctx = _ctx; 5950 5951 ctx->cb(bdev, ctx->cb_arg, 0); 5952 5953 free(ctx); 5954 } 5955 5956 static void 5957 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5958 struct spdk_io_channel *ch, void *_ctx) 5959 { 5960 struct bdev_iostat_reset_ctx *ctx = _ctx; 5961 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5962 5963 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 5964 5965 spdk_bdev_for_each_channel_continue(i, 0); 5966 } 5967 5968 void 5969 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 5970 bdev_reset_device_stat_cb cb, void *cb_arg) 5971 { 5972 struct bdev_iostat_reset_ctx *ctx; 5973 5974 assert(bdev != NULL); 5975 assert(cb != NULL); 5976 5977 ctx = calloc(1, sizeof(*ctx)); 5978 if (ctx == NULL) { 5979 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5980 cb(bdev, cb_arg, -ENOMEM); 5981 return; 5982 } 5983 5984 ctx->mode = mode; 5985 ctx->cb = cb; 5986 ctx->cb_arg = cb_arg; 5987 5988 spdk_spin_lock(&bdev->internal.spinlock); 5989 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 5990 spdk_spin_unlock(&bdev->internal.spinlock); 5991 5992 spdk_bdev_for_each_channel(bdev, 5993 bdev_reset_each_channel_stat, 5994 ctx, 5995 bdev_reset_device_stat_done); 5996 } 5997 5998 int 5999 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6000 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6001 spdk_bdev_io_completion_cb cb, void *cb_arg) 6002 { 6003 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6004 struct spdk_bdev_io *bdev_io; 6005 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6006 6007 if (!desc->write) { 6008 return -EBADF; 6009 } 6010 6011 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6012 return -ENOTSUP; 6013 } 6014 6015 bdev_io = bdev_channel_get_io(channel); 6016 if (!bdev_io) { 6017 return -ENOMEM; 6018 } 6019 6020 bdev_io->internal.ch = channel; 6021 bdev_io->internal.desc = desc; 6022 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6023 bdev_io->u.nvme_passthru.cmd = *cmd; 6024 bdev_io->u.nvme_passthru.buf = buf; 6025 bdev_io->u.nvme_passthru.nbytes = nbytes; 6026 bdev_io->u.nvme_passthru.md_buf = NULL; 6027 bdev_io->u.nvme_passthru.md_len = 0; 6028 6029 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6030 6031 bdev_io_submit(bdev_io); 6032 return 0; 6033 } 6034 6035 int 6036 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6037 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6038 spdk_bdev_io_completion_cb cb, void *cb_arg) 6039 { 6040 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6041 struct spdk_bdev_io *bdev_io; 6042 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6043 6044 if (!desc->write) { 6045 /* 6046 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6047 * to easily determine if the command is a read or write, but for now just 6048 * do not allow io_passthru with a read-only descriptor. 6049 */ 6050 return -EBADF; 6051 } 6052 6053 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6054 return -ENOTSUP; 6055 } 6056 6057 bdev_io = bdev_channel_get_io(channel); 6058 if (!bdev_io) { 6059 return -ENOMEM; 6060 } 6061 6062 bdev_io->internal.ch = channel; 6063 bdev_io->internal.desc = desc; 6064 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6065 bdev_io->u.nvme_passthru.cmd = *cmd; 6066 bdev_io->u.nvme_passthru.buf = buf; 6067 bdev_io->u.nvme_passthru.nbytes = nbytes; 6068 bdev_io->u.nvme_passthru.md_buf = NULL; 6069 bdev_io->u.nvme_passthru.md_len = 0; 6070 6071 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6072 6073 bdev_io_submit(bdev_io); 6074 return 0; 6075 } 6076 6077 int 6078 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6079 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6080 spdk_bdev_io_completion_cb cb, void *cb_arg) 6081 { 6082 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6083 struct spdk_bdev_io *bdev_io; 6084 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6085 6086 if (!desc->write) { 6087 /* 6088 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6089 * to easily determine if the command is a read or write, but for now just 6090 * do not allow io_passthru with a read-only descriptor. 6091 */ 6092 return -EBADF; 6093 } 6094 6095 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6096 return -ENOTSUP; 6097 } 6098 6099 bdev_io = bdev_channel_get_io(channel); 6100 if (!bdev_io) { 6101 return -ENOMEM; 6102 } 6103 6104 bdev_io->internal.ch = channel; 6105 bdev_io->internal.desc = desc; 6106 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6107 bdev_io->u.nvme_passthru.cmd = *cmd; 6108 bdev_io->u.nvme_passthru.buf = buf; 6109 bdev_io->u.nvme_passthru.nbytes = nbytes; 6110 bdev_io->u.nvme_passthru.md_buf = md_buf; 6111 bdev_io->u.nvme_passthru.md_len = md_len; 6112 6113 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6114 6115 bdev_io_submit(bdev_io); 6116 return 0; 6117 } 6118 6119 static void bdev_abort_retry(void *ctx); 6120 static void bdev_abort(struct spdk_bdev_io *parent_io); 6121 6122 static void 6123 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6124 { 6125 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6126 struct spdk_bdev_io *parent_io = cb_arg; 6127 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6128 6129 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6130 6131 spdk_bdev_free_io(bdev_io); 6132 6133 if (!success) { 6134 /* Check if the target I/O completed in the meantime. */ 6135 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6136 if (tmp_io == bio_to_abort) { 6137 break; 6138 } 6139 } 6140 6141 /* If the target I/O still exists, set the parent to failed. */ 6142 if (tmp_io != NULL) { 6143 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6144 } 6145 } 6146 6147 parent_io->u.bdev.split_outstanding--; 6148 if (parent_io->u.bdev.split_outstanding == 0) { 6149 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6150 bdev_abort_retry(parent_io); 6151 } else { 6152 bdev_io_complete(parent_io); 6153 } 6154 } 6155 } 6156 6157 static int 6158 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6159 struct spdk_bdev_io *bio_to_abort, 6160 spdk_bdev_io_completion_cb cb, void *cb_arg) 6161 { 6162 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6163 struct spdk_bdev_io *bdev_io; 6164 6165 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6166 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6167 /* TODO: Abort reset or abort request. */ 6168 return -ENOTSUP; 6169 } 6170 6171 bdev_io = bdev_channel_get_io(channel); 6172 if (bdev_io == NULL) { 6173 return -ENOMEM; 6174 } 6175 6176 bdev_io->internal.ch = channel; 6177 bdev_io->internal.desc = desc; 6178 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6179 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6180 6181 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6182 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6183 6184 /* Parent abort request is not submitted directly, but to manage its 6185 * execution add it to the submitted list here. 6186 */ 6187 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6188 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6189 6190 bdev_abort(bdev_io); 6191 6192 return 0; 6193 } 6194 6195 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6196 6197 /* Submit the abort request to the underlying bdev module. */ 6198 bdev_io_submit(bdev_io); 6199 6200 return 0; 6201 } 6202 6203 static uint32_t 6204 _bdev_abort(struct spdk_bdev_io *parent_io) 6205 { 6206 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6207 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6208 void *bio_cb_arg; 6209 struct spdk_bdev_io *bio_to_abort; 6210 uint32_t matched_ios; 6211 int rc; 6212 6213 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6214 6215 /* matched_ios is returned and will be kept by the caller. 6216 * 6217 * This function will be used for two cases, 1) the same cb_arg is used for 6218 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6219 * Incrementing split_outstanding directly here may confuse readers especially 6220 * for the 1st case. 6221 * 6222 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6223 * works as expected. 6224 */ 6225 matched_ios = 0; 6226 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6227 6228 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6229 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6230 continue; 6231 } 6232 6233 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6234 /* Any I/O which was submitted after this abort command should be excluded. */ 6235 continue; 6236 } 6237 6238 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6239 if (rc != 0) { 6240 if (rc == -ENOMEM) { 6241 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6242 } else { 6243 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6244 } 6245 break; 6246 } 6247 matched_ios++; 6248 } 6249 6250 return matched_ios; 6251 } 6252 6253 static void 6254 bdev_abort_retry(void *ctx) 6255 { 6256 struct spdk_bdev_io *parent_io = ctx; 6257 uint32_t matched_ios; 6258 6259 matched_ios = _bdev_abort(parent_io); 6260 6261 if (matched_ios == 0) { 6262 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6263 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6264 } else { 6265 /* For retry, the case that no target I/O was found is success 6266 * because it means target I/Os completed in the meantime. 6267 */ 6268 bdev_io_complete(parent_io); 6269 } 6270 return; 6271 } 6272 6273 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6274 parent_io->u.bdev.split_outstanding = matched_ios; 6275 } 6276 6277 static void 6278 bdev_abort(struct spdk_bdev_io *parent_io) 6279 { 6280 uint32_t matched_ios; 6281 6282 matched_ios = _bdev_abort(parent_io); 6283 6284 if (matched_ios == 0) { 6285 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6286 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6287 } else { 6288 /* The case the no target I/O was found is failure. */ 6289 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6290 bdev_io_complete(parent_io); 6291 } 6292 return; 6293 } 6294 6295 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6296 parent_io->u.bdev.split_outstanding = matched_ios; 6297 } 6298 6299 int 6300 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6301 void *bio_cb_arg, 6302 spdk_bdev_io_completion_cb cb, void *cb_arg) 6303 { 6304 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6305 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6306 struct spdk_bdev_io *bdev_io; 6307 6308 if (bio_cb_arg == NULL) { 6309 return -EINVAL; 6310 } 6311 6312 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6313 return -ENOTSUP; 6314 } 6315 6316 bdev_io = bdev_channel_get_io(channel); 6317 if (bdev_io == NULL) { 6318 return -ENOMEM; 6319 } 6320 6321 bdev_io->internal.ch = channel; 6322 bdev_io->internal.desc = desc; 6323 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6324 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6325 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6326 6327 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6328 6329 /* Parent abort request is not submitted directly, but to manage its execution, 6330 * add it to the submitted list here. 6331 */ 6332 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6333 6334 bdev_abort(bdev_io); 6335 6336 return 0; 6337 } 6338 6339 int 6340 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6341 struct spdk_bdev_io_wait_entry *entry) 6342 { 6343 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6344 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6345 6346 if (bdev != entry->bdev) { 6347 SPDK_ERRLOG("bdevs do not match\n"); 6348 return -EINVAL; 6349 } 6350 6351 if (mgmt_ch->per_thread_cache_count > 0) { 6352 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6353 return -EINVAL; 6354 } 6355 6356 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6357 return 0; 6358 } 6359 6360 static inline void 6361 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6362 { 6363 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6364 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6365 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6366 uint32_t blocklen = bdev_io->bdev->blocklen; 6367 6368 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6369 switch (bdev_io->type) { 6370 case SPDK_BDEV_IO_TYPE_READ: 6371 io_stat->bytes_read += num_blocks * blocklen; 6372 io_stat->num_read_ops++; 6373 io_stat->read_latency_ticks += tsc_diff; 6374 if (io_stat->max_read_latency_ticks < tsc_diff) { 6375 io_stat->max_read_latency_ticks = tsc_diff; 6376 } 6377 if (io_stat->min_read_latency_ticks > tsc_diff) { 6378 io_stat->min_read_latency_ticks = tsc_diff; 6379 } 6380 break; 6381 case SPDK_BDEV_IO_TYPE_WRITE: 6382 io_stat->bytes_written += num_blocks * blocklen; 6383 io_stat->num_write_ops++; 6384 io_stat->write_latency_ticks += tsc_diff; 6385 if (io_stat->max_write_latency_ticks < tsc_diff) { 6386 io_stat->max_write_latency_ticks = tsc_diff; 6387 } 6388 if (io_stat->min_write_latency_ticks > tsc_diff) { 6389 io_stat->min_write_latency_ticks = tsc_diff; 6390 } 6391 break; 6392 case SPDK_BDEV_IO_TYPE_UNMAP: 6393 io_stat->bytes_unmapped += num_blocks * blocklen; 6394 io_stat->num_unmap_ops++; 6395 io_stat->unmap_latency_ticks += tsc_diff; 6396 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6397 io_stat->max_unmap_latency_ticks = tsc_diff; 6398 } 6399 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6400 io_stat->min_unmap_latency_ticks = tsc_diff; 6401 } 6402 break; 6403 case SPDK_BDEV_IO_TYPE_ZCOPY: 6404 /* Track the data in the start phase only */ 6405 if (bdev_io->u.bdev.zcopy.start) { 6406 if (bdev_io->u.bdev.zcopy.populate) { 6407 io_stat->bytes_read += num_blocks * blocklen; 6408 io_stat->num_read_ops++; 6409 io_stat->read_latency_ticks += tsc_diff; 6410 if (io_stat->max_read_latency_ticks < tsc_diff) { 6411 io_stat->max_read_latency_ticks = tsc_diff; 6412 } 6413 if (io_stat->min_read_latency_ticks > tsc_diff) { 6414 io_stat->min_read_latency_ticks = tsc_diff; 6415 } 6416 } else { 6417 io_stat->bytes_written += num_blocks * blocklen; 6418 io_stat->num_write_ops++; 6419 io_stat->write_latency_ticks += tsc_diff; 6420 if (io_stat->max_write_latency_ticks < tsc_diff) { 6421 io_stat->max_write_latency_ticks = tsc_diff; 6422 } 6423 if (io_stat->min_write_latency_ticks > tsc_diff) { 6424 io_stat->min_write_latency_ticks = tsc_diff; 6425 } 6426 } 6427 } 6428 break; 6429 case SPDK_BDEV_IO_TYPE_COPY: 6430 io_stat->bytes_copied += num_blocks * blocklen; 6431 io_stat->num_copy_ops++; 6432 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6433 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6434 io_stat->max_copy_latency_ticks = tsc_diff; 6435 } 6436 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6437 io_stat->min_copy_latency_ticks = tsc_diff; 6438 } 6439 break; 6440 default: 6441 break; 6442 } 6443 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6444 io_stat = bdev_io->bdev->internal.stat; 6445 assert(io_stat->io_error != NULL); 6446 6447 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6448 io_stat->io_error->error_status[-io_status - 1]++; 6449 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6450 } 6451 6452 #ifdef SPDK_CONFIG_VTUNE 6453 uint64_t now_tsc = spdk_get_ticks(); 6454 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6455 uint64_t data[5]; 6456 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6457 6458 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6459 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6460 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6461 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6462 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6463 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6464 6465 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6466 __itt_metadata_u64, 5, data); 6467 6468 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6469 bdev_io->internal.ch->start_tsc = now_tsc; 6470 } 6471 #endif 6472 } 6473 6474 static inline void 6475 bdev_io_complete(void *ctx) 6476 { 6477 struct spdk_bdev_io *bdev_io = ctx; 6478 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6479 uint64_t tsc, tsc_diff; 6480 6481 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6482 /* 6483 * Defer completion to avoid potential infinite recursion if the 6484 * user's completion callback issues a new I/O. 6485 */ 6486 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6487 bdev_io_complete, bdev_io); 6488 return; 6489 } 6490 6491 tsc = spdk_get_ticks(); 6492 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6493 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6494 bdev_io->internal.caller_ctx); 6495 6496 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6497 6498 if (bdev_io->internal.ch->histogram) { 6499 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6500 } 6501 6502 bdev_io_update_io_stat(bdev_io, tsc_diff); 6503 6504 assert(bdev_io->internal.cb != NULL); 6505 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6506 6507 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6508 bdev_io->internal.caller_ctx); 6509 } 6510 6511 static void bdev_destroy_cb(void *io_device); 6512 6513 static void 6514 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6515 { 6516 struct spdk_bdev_io *bdev_io = _ctx; 6517 6518 if (bdev_io->u.reset.ch_ref != NULL) { 6519 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6520 bdev_io->u.reset.ch_ref = NULL; 6521 } 6522 6523 bdev_io_complete(bdev_io); 6524 6525 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6526 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6527 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6528 } 6529 } 6530 6531 static void 6532 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6533 struct spdk_io_channel *_ch, void *_ctx) 6534 { 6535 struct spdk_bdev_io *bdev_io = _ctx; 6536 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6537 struct spdk_bdev_io *queued_reset; 6538 6539 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6540 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6541 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6542 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6543 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6544 } 6545 6546 spdk_bdev_for_each_channel_continue(i, 0); 6547 } 6548 6549 void 6550 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6551 { 6552 struct spdk_bdev *bdev = bdev_io->bdev; 6553 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6554 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6555 6556 bdev_io->internal.status = status; 6557 6558 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6559 bool unlock_channels = false; 6560 6561 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6562 SPDK_ERRLOG("NOMEM returned for reset\n"); 6563 } 6564 spdk_spin_lock(&bdev->internal.spinlock); 6565 if (bdev_io == bdev->internal.reset_in_progress) { 6566 bdev->internal.reset_in_progress = NULL; 6567 unlock_channels = true; 6568 } 6569 spdk_spin_unlock(&bdev->internal.spinlock); 6570 6571 if (unlock_channels) { 6572 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6573 bdev_reset_complete); 6574 return; 6575 } 6576 } else { 6577 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6578 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6579 /* bdev IO will be completed in the callback */ 6580 return; 6581 } 6582 6583 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6584 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6585 return; 6586 } 6587 } 6588 6589 bdev_io_complete(bdev_io); 6590 } 6591 6592 void 6593 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6594 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6595 { 6596 if (sc == SPDK_SCSI_STATUS_GOOD) { 6597 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6598 } else { 6599 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6600 bdev_io->internal.error.scsi.sc = sc; 6601 bdev_io->internal.error.scsi.sk = sk; 6602 bdev_io->internal.error.scsi.asc = asc; 6603 bdev_io->internal.error.scsi.ascq = ascq; 6604 } 6605 6606 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6607 } 6608 6609 void 6610 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6611 int *sc, int *sk, int *asc, int *ascq) 6612 { 6613 assert(sc != NULL); 6614 assert(sk != NULL); 6615 assert(asc != NULL); 6616 assert(ascq != NULL); 6617 6618 switch (bdev_io->internal.status) { 6619 case SPDK_BDEV_IO_STATUS_SUCCESS: 6620 *sc = SPDK_SCSI_STATUS_GOOD; 6621 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6622 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6623 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6624 break; 6625 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6626 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6627 break; 6628 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6629 *sc = bdev_io->internal.error.scsi.sc; 6630 *sk = bdev_io->internal.error.scsi.sk; 6631 *asc = bdev_io->internal.error.scsi.asc; 6632 *ascq = bdev_io->internal.error.scsi.ascq; 6633 break; 6634 default: 6635 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6636 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6637 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6638 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6639 break; 6640 } 6641 } 6642 6643 void 6644 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6645 { 6646 if (aio_result == 0) { 6647 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6648 } else { 6649 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6650 } 6651 6652 bdev_io->internal.error.aio_result = aio_result; 6653 6654 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6655 } 6656 6657 void 6658 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6659 { 6660 assert(aio_result != NULL); 6661 6662 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6663 *aio_result = bdev_io->internal.error.aio_result; 6664 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6665 *aio_result = 0; 6666 } else { 6667 *aio_result = -EIO; 6668 } 6669 } 6670 6671 void 6672 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6673 { 6674 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6675 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6676 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6677 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6678 } else { 6679 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6680 } 6681 6682 bdev_io->internal.error.nvme.cdw0 = cdw0; 6683 bdev_io->internal.error.nvme.sct = sct; 6684 bdev_io->internal.error.nvme.sc = sc; 6685 6686 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6687 } 6688 6689 void 6690 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6691 { 6692 assert(sct != NULL); 6693 assert(sc != NULL); 6694 assert(cdw0 != NULL); 6695 6696 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6697 *sct = SPDK_NVME_SCT_GENERIC; 6698 *sc = SPDK_NVME_SC_SUCCESS; 6699 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6700 *cdw0 = 0; 6701 } else { 6702 *cdw0 = 1U; 6703 } 6704 return; 6705 } 6706 6707 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6708 *sct = bdev_io->internal.error.nvme.sct; 6709 *sc = bdev_io->internal.error.nvme.sc; 6710 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6711 *sct = SPDK_NVME_SCT_GENERIC; 6712 *sc = SPDK_NVME_SC_SUCCESS; 6713 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6714 *sct = SPDK_NVME_SCT_GENERIC; 6715 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6716 } else { 6717 *sct = SPDK_NVME_SCT_GENERIC; 6718 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6719 } 6720 6721 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6722 } 6723 6724 void 6725 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6726 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6727 { 6728 assert(first_sct != NULL); 6729 assert(first_sc != NULL); 6730 assert(second_sct != NULL); 6731 assert(second_sc != NULL); 6732 assert(cdw0 != NULL); 6733 6734 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6735 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6736 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6737 *first_sct = bdev_io->internal.error.nvme.sct; 6738 *first_sc = bdev_io->internal.error.nvme.sc; 6739 *second_sct = SPDK_NVME_SCT_GENERIC; 6740 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6741 } else { 6742 *first_sct = SPDK_NVME_SCT_GENERIC; 6743 *first_sc = SPDK_NVME_SC_SUCCESS; 6744 *second_sct = bdev_io->internal.error.nvme.sct; 6745 *second_sc = bdev_io->internal.error.nvme.sc; 6746 } 6747 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6748 *first_sct = SPDK_NVME_SCT_GENERIC; 6749 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6750 *second_sct = SPDK_NVME_SCT_GENERIC; 6751 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6752 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6753 *first_sct = SPDK_NVME_SCT_GENERIC; 6754 *first_sc = SPDK_NVME_SC_SUCCESS; 6755 *second_sct = SPDK_NVME_SCT_GENERIC; 6756 *second_sc = SPDK_NVME_SC_SUCCESS; 6757 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6758 *first_sct = SPDK_NVME_SCT_GENERIC; 6759 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6760 *second_sct = SPDK_NVME_SCT_GENERIC; 6761 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6762 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6763 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6764 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6765 *second_sct = SPDK_NVME_SCT_GENERIC; 6766 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6767 } else { 6768 *first_sct = SPDK_NVME_SCT_GENERIC; 6769 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6770 *second_sct = SPDK_NVME_SCT_GENERIC; 6771 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6772 } 6773 6774 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6775 } 6776 6777 struct spdk_thread * 6778 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6779 { 6780 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6781 } 6782 6783 struct spdk_io_channel * 6784 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6785 { 6786 return bdev_io->internal.ch->channel; 6787 } 6788 6789 static int 6790 bdev_register(struct spdk_bdev *bdev) 6791 { 6792 char *bdev_name; 6793 char uuid[SPDK_UUID_STRING_LEN]; 6794 int ret; 6795 6796 assert(bdev->module != NULL); 6797 6798 if (!bdev->name) { 6799 SPDK_ERRLOG("Bdev name is NULL\n"); 6800 return -EINVAL; 6801 } 6802 6803 if (!strlen(bdev->name)) { 6804 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6805 return -EINVAL; 6806 } 6807 6808 /* Users often register their own I/O devices using the bdev name. In 6809 * order to avoid conflicts, prepend bdev_. */ 6810 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6811 if (!bdev_name) { 6812 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6813 return -ENOMEM; 6814 } 6815 6816 bdev->internal.stat = bdev_alloc_io_stat(true); 6817 if (!bdev->internal.stat) { 6818 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6819 free(bdev_name); 6820 return -ENOMEM; 6821 } 6822 6823 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6824 bdev->internal.measured_queue_depth = UINT64_MAX; 6825 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6826 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6827 bdev->internal.qd_poller = NULL; 6828 bdev->internal.qos = NULL; 6829 6830 TAILQ_INIT(&bdev->internal.open_descs); 6831 TAILQ_INIT(&bdev->internal.locked_ranges); 6832 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6833 TAILQ_INIT(&bdev->aliases); 6834 6835 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6836 if (ret != 0) { 6837 bdev_free_io_stat(bdev->internal.stat); 6838 free(bdev_name); 6839 return ret; 6840 } 6841 6842 /* UUID has to be specified by the user or defined by bdev itself. 6843 * Otherwise this field must remain empty, to indicate that this 6844 * value cannot be depended upon. */ 6845 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6846 /* Add the UUID alias only if it's different than the name */ 6847 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6848 if (strcmp(bdev->name, uuid) != 0) { 6849 ret = spdk_bdev_alias_add(bdev, uuid); 6850 if (ret != 0) { 6851 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6852 bdev_name_del(&bdev->internal.bdev_name); 6853 bdev_free_io_stat(bdev->internal.stat); 6854 free(bdev_name); 6855 return ret; 6856 } 6857 } 6858 } 6859 6860 if (spdk_bdev_get_buf_align(bdev) > 1) { 6861 if (bdev->split_on_optimal_io_boundary) { 6862 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6863 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6864 } else { 6865 bdev->split_on_optimal_io_boundary = true; 6866 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6867 } 6868 } 6869 6870 /* If the user didn't specify a write unit size, set it to one. */ 6871 if (bdev->write_unit_size == 0) { 6872 bdev->write_unit_size = 1; 6873 } 6874 6875 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6876 if (bdev->acwu == 0) { 6877 bdev->acwu = bdev->write_unit_size; 6878 } 6879 6880 if (bdev->phys_blocklen == 0) { 6881 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6882 } 6883 6884 bdev->internal.reset_in_progress = NULL; 6885 bdev->internal.qd_poll_in_progress = false; 6886 bdev->internal.period = 0; 6887 bdev->internal.new_period = 0; 6888 6889 spdk_io_device_register(__bdev_to_io_dev(bdev), 6890 bdev_channel_create, bdev_channel_destroy, 6891 sizeof(struct spdk_bdev_channel), 6892 bdev_name); 6893 6894 free(bdev_name); 6895 6896 spdk_spin_init(&bdev->internal.spinlock); 6897 6898 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6899 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6900 6901 return 0; 6902 } 6903 6904 static void 6905 bdev_destroy_cb(void *io_device) 6906 { 6907 int rc; 6908 struct spdk_bdev *bdev; 6909 spdk_bdev_unregister_cb cb_fn; 6910 void *cb_arg; 6911 6912 bdev = __bdev_from_io_dev(io_device); 6913 6914 if (bdev->internal.unregister_td != spdk_get_thread()) { 6915 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 6916 return; 6917 } 6918 6919 cb_fn = bdev->internal.unregister_cb; 6920 cb_arg = bdev->internal.unregister_ctx; 6921 6922 spdk_spin_destroy(&bdev->internal.spinlock); 6923 free(bdev->internal.qos); 6924 bdev_free_io_stat(bdev->internal.stat); 6925 6926 rc = bdev->fn_table->destruct(bdev->ctxt); 6927 if (rc < 0) { 6928 SPDK_ERRLOG("destruct failed\n"); 6929 } 6930 if (rc <= 0 && cb_fn != NULL) { 6931 cb_fn(cb_arg, rc); 6932 } 6933 } 6934 6935 void 6936 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6937 { 6938 if (bdev->internal.unregister_cb != NULL) { 6939 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6940 } 6941 } 6942 6943 static void 6944 _remove_notify(void *arg) 6945 { 6946 struct spdk_bdev_desc *desc = arg; 6947 6948 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 6949 } 6950 6951 /* returns: 0 - bdev removed and ready to be destructed. 6952 * -EBUSY - bdev can't be destructed yet. */ 6953 static int 6954 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6955 { 6956 struct spdk_bdev_desc *desc, *tmp; 6957 int rc = 0; 6958 char uuid[SPDK_UUID_STRING_LEN]; 6959 6960 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6961 assert(spdk_spin_held(&bdev->internal.spinlock)); 6962 6963 /* Notify each descriptor about hotremoval */ 6964 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6965 rc = -EBUSY; 6966 /* 6967 * Defer invocation of the event_cb to a separate message that will 6968 * run later on its thread. This ensures this context unwinds and 6969 * we don't recursively unregister this bdev again if the event_cb 6970 * immediately closes its descriptor. 6971 */ 6972 event_notify(desc, _remove_notify); 6973 } 6974 6975 /* If there are no descriptors, proceed removing the bdev */ 6976 if (rc == 0) { 6977 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6978 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6979 6980 /* Delete the name and the UUID alias */ 6981 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6982 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6983 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6984 6985 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6986 6987 if (bdev->internal.reset_in_progress != NULL) { 6988 /* If reset is in progress, let the completion callback for reset 6989 * unregister the bdev. 6990 */ 6991 rc = -EBUSY; 6992 } 6993 } 6994 6995 return rc; 6996 } 6997 6998 static void 6999 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7000 struct spdk_io_channel *io_ch, void *_ctx) 7001 { 7002 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7003 7004 bdev_channel_abort_queued_ios(bdev_ch); 7005 spdk_bdev_for_each_channel_continue(i, 0); 7006 } 7007 7008 static void 7009 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7010 { 7011 int rc; 7012 7013 spdk_spin_lock(&g_bdev_mgr.spinlock); 7014 spdk_spin_lock(&bdev->internal.spinlock); 7015 /* 7016 * Set the status to REMOVING after completing to abort channels. Otherwise, 7017 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7018 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7019 * may fail. 7020 */ 7021 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7022 rc = bdev_unregister_unsafe(bdev); 7023 spdk_spin_unlock(&bdev->internal.spinlock); 7024 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7025 7026 if (rc == 0) { 7027 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7028 } 7029 } 7030 7031 void 7032 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7033 { 7034 struct spdk_thread *thread; 7035 7036 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7037 7038 thread = spdk_get_thread(); 7039 if (!thread) { 7040 /* The user called this from a non-SPDK thread. */ 7041 if (cb_fn != NULL) { 7042 cb_fn(cb_arg, -ENOTSUP); 7043 } 7044 return; 7045 } 7046 7047 spdk_spin_lock(&g_bdev_mgr.spinlock); 7048 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7049 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7050 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7051 if (cb_fn) { 7052 cb_fn(cb_arg, -EBUSY); 7053 } 7054 return; 7055 } 7056 7057 spdk_spin_lock(&bdev->internal.spinlock); 7058 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7059 bdev->internal.unregister_cb = cb_fn; 7060 bdev->internal.unregister_ctx = cb_arg; 7061 bdev->internal.unregister_td = thread; 7062 spdk_spin_unlock(&bdev->internal.spinlock); 7063 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7064 7065 spdk_bdev_set_qd_sampling_period(bdev, 0); 7066 7067 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7068 bdev_unregister); 7069 } 7070 7071 int 7072 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7073 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7074 { 7075 struct spdk_bdev_desc *desc; 7076 struct spdk_bdev *bdev; 7077 int rc; 7078 7079 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7080 if (rc != 0) { 7081 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7082 return rc; 7083 } 7084 7085 bdev = spdk_bdev_desc_get_bdev(desc); 7086 7087 if (bdev->module != module) { 7088 spdk_bdev_close(desc); 7089 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7090 bdev_name); 7091 return -ENODEV; 7092 } 7093 7094 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7095 7096 spdk_bdev_close(desc); 7097 7098 return 0; 7099 } 7100 7101 static int 7102 bdev_start_qos(struct spdk_bdev *bdev) 7103 { 7104 struct set_qos_limit_ctx *ctx; 7105 7106 /* Enable QoS */ 7107 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7108 ctx = calloc(1, sizeof(*ctx)); 7109 if (ctx == NULL) { 7110 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7111 return -ENOMEM; 7112 } 7113 ctx->bdev = bdev; 7114 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7115 } 7116 7117 return 0; 7118 } 7119 7120 static void 7121 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7122 struct spdk_bdev *bdev) 7123 { 7124 enum spdk_bdev_claim_type type; 7125 const char *typename, *modname; 7126 extern struct spdk_log_flag SPDK_LOG_bdev; 7127 7128 assert(spdk_spin_held(&bdev->internal.spinlock)); 7129 7130 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7131 return; 7132 } 7133 7134 type = bdev->internal.claim_type; 7135 typename = spdk_bdev_claim_get_name(type); 7136 7137 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7138 modname = bdev->internal.claim.v1.module->name; 7139 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7140 bdev->name, detail, typename, modname); 7141 return; 7142 } 7143 7144 if (claim_type_is_v2(type)) { 7145 struct spdk_bdev_module_claim *claim; 7146 7147 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7148 modname = claim->module->name; 7149 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7150 bdev->name, detail, typename, modname); 7151 } 7152 return; 7153 } 7154 7155 assert(false); 7156 } 7157 7158 static int 7159 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7160 { 7161 struct spdk_thread *thread; 7162 int rc = 0; 7163 7164 thread = spdk_get_thread(); 7165 if (!thread) { 7166 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7167 return -ENOTSUP; 7168 } 7169 7170 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7171 spdk_get_thread()); 7172 7173 desc->bdev = bdev; 7174 desc->thread = thread; 7175 desc->write = write; 7176 7177 spdk_spin_lock(&bdev->internal.spinlock); 7178 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7179 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7180 spdk_spin_unlock(&bdev->internal.spinlock); 7181 return -ENODEV; 7182 } 7183 7184 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7185 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7186 spdk_spin_unlock(&bdev->internal.spinlock); 7187 return -EPERM; 7188 } 7189 7190 rc = bdev_start_qos(bdev); 7191 if (rc != 0) { 7192 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7193 spdk_spin_unlock(&bdev->internal.spinlock); 7194 return rc; 7195 } 7196 7197 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7198 7199 spdk_spin_unlock(&bdev->internal.spinlock); 7200 7201 return 0; 7202 } 7203 7204 static int 7205 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7206 struct spdk_bdev_desc **_desc) 7207 { 7208 struct spdk_bdev_desc *desc; 7209 unsigned int event_id; 7210 7211 desc = calloc(1, sizeof(*desc)); 7212 if (desc == NULL) { 7213 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7214 return -ENOMEM; 7215 } 7216 7217 TAILQ_INIT(&desc->pending_media_events); 7218 TAILQ_INIT(&desc->free_media_events); 7219 7220 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7221 desc->callback.event_fn = event_cb; 7222 desc->callback.ctx = event_ctx; 7223 spdk_spin_init(&desc->spinlock); 7224 7225 if (bdev->media_events) { 7226 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7227 sizeof(*desc->media_events_buffer)); 7228 if (desc->media_events_buffer == NULL) { 7229 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7230 bdev_desc_free(desc); 7231 return -ENOMEM; 7232 } 7233 7234 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7235 TAILQ_INSERT_TAIL(&desc->free_media_events, 7236 &desc->media_events_buffer[event_id], tailq); 7237 } 7238 } 7239 7240 *_desc = desc; 7241 7242 return 0; 7243 } 7244 7245 int 7246 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7247 void *event_ctx, struct spdk_bdev_desc **_desc) 7248 { 7249 struct spdk_bdev_desc *desc; 7250 struct spdk_bdev *bdev; 7251 int rc; 7252 7253 if (event_cb == NULL) { 7254 SPDK_ERRLOG("Missing event callback function\n"); 7255 return -EINVAL; 7256 } 7257 7258 spdk_spin_lock(&g_bdev_mgr.spinlock); 7259 7260 bdev = bdev_get_by_name(bdev_name); 7261 7262 if (bdev == NULL) { 7263 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7264 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7265 return -ENODEV; 7266 } 7267 7268 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7269 if (rc != 0) { 7270 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7271 return rc; 7272 } 7273 7274 rc = bdev_open(bdev, write, desc); 7275 if (rc != 0) { 7276 bdev_desc_free(desc); 7277 desc = NULL; 7278 } 7279 7280 *_desc = desc; 7281 7282 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7283 7284 return rc; 7285 } 7286 7287 static void 7288 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7289 { 7290 int rc; 7291 7292 spdk_spin_lock(&bdev->internal.spinlock); 7293 spdk_spin_lock(&desc->spinlock); 7294 7295 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7296 7297 desc->closed = true; 7298 7299 if (desc->claim != NULL) { 7300 bdev_desc_release_claims(desc); 7301 } 7302 7303 if (0 == desc->refs) { 7304 spdk_spin_unlock(&desc->spinlock); 7305 bdev_desc_free(desc); 7306 } else { 7307 spdk_spin_unlock(&desc->spinlock); 7308 } 7309 7310 /* If no more descriptors, kill QoS channel */ 7311 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7312 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7313 bdev->name, spdk_get_thread()); 7314 7315 if (bdev_qos_destroy(bdev)) { 7316 /* There isn't anything we can do to recover here. Just let the 7317 * old QoS poller keep running. The QoS handling won't change 7318 * cores when the user allocates a new channel, but it won't break. */ 7319 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7320 } 7321 } 7322 7323 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7324 rc = bdev_unregister_unsafe(bdev); 7325 spdk_spin_unlock(&bdev->internal.spinlock); 7326 7327 if (rc == 0) { 7328 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7329 } 7330 } else { 7331 spdk_spin_unlock(&bdev->internal.spinlock); 7332 } 7333 } 7334 7335 void 7336 spdk_bdev_close(struct spdk_bdev_desc *desc) 7337 { 7338 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7339 7340 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7341 spdk_get_thread()); 7342 7343 assert(desc->thread == spdk_get_thread()); 7344 7345 spdk_poller_unregister(&desc->io_timeout_poller); 7346 7347 spdk_spin_lock(&g_bdev_mgr.spinlock); 7348 7349 bdev_close(bdev, desc); 7350 7351 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7352 } 7353 7354 static void 7355 bdev_register_finished(void *arg) 7356 { 7357 struct spdk_bdev_desc *desc = arg; 7358 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7359 7360 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7361 7362 spdk_spin_lock(&g_bdev_mgr.spinlock); 7363 7364 bdev_close(bdev, desc); 7365 7366 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7367 } 7368 7369 int 7370 spdk_bdev_register(struct spdk_bdev *bdev) 7371 { 7372 struct spdk_bdev_desc *desc; 7373 struct spdk_thread *thread = spdk_get_thread(); 7374 int rc; 7375 7376 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7377 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7378 thread ? spdk_thread_get_name(thread) : "null"); 7379 return -EINVAL; 7380 } 7381 7382 rc = bdev_register(bdev); 7383 if (rc != 0) { 7384 return rc; 7385 } 7386 7387 /* A descriptor is opened to prevent bdev deletion during examination */ 7388 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7389 if (rc != 0) { 7390 spdk_bdev_unregister(bdev, NULL, NULL); 7391 return rc; 7392 } 7393 7394 rc = bdev_open(bdev, false, desc); 7395 if (rc != 0) { 7396 bdev_desc_free(desc); 7397 spdk_bdev_unregister(bdev, NULL, NULL); 7398 return rc; 7399 } 7400 7401 /* Examine configuration before initializing I/O */ 7402 bdev_examine(bdev); 7403 7404 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7405 if (rc != 0) { 7406 bdev_close(bdev, desc); 7407 spdk_bdev_unregister(bdev, NULL, NULL); 7408 } 7409 7410 return rc; 7411 } 7412 7413 int 7414 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7415 struct spdk_bdev_module *module) 7416 { 7417 spdk_spin_lock(&bdev->internal.spinlock); 7418 7419 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7420 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7421 spdk_spin_unlock(&bdev->internal.spinlock); 7422 return -EPERM; 7423 } 7424 7425 if (desc && !desc->write) { 7426 desc->write = true; 7427 } 7428 7429 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7430 bdev->internal.claim.v1.module = module; 7431 7432 spdk_spin_unlock(&bdev->internal.spinlock); 7433 return 0; 7434 } 7435 7436 void 7437 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7438 { 7439 spdk_spin_lock(&bdev->internal.spinlock); 7440 7441 assert(bdev->internal.claim.v1.module != NULL); 7442 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7443 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7444 bdev->internal.claim.v1.module = NULL; 7445 7446 spdk_spin_unlock(&bdev->internal.spinlock); 7447 } 7448 7449 /* 7450 * Start claims v2 7451 */ 7452 7453 const char * 7454 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7455 { 7456 switch (type) { 7457 case SPDK_BDEV_CLAIM_NONE: 7458 return "not_claimed"; 7459 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7460 return "exclusive_write"; 7461 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7462 return "read_many_write_one"; 7463 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7464 return "read_many_write_none"; 7465 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7466 return "read_many_write_many"; 7467 default: 7468 break; 7469 } 7470 return "invalid_claim"; 7471 } 7472 7473 static bool 7474 claim_type_is_v2(enum spdk_bdev_claim_type type) 7475 { 7476 switch (type) { 7477 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7478 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7479 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7480 return true; 7481 default: 7482 break; 7483 } 7484 return false; 7485 } 7486 7487 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7488 static bool 7489 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7490 { 7491 switch (type) { 7492 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7493 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7494 return true; 7495 default: 7496 break; 7497 } 7498 return false; 7499 } 7500 7501 void 7502 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7503 { 7504 if (opts == NULL) { 7505 SPDK_ERRLOG("opts should not be NULL\n"); 7506 assert(opts != NULL); 7507 return; 7508 } 7509 if (size == 0) { 7510 SPDK_ERRLOG("size should not be zero\n"); 7511 assert(size != 0); 7512 return; 7513 } 7514 7515 memset(opts, 0, size); 7516 opts->opts_size = size; 7517 7518 #define FIELD_OK(field) \ 7519 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7520 7521 #define SET_FIELD(field, value) \ 7522 if (FIELD_OK(field)) { \ 7523 opts->field = value; \ 7524 } \ 7525 7526 SET_FIELD(shared_claim_key, 0); 7527 7528 #undef FIELD_OK 7529 #undef SET_FIELD 7530 } 7531 7532 static int 7533 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7534 { 7535 if (src->opts_size == 0) { 7536 SPDK_ERRLOG("size should not be zero\n"); 7537 return -1; 7538 } 7539 7540 memset(dst, 0, sizeof(*dst)); 7541 dst->opts_size = src->opts_size; 7542 7543 #define FIELD_OK(field) \ 7544 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7545 7546 #define SET_FIELD(field) \ 7547 if (FIELD_OK(field)) { \ 7548 dst->field = src->field; \ 7549 } \ 7550 7551 if (FIELD_OK(name)) { 7552 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7553 } 7554 7555 SET_FIELD(shared_claim_key); 7556 7557 /* You should not remove this statement, but need to update the assert statement 7558 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7559 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7560 7561 #undef FIELD_OK 7562 #undef SET_FIELD 7563 return 0; 7564 } 7565 7566 /* Returns 0 if a read-write-once claim can be taken. */ 7567 static int 7568 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7569 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7570 { 7571 struct spdk_bdev *bdev = desc->bdev; 7572 struct spdk_bdev_desc *open_desc; 7573 7574 assert(spdk_spin_held(&bdev->internal.spinlock)); 7575 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7576 7577 if (opts->shared_claim_key != 0) { 7578 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7579 bdev->name); 7580 return -EINVAL; 7581 } 7582 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7583 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7584 return -EPERM; 7585 } 7586 if (desc->claim != NULL) { 7587 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7588 bdev->name, desc->claim->module->name); 7589 return -EPERM; 7590 } 7591 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7592 if (desc != open_desc && open_desc->write) { 7593 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7594 "another descriptor is open for writing\n", 7595 bdev->name); 7596 return -EPERM; 7597 } 7598 } 7599 7600 return 0; 7601 } 7602 7603 /* Returns 0 if a read-only-many claim can be taken. */ 7604 static int 7605 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7606 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7607 { 7608 struct spdk_bdev *bdev = desc->bdev; 7609 struct spdk_bdev_desc *open_desc; 7610 7611 assert(spdk_spin_held(&bdev->internal.spinlock)); 7612 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 7613 assert(desc->claim == NULL); 7614 7615 if (desc->write) { 7616 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 7617 bdev->name); 7618 return -EINVAL; 7619 } 7620 if (opts->shared_claim_key != 0) { 7621 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 7622 return -EINVAL; 7623 } 7624 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7625 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7626 if (open_desc->write) { 7627 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 7628 "another descriptor is open for writing\n", 7629 bdev->name); 7630 return -EPERM; 7631 } 7632 } 7633 } 7634 7635 return 0; 7636 } 7637 7638 /* Returns 0 if a read-write-many claim can be taken. */ 7639 static int 7640 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7641 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7642 { 7643 struct spdk_bdev *bdev = desc->bdev; 7644 struct spdk_bdev_desc *open_desc; 7645 7646 assert(spdk_spin_held(&bdev->internal.spinlock)); 7647 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 7648 assert(desc->claim == NULL); 7649 7650 if (opts->shared_claim_key == 0) { 7651 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 7652 bdev->name); 7653 return -EINVAL; 7654 } 7655 switch (bdev->internal.claim_type) { 7656 case SPDK_BDEV_CLAIM_NONE: 7657 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7658 if (open_desc == desc) { 7659 continue; 7660 } 7661 if (open_desc->write) { 7662 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 7663 "another descriptor is open for writing without a " 7664 "claim\n", bdev->name); 7665 return -EPERM; 7666 } 7667 } 7668 break; 7669 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7670 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 7671 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 7672 return -EPERM; 7673 } 7674 break; 7675 default: 7676 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7677 return -EBUSY; 7678 } 7679 7680 return 0; 7681 } 7682 7683 /* Updates desc and its bdev with a v2 claim. */ 7684 static int 7685 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7686 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7687 { 7688 struct spdk_bdev *bdev = desc->bdev; 7689 struct spdk_bdev_module_claim *claim; 7690 7691 assert(spdk_spin_held(&bdev->internal.spinlock)); 7692 assert(claim_type_is_v2(type)); 7693 assert(desc->claim == NULL); 7694 7695 claim = calloc(1, sizeof(*desc->claim)); 7696 if (claim == NULL) { 7697 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 7698 return -ENOMEM; 7699 } 7700 claim->module = module; 7701 claim->desc = desc; 7702 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 7703 memcpy(claim->name, opts->name, sizeof(claim->name)); 7704 desc->claim = claim; 7705 7706 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 7707 bdev->internal.claim_type = type; 7708 TAILQ_INIT(&bdev->internal.claim.v2.claims); 7709 bdev->internal.claim.v2.key = opts->shared_claim_key; 7710 } 7711 assert(type == bdev->internal.claim_type); 7712 7713 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 7714 7715 if (!desc->write && claim_type_promotes_to_write(type)) { 7716 desc->write = true; 7717 } 7718 7719 return 0; 7720 } 7721 7722 int 7723 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7724 struct spdk_bdev_claim_opts *_opts, 7725 struct spdk_bdev_module *module) 7726 { 7727 struct spdk_bdev *bdev = desc->bdev; 7728 struct spdk_bdev_claim_opts opts; 7729 int rc = 0; 7730 7731 if (desc == NULL) { 7732 SPDK_ERRLOG("descriptor must not be NULL\n"); 7733 return -EINVAL; 7734 } 7735 7736 if (_opts == NULL) { 7737 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 7738 } else if (claim_opts_copy(_opts, &opts) != 0) { 7739 return -EINVAL; 7740 } 7741 7742 spdk_spin_lock(&bdev->internal.spinlock); 7743 7744 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 7745 bdev->internal.claim_type != type) { 7746 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7747 spdk_spin_unlock(&bdev->internal.spinlock); 7748 return -EPERM; 7749 } 7750 7751 if (claim_type_is_v2(type) && desc->claim != NULL) { 7752 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 7753 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 7754 spdk_spin_unlock(&bdev->internal.spinlock); 7755 return -EPERM; 7756 } 7757 7758 switch (type) { 7759 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7760 spdk_spin_unlock(&bdev->internal.spinlock); 7761 return spdk_bdev_module_claim_bdev(bdev, desc, module); 7762 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7763 rc = claim_verify_rwo(desc, type, &opts, module); 7764 break; 7765 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7766 rc = claim_verify_rom(desc, type, &opts, module); 7767 break; 7768 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7769 rc = claim_verify_rwm(desc, type, &opts, module); 7770 break; 7771 default: 7772 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 7773 rc = -ENOTSUP; 7774 } 7775 7776 if (rc == 0) { 7777 rc = claim_bdev(desc, type, &opts, module); 7778 } 7779 7780 spdk_spin_unlock(&bdev->internal.spinlock); 7781 return rc; 7782 } 7783 7784 static void 7785 claim_reset(struct spdk_bdev *bdev) 7786 { 7787 assert(spdk_spin_held(&bdev->internal.spinlock)); 7788 assert(claim_type_is_v2(bdev->internal.claim_type)); 7789 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 7790 7791 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7792 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7793 } 7794 7795 static void 7796 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 7797 { 7798 struct spdk_bdev *bdev = desc->bdev; 7799 7800 assert(spdk_spin_held(&bdev->internal.spinlock)); 7801 assert(claim_type_is_v2(bdev->internal.claim_type)); 7802 7803 if (bdev->internal.examine_in_progress == 0) { 7804 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 7805 free(desc->claim); 7806 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 7807 claim_reset(bdev); 7808 } 7809 } else { 7810 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 7811 desc->claim->module = NULL; 7812 desc->claim->desc = NULL; 7813 } 7814 desc->claim = NULL; 7815 } 7816 7817 /* 7818 * End claims v2 7819 */ 7820 7821 struct spdk_bdev * 7822 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7823 { 7824 assert(desc != NULL); 7825 return desc->bdev; 7826 } 7827 7828 int 7829 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7830 { 7831 struct spdk_bdev *bdev, *tmp; 7832 struct spdk_bdev_desc *desc; 7833 int rc = 0; 7834 7835 assert(fn != NULL); 7836 7837 spdk_spin_lock(&g_bdev_mgr.spinlock); 7838 bdev = spdk_bdev_first(); 7839 while (bdev != NULL) { 7840 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7841 if (rc != 0) { 7842 break; 7843 } 7844 rc = bdev_open(bdev, false, desc); 7845 if (rc != 0) { 7846 bdev_desc_free(desc); 7847 if (rc == -ENODEV) { 7848 /* Ignore the error and move to the next bdev. */ 7849 rc = 0; 7850 bdev = spdk_bdev_next(bdev); 7851 continue; 7852 } 7853 break; 7854 } 7855 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7856 7857 rc = fn(ctx, bdev); 7858 7859 spdk_spin_lock(&g_bdev_mgr.spinlock); 7860 tmp = spdk_bdev_next(bdev); 7861 bdev_close(bdev, desc); 7862 if (rc != 0) { 7863 break; 7864 } 7865 bdev = tmp; 7866 } 7867 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7868 7869 return rc; 7870 } 7871 7872 int 7873 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7874 { 7875 struct spdk_bdev *bdev, *tmp; 7876 struct spdk_bdev_desc *desc; 7877 int rc = 0; 7878 7879 assert(fn != NULL); 7880 7881 spdk_spin_lock(&g_bdev_mgr.spinlock); 7882 bdev = spdk_bdev_first_leaf(); 7883 while (bdev != NULL) { 7884 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7885 if (rc != 0) { 7886 break; 7887 } 7888 rc = bdev_open(bdev, false, desc); 7889 if (rc != 0) { 7890 bdev_desc_free(desc); 7891 if (rc == -ENODEV) { 7892 /* Ignore the error and move to the next bdev. */ 7893 rc = 0; 7894 bdev = spdk_bdev_next_leaf(bdev); 7895 continue; 7896 } 7897 break; 7898 } 7899 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7900 7901 rc = fn(ctx, bdev); 7902 7903 spdk_spin_lock(&g_bdev_mgr.spinlock); 7904 tmp = spdk_bdev_next_leaf(bdev); 7905 bdev_close(bdev, desc); 7906 if (rc != 0) { 7907 break; 7908 } 7909 bdev = tmp; 7910 } 7911 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7912 7913 return rc; 7914 } 7915 7916 void 7917 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7918 { 7919 struct iovec *iovs; 7920 int iovcnt; 7921 7922 if (bdev_io == NULL) { 7923 return; 7924 } 7925 7926 switch (bdev_io->type) { 7927 case SPDK_BDEV_IO_TYPE_READ: 7928 case SPDK_BDEV_IO_TYPE_WRITE: 7929 case SPDK_BDEV_IO_TYPE_ZCOPY: 7930 iovs = bdev_io->u.bdev.iovs; 7931 iovcnt = bdev_io->u.bdev.iovcnt; 7932 break; 7933 default: 7934 iovs = NULL; 7935 iovcnt = 0; 7936 break; 7937 } 7938 7939 if (iovp) { 7940 *iovp = iovs; 7941 } 7942 if (iovcntp) { 7943 *iovcntp = iovcnt; 7944 } 7945 } 7946 7947 void * 7948 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7949 { 7950 if (bdev_io == NULL) { 7951 return NULL; 7952 } 7953 7954 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7955 return NULL; 7956 } 7957 7958 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7959 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7960 return bdev_io->u.bdev.md_buf; 7961 } 7962 7963 return NULL; 7964 } 7965 7966 void * 7967 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7968 { 7969 if (bdev_io == NULL) { 7970 assert(false); 7971 return NULL; 7972 } 7973 7974 return bdev_io->internal.caller_ctx; 7975 } 7976 7977 void 7978 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7979 { 7980 7981 if (spdk_bdev_module_list_find(bdev_module->name)) { 7982 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7983 assert(false); 7984 } 7985 7986 spdk_spin_init(&bdev_module->internal.spinlock); 7987 7988 /* 7989 * Modules with examine callbacks must be initialized first, so they are 7990 * ready to handle examine callbacks from later modules that will 7991 * register physical bdevs. 7992 */ 7993 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7994 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7995 } else { 7996 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7997 } 7998 } 7999 8000 struct spdk_bdev_module * 8001 spdk_bdev_module_list_find(const char *name) 8002 { 8003 struct spdk_bdev_module *bdev_module; 8004 8005 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8006 if (strcmp(name, bdev_module->name) == 0) { 8007 break; 8008 } 8009 } 8010 8011 return bdev_module; 8012 } 8013 8014 static void 8015 bdev_write_zero_buffer_next(void *_bdev_io) 8016 { 8017 struct spdk_bdev_io *bdev_io = _bdev_io; 8018 uint64_t num_bytes, num_blocks; 8019 void *md_buf = NULL; 8020 int rc; 8021 8022 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8023 bdev_io->u.bdev.split_remaining_num_blocks, 8024 ZERO_BUFFER_SIZE); 8025 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8026 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8027 8028 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8029 md_buf = (char *)g_bdev_mgr.zero_buffer + 8030 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8031 } 8032 8033 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8034 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8035 g_bdev_mgr.zero_buffer, md_buf, 8036 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8037 bdev_write_zero_buffer_done, bdev_io); 8038 if (rc == 0) { 8039 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8040 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8041 } else if (rc == -ENOMEM) { 8042 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8043 } else { 8044 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8045 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8046 } 8047 } 8048 8049 static void 8050 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8051 { 8052 struct spdk_bdev_io *parent_io = cb_arg; 8053 8054 spdk_bdev_free_io(bdev_io); 8055 8056 if (!success) { 8057 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8058 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8059 return; 8060 } 8061 8062 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8063 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8064 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8065 return; 8066 } 8067 8068 bdev_write_zero_buffer_next(parent_io); 8069 } 8070 8071 static void 8072 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8073 { 8074 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8075 ctx->bdev->internal.qos_mod_in_progress = false; 8076 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8077 8078 if (ctx->cb_fn) { 8079 ctx->cb_fn(ctx->cb_arg, status); 8080 } 8081 free(ctx); 8082 } 8083 8084 static void 8085 bdev_disable_qos_done(void *cb_arg) 8086 { 8087 struct set_qos_limit_ctx *ctx = cb_arg; 8088 struct spdk_bdev *bdev = ctx->bdev; 8089 struct spdk_bdev_io *bdev_io; 8090 struct spdk_bdev_qos *qos; 8091 8092 spdk_spin_lock(&bdev->internal.spinlock); 8093 qos = bdev->internal.qos; 8094 bdev->internal.qos = NULL; 8095 spdk_spin_unlock(&bdev->internal.spinlock); 8096 8097 while (!TAILQ_EMPTY(&qos->queued)) { 8098 /* Send queued I/O back to their original thread for resubmission. */ 8099 bdev_io = TAILQ_FIRST(&qos->queued); 8100 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8101 8102 if (bdev_io->internal.io_submit_ch) { 8103 /* 8104 * Channel was changed when sending it to the QoS thread - change it back 8105 * before sending it back to the original thread. 8106 */ 8107 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8108 bdev_io->internal.io_submit_ch = NULL; 8109 } 8110 8111 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8112 _bdev_io_submit, bdev_io); 8113 } 8114 8115 if (qos->thread != NULL) { 8116 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8117 spdk_poller_unregister(&qos->poller); 8118 } 8119 8120 free(qos); 8121 8122 bdev_set_qos_limit_done(ctx, 0); 8123 } 8124 8125 static void 8126 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8127 { 8128 struct set_qos_limit_ctx *ctx = _ctx; 8129 struct spdk_thread *thread; 8130 8131 spdk_spin_lock(&bdev->internal.spinlock); 8132 thread = bdev->internal.qos->thread; 8133 spdk_spin_unlock(&bdev->internal.spinlock); 8134 8135 if (thread != NULL) { 8136 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8137 } else { 8138 bdev_disable_qos_done(ctx); 8139 } 8140 } 8141 8142 static void 8143 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8144 struct spdk_io_channel *ch, void *_ctx) 8145 { 8146 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8147 8148 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8149 8150 spdk_bdev_for_each_channel_continue(i, 0); 8151 } 8152 8153 static void 8154 bdev_update_qos_rate_limit_msg(void *cb_arg) 8155 { 8156 struct set_qos_limit_ctx *ctx = cb_arg; 8157 struct spdk_bdev *bdev = ctx->bdev; 8158 8159 spdk_spin_lock(&bdev->internal.spinlock); 8160 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8161 spdk_spin_unlock(&bdev->internal.spinlock); 8162 8163 bdev_set_qos_limit_done(ctx, 0); 8164 } 8165 8166 static void 8167 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8168 struct spdk_io_channel *ch, void *_ctx) 8169 { 8170 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8171 8172 spdk_spin_lock(&bdev->internal.spinlock); 8173 bdev_enable_qos(bdev, bdev_ch); 8174 spdk_spin_unlock(&bdev->internal.spinlock); 8175 spdk_bdev_for_each_channel_continue(i, 0); 8176 } 8177 8178 static void 8179 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8180 { 8181 struct set_qos_limit_ctx *ctx = _ctx; 8182 8183 bdev_set_qos_limit_done(ctx, status); 8184 } 8185 8186 static void 8187 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8188 { 8189 int i; 8190 8191 assert(bdev->internal.qos != NULL); 8192 8193 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8194 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8195 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8196 8197 if (limits[i] == 0) { 8198 bdev->internal.qos->rate_limits[i].limit = 8199 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8200 } 8201 } 8202 } 8203 } 8204 8205 void 8206 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8207 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8208 { 8209 struct set_qos_limit_ctx *ctx; 8210 uint32_t limit_set_complement; 8211 uint64_t min_limit_per_sec; 8212 int i; 8213 bool disable_rate_limit = true; 8214 8215 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8216 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8217 continue; 8218 } 8219 8220 if (limits[i] > 0) { 8221 disable_rate_limit = false; 8222 } 8223 8224 if (bdev_qos_is_iops_rate_limit(i) == true) { 8225 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8226 } else { 8227 /* Change from megabyte to byte rate limit */ 8228 limits[i] = limits[i] * 1024 * 1024; 8229 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8230 } 8231 8232 limit_set_complement = limits[i] % min_limit_per_sec; 8233 if (limit_set_complement) { 8234 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8235 limits[i], min_limit_per_sec); 8236 limits[i] += min_limit_per_sec - limit_set_complement; 8237 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8238 } 8239 } 8240 8241 ctx = calloc(1, sizeof(*ctx)); 8242 if (ctx == NULL) { 8243 cb_fn(cb_arg, -ENOMEM); 8244 return; 8245 } 8246 8247 ctx->cb_fn = cb_fn; 8248 ctx->cb_arg = cb_arg; 8249 ctx->bdev = bdev; 8250 8251 spdk_spin_lock(&bdev->internal.spinlock); 8252 if (bdev->internal.qos_mod_in_progress) { 8253 spdk_spin_unlock(&bdev->internal.spinlock); 8254 free(ctx); 8255 cb_fn(cb_arg, -EAGAIN); 8256 return; 8257 } 8258 bdev->internal.qos_mod_in_progress = true; 8259 8260 if (disable_rate_limit == true && bdev->internal.qos) { 8261 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8262 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8263 (bdev->internal.qos->rate_limits[i].limit > 0 && 8264 bdev->internal.qos->rate_limits[i].limit != 8265 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8266 disable_rate_limit = false; 8267 break; 8268 } 8269 } 8270 } 8271 8272 if (disable_rate_limit == false) { 8273 if (bdev->internal.qos == NULL) { 8274 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8275 if (!bdev->internal.qos) { 8276 spdk_spin_unlock(&bdev->internal.spinlock); 8277 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8278 bdev_set_qos_limit_done(ctx, -ENOMEM); 8279 return; 8280 } 8281 } 8282 8283 if (bdev->internal.qos->thread == NULL) { 8284 /* Enabling */ 8285 bdev_set_qos_rate_limits(bdev, limits); 8286 8287 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8288 bdev_enable_qos_done); 8289 } else { 8290 /* Updating */ 8291 bdev_set_qos_rate_limits(bdev, limits); 8292 8293 spdk_thread_send_msg(bdev->internal.qos->thread, 8294 bdev_update_qos_rate_limit_msg, ctx); 8295 } 8296 } else { 8297 if (bdev->internal.qos != NULL) { 8298 bdev_set_qos_rate_limits(bdev, limits); 8299 8300 /* Disabling */ 8301 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8302 bdev_disable_qos_msg_done); 8303 } else { 8304 spdk_spin_unlock(&bdev->internal.spinlock); 8305 bdev_set_qos_limit_done(ctx, 0); 8306 return; 8307 } 8308 } 8309 8310 spdk_spin_unlock(&bdev->internal.spinlock); 8311 } 8312 8313 struct spdk_bdev_histogram_ctx { 8314 spdk_bdev_histogram_status_cb cb_fn; 8315 void *cb_arg; 8316 struct spdk_bdev *bdev; 8317 int status; 8318 }; 8319 8320 static void 8321 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8322 { 8323 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8324 8325 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8326 ctx->bdev->internal.histogram_in_progress = false; 8327 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8328 ctx->cb_fn(ctx->cb_arg, ctx->status); 8329 free(ctx); 8330 } 8331 8332 static void 8333 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8334 struct spdk_io_channel *_ch, void *_ctx) 8335 { 8336 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8337 8338 if (ch->histogram != NULL) { 8339 spdk_histogram_data_free(ch->histogram); 8340 ch->histogram = NULL; 8341 } 8342 spdk_bdev_for_each_channel_continue(i, 0); 8343 } 8344 8345 static void 8346 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8347 { 8348 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8349 8350 if (status != 0) { 8351 ctx->status = status; 8352 ctx->bdev->internal.histogram_enabled = false; 8353 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8354 bdev_histogram_disable_channel_cb); 8355 } else { 8356 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8357 ctx->bdev->internal.histogram_in_progress = false; 8358 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8359 ctx->cb_fn(ctx->cb_arg, ctx->status); 8360 free(ctx); 8361 } 8362 } 8363 8364 static void 8365 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8366 struct spdk_io_channel *_ch, void *_ctx) 8367 { 8368 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8369 int status = 0; 8370 8371 if (ch->histogram == NULL) { 8372 ch->histogram = spdk_histogram_data_alloc(); 8373 if (ch->histogram == NULL) { 8374 status = -ENOMEM; 8375 } 8376 } 8377 8378 spdk_bdev_for_each_channel_continue(i, status); 8379 } 8380 8381 void 8382 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8383 void *cb_arg, bool enable) 8384 { 8385 struct spdk_bdev_histogram_ctx *ctx; 8386 8387 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8388 if (ctx == NULL) { 8389 cb_fn(cb_arg, -ENOMEM); 8390 return; 8391 } 8392 8393 ctx->bdev = bdev; 8394 ctx->status = 0; 8395 ctx->cb_fn = cb_fn; 8396 ctx->cb_arg = cb_arg; 8397 8398 spdk_spin_lock(&bdev->internal.spinlock); 8399 if (bdev->internal.histogram_in_progress) { 8400 spdk_spin_unlock(&bdev->internal.spinlock); 8401 free(ctx); 8402 cb_fn(cb_arg, -EAGAIN); 8403 return; 8404 } 8405 8406 bdev->internal.histogram_in_progress = true; 8407 spdk_spin_unlock(&bdev->internal.spinlock); 8408 8409 bdev->internal.histogram_enabled = enable; 8410 8411 if (enable) { 8412 /* Allocate histogram for each channel */ 8413 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8414 bdev_histogram_enable_channel_cb); 8415 } else { 8416 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8417 bdev_histogram_disable_channel_cb); 8418 } 8419 } 8420 8421 struct spdk_bdev_histogram_data_ctx { 8422 spdk_bdev_histogram_data_cb cb_fn; 8423 void *cb_arg; 8424 struct spdk_bdev *bdev; 8425 /** merged histogram data from all channels */ 8426 struct spdk_histogram_data *histogram; 8427 }; 8428 8429 static void 8430 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8431 { 8432 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8433 8434 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8435 free(ctx); 8436 } 8437 8438 static void 8439 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8440 struct spdk_io_channel *_ch, void *_ctx) 8441 { 8442 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8443 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8444 int status = 0; 8445 8446 if (ch->histogram == NULL) { 8447 status = -EFAULT; 8448 } else { 8449 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8450 } 8451 8452 spdk_bdev_for_each_channel_continue(i, status); 8453 } 8454 8455 void 8456 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8457 spdk_bdev_histogram_data_cb cb_fn, 8458 void *cb_arg) 8459 { 8460 struct spdk_bdev_histogram_data_ctx *ctx; 8461 8462 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8463 if (ctx == NULL) { 8464 cb_fn(cb_arg, -ENOMEM, NULL); 8465 return; 8466 } 8467 8468 ctx->bdev = bdev; 8469 ctx->cb_fn = cb_fn; 8470 ctx->cb_arg = cb_arg; 8471 8472 ctx->histogram = histogram; 8473 8474 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8475 bdev_histogram_get_channel_cb); 8476 } 8477 8478 void 8479 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8480 void *cb_arg) 8481 { 8482 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8483 int status = 0; 8484 8485 assert(cb_fn != NULL); 8486 8487 if (bdev_ch->histogram == NULL) { 8488 status = -EFAULT; 8489 } 8490 cb_fn(cb_arg, status, bdev_ch->histogram); 8491 } 8492 8493 size_t 8494 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8495 size_t max_events) 8496 { 8497 struct media_event_entry *entry; 8498 size_t num_events = 0; 8499 8500 for (; num_events < max_events; ++num_events) { 8501 entry = TAILQ_FIRST(&desc->pending_media_events); 8502 if (entry == NULL) { 8503 break; 8504 } 8505 8506 events[num_events] = entry->event; 8507 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8508 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8509 } 8510 8511 return num_events; 8512 } 8513 8514 int 8515 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8516 size_t num_events) 8517 { 8518 struct spdk_bdev_desc *desc; 8519 struct media_event_entry *entry; 8520 size_t event_id; 8521 int rc = 0; 8522 8523 assert(bdev->media_events); 8524 8525 spdk_spin_lock(&bdev->internal.spinlock); 8526 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8527 if (desc->write) { 8528 break; 8529 } 8530 } 8531 8532 if (desc == NULL || desc->media_events_buffer == NULL) { 8533 rc = -ENODEV; 8534 goto out; 8535 } 8536 8537 for (event_id = 0; event_id < num_events; ++event_id) { 8538 entry = TAILQ_FIRST(&desc->free_media_events); 8539 if (entry == NULL) { 8540 break; 8541 } 8542 8543 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8544 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8545 entry->event = events[event_id]; 8546 } 8547 8548 rc = event_id; 8549 out: 8550 spdk_spin_unlock(&bdev->internal.spinlock); 8551 return rc; 8552 } 8553 8554 static void 8555 _media_management_notify(void *arg) 8556 { 8557 struct spdk_bdev_desc *desc = arg; 8558 8559 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8560 } 8561 8562 void 8563 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8564 { 8565 struct spdk_bdev_desc *desc; 8566 8567 spdk_spin_lock(&bdev->internal.spinlock); 8568 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8569 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8570 event_notify(desc, _media_management_notify); 8571 } 8572 } 8573 spdk_spin_unlock(&bdev->internal.spinlock); 8574 } 8575 8576 struct locked_lba_range_ctx { 8577 struct lba_range range; 8578 struct spdk_bdev *bdev; 8579 struct lba_range *current_range; 8580 struct lba_range *owner_range; 8581 struct spdk_poller *poller; 8582 lock_range_cb cb_fn; 8583 void *cb_arg; 8584 }; 8585 8586 static void 8587 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8588 { 8589 struct locked_lba_range_ctx *ctx = _ctx; 8590 8591 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8592 free(ctx); 8593 } 8594 8595 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8596 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8597 8598 static void 8599 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8600 { 8601 struct locked_lba_range_ctx *ctx = _ctx; 8602 8603 if (status == -ENOMEM) { 8604 /* One of the channels could not allocate a range object. 8605 * So we have to go back and clean up any ranges that were 8606 * allocated successfully before we return error status to 8607 * the caller. We can reuse the unlock function to do that 8608 * clean up. 8609 */ 8610 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8611 bdev_lock_error_cleanup_cb); 8612 return; 8613 } 8614 8615 /* All channels have locked this range and no I/O overlapping the range 8616 * are outstanding! Set the owner_ch for the range object for the 8617 * locking channel, so that this channel will know that it is allowed 8618 * to write to this range. 8619 */ 8620 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8621 ctx->cb_fn(ctx->cb_arg, status); 8622 8623 /* Don't free the ctx here. Its range is in the bdev's global list of 8624 * locked ranges still, and will be removed and freed when this range 8625 * is later unlocked. 8626 */ 8627 } 8628 8629 static int 8630 bdev_lock_lba_range_check_io(void *_i) 8631 { 8632 struct spdk_bdev_channel_iter *i = _i; 8633 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8634 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8635 struct locked_lba_range_ctx *ctx = i->ctx; 8636 struct lba_range *range = ctx->current_range; 8637 struct spdk_bdev_io *bdev_io; 8638 8639 spdk_poller_unregister(&ctx->poller); 8640 8641 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8642 * range. But we need to wait until any outstanding IO overlapping with this range 8643 * are completed. 8644 */ 8645 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8646 if (bdev_io_range_is_locked(bdev_io, range)) { 8647 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8648 return SPDK_POLLER_BUSY; 8649 } 8650 } 8651 8652 spdk_bdev_for_each_channel_continue(i, 0); 8653 return SPDK_POLLER_BUSY; 8654 } 8655 8656 static void 8657 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8658 struct spdk_io_channel *_ch, void *_ctx) 8659 { 8660 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8661 struct locked_lba_range_ctx *ctx = _ctx; 8662 struct lba_range *range; 8663 8664 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8665 if (range->length == ctx->range.length && 8666 range->offset == ctx->range.offset && 8667 range->locked_ctx == ctx->range.locked_ctx) { 8668 /* This range already exists on this channel, so don't add 8669 * it again. This can happen when a new channel is created 8670 * while the for_each_channel operation is in progress. 8671 * Do not check for outstanding I/O in that case, since the 8672 * range was locked before any I/O could be submitted to the 8673 * new channel. 8674 */ 8675 spdk_bdev_for_each_channel_continue(i, 0); 8676 return; 8677 } 8678 } 8679 8680 range = calloc(1, sizeof(*range)); 8681 if (range == NULL) { 8682 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8683 return; 8684 } 8685 8686 range->length = ctx->range.length; 8687 range->offset = ctx->range.offset; 8688 range->locked_ctx = ctx->range.locked_ctx; 8689 ctx->current_range = range; 8690 if (ctx->range.owner_ch == ch) { 8691 /* This is the range object for the channel that will hold 8692 * the lock. Store it in the ctx object so that we can easily 8693 * set its owner_ch after the lock is finally acquired. 8694 */ 8695 ctx->owner_range = range; 8696 } 8697 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8698 bdev_lock_lba_range_check_io(i); 8699 } 8700 8701 static void 8702 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8703 { 8704 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8705 8706 /* We will add a copy of this range to each channel now. */ 8707 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8708 bdev_lock_lba_range_cb); 8709 } 8710 8711 static bool 8712 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8713 { 8714 struct lba_range *r; 8715 8716 TAILQ_FOREACH(r, tailq, tailq) { 8717 if (bdev_lba_range_overlapped(range, r)) { 8718 return true; 8719 } 8720 } 8721 return false; 8722 } 8723 8724 static int 8725 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8726 uint64_t offset, uint64_t length, 8727 lock_range_cb cb_fn, void *cb_arg) 8728 { 8729 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8730 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8731 struct locked_lba_range_ctx *ctx; 8732 8733 if (cb_arg == NULL) { 8734 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8735 return -EINVAL; 8736 } 8737 8738 ctx = calloc(1, sizeof(*ctx)); 8739 if (ctx == NULL) { 8740 return -ENOMEM; 8741 } 8742 8743 ctx->range.offset = offset; 8744 ctx->range.length = length; 8745 ctx->range.owner_ch = ch; 8746 ctx->range.locked_ctx = cb_arg; 8747 ctx->bdev = bdev; 8748 ctx->cb_fn = cb_fn; 8749 ctx->cb_arg = cb_arg; 8750 8751 spdk_spin_lock(&bdev->internal.spinlock); 8752 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8753 /* There is an active lock overlapping with this range. 8754 * Put it on the pending list until this range no 8755 * longer overlaps with another. 8756 */ 8757 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8758 } else { 8759 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8760 bdev_lock_lba_range_ctx(bdev, ctx); 8761 } 8762 spdk_spin_unlock(&bdev->internal.spinlock); 8763 return 0; 8764 } 8765 8766 static void 8767 bdev_lock_lba_range_ctx_msg(void *_ctx) 8768 { 8769 struct locked_lba_range_ctx *ctx = _ctx; 8770 8771 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8772 } 8773 8774 static void 8775 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8776 { 8777 struct locked_lba_range_ctx *ctx = _ctx; 8778 struct locked_lba_range_ctx *pending_ctx; 8779 struct lba_range *range, *tmp; 8780 8781 spdk_spin_lock(&bdev->internal.spinlock); 8782 /* Check if there are any pending locked ranges that overlap with this range 8783 * that was just unlocked. If there are, check that it doesn't overlap with any 8784 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8785 * the lock process. 8786 */ 8787 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8788 if (bdev_lba_range_overlapped(range, &ctx->range) && 8789 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8790 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8791 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8792 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8793 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8794 bdev_lock_lba_range_ctx_msg, pending_ctx); 8795 } 8796 } 8797 spdk_spin_unlock(&bdev->internal.spinlock); 8798 8799 ctx->cb_fn(ctx->cb_arg, status); 8800 free(ctx); 8801 } 8802 8803 static void 8804 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8805 struct spdk_io_channel *_ch, void *_ctx) 8806 { 8807 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8808 struct locked_lba_range_ctx *ctx = _ctx; 8809 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8810 struct spdk_bdev_io *bdev_io; 8811 struct lba_range *range; 8812 8813 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8814 if (ctx->range.offset == range->offset && 8815 ctx->range.length == range->length && 8816 ctx->range.locked_ctx == range->locked_ctx) { 8817 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8818 free(range); 8819 break; 8820 } 8821 } 8822 8823 /* Note: we should almost always be able to assert that the range specified 8824 * was found. But there are some very rare corner cases where a new channel 8825 * gets created simultaneously with a range unlock, where this function 8826 * would execute on that new channel and wouldn't have the range. 8827 * We also use this to clean up range allocations when a later allocation 8828 * fails in the locking path. 8829 * So we can't actually assert() here. 8830 */ 8831 8832 /* Swap the locked IO into a temporary list, and then try to submit them again. 8833 * We could hyper-optimize this to only resubmit locked I/O that overlap 8834 * with the range that was just unlocked, but this isn't a performance path so 8835 * we go for simplicity here. 8836 */ 8837 TAILQ_INIT(&io_locked); 8838 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8839 while (!TAILQ_EMPTY(&io_locked)) { 8840 bdev_io = TAILQ_FIRST(&io_locked); 8841 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8842 bdev_io_submit(bdev_io); 8843 } 8844 8845 spdk_bdev_for_each_channel_continue(i, 0); 8846 } 8847 8848 static int 8849 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8850 uint64_t offset, uint64_t length, 8851 lock_range_cb cb_fn, void *cb_arg) 8852 { 8853 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8854 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8855 struct locked_lba_range_ctx *ctx; 8856 struct lba_range *range; 8857 bool range_found = false; 8858 8859 /* Let's make sure the specified channel actually has a lock on 8860 * the specified range. Note that the range must match exactly. 8861 */ 8862 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8863 if (range->offset == offset && range->length == length && 8864 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8865 range_found = true; 8866 break; 8867 } 8868 } 8869 8870 if (!range_found) { 8871 return -EINVAL; 8872 } 8873 8874 spdk_spin_lock(&bdev->internal.spinlock); 8875 /* We confirmed that this channel has locked the specified range. To 8876 * start the unlock the process, we find the range in the bdev's locked_ranges 8877 * and remove it. This ensures new channels don't inherit the locked range. 8878 * Then we will send a message to each channel (including the one specified 8879 * here) to remove the range from its per-channel list. 8880 */ 8881 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8882 if (range->offset == offset && range->length == length && 8883 range->locked_ctx == cb_arg) { 8884 break; 8885 } 8886 } 8887 if (range == NULL) { 8888 assert(false); 8889 spdk_spin_unlock(&bdev->internal.spinlock); 8890 return -EINVAL; 8891 } 8892 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8893 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8894 spdk_spin_unlock(&bdev->internal.spinlock); 8895 8896 ctx->cb_fn = cb_fn; 8897 ctx->cb_arg = cb_arg; 8898 8899 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8900 bdev_unlock_lba_range_cb); 8901 return 0; 8902 } 8903 8904 int 8905 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8906 int array_size) 8907 { 8908 if (!bdev) { 8909 return -EINVAL; 8910 } 8911 8912 if (bdev->fn_table->get_memory_domains) { 8913 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8914 } 8915 8916 return 0; 8917 } 8918 8919 struct spdk_bdev_for_each_io_ctx { 8920 void *ctx; 8921 spdk_bdev_io_fn fn; 8922 spdk_bdev_for_each_io_cb cb; 8923 }; 8924 8925 static void 8926 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8927 struct spdk_io_channel *io_ch, void *_ctx) 8928 { 8929 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8930 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8931 struct spdk_bdev_io *bdev_io; 8932 int rc = 0; 8933 8934 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8935 rc = ctx->fn(ctx->ctx, bdev_io); 8936 if (rc != 0) { 8937 break; 8938 } 8939 } 8940 8941 spdk_bdev_for_each_channel_continue(i, rc); 8942 } 8943 8944 static void 8945 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8946 { 8947 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8948 8949 ctx->cb(ctx->ctx, status); 8950 8951 free(ctx); 8952 } 8953 8954 void 8955 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8956 spdk_bdev_for_each_io_cb cb) 8957 { 8958 struct spdk_bdev_for_each_io_ctx *ctx; 8959 8960 assert(fn != NULL && cb != NULL); 8961 8962 ctx = calloc(1, sizeof(*ctx)); 8963 if (ctx == NULL) { 8964 SPDK_ERRLOG("Failed to allocate context.\n"); 8965 cb(_ctx, -ENOMEM); 8966 return; 8967 } 8968 8969 ctx->ctx = _ctx; 8970 ctx->fn = fn; 8971 ctx->cb = cb; 8972 8973 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8974 bdev_for_each_io_done); 8975 } 8976 8977 void 8978 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8979 { 8980 spdk_for_each_channel_continue(iter->i, status); 8981 } 8982 8983 static struct spdk_bdev * 8984 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8985 { 8986 void *io_device = spdk_io_channel_iter_get_io_device(i); 8987 8988 return __bdev_from_io_dev(io_device); 8989 } 8990 8991 static void 8992 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8993 { 8994 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8995 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8996 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8997 8998 iter->i = i; 8999 iter->fn(iter, bdev, ch, iter->ctx); 9000 } 9001 9002 static void 9003 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9004 { 9005 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9006 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9007 9008 iter->i = i; 9009 iter->cpl(bdev, iter->ctx, status); 9010 9011 free(iter); 9012 } 9013 9014 void 9015 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9016 void *ctx, spdk_bdev_for_each_channel_done cpl) 9017 { 9018 struct spdk_bdev_channel_iter *iter; 9019 9020 assert(bdev != NULL && fn != NULL && ctx != NULL); 9021 9022 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9023 if (iter == NULL) { 9024 SPDK_ERRLOG("Unable to allocate iterator\n"); 9025 assert(false); 9026 return; 9027 } 9028 9029 iter->fn = fn; 9030 iter->cpl = cpl; 9031 iter->ctx = ctx; 9032 9033 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9034 iter, bdev_each_channel_cpl); 9035 } 9036 9037 int 9038 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9039 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9040 spdk_bdev_io_completion_cb cb, void *cb_arg) 9041 { 9042 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9043 struct spdk_bdev_io *bdev_io; 9044 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9045 9046 if (!desc->write) { 9047 return -EBADF; 9048 } 9049 9050 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 9051 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 9052 return -ENOTSUP; 9053 } 9054 9055 if (num_blocks == 0) { 9056 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9057 return -EINVAL; 9058 } 9059 9060 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9061 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9062 SPDK_DEBUGLOG(bdev, 9063 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9064 dst_offset_blocks, src_offset_blocks, num_blocks); 9065 return -EINVAL; 9066 } 9067 9068 bdev_io = bdev_channel_get_io(channel); 9069 if (!bdev_io) { 9070 return -ENOMEM; 9071 } 9072 9073 bdev_io->internal.ch = channel; 9074 bdev_io->internal.desc = desc; 9075 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9076 9077 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9078 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9079 bdev_io->u.bdev.num_blocks = num_blocks; 9080 bdev_io->u.bdev.ext_opts = NULL; 9081 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9082 9083 bdev_io_submit(bdev_io); 9084 return 0; 9085 } 9086 9087 SPDK_LOG_REGISTER_COMPONENT(bdev) 9088 9089 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9090 { 9091 struct spdk_trace_tpoint_opts opts[] = { 9092 { 9093 "BDEV_IO_START", TRACE_BDEV_IO_START, 9094 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9095 { 9096 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9097 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9098 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9099 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9100 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9101 } 9102 }, 9103 { 9104 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9105 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9106 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9107 }, 9108 { 9109 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9110 OWNER_BDEV, OBJECT_NONE, 1, 9111 { 9112 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9113 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9114 } 9115 }, 9116 { 9117 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9118 OWNER_BDEV, OBJECT_NONE, 0, 9119 { 9120 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9121 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9122 } 9123 }, 9124 }; 9125 9126 9127 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9128 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9129 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9130 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9131 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9132 } 9133