1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 /* List of I/Os with accel sequence being currently executed */ 293 bdev_io_tailq_t io_accel_exec; 294 295 /* List of I/Os doing memory domain pull/push */ 296 bdev_io_tailq_t io_memory_domain; 297 298 uint32_t flags; 299 300 struct spdk_histogram_data *histogram; 301 302 #ifdef SPDK_CONFIG_VTUNE 303 uint64_t start_tsc; 304 uint64_t interval_tsc; 305 __itt_string_handle *handle; 306 struct spdk_bdev_io_stat *prev_stat; 307 #endif 308 309 bdev_io_tailq_t queued_resets; 310 311 lba_range_tailq_t locked_ranges; 312 }; 313 314 struct media_event_entry { 315 struct spdk_bdev_media_event event; 316 TAILQ_ENTRY(media_event_entry) tailq; 317 }; 318 319 #define MEDIA_EVENT_POOL_SIZE 64 320 321 struct spdk_bdev_desc { 322 struct spdk_bdev *bdev; 323 struct spdk_thread *thread; 324 struct { 325 spdk_bdev_event_cb_t event_fn; 326 void *ctx; 327 } callback; 328 bool closed; 329 bool write; 330 bool memory_domains_supported; 331 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 332 struct spdk_spinlock spinlock; 333 uint32_t refs; 334 TAILQ_HEAD(, media_event_entry) pending_media_events; 335 TAILQ_HEAD(, media_event_entry) free_media_events; 336 struct media_event_entry *media_events_buffer; 337 TAILQ_ENTRY(spdk_bdev_desc) link; 338 339 uint64_t timeout_in_sec; 340 spdk_bdev_io_timeout_cb cb_fn; 341 void *cb_arg; 342 struct spdk_poller *io_timeout_poller; 343 struct spdk_bdev_module_claim *claim; 344 }; 345 346 struct spdk_bdev_iostat_ctx { 347 struct spdk_bdev_io_stat *stat; 348 spdk_bdev_get_device_stat_cb cb; 349 void *cb_arg; 350 }; 351 352 struct set_qos_limit_ctx { 353 void (*cb_fn)(void *cb_arg, int status); 354 void *cb_arg; 355 struct spdk_bdev *bdev; 356 }; 357 358 struct spdk_bdev_channel_iter { 359 spdk_bdev_for_each_channel_msg fn; 360 spdk_bdev_for_each_channel_done cpl; 361 struct spdk_io_channel_iter *i; 362 void *ctx; 363 }; 364 365 struct spdk_bdev_io_error_stat { 366 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 367 }; 368 369 enum bdev_io_retry_state { 370 BDEV_IO_RETRY_STATE_INVALID, 371 BDEV_IO_RETRY_STATE_PULL, 372 BDEV_IO_RETRY_STATE_PULL_MD, 373 BDEV_IO_RETRY_STATE_SUBMIT, 374 }; 375 376 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 377 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 378 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 379 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 380 381 static inline void bdev_io_complete(void *ctx); 382 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 383 384 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 385 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 386 387 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 388 struct spdk_io_channel *ch, void *_ctx); 389 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 390 391 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 392 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 393 uint64_t num_blocks, 394 struct spdk_memory_domain *domain, void *domain_ctx, 395 struct spdk_accel_sequence *seq, 396 spdk_bdev_io_completion_cb cb, void *cb_arg); 397 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 398 struct iovec *iov, int iovcnt, void *md_buf, 399 uint64_t offset_blocks, uint64_t num_blocks, 400 struct spdk_memory_domain *domain, void *domain_ctx, 401 struct spdk_accel_sequence *seq, 402 spdk_bdev_io_completion_cb cb, void *cb_arg); 403 404 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 405 uint64_t offset, uint64_t length, 406 lock_range_cb cb_fn, void *cb_arg); 407 408 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 409 uint64_t offset, uint64_t length, 410 lock_range_cb cb_fn, void *cb_arg); 411 412 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 413 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 414 415 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 416 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 417 static void claim_reset(struct spdk_bdev *bdev); 418 419 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 420 421 #define bdev_get_ext_io_opt(opts, field, defval) \ 422 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 423 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 424 425 void 426 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 427 { 428 if (!opts) { 429 SPDK_ERRLOG("opts should not be NULL\n"); 430 return; 431 } 432 433 if (!opts_size) { 434 SPDK_ERRLOG("opts_size should not be zero value\n"); 435 return; 436 } 437 438 opts->opts_size = opts_size; 439 440 #define SET_FIELD(field) \ 441 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 442 opts->field = g_bdev_opts.field; \ 443 } \ 444 445 SET_FIELD(bdev_io_pool_size); 446 SET_FIELD(bdev_io_cache_size); 447 SET_FIELD(bdev_auto_examine); 448 SET_FIELD(small_buf_pool_size); 449 SET_FIELD(large_buf_pool_size); 450 451 /* Do not remove this statement, you should always update this statement when you adding a new field, 452 * and do not forget to add the SET_FIELD statement for your added field. */ 453 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 454 455 #undef SET_FIELD 456 } 457 458 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 459 "v23.05", 0); 460 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 461 "v23.05", 0); 462 int 463 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 464 { 465 struct spdk_iobuf_opts iobuf_opts; 466 uint32_t min_pool_size; 467 int rc; 468 469 if (!opts) { 470 SPDK_ERRLOG("opts cannot be NULL\n"); 471 return -1; 472 } 473 474 if (!opts->opts_size) { 475 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 476 return -1; 477 } 478 479 /* 480 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 481 * initialization. A second mgmt_ch will be created on the same thread when the application starts 482 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 483 */ 484 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 485 if (opts->bdev_io_pool_size < min_pool_size) { 486 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 487 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 488 spdk_thread_get_count()); 489 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 490 return -1; 491 } 492 493 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 494 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 495 } 496 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 497 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 498 } 499 500 #define SET_FIELD(field) \ 501 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 502 g_bdev_opts.field = opts->field; \ 503 } \ 504 505 SET_FIELD(bdev_io_pool_size); 506 SET_FIELD(bdev_io_cache_size); 507 SET_FIELD(bdev_auto_examine); 508 SET_FIELD(small_buf_pool_size); 509 SET_FIELD(large_buf_pool_size); 510 511 spdk_iobuf_get_opts(&iobuf_opts); 512 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 513 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 514 515 rc = spdk_iobuf_set_opts(&iobuf_opts); 516 if (rc != 0) { 517 SPDK_ERRLOG("Failed to set iobuf opts\n"); 518 return -1; 519 } 520 521 g_bdev_opts.opts_size = opts->opts_size; 522 523 #undef SET_FIELD 524 525 return 0; 526 } 527 528 static struct spdk_bdev * 529 bdev_get_by_name(const char *bdev_name) 530 { 531 struct spdk_bdev_name find; 532 struct spdk_bdev_name *res; 533 534 find.name = (char *)bdev_name; 535 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 536 if (res != NULL) { 537 return res->bdev; 538 } 539 540 return NULL; 541 } 542 543 struct spdk_bdev * 544 spdk_bdev_get_by_name(const char *bdev_name) 545 { 546 struct spdk_bdev *bdev; 547 548 spdk_spin_lock(&g_bdev_mgr.spinlock); 549 bdev = bdev_get_by_name(bdev_name); 550 spdk_spin_unlock(&g_bdev_mgr.spinlock); 551 552 return bdev; 553 } 554 555 struct bdev_io_status_string { 556 enum spdk_bdev_io_status status; 557 const char *str; 558 }; 559 560 static const struct bdev_io_status_string bdev_io_status_strings[] = { 561 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 562 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 563 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 564 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 565 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 566 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 567 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 568 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 569 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 570 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 571 }; 572 573 static const char * 574 bdev_io_status_get_string(enum spdk_bdev_io_status status) 575 { 576 uint32_t i; 577 578 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 579 if (bdev_io_status_strings[i].status == status) { 580 return bdev_io_status_strings[i].str; 581 } 582 } 583 584 return "reserved"; 585 } 586 587 struct spdk_bdev_wait_for_examine_ctx { 588 struct spdk_poller *poller; 589 spdk_bdev_wait_for_examine_cb cb_fn; 590 void *cb_arg; 591 }; 592 593 static bool bdev_module_all_actions_completed(void); 594 595 static int 596 bdev_wait_for_examine_cb(void *arg) 597 { 598 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 599 600 if (!bdev_module_all_actions_completed()) { 601 return SPDK_POLLER_IDLE; 602 } 603 604 spdk_poller_unregister(&ctx->poller); 605 ctx->cb_fn(ctx->cb_arg); 606 free(ctx); 607 608 return SPDK_POLLER_BUSY; 609 } 610 611 int 612 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 613 { 614 struct spdk_bdev_wait_for_examine_ctx *ctx; 615 616 ctx = calloc(1, sizeof(*ctx)); 617 if (ctx == NULL) { 618 return -ENOMEM; 619 } 620 ctx->cb_fn = cb_fn; 621 ctx->cb_arg = cb_arg; 622 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 623 624 return 0; 625 } 626 627 struct spdk_bdev_examine_item { 628 char *name; 629 TAILQ_ENTRY(spdk_bdev_examine_item) link; 630 }; 631 632 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 633 634 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 635 g_bdev_examine_allowlist); 636 637 static inline bool 638 bdev_examine_allowlist_check(const char *name) 639 { 640 struct spdk_bdev_examine_item *item; 641 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 642 if (strcmp(name, item->name) == 0) { 643 return true; 644 } 645 } 646 return false; 647 } 648 649 static inline void 650 bdev_examine_allowlist_free(void) 651 { 652 struct spdk_bdev_examine_item *item; 653 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 654 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 655 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 656 free(item->name); 657 free(item); 658 } 659 } 660 661 static inline bool 662 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 663 { 664 struct spdk_bdev_alias *tmp; 665 if (bdev_examine_allowlist_check(bdev->name)) { 666 return true; 667 } 668 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 669 if (bdev_examine_allowlist_check(tmp->alias.name)) { 670 return true; 671 } 672 } 673 return false; 674 } 675 676 static inline bool 677 bdev_ok_to_examine(struct spdk_bdev *bdev) 678 { 679 if (g_bdev_opts.bdev_auto_examine) { 680 return true; 681 } else { 682 return bdev_in_examine_allowlist(bdev); 683 } 684 } 685 686 static void 687 bdev_examine(struct spdk_bdev *bdev) 688 { 689 struct spdk_bdev_module *module; 690 struct spdk_bdev_module_claim *claim, *tmpclaim; 691 uint32_t action; 692 693 if (!bdev_ok_to_examine(bdev)) { 694 return; 695 } 696 697 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 698 if (module->examine_config) { 699 spdk_spin_lock(&module->internal.spinlock); 700 action = module->internal.action_in_progress; 701 module->internal.action_in_progress++; 702 spdk_spin_unlock(&module->internal.spinlock); 703 module->examine_config(bdev); 704 if (action != module->internal.action_in_progress) { 705 SPDK_ERRLOG("examine_config for module %s did not call " 706 "spdk_bdev_module_examine_done()\n", module->name); 707 } 708 } 709 } 710 711 spdk_spin_lock(&bdev->internal.spinlock); 712 713 switch (bdev->internal.claim_type) { 714 case SPDK_BDEV_CLAIM_NONE: 715 /* Examine by all bdev modules */ 716 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 717 if (module->examine_disk) { 718 spdk_spin_lock(&module->internal.spinlock); 719 module->internal.action_in_progress++; 720 spdk_spin_unlock(&module->internal.spinlock); 721 spdk_spin_unlock(&bdev->internal.spinlock); 722 module->examine_disk(bdev); 723 spdk_spin_lock(&bdev->internal.spinlock); 724 } 725 } 726 break; 727 case SPDK_BDEV_CLAIM_EXCL_WRITE: 728 /* Examine by the one bdev module with a v1 claim */ 729 module = bdev->internal.claim.v1.module; 730 if (module->examine_disk) { 731 spdk_spin_lock(&module->internal.spinlock); 732 module->internal.action_in_progress++; 733 spdk_spin_unlock(&module->internal.spinlock); 734 spdk_spin_unlock(&bdev->internal.spinlock); 735 module->examine_disk(bdev); 736 return; 737 } 738 break; 739 default: 740 /* Examine by all bdev modules with a v2 claim */ 741 assert(claim_type_is_v2(bdev->internal.claim_type)); 742 /* 743 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 744 * list, perhaps accessing freed memory. Without protection, this could happen 745 * while the lock is dropped during the examine callback. 746 */ 747 bdev->internal.examine_in_progress++; 748 749 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 750 module = claim->module; 751 752 if (module == NULL) { 753 /* This is a vestigial claim, held by examine_count */ 754 continue; 755 } 756 757 if (module->examine_disk == NULL) { 758 continue; 759 } 760 761 spdk_spin_lock(&module->internal.spinlock); 762 module->internal.action_in_progress++; 763 spdk_spin_unlock(&module->internal.spinlock); 764 765 /* Call examine_disk without holding internal.spinlock. */ 766 spdk_spin_unlock(&bdev->internal.spinlock); 767 module->examine_disk(bdev); 768 spdk_spin_lock(&bdev->internal.spinlock); 769 } 770 771 assert(bdev->internal.examine_in_progress > 0); 772 bdev->internal.examine_in_progress--; 773 if (bdev->internal.examine_in_progress == 0) { 774 /* Remove any claims that were released during examine_disk */ 775 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 776 if (claim->desc != NULL) { 777 continue; 778 } 779 780 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 781 free(claim); 782 } 783 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 784 claim_reset(bdev); 785 } 786 } 787 } 788 789 spdk_spin_unlock(&bdev->internal.spinlock); 790 } 791 792 int 793 spdk_bdev_examine(const char *name) 794 { 795 struct spdk_bdev *bdev; 796 struct spdk_bdev_examine_item *item; 797 struct spdk_thread *thread = spdk_get_thread(); 798 799 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 800 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 801 thread ? spdk_thread_get_name(thread) : "null"); 802 return -EINVAL; 803 } 804 805 if (g_bdev_opts.bdev_auto_examine) { 806 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 807 return -EINVAL; 808 } 809 810 if (bdev_examine_allowlist_check(name)) { 811 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 812 return -EEXIST; 813 } 814 815 item = calloc(1, sizeof(*item)); 816 if (!item) { 817 return -ENOMEM; 818 } 819 item->name = strdup(name); 820 if (!item->name) { 821 free(item); 822 return -ENOMEM; 823 } 824 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 825 826 bdev = spdk_bdev_get_by_name(name); 827 if (bdev) { 828 bdev_examine(bdev); 829 } 830 return 0; 831 } 832 833 static inline void 834 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 835 { 836 struct spdk_bdev_examine_item *item; 837 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 838 spdk_json_write_object_begin(w); 839 spdk_json_write_named_string(w, "method", "bdev_examine"); 840 spdk_json_write_named_object_begin(w, "params"); 841 spdk_json_write_named_string(w, "name", item->name); 842 spdk_json_write_object_end(w); 843 spdk_json_write_object_end(w); 844 } 845 } 846 847 struct spdk_bdev * 848 spdk_bdev_first(void) 849 { 850 struct spdk_bdev *bdev; 851 852 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 853 if (bdev) { 854 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 855 } 856 857 return bdev; 858 } 859 860 struct spdk_bdev * 861 spdk_bdev_next(struct spdk_bdev *prev) 862 { 863 struct spdk_bdev *bdev; 864 865 bdev = TAILQ_NEXT(prev, internal.link); 866 if (bdev) { 867 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 868 } 869 870 return bdev; 871 } 872 873 static struct spdk_bdev * 874 _bdev_next_leaf(struct spdk_bdev *bdev) 875 { 876 while (bdev != NULL) { 877 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 878 return bdev; 879 } else { 880 bdev = TAILQ_NEXT(bdev, internal.link); 881 } 882 } 883 884 return bdev; 885 } 886 887 struct spdk_bdev * 888 spdk_bdev_first_leaf(void) 889 { 890 struct spdk_bdev *bdev; 891 892 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 893 894 if (bdev) { 895 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 896 } 897 898 return bdev; 899 } 900 901 struct spdk_bdev * 902 spdk_bdev_next_leaf(struct spdk_bdev *prev) 903 { 904 struct spdk_bdev *bdev; 905 906 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 907 908 if (bdev) { 909 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 910 } 911 912 return bdev; 913 } 914 915 static inline bool 916 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 917 { 918 return bdev_io->internal.memory_domain; 919 } 920 921 static inline bool 922 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 923 { 924 return bdev_io->internal.accel_sequence; 925 } 926 927 static inline void 928 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 929 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 930 { 931 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 932 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 933 * channels we will instead wait for half to complete. 934 */ 935 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 936 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 937 938 assert(state != BDEV_IO_RETRY_STATE_INVALID); 939 bdev_io->internal.retry_state = state; 940 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 941 } 942 943 static inline void 944 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 945 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 946 { 947 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 948 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 949 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 950 951 assert(state != BDEV_IO_RETRY_STATE_INVALID); 952 bdev_io->internal.retry_state = state; 953 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 954 } 955 956 void 957 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 958 { 959 struct iovec *iovs; 960 961 if (bdev_io->u.bdev.iovs == NULL) { 962 bdev_io->u.bdev.iovs = &bdev_io->iov; 963 bdev_io->u.bdev.iovcnt = 1; 964 } 965 966 iovs = bdev_io->u.bdev.iovs; 967 968 assert(iovs != NULL); 969 assert(bdev_io->u.bdev.iovcnt >= 1); 970 971 iovs[0].iov_base = buf; 972 iovs[0].iov_len = len; 973 } 974 975 void 976 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 977 { 978 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 979 bdev_io->u.bdev.md_buf = md_buf; 980 } 981 982 static bool 983 _is_buf_allocated(const struct iovec *iovs) 984 { 985 if (iovs == NULL) { 986 return false; 987 } 988 989 return iovs[0].iov_base != NULL; 990 } 991 992 static bool 993 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 994 { 995 int i; 996 uintptr_t iov_base; 997 998 if (spdk_likely(alignment == 1)) { 999 return true; 1000 } 1001 1002 for (i = 0; i < iovcnt; i++) { 1003 iov_base = (uintptr_t)iovs[i].iov_base; 1004 if ((iov_base & (alignment - 1)) != 0) { 1005 return false; 1006 } 1007 } 1008 1009 return true; 1010 } 1011 1012 static inline bool 1013 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1014 { 1015 if (!bdev_io_use_accel_sequence(bdev_io)) { 1016 return false; 1017 } 1018 1019 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1020 * bdev module didn't support accel sequences */ 1021 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1022 } 1023 1024 static inline void 1025 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1026 struct spdk_bdev_shared_resource *shared_resource) 1027 { 1028 bdev_ch->io_outstanding++; 1029 shared_resource->io_outstanding++; 1030 } 1031 1032 static inline void 1033 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1034 struct spdk_bdev_shared_resource *shared_resource) 1035 { 1036 assert(bdev_ch->io_outstanding > 0); 1037 assert(shared_resource->io_outstanding > 0); 1038 bdev_ch->io_outstanding--; 1039 shared_resource->io_outstanding--; 1040 } 1041 1042 static void 1043 bdev_io_submit_sequence_cb(void *ctx, int status) 1044 { 1045 struct spdk_bdev_io *bdev_io = ctx; 1046 1047 bdev_io->u.bdev.accel_sequence = NULL; 1048 bdev_io->internal.accel_sequence = NULL; 1049 1050 if (spdk_unlikely(status != 0)) { 1051 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1052 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1053 bdev_io_complete_unsubmitted(bdev_io); 1054 return; 1055 } 1056 1057 bdev_io_submit(bdev_io); 1058 } 1059 1060 static void 1061 bdev_io_exec_sequence_cb(void *ctx, int status) 1062 { 1063 struct spdk_bdev_io *bdev_io = ctx; 1064 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1065 1066 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1067 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1068 1069 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1070 bdev_ch_retry_io(ch); 1071 } 1072 1073 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1074 } 1075 1076 static void 1077 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1078 { 1079 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1080 1081 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1082 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1083 1084 /* Since the operations are appended during submission, they're in the opposite order than 1085 * how we want to execute them for reads (i.e. we need to execute the most recently added 1086 * operation first), so reverse the sequence before executing it. 1087 */ 1088 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1089 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1090 } 1091 1092 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1093 bdev_io_increment_outstanding(ch, ch->shared_resource); 1094 bdev_io->internal.data_transfer_cpl = cb_fn; 1095 1096 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1097 bdev_io_exec_sequence_cb, bdev_io); 1098 } 1099 1100 static void 1101 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1102 { 1103 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1104 void *buf; 1105 1106 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1107 buf = bdev_io->internal.buf; 1108 bdev_io->internal.buf = NULL; 1109 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1110 bdev_io->internal.get_aux_buf_cb = NULL; 1111 } else { 1112 assert(bdev_io->internal.get_buf_cb != NULL); 1113 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1114 bdev_io->internal.get_buf_cb = NULL; 1115 } 1116 } 1117 1118 static void 1119 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1120 { 1121 struct spdk_bdev_io *bdev_io = ctx; 1122 1123 if (rc) { 1124 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1126 } 1127 bdev_io_get_buf_complete(bdev_io, !rc); 1128 } 1129 1130 static void 1131 bdev_io_pull_md_buf_done(void *ctx, int status) 1132 { 1133 struct spdk_bdev_io *bdev_io = ctx; 1134 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1135 1136 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1137 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1138 1139 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1140 bdev_ch_retry_io(ch); 1141 } 1142 1143 assert(bdev_io->internal.data_transfer_cpl); 1144 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1145 } 1146 1147 static void 1148 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1149 { 1150 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1151 int rc = 0; 1152 1153 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1154 if (bdev_io_use_memory_domain(bdev_io)) { 1155 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1156 bdev_io_increment_outstanding(ch, ch->shared_resource); 1157 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1158 bdev_io->internal.memory_domain_ctx, 1159 &bdev_io->internal.orig_md_iov, 1, 1160 &bdev_io->internal.bounce_md_iov, 1, 1161 bdev_io_pull_md_buf_done, bdev_io); 1162 if (rc == 0) { 1163 /* Continue to submit IO in completion callback */ 1164 return; 1165 } 1166 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1167 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1168 if (rc != -ENOMEM) { 1169 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1170 spdk_memory_domain_get_dma_device_id( 1171 bdev_io->internal.memory_domain), rc); 1172 } 1173 } else { 1174 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1175 bdev_io->internal.orig_md_iov.iov_base, 1176 bdev_io->internal.orig_md_iov.iov_len); 1177 } 1178 } 1179 1180 if (spdk_unlikely(rc == -ENOMEM)) { 1181 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1182 } else { 1183 assert(bdev_io->internal.data_transfer_cpl); 1184 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1185 } 1186 } 1187 1188 static void 1189 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1190 { 1191 /* save original md_buf */ 1192 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1193 bdev_io->internal.orig_md_iov.iov_len = len; 1194 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1195 bdev_io->internal.bounce_md_iov.iov_len = len; 1196 /* set bounce md_buf */ 1197 bdev_io->u.bdev.md_buf = md_buf; 1198 1199 bdev_io_pull_md_buf(bdev_io); 1200 } 1201 1202 static void 1203 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1204 { 1205 struct spdk_bdev *bdev = bdev_io->bdev; 1206 uint64_t md_len; 1207 void *buf; 1208 1209 if (spdk_bdev_is_md_separate(bdev)) { 1210 assert(!bdev_io_use_accel_sequence(bdev_io)); 1211 1212 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1213 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1214 1215 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1216 1217 if (bdev_io->u.bdev.md_buf != NULL) { 1218 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1219 return; 1220 } else { 1221 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1222 } 1223 } 1224 1225 bdev_io_get_buf_complete(bdev_io, true); 1226 } 1227 1228 static inline void 1229 bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1230 { 1231 struct spdk_bdev_io *bdev_io = ctx; 1232 1233 if (rc) { 1234 SPDK_ERRLOG("Failed to get data buffer\n"); 1235 assert(bdev_io->internal.data_transfer_cpl); 1236 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1237 return; 1238 } 1239 1240 _bdev_io_set_md_buf(bdev_io); 1241 } 1242 1243 static void 1244 _bdev_io_pull_bounce_data_buf_done(void *ctx, int status) 1245 { 1246 struct spdk_bdev_io *bdev_io = ctx; 1247 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1248 1249 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1250 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1251 1252 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1253 bdev_ch_retry_io(ch); 1254 } 1255 1256 bdev_io_pull_bounce_data_buf_done(ctx, status); 1257 } 1258 1259 static void 1260 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1261 { 1262 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1263 int rc = 0; 1264 1265 /* If we need to exec an accel sequence, append a copy operation making accel change the 1266 * src/dst buffers of the previous operation */ 1267 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1268 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1269 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1270 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1271 NULL, NULL, 1272 bdev_io->internal.orig_iovs, 1273 bdev_io->internal.orig_iovcnt, 1274 bdev_io->internal.memory_domain, 1275 bdev_io->internal.memory_domain_ctx, 1276 0, NULL, NULL); 1277 } else { 1278 /* We need to reverse the src/dst for reads */ 1279 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1280 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1281 bdev_io->internal.orig_iovs, 1282 bdev_io->internal.orig_iovcnt, 1283 bdev_io->internal.memory_domain, 1284 bdev_io->internal.memory_domain_ctx, 1285 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1286 NULL, NULL, 0, NULL, NULL); 1287 } 1288 1289 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1290 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1291 bdev_io->internal.accel_sequence); 1292 } 1293 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1294 /* if this is write path, copy data from original buffer to bounce buffer */ 1295 if (bdev_io_use_memory_domain(bdev_io)) { 1296 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1297 bdev_io_increment_outstanding(ch, ch->shared_resource); 1298 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1299 bdev_io->internal.memory_domain_ctx, 1300 bdev_io->internal.orig_iovs, 1301 (uint32_t) bdev_io->internal.orig_iovcnt, 1302 bdev_io->u.bdev.iovs, 1, 1303 _bdev_io_pull_bounce_data_buf_done, 1304 bdev_io); 1305 if (rc == 0) { 1306 /* Continue to submit IO in completion callback */ 1307 return; 1308 } 1309 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1310 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1311 if (rc != -ENOMEM) { 1312 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1313 spdk_memory_domain_get_dma_device_id( 1314 bdev_io->internal.memory_domain)); 1315 } 1316 } else { 1317 assert(bdev_io->u.bdev.iovcnt == 1); 1318 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1319 bdev_io->u.bdev.iovs[0].iov_len, 1320 bdev_io->internal.orig_iovs, 1321 bdev_io->internal.orig_iovcnt); 1322 } 1323 } 1324 1325 if (spdk_unlikely(rc == -ENOMEM)) { 1326 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1327 } else { 1328 bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1329 } 1330 } 1331 1332 static void 1333 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1334 bdev_copy_bounce_buffer_cpl cpl_cb) 1335 { 1336 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1337 1338 bdev_io->internal.data_transfer_cpl = cpl_cb; 1339 /* save original iovec */ 1340 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1341 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1342 /* set bounce iov */ 1343 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1344 bdev_io->u.bdev.iovcnt = 1; 1345 /* set bounce buffer for this operation */ 1346 bdev_io->u.bdev.iovs[0].iov_base = buf; 1347 bdev_io->u.bdev.iovs[0].iov_len = len; 1348 1349 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1350 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1351 } else { 1352 bdev_io_pull_data(bdev_io); 1353 } 1354 } 1355 1356 static void 1357 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1358 { 1359 struct spdk_bdev *bdev = bdev_io->bdev; 1360 bool buf_allocated; 1361 uint64_t alignment; 1362 void *aligned_buf; 1363 1364 bdev_io->internal.buf = buf; 1365 1366 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1367 bdev_io_get_buf_complete(bdev_io, true); 1368 return; 1369 } 1370 1371 alignment = spdk_bdev_get_buf_align(bdev); 1372 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1373 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1374 1375 if (buf_allocated) { 1376 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1377 /* Continue in completion callback */ 1378 return; 1379 } else { 1380 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1381 } 1382 1383 _bdev_io_set_md_buf(bdev_io); 1384 } 1385 1386 static inline uint64_t 1387 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1388 { 1389 struct spdk_bdev *bdev = bdev_io->bdev; 1390 uint64_t md_len, alignment; 1391 1392 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1393 1394 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1395 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1396 1397 return len + alignment + md_len; 1398 } 1399 1400 static void 1401 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1402 { 1403 struct spdk_bdev_mgmt_channel *ch; 1404 1405 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1406 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1407 } 1408 1409 static void 1410 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1411 { 1412 assert(bdev_io->internal.buf != NULL); 1413 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1414 bdev_io->internal.buf = NULL; 1415 } 1416 1417 void 1418 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1419 { 1420 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1421 1422 assert(buf != NULL); 1423 _bdev_io_put_buf(bdev_io, buf, len); 1424 } 1425 1426 static inline void 1427 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1428 struct spdk_bdev_io *bdev_io) 1429 { 1430 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1431 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1432 * sequence pointer to make sure we won't touch it anymore. */ 1433 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1434 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1435 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1436 bdev_io->internal.accel_sequence = NULL; 1437 } 1438 1439 bdev->fn_table->submit_request(ioch, bdev_io); 1440 } 1441 1442 static inline void 1443 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1444 { 1445 struct spdk_bdev *bdev = bdev_ch->bdev; 1446 1447 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1448 bdev_io->internal.error.nvme.cdw0 = 0; 1449 bdev_io->num_retries++; 1450 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1451 } 1452 1453 static void 1454 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1455 { 1456 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1457 struct spdk_bdev_io *bdev_io; 1458 1459 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1460 /* 1461 * Allow some more I/O to complete before retrying the nomem_io queue. 1462 * Some drivers (such as nvme) cannot immediately take a new I/O in 1463 * the context of a completion, because the resources for the I/O are 1464 * not released until control returns to the bdev poller. Also, we 1465 * may require several small I/O to complete before a larger I/O 1466 * (that requires splitting) can be submitted. 1467 */ 1468 return; 1469 } 1470 1471 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1472 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1473 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1474 1475 switch (bdev_io->internal.retry_state) { 1476 case BDEV_IO_RETRY_STATE_SUBMIT: 1477 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1478 break; 1479 case BDEV_IO_RETRY_STATE_PULL: 1480 bdev_io_pull_data(bdev_io); 1481 break; 1482 case BDEV_IO_RETRY_STATE_PULL_MD: 1483 bdev_io_pull_md_buf(bdev_io); 1484 break; 1485 default: 1486 assert(0 && "invalid retry state"); 1487 break; 1488 } 1489 1490 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1491 /* This IO completed again with NOMEM status, so break the loop and 1492 * don't try anymore. Note that a bdev_io that fails with NOMEM 1493 * always gets requeued at the front of the list, to maintain 1494 * ordering. 1495 */ 1496 break; 1497 } 1498 } 1499 } 1500 1501 static inline bool 1502 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1503 { 1504 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1505 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1506 1507 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1508 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1509 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1510 1511 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1512 * ownership of that sequence is transferred back to the bdev layer, so we need to 1513 * restore internal.accel_sequence to make sure that the sequence is handled 1514 * correctly in case the I/O is later aborted. */ 1515 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1516 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1517 assert(bdev_io->internal.accel_sequence == NULL); 1518 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1519 } 1520 1521 return true; 1522 } 1523 1524 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1525 bdev_ch_retry_io(bdev_ch); 1526 } 1527 1528 return false; 1529 } 1530 1531 static void 1532 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1533 { 1534 struct spdk_bdev_io *bdev_io = ctx; 1535 1536 if (rc) { 1537 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1538 } 1539 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1540 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1541 */ 1542 bdev_io_put_buf(bdev_io); 1543 1544 /* Continue with IO completion flow */ 1545 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_INVALID))) { 1546 return; 1547 } 1548 1549 bdev_io_complete(bdev_io); 1550 } 1551 1552 static void 1553 _bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1554 { 1555 struct spdk_bdev_io *bdev_io = ctx; 1556 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1557 1558 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1559 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1560 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1561 } 1562 1563 static inline void 1564 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1565 { 1566 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1567 int rc = 0; 1568 1569 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1570 /* do the same for metadata buffer */ 1571 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1572 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1573 1574 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1575 if (bdev_io_use_memory_domain(bdev_io)) { 1576 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1577 bdev_io_increment_outstanding(ch, ch->shared_resource); 1578 /* If memory domain is used then we need to call async push function */ 1579 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1580 bdev_io->internal.memory_domain_ctx, 1581 &bdev_io->internal.orig_md_iov, 1582 (uint32_t)bdev_io->internal.orig_iovcnt, 1583 &bdev_io->internal.bounce_md_iov, 1, 1584 _bdev_io_push_bounce_md_buf_done, 1585 bdev_io); 1586 if (rc == 0) { 1587 /* Continue IO completion in async callback */ 1588 return; 1589 } 1590 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1591 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1592 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1593 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1594 } else { 1595 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1596 bdev_io->internal.orig_md_iov.iov_len); 1597 } 1598 } 1599 } 1600 1601 assert(bdev_io->internal.data_transfer_cpl); 1602 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1603 } 1604 1605 static void 1606 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1607 { 1608 struct spdk_bdev_io *bdev_io = ctx; 1609 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1610 1611 assert(bdev_io->internal.data_transfer_cpl); 1612 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1613 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1614 1615 if (rc) { 1616 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1617 return; 1618 } 1619 1620 /* set original buffer for this io */ 1621 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1622 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1623 /* disable bouncing buffer for this io */ 1624 bdev_io->internal.orig_iovcnt = 0; 1625 bdev_io->internal.orig_iovs = NULL; 1626 1627 bdev_io_push_bounce_md_buf(bdev_io); 1628 } 1629 1630 static inline void 1631 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1632 { 1633 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1634 int rc = 0; 1635 1636 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1637 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1638 bdev_io_increment_outstanding(ch, ch->shared_resource); 1639 1640 /* if this is read path, copy data from bounce buffer to original buffer */ 1641 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1642 if (bdev_io_use_memory_domain(bdev_io)) { 1643 /* If memory domain is used then we need to call async push function */ 1644 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1645 bdev_io->internal.memory_domain_ctx, 1646 bdev_io->internal.orig_iovs, 1647 (uint32_t)bdev_io->internal.orig_iovcnt, 1648 &bdev_io->internal.bounce_iov, 1, 1649 _bdev_io_push_bounce_data_buffer_done, 1650 bdev_io); 1651 if (rc == 0) { 1652 /* Continue IO completion in async callback */ 1653 return; 1654 } 1655 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1656 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1657 } else { 1658 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1659 bdev_io->internal.orig_iovcnt, 1660 bdev_io->internal.bounce_iov.iov_base, 1661 bdev_io->internal.bounce_iov.iov_len); 1662 } 1663 } 1664 1665 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1666 } 1667 1668 static inline void 1669 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1670 { 1671 bdev_io->internal.data_transfer_cpl = cpl_cb; 1672 bdev_io_push_bounce_data(bdev_io); 1673 } 1674 1675 static void 1676 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1677 { 1678 struct spdk_bdev_io *bdev_io; 1679 1680 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1681 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1682 } 1683 1684 static void 1685 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1686 { 1687 struct spdk_bdev_mgmt_channel *mgmt_ch; 1688 uint64_t max_len; 1689 void *buf; 1690 1691 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1692 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1693 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1694 1695 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1696 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1697 bdev_io_get_buf_complete(bdev_io, false); 1698 return; 1699 } 1700 1701 bdev_io->internal.buf_len = len; 1702 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1703 bdev_io_get_iobuf_cb); 1704 if (buf != NULL) { 1705 _bdev_io_set_buf(bdev_io, buf, len); 1706 } 1707 } 1708 1709 void 1710 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1711 { 1712 struct spdk_bdev *bdev = bdev_io->bdev; 1713 uint64_t alignment; 1714 1715 assert(cb != NULL); 1716 bdev_io->internal.get_buf_cb = cb; 1717 1718 alignment = spdk_bdev_get_buf_align(bdev); 1719 1720 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1721 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1722 /* Buffer already present and aligned */ 1723 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1724 return; 1725 } 1726 1727 bdev_io_get_buf(bdev_io, len); 1728 } 1729 1730 static void 1731 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1732 bool success) 1733 { 1734 if (!success) { 1735 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1736 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1737 bdev_io_complete_unsubmitted(bdev_io); 1738 return; 1739 } 1740 1741 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1742 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1743 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1744 return; 1745 } 1746 /* For reads we'll execute the sequence after the data is read, so, for now, only 1747 * clear out accel_sequence pointer and submit the IO */ 1748 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1749 bdev_io->u.bdev.accel_sequence = NULL; 1750 } 1751 1752 bdev_io_submit(bdev_io); 1753 } 1754 1755 static void 1756 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1757 uint64_t len) 1758 { 1759 assert(cb != NULL); 1760 bdev_io->internal.get_buf_cb = cb; 1761 1762 bdev_io_get_buf(bdev_io, len); 1763 } 1764 1765 void 1766 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1767 { 1768 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1769 1770 assert(cb != NULL); 1771 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1772 bdev_io->internal.get_aux_buf_cb = cb; 1773 bdev_io_get_buf(bdev_io, len); 1774 } 1775 1776 static int 1777 bdev_module_get_max_ctx_size(void) 1778 { 1779 struct spdk_bdev_module *bdev_module; 1780 int max_bdev_module_size = 0; 1781 1782 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1783 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1784 max_bdev_module_size = bdev_module->get_ctx_size(); 1785 } 1786 } 1787 1788 return max_bdev_module_size; 1789 } 1790 1791 static void 1792 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1793 { 1794 int i; 1795 struct spdk_bdev_qos *qos = bdev->internal.qos; 1796 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1797 1798 if (!qos) { 1799 return; 1800 } 1801 1802 spdk_bdev_get_qos_rate_limits(bdev, limits); 1803 1804 spdk_json_write_object_begin(w); 1805 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1806 1807 spdk_json_write_named_object_begin(w, "params"); 1808 spdk_json_write_named_string(w, "name", bdev->name); 1809 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1810 if (limits[i] > 0) { 1811 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1812 } 1813 } 1814 spdk_json_write_object_end(w); 1815 1816 spdk_json_write_object_end(w); 1817 } 1818 1819 void 1820 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1821 { 1822 struct spdk_bdev_module *bdev_module; 1823 struct spdk_bdev *bdev; 1824 1825 assert(w != NULL); 1826 1827 spdk_json_write_array_begin(w); 1828 1829 spdk_json_write_object_begin(w); 1830 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1831 spdk_json_write_named_object_begin(w, "params"); 1832 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1833 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1834 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1835 spdk_json_write_object_end(w); 1836 spdk_json_write_object_end(w); 1837 1838 bdev_examine_allowlist_config_json(w); 1839 1840 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1841 if (bdev_module->config_json) { 1842 bdev_module->config_json(w); 1843 } 1844 } 1845 1846 spdk_spin_lock(&g_bdev_mgr.spinlock); 1847 1848 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1849 if (bdev->fn_table->write_config_json) { 1850 bdev->fn_table->write_config_json(bdev, w); 1851 } 1852 1853 bdev_qos_config_json(bdev, w); 1854 } 1855 1856 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1857 1858 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1859 spdk_json_write_object_begin(w); 1860 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1861 spdk_json_write_object_end(w); 1862 1863 spdk_json_write_array_end(w); 1864 } 1865 1866 static void 1867 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1868 { 1869 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1870 struct spdk_bdev_io *bdev_io; 1871 1872 spdk_iobuf_channel_fini(&ch->iobuf); 1873 1874 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1875 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1876 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1877 ch->per_thread_cache_count--; 1878 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1879 } 1880 1881 assert(ch->per_thread_cache_count == 0); 1882 } 1883 1884 static int 1885 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1886 { 1887 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1888 struct spdk_bdev_io *bdev_io; 1889 uint32_t i; 1890 int rc; 1891 1892 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1893 if (rc != 0) { 1894 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1895 return -1; 1896 } 1897 1898 STAILQ_INIT(&ch->per_thread_cache); 1899 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1900 1901 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1902 ch->per_thread_cache_count = 0; 1903 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1904 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1905 if (bdev_io == NULL) { 1906 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1907 assert(false); 1908 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1909 return -1; 1910 } 1911 ch->per_thread_cache_count++; 1912 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1913 } 1914 1915 TAILQ_INIT(&ch->shared_resources); 1916 TAILQ_INIT(&ch->io_wait_queue); 1917 1918 return 0; 1919 } 1920 1921 static void 1922 bdev_init_complete(int rc) 1923 { 1924 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1925 void *cb_arg = g_init_cb_arg; 1926 struct spdk_bdev_module *m; 1927 1928 g_bdev_mgr.init_complete = true; 1929 g_init_cb_fn = NULL; 1930 g_init_cb_arg = NULL; 1931 1932 /* 1933 * For modules that need to know when subsystem init is complete, 1934 * inform them now. 1935 */ 1936 if (rc == 0) { 1937 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1938 if (m->init_complete) { 1939 m->init_complete(); 1940 } 1941 } 1942 } 1943 1944 cb_fn(cb_arg, rc); 1945 } 1946 1947 static bool 1948 bdev_module_all_actions_completed(void) 1949 { 1950 struct spdk_bdev_module *m; 1951 1952 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1953 if (m->internal.action_in_progress > 0) { 1954 return false; 1955 } 1956 } 1957 return true; 1958 } 1959 1960 static void 1961 bdev_module_action_complete(void) 1962 { 1963 /* 1964 * Don't finish bdev subsystem initialization if 1965 * module pre-initialization is still in progress, or 1966 * the subsystem been already initialized. 1967 */ 1968 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1969 return; 1970 } 1971 1972 /* 1973 * Check all bdev modules for inits/examinations in progress. If any 1974 * exist, return immediately since we cannot finish bdev subsystem 1975 * initialization until all are completed. 1976 */ 1977 if (!bdev_module_all_actions_completed()) { 1978 return; 1979 } 1980 1981 /* 1982 * Modules already finished initialization - now that all 1983 * the bdev modules have finished their asynchronous I/O 1984 * processing, the entire bdev layer can be marked as complete. 1985 */ 1986 bdev_init_complete(0); 1987 } 1988 1989 static void 1990 bdev_module_action_done(struct spdk_bdev_module *module) 1991 { 1992 spdk_spin_lock(&module->internal.spinlock); 1993 assert(module->internal.action_in_progress > 0); 1994 module->internal.action_in_progress--; 1995 spdk_spin_unlock(&module->internal.spinlock); 1996 bdev_module_action_complete(); 1997 } 1998 1999 void 2000 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2001 { 2002 assert(module->async_init); 2003 bdev_module_action_done(module); 2004 } 2005 2006 void 2007 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2008 { 2009 bdev_module_action_done(module); 2010 } 2011 2012 /** The last initialized bdev module */ 2013 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2014 2015 static void 2016 bdev_init_failed(void *cb_arg) 2017 { 2018 struct spdk_bdev_module *module = cb_arg; 2019 2020 spdk_spin_lock(&module->internal.spinlock); 2021 assert(module->internal.action_in_progress > 0); 2022 module->internal.action_in_progress--; 2023 spdk_spin_unlock(&module->internal.spinlock); 2024 bdev_init_complete(-1); 2025 } 2026 2027 static int 2028 bdev_modules_init(void) 2029 { 2030 struct spdk_bdev_module *module; 2031 int rc = 0; 2032 2033 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2034 g_resume_bdev_module = module; 2035 if (module->async_init) { 2036 spdk_spin_lock(&module->internal.spinlock); 2037 module->internal.action_in_progress = 1; 2038 spdk_spin_unlock(&module->internal.spinlock); 2039 } 2040 rc = module->module_init(); 2041 if (rc != 0) { 2042 /* Bump action_in_progress to prevent other modules from completion of modules_init 2043 * Send message to defer application shutdown until resources are cleaned up */ 2044 spdk_spin_lock(&module->internal.spinlock); 2045 module->internal.action_in_progress = 1; 2046 spdk_spin_unlock(&module->internal.spinlock); 2047 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2048 return rc; 2049 } 2050 } 2051 2052 g_resume_bdev_module = NULL; 2053 return 0; 2054 } 2055 2056 void 2057 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2058 { 2059 int rc = 0; 2060 char mempool_name[32]; 2061 2062 assert(cb_fn != NULL); 2063 2064 g_init_cb_fn = cb_fn; 2065 g_init_cb_arg = cb_arg; 2066 2067 spdk_notify_type_register("bdev_register"); 2068 spdk_notify_type_register("bdev_unregister"); 2069 2070 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2071 2072 rc = spdk_iobuf_register_module("bdev"); 2073 if (rc != 0) { 2074 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2075 bdev_init_complete(-1); 2076 return; 2077 } 2078 2079 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2080 g_bdev_opts.bdev_io_pool_size, 2081 sizeof(struct spdk_bdev_io) + 2082 bdev_module_get_max_ctx_size(), 2083 0, 2084 SPDK_ENV_SOCKET_ID_ANY); 2085 2086 if (g_bdev_mgr.bdev_io_pool == NULL) { 2087 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2088 bdev_init_complete(-1); 2089 return; 2090 } 2091 2092 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2093 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2094 if (!g_bdev_mgr.zero_buffer) { 2095 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2096 bdev_init_complete(-1); 2097 return; 2098 } 2099 2100 #ifdef SPDK_CONFIG_VTUNE 2101 SPDK_LOG_DEPRECATED(vtune_support); 2102 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2103 #endif 2104 2105 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2106 bdev_mgmt_channel_destroy, 2107 sizeof(struct spdk_bdev_mgmt_channel), 2108 "bdev_mgr"); 2109 2110 rc = bdev_modules_init(); 2111 g_bdev_mgr.module_init_complete = true; 2112 if (rc != 0) { 2113 SPDK_ERRLOG("bdev modules init failed\n"); 2114 return; 2115 } 2116 2117 bdev_module_action_complete(); 2118 } 2119 2120 static void 2121 bdev_mgr_unregister_cb(void *io_device) 2122 { 2123 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2124 2125 if (g_bdev_mgr.bdev_io_pool) { 2126 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2127 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2128 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2129 g_bdev_opts.bdev_io_pool_size); 2130 } 2131 2132 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2133 } 2134 2135 spdk_free(g_bdev_mgr.zero_buffer); 2136 2137 bdev_examine_allowlist_free(); 2138 2139 cb_fn(g_fini_cb_arg); 2140 g_fini_cb_fn = NULL; 2141 g_fini_cb_arg = NULL; 2142 g_bdev_mgr.init_complete = false; 2143 g_bdev_mgr.module_init_complete = false; 2144 } 2145 2146 static void 2147 bdev_module_fini_iter(void *arg) 2148 { 2149 struct spdk_bdev_module *bdev_module; 2150 2151 /* FIXME: Handling initialization failures is broken now, 2152 * so we won't even try cleaning up after successfully 2153 * initialized modules. if module_init_complete is false, 2154 * just call spdk_bdev_mgr_unregister_cb 2155 */ 2156 if (!g_bdev_mgr.module_init_complete) { 2157 bdev_mgr_unregister_cb(NULL); 2158 return; 2159 } 2160 2161 /* Start iterating from the last touched module */ 2162 if (!g_resume_bdev_module) { 2163 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2164 } else { 2165 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2166 internal.tailq); 2167 } 2168 2169 while (bdev_module) { 2170 if (bdev_module->async_fini) { 2171 /* Save our place so we can resume later. We must 2172 * save the variable here, before calling module_fini() 2173 * below, because in some cases the module may immediately 2174 * call spdk_bdev_module_fini_done() and re-enter 2175 * this function to continue iterating. */ 2176 g_resume_bdev_module = bdev_module; 2177 } 2178 2179 if (bdev_module->module_fini) { 2180 bdev_module->module_fini(); 2181 } 2182 2183 if (bdev_module->async_fini) { 2184 return; 2185 } 2186 2187 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2188 internal.tailq); 2189 } 2190 2191 g_resume_bdev_module = NULL; 2192 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2193 } 2194 2195 void 2196 spdk_bdev_module_fini_done(void) 2197 { 2198 if (spdk_get_thread() != g_fini_thread) { 2199 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2200 } else { 2201 bdev_module_fini_iter(NULL); 2202 } 2203 } 2204 2205 static void 2206 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2207 { 2208 struct spdk_bdev *bdev = cb_arg; 2209 2210 if (bdeverrno && bdev) { 2211 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2212 bdev->name); 2213 2214 /* 2215 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2216 * bdev; try to continue by manually removing this bdev from the list and continue 2217 * with the next bdev in the list. 2218 */ 2219 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2220 } 2221 2222 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2223 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2224 /* 2225 * Bdev module finish need to be deferred as we might be in the middle of some context 2226 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2227 * after returning. 2228 */ 2229 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2230 return; 2231 } 2232 2233 /* 2234 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2235 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2236 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2237 * base bdevs. 2238 * 2239 * Also, walk the list in the reverse order. 2240 */ 2241 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2242 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2243 spdk_spin_lock(&bdev->internal.spinlock); 2244 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2245 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2246 spdk_spin_unlock(&bdev->internal.spinlock); 2247 continue; 2248 } 2249 spdk_spin_unlock(&bdev->internal.spinlock); 2250 2251 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2252 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2253 return; 2254 } 2255 2256 /* 2257 * If any bdev fails to unclaim underlying bdev properly, we may face the 2258 * case of bdev list consisting of claimed bdevs only (if claims are managed 2259 * correctly, this would mean there's a loop in the claims graph which is 2260 * clearly impossible). Warn and unregister last bdev on the list then. 2261 */ 2262 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2263 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2264 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2265 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2266 return; 2267 } 2268 } 2269 2270 static void 2271 bdev_module_fini_start_iter(void *arg) 2272 { 2273 struct spdk_bdev_module *bdev_module; 2274 2275 if (!g_resume_bdev_module) { 2276 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2277 } else { 2278 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2279 } 2280 2281 while (bdev_module) { 2282 if (bdev_module->async_fini_start) { 2283 /* Save our place so we can resume later. We must 2284 * save the variable here, before calling fini_start() 2285 * below, because in some cases the module may immediately 2286 * call spdk_bdev_module_fini_start_done() and re-enter 2287 * this function to continue iterating. */ 2288 g_resume_bdev_module = bdev_module; 2289 } 2290 2291 if (bdev_module->fini_start) { 2292 bdev_module->fini_start(); 2293 } 2294 2295 if (bdev_module->async_fini_start) { 2296 return; 2297 } 2298 2299 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2300 } 2301 2302 g_resume_bdev_module = NULL; 2303 2304 bdev_finish_unregister_bdevs_iter(NULL, 0); 2305 } 2306 2307 void 2308 spdk_bdev_module_fini_start_done(void) 2309 { 2310 if (spdk_get_thread() != g_fini_thread) { 2311 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2312 } else { 2313 bdev_module_fini_start_iter(NULL); 2314 } 2315 } 2316 2317 static void 2318 bdev_finish_wait_for_examine_done(void *cb_arg) 2319 { 2320 bdev_module_fini_start_iter(NULL); 2321 } 2322 2323 void 2324 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2325 { 2326 int rc; 2327 2328 assert(cb_fn != NULL); 2329 2330 g_fini_thread = spdk_get_thread(); 2331 2332 g_fini_cb_fn = cb_fn; 2333 g_fini_cb_arg = cb_arg; 2334 2335 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2336 if (rc != 0) { 2337 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2338 bdev_finish_wait_for_examine_done(NULL); 2339 } 2340 } 2341 2342 struct spdk_bdev_io * 2343 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2344 { 2345 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2346 struct spdk_bdev_io *bdev_io; 2347 2348 if (ch->per_thread_cache_count > 0) { 2349 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2350 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2351 ch->per_thread_cache_count--; 2352 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2353 /* 2354 * Don't try to look for bdev_ios in the global pool if there are 2355 * waiters on bdev_ios - we don't want this caller to jump the line. 2356 */ 2357 bdev_io = NULL; 2358 } else { 2359 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2360 } 2361 2362 return bdev_io; 2363 } 2364 2365 void 2366 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2367 { 2368 struct spdk_bdev_mgmt_channel *ch; 2369 2370 assert(bdev_io != NULL); 2371 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2372 2373 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2374 2375 if (bdev_io->internal.buf != NULL) { 2376 bdev_io_put_buf(bdev_io); 2377 } 2378 2379 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2380 ch->per_thread_cache_count++; 2381 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2382 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2383 struct spdk_bdev_io_wait_entry *entry; 2384 2385 entry = TAILQ_FIRST(&ch->io_wait_queue); 2386 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2387 entry->cb_fn(entry->cb_arg); 2388 } 2389 } else { 2390 /* We should never have a full cache with entries on the io wait queue. */ 2391 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2392 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2393 } 2394 } 2395 2396 static bool 2397 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2398 { 2399 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2400 2401 switch (limit) { 2402 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2403 return true; 2404 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2405 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2406 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2407 return false; 2408 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2409 default: 2410 return false; 2411 } 2412 } 2413 2414 static bool 2415 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2416 { 2417 switch (bdev_io->type) { 2418 case SPDK_BDEV_IO_TYPE_NVME_IO: 2419 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2420 case SPDK_BDEV_IO_TYPE_READ: 2421 case SPDK_BDEV_IO_TYPE_WRITE: 2422 return true; 2423 case SPDK_BDEV_IO_TYPE_ZCOPY: 2424 if (bdev_io->u.bdev.zcopy.start) { 2425 return true; 2426 } else { 2427 return false; 2428 } 2429 default: 2430 return false; 2431 } 2432 } 2433 2434 static bool 2435 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2436 { 2437 switch (bdev_io->type) { 2438 case SPDK_BDEV_IO_TYPE_NVME_IO: 2439 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2440 /* Bit 1 (0x2) set for read operation */ 2441 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2442 return true; 2443 } else { 2444 return false; 2445 } 2446 case SPDK_BDEV_IO_TYPE_READ: 2447 return true; 2448 case SPDK_BDEV_IO_TYPE_ZCOPY: 2449 /* Populate to read from disk */ 2450 if (bdev_io->u.bdev.zcopy.populate) { 2451 return true; 2452 } else { 2453 return false; 2454 } 2455 default: 2456 return false; 2457 } 2458 } 2459 2460 static uint64_t 2461 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2462 { 2463 struct spdk_bdev *bdev = bdev_io->bdev; 2464 2465 switch (bdev_io->type) { 2466 case SPDK_BDEV_IO_TYPE_NVME_IO: 2467 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2468 return bdev_io->u.nvme_passthru.nbytes; 2469 case SPDK_BDEV_IO_TYPE_READ: 2470 case SPDK_BDEV_IO_TYPE_WRITE: 2471 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2472 case SPDK_BDEV_IO_TYPE_ZCOPY: 2473 /* Track the data in the start phase only */ 2474 if (bdev_io->u.bdev.zcopy.start) { 2475 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2476 } else { 2477 return 0; 2478 } 2479 default: 2480 return 0; 2481 } 2482 } 2483 2484 static bool 2485 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2486 { 2487 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2488 return true; 2489 } else { 2490 return false; 2491 } 2492 } 2493 2494 static bool 2495 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2496 { 2497 if (bdev_is_read_io(io) == false) { 2498 return false; 2499 } 2500 2501 return bdev_qos_rw_queue_io(limit, io); 2502 } 2503 2504 static bool 2505 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2506 { 2507 if (bdev_is_read_io(io) == true) { 2508 return false; 2509 } 2510 2511 return bdev_qos_rw_queue_io(limit, io); 2512 } 2513 2514 static void 2515 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2516 { 2517 limit->remaining_this_timeslice--; 2518 } 2519 2520 static void 2521 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2522 { 2523 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2524 } 2525 2526 static void 2527 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2528 { 2529 if (bdev_is_read_io(io) == false) { 2530 return; 2531 } 2532 2533 return bdev_qos_rw_bps_update_quota(limit, io); 2534 } 2535 2536 static void 2537 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2538 { 2539 if (bdev_is_read_io(io) == true) { 2540 return; 2541 } 2542 2543 return bdev_qos_rw_bps_update_quota(limit, io); 2544 } 2545 2546 static void 2547 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2548 { 2549 int i; 2550 2551 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2552 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2553 qos->rate_limits[i].queue_io = NULL; 2554 qos->rate_limits[i].update_quota = NULL; 2555 continue; 2556 } 2557 2558 switch (i) { 2559 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2560 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2561 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2562 break; 2563 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2564 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2565 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2566 break; 2567 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2568 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2569 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2570 break; 2571 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2572 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2573 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2574 break; 2575 default: 2576 break; 2577 } 2578 } 2579 } 2580 2581 static void 2582 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2583 struct spdk_bdev_io *bdev_io, 2584 enum spdk_bdev_io_status status) 2585 { 2586 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2587 2588 bdev_io->internal.in_submit_request = true; 2589 bdev_ch->io_outstanding++; 2590 shared_resource->io_outstanding++; 2591 spdk_bdev_io_complete(bdev_io, status); 2592 bdev_io->internal.in_submit_request = false; 2593 } 2594 2595 static inline void 2596 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2597 { 2598 struct spdk_bdev *bdev = bdev_io->bdev; 2599 struct spdk_io_channel *ch = bdev_ch->channel; 2600 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2601 2602 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2603 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2604 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2605 2606 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2607 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2608 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2609 SPDK_BDEV_IO_STATUS_SUCCESS); 2610 return; 2611 } 2612 } 2613 2614 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2615 bdev_io->bdev->split_on_write_unit && 2616 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2617 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2618 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2619 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2620 return; 2621 } 2622 2623 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2624 bdev_ch->io_outstanding++; 2625 shared_resource->io_outstanding++; 2626 bdev_io->internal.in_submit_request = true; 2627 bdev_submit_request(bdev, ch, bdev_io); 2628 bdev_io->internal.in_submit_request = false; 2629 } else { 2630 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2631 } 2632 } 2633 2634 static bool 2635 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2636 { 2637 int i; 2638 2639 if (bdev_qos_io_to_limit(bdev_io) == true) { 2640 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2641 if (!qos->rate_limits[i].queue_io) { 2642 continue; 2643 } 2644 2645 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2646 bdev_io) == true) { 2647 return true; 2648 } 2649 } 2650 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2651 if (!qos->rate_limits[i].update_quota) { 2652 continue; 2653 } 2654 2655 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2656 } 2657 } 2658 2659 return false; 2660 } 2661 2662 static inline void 2663 _bdev_io_do_submit(void *ctx) 2664 { 2665 struct spdk_bdev_io *bdev_io = ctx; 2666 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2667 2668 bdev_io_do_submit(ch, bdev_io); 2669 } 2670 2671 static int 2672 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2673 { 2674 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2675 int submitted_ios = 0; 2676 2677 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2678 if (!bdev_qos_queue_io(qos, bdev_io)) { 2679 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2680 2681 if (bdev_io->internal.io_submit_ch) { 2682 /* Send back the IO to the original thread for the actual processing. */ 2683 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2684 bdev_io->internal.io_submit_ch = NULL; 2685 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2686 _bdev_io_do_submit, bdev_io); 2687 } else { 2688 bdev_io_do_submit(ch, bdev_io); 2689 } 2690 2691 submitted_ios++; 2692 } 2693 } 2694 2695 return submitted_ios; 2696 } 2697 2698 static void 2699 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2700 { 2701 int rc; 2702 2703 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2704 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2705 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2706 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2707 &bdev_io->internal.waitq_entry); 2708 if (rc != 0) { 2709 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2710 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2711 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2712 } 2713 } 2714 2715 static bool 2716 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2717 { 2718 uint32_t io_boundary; 2719 struct spdk_bdev *bdev = bdev_io->bdev; 2720 uint32_t max_size = bdev->max_segment_size; 2721 int max_segs = bdev->max_num_segments; 2722 2723 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2724 io_boundary = bdev->write_unit_size; 2725 } else if (bdev->split_on_optimal_io_boundary) { 2726 io_boundary = bdev->optimal_io_boundary; 2727 } else { 2728 io_boundary = 0; 2729 } 2730 2731 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2732 return false; 2733 } 2734 2735 if (io_boundary) { 2736 uint64_t start_stripe, end_stripe; 2737 2738 start_stripe = bdev_io->u.bdev.offset_blocks; 2739 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2740 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2741 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2742 start_stripe >>= spdk_u32log2(io_boundary); 2743 end_stripe >>= spdk_u32log2(io_boundary); 2744 } else { 2745 start_stripe /= io_boundary; 2746 end_stripe /= io_boundary; 2747 } 2748 2749 if (start_stripe != end_stripe) { 2750 return true; 2751 } 2752 } 2753 2754 if (max_segs) { 2755 if (bdev_io->u.bdev.iovcnt > max_segs) { 2756 return true; 2757 } 2758 } 2759 2760 if (max_size) { 2761 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2762 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2763 return true; 2764 } 2765 } 2766 } 2767 2768 return false; 2769 } 2770 2771 static bool 2772 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2773 { 2774 uint32_t num_unmap_segments; 2775 2776 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2777 return false; 2778 } 2779 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2780 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2781 return true; 2782 } 2783 2784 return false; 2785 } 2786 2787 static bool 2788 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2789 { 2790 if (!bdev_io->bdev->max_write_zeroes) { 2791 return false; 2792 } 2793 2794 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2795 return true; 2796 } 2797 2798 return false; 2799 } 2800 2801 static bool 2802 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2803 { 2804 if (bdev_io->bdev->max_copy != 0 && 2805 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2806 return true; 2807 } 2808 2809 return false; 2810 } 2811 2812 static bool 2813 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2814 { 2815 switch (bdev_io->type) { 2816 case SPDK_BDEV_IO_TYPE_READ: 2817 case SPDK_BDEV_IO_TYPE_WRITE: 2818 return bdev_rw_should_split(bdev_io); 2819 case SPDK_BDEV_IO_TYPE_UNMAP: 2820 return bdev_unmap_should_split(bdev_io); 2821 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2822 return bdev_write_zeroes_should_split(bdev_io); 2823 case SPDK_BDEV_IO_TYPE_COPY: 2824 return bdev_copy_should_split(bdev_io); 2825 default: 2826 return false; 2827 } 2828 } 2829 2830 static uint32_t 2831 _to_next_boundary(uint64_t offset, uint32_t boundary) 2832 { 2833 return (boundary - (offset % boundary)); 2834 } 2835 2836 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2837 2838 static void _bdev_rw_split(void *_bdev_io); 2839 2840 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2841 2842 static void 2843 _bdev_unmap_split(void *_bdev_io) 2844 { 2845 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2846 } 2847 2848 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2849 2850 static void 2851 _bdev_write_zeroes_split(void *_bdev_io) 2852 { 2853 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2854 } 2855 2856 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2857 2858 static void 2859 _bdev_copy_split(void *_bdev_io) 2860 { 2861 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2862 } 2863 2864 static int 2865 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2866 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2867 { 2868 int rc; 2869 uint64_t current_offset, current_remaining, current_src_offset; 2870 spdk_bdev_io_wait_cb io_wait_fn; 2871 2872 current_offset = *offset; 2873 current_remaining = *remaining; 2874 2875 bdev_io->u.bdev.split_outstanding++; 2876 2877 io_wait_fn = _bdev_rw_split; 2878 switch (bdev_io->type) { 2879 case SPDK_BDEV_IO_TYPE_READ: 2880 assert(bdev_io->u.bdev.accel_sequence == NULL); 2881 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2882 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2883 iov, iovcnt, md_buf, current_offset, 2884 num_blocks, bdev_io->internal.memory_domain, 2885 bdev_io->internal.memory_domain_ctx, NULL, 2886 bdev_io_split_done, bdev_io); 2887 break; 2888 case SPDK_BDEV_IO_TYPE_WRITE: 2889 assert(bdev_io->u.bdev.accel_sequence == NULL); 2890 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2891 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2892 iov, iovcnt, md_buf, current_offset, 2893 num_blocks, bdev_io->internal.memory_domain, 2894 bdev_io->internal.memory_domain_ctx, NULL, 2895 bdev_io_split_done, bdev_io); 2896 break; 2897 case SPDK_BDEV_IO_TYPE_UNMAP: 2898 io_wait_fn = _bdev_unmap_split; 2899 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2900 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2901 current_offset, num_blocks, 2902 bdev_io_split_done, bdev_io); 2903 break; 2904 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2905 io_wait_fn = _bdev_write_zeroes_split; 2906 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2907 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2908 current_offset, num_blocks, 2909 bdev_io_split_done, bdev_io); 2910 break; 2911 case SPDK_BDEV_IO_TYPE_COPY: 2912 io_wait_fn = _bdev_copy_split; 2913 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2914 (current_offset - bdev_io->u.bdev.offset_blocks); 2915 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2916 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2917 current_offset, current_src_offset, num_blocks, 2918 bdev_io_split_done, bdev_io); 2919 break; 2920 default: 2921 assert(false); 2922 rc = -EINVAL; 2923 break; 2924 } 2925 2926 if (rc == 0) { 2927 current_offset += num_blocks; 2928 current_remaining -= num_blocks; 2929 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2930 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2931 *offset = current_offset; 2932 *remaining = current_remaining; 2933 } else { 2934 bdev_io->u.bdev.split_outstanding--; 2935 if (rc == -ENOMEM) { 2936 if (bdev_io->u.bdev.split_outstanding == 0) { 2937 /* No I/O is outstanding. Hence we should wait here. */ 2938 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2939 } 2940 } else { 2941 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2942 if (bdev_io->u.bdev.split_outstanding == 0) { 2943 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2944 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2945 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2946 } 2947 } 2948 } 2949 2950 return rc; 2951 } 2952 2953 static void 2954 _bdev_rw_split(void *_bdev_io) 2955 { 2956 struct iovec *parent_iov, *iov; 2957 struct spdk_bdev_io *bdev_io = _bdev_io; 2958 struct spdk_bdev *bdev = bdev_io->bdev; 2959 uint64_t parent_offset, current_offset, remaining; 2960 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2961 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2962 uint32_t iovcnt, iov_len, child_iovsize; 2963 uint32_t blocklen = bdev->blocklen; 2964 uint32_t io_boundary; 2965 uint32_t max_segment_size = bdev->max_segment_size; 2966 uint32_t max_child_iovcnt = bdev->max_num_segments; 2967 void *md_buf = NULL; 2968 int rc; 2969 2970 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2971 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2972 SPDK_BDEV_IO_NUM_CHILD_IOV; 2973 2974 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2975 io_boundary = bdev->write_unit_size; 2976 } else if (bdev->split_on_optimal_io_boundary) { 2977 io_boundary = bdev->optimal_io_boundary; 2978 } else { 2979 io_boundary = UINT32_MAX; 2980 } 2981 2982 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2983 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2984 parent_offset = bdev_io->u.bdev.offset_blocks; 2985 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2986 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2987 2988 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2989 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2990 if (parent_iov_offset < parent_iov->iov_len) { 2991 break; 2992 } 2993 parent_iov_offset -= parent_iov->iov_len; 2994 } 2995 2996 child_iovcnt = 0; 2997 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2998 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2999 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3000 to_next_boundary = spdk_min(remaining, to_next_boundary); 3001 to_next_boundary_bytes = to_next_boundary * blocklen; 3002 3003 iov = &bdev_io->child_iov[child_iovcnt]; 3004 iovcnt = 0; 3005 3006 if (bdev_io->u.bdev.md_buf) { 3007 md_buf = (char *)bdev_io->u.bdev.md_buf + 3008 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3009 } 3010 3011 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3012 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3013 iovcnt < child_iovsize) { 3014 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3015 iov_len = parent_iov->iov_len - parent_iov_offset; 3016 3017 iov_len = spdk_min(iov_len, max_segment_size); 3018 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3019 to_next_boundary_bytes -= iov_len; 3020 3021 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3022 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3023 3024 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3025 parent_iov_offset += iov_len; 3026 } else { 3027 parent_iovpos++; 3028 parent_iov_offset = 0; 3029 } 3030 child_iovcnt++; 3031 iovcnt++; 3032 } 3033 3034 if (to_next_boundary_bytes > 0) { 3035 /* We had to stop this child I/O early because we ran out of 3036 * child_iov space or were limited by max_num_segments. 3037 * Ensure the iovs to be aligned with block size and 3038 * then adjust to_next_boundary before starting the 3039 * child I/O. 3040 */ 3041 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3042 iovcnt == child_iovsize); 3043 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3044 if (to_last_block_bytes != 0) { 3045 uint32_t child_iovpos = child_iovcnt - 1; 3046 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3047 * so the loop will naturally end 3048 */ 3049 3050 to_last_block_bytes = blocklen - to_last_block_bytes; 3051 to_next_boundary_bytes += to_last_block_bytes; 3052 while (to_last_block_bytes > 0 && iovcnt > 0) { 3053 iov_len = spdk_min(to_last_block_bytes, 3054 bdev_io->child_iov[child_iovpos].iov_len); 3055 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3056 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3057 child_iovpos--; 3058 if (--iovcnt == 0) { 3059 /* If the child IO is less than a block size just return. 3060 * If the first child IO of any split round is less than 3061 * a block size, an error exit. 3062 */ 3063 if (bdev_io->u.bdev.split_outstanding == 0) { 3064 SPDK_ERRLOG("The first child io was less than a block size\n"); 3065 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3066 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3067 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3068 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3069 } 3070 3071 return; 3072 } 3073 } 3074 3075 to_last_block_bytes -= iov_len; 3076 3077 if (parent_iov_offset == 0) { 3078 parent_iovpos--; 3079 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3080 } 3081 parent_iov_offset -= iov_len; 3082 } 3083 3084 assert(to_last_block_bytes == 0); 3085 } 3086 to_next_boundary -= to_next_boundary_bytes / blocklen; 3087 } 3088 3089 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3090 ¤t_offset, &remaining); 3091 if (spdk_unlikely(rc)) { 3092 return; 3093 } 3094 } 3095 } 3096 3097 static void 3098 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3099 { 3100 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3101 uint32_t num_children_reqs = 0; 3102 int rc; 3103 3104 offset = bdev_io->u.bdev.split_current_offset_blocks; 3105 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3106 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3107 3108 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3109 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3110 3111 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3112 &offset, &remaining); 3113 if (spdk_likely(rc == 0)) { 3114 num_children_reqs++; 3115 } else { 3116 return; 3117 } 3118 } 3119 } 3120 3121 static void 3122 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3123 { 3124 uint64_t offset, write_zeroes_blocks, remaining; 3125 uint32_t num_children_reqs = 0; 3126 int rc; 3127 3128 offset = bdev_io->u.bdev.split_current_offset_blocks; 3129 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3130 3131 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3132 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3133 3134 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3135 &offset, &remaining); 3136 if (spdk_likely(rc == 0)) { 3137 num_children_reqs++; 3138 } else { 3139 return; 3140 } 3141 } 3142 } 3143 3144 static void 3145 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3146 { 3147 uint64_t offset, copy_blocks, remaining; 3148 uint32_t num_children_reqs = 0; 3149 int rc; 3150 3151 offset = bdev_io->u.bdev.split_current_offset_blocks; 3152 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3153 3154 assert(bdev_io->bdev->max_copy != 0); 3155 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3156 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3157 3158 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3159 &offset, &remaining); 3160 if (spdk_likely(rc == 0)) { 3161 num_children_reqs++; 3162 } else { 3163 return; 3164 } 3165 } 3166 } 3167 3168 static void 3169 parent_bdev_io_complete(void *ctx, int rc) 3170 { 3171 struct spdk_bdev_io *parent_io = ctx; 3172 3173 if (rc) { 3174 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3175 } 3176 3177 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3178 parent_io->internal.caller_ctx); 3179 } 3180 3181 static void 3182 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3183 { 3184 struct spdk_bdev_io *bdev_io = ctx; 3185 3186 /* u.bdev.accel_sequence should have already been cleared at this point */ 3187 assert(bdev_io->u.bdev.accel_sequence == NULL); 3188 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3189 bdev_io->internal.accel_sequence = NULL; 3190 3191 if (spdk_unlikely(status != 0)) { 3192 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3193 } 3194 3195 parent_bdev_io_complete(bdev_io, status); 3196 } 3197 3198 static void 3199 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3200 { 3201 struct spdk_bdev_io *parent_io = cb_arg; 3202 3203 spdk_bdev_free_io(bdev_io); 3204 3205 if (!success) { 3206 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3207 /* If any child I/O failed, stop further splitting process. */ 3208 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3209 parent_io->u.bdev.split_remaining_num_blocks = 0; 3210 } 3211 parent_io->u.bdev.split_outstanding--; 3212 if (parent_io->u.bdev.split_outstanding != 0) { 3213 return; 3214 } 3215 3216 /* 3217 * Parent I/O finishes when all blocks are consumed. 3218 */ 3219 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3220 assert(parent_io->internal.cb != bdev_io_split_done); 3221 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3222 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3223 3224 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3225 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3226 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3227 return; 3228 } else if (parent_io->internal.orig_iovcnt != 0) { 3229 /* bdev IO will be completed in the callback */ 3230 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3231 return; 3232 } 3233 } 3234 3235 parent_bdev_io_complete(parent_io, 0); 3236 return; 3237 } 3238 3239 /* 3240 * Continue with the splitting process. This function will complete the parent I/O if the 3241 * splitting is done. 3242 */ 3243 switch (parent_io->type) { 3244 case SPDK_BDEV_IO_TYPE_READ: 3245 case SPDK_BDEV_IO_TYPE_WRITE: 3246 _bdev_rw_split(parent_io); 3247 break; 3248 case SPDK_BDEV_IO_TYPE_UNMAP: 3249 bdev_unmap_split(parent_io); 3250 break; 3251 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3252 bdev_write_zeroes_split(parent_io); 3253 break; 3254 case SPDK_BDEV_IO_TYPE_COPY: 3255 bdev_copy_split(parent_io); 3256 break; 3257 default: 3258 assert(false); 3259 break; 3260 } 3261 } 3262 3263 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3264 bool success); 3265 3266 static void 3267 bdev_io_split(struct spdk_bdev_io *bdev_io) 3268 { 3269 assert(bdev_io_should_split(bdev_io)); 3270 3271 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3272 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3273 bdev_io->u.bdev.split_outstanding = 0; 3274 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3275 3276 switch (bdev_io->type) { 3277 case SPDK_BDEV_IO_TYPE_READ: 3278 case SPDK_BDEV_IO_TYPE_WRITE: 3279 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3280 _bdev_rw_split(bdev_io); 3281 } else { 3282 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3283 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3284 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3285 } 3286 break; 3287 case SPDK_BDEV_IO_TYPE_UNMAP: 3288 bdev_unmap_split(bdev_io); 3289 break; 3290 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3291 bdev_write_zeroes_split(bdev_io); 3292 break; 3293 case SPDK_BDEV_IO_TYPE_COPY: 3294 bdev_copy_split(bdev_io); 3295 break; 3296 default: 3297 assert(false); 3298 break; 3299 } 3300 } 3301 3302 static void 3303 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3304 { 3305 if (!success) { 3306 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3307 return; 3308 } 3309 3310 _bdev_rw_split(bdev_io); 3311 } 3312 3313 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3314 * be inlined, at least on some compilers. 3315 */ 3316 static inline void 3317 _bdev_io_submit(void *ctx) 3318 { 3319 struct spdk_bdev_io *bdev_io = ctx; 3320 struct spdk_bdev *bdev = bdev_io->bdev; 3321 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3322 3323 if (spdk_likely(bdev_ch->flags == 0)) { 3324 bdev_io_do_submit(bdev_ch, bdev_io); 3325 return; 3326 } 3327 3328 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3329 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3330 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3331 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3332 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3333 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3334 } else { 3335 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3336 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3337 } 3338 } else { 3339 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3340 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3341 } 3342 } 3343 3344 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3345 3346 bool 3347 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3348 { 3349 if (range1->length == 0 || range2->length == 0) { 3350 return false; 3351 } 3352 3353 if (range1->offset + range1->length <= range2->offset) { 3354 return false; 3355 } 3356 3357 if (range2->offset + range2->length <= range1->offset) { 3358 return false; 3359 } 3360 3361 return true; 3362 } 3363 3364 static bool 3365 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3366 { 3367 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3368 struct lba_range r; 3369 3370 switch (bdev_io->type) { 3371 case SPDK_BDEV_IO_TYPE_NVME_IO: 3372 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3373 /* Don't try to decode the NVMe command - just assume worst-case and that 3374 * it overlaps a locked range. 3375 */ 3376 return true; 3377 case SPDK_BDEV_IO_TYPE_WRITE: 3378 case SPDK_BDEV_IO_TYPE_UNMAP: 3379 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3380 case SPDK_BDEV_IO_TYPE_ZCOPY: 3381 case SPDK_BDEV_IO_TYPE_COPY: 3382 r.offset = bdev_io->u.bdev.offset_blocks; 3383 r.length = bdev_io->u.bdev.num_blocks; 3384 if (!bdev_lba_range_overlapped(range, &r)) { 3385 /* This I/O doesn't overlap the specified LBA range. */ 3386 return false; 3387 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3388 /* This I/O overlaps, but the I/O is on the same channel that locked this 3389 * range, and the caller_ctx is the same as the locked_ctx. This means 3390 * that this I/O is associated with the lock, and is allowed to execute. 3391 */ 3392 return false; 3393 } else { 3394 return true; 3395 } 3396 default: 3397 return false; 3398 } 3399 } 3400 3401 void 3402 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3403 { 3404 struct spdk_bdev *bdev = bdev_io->bdev; 3405 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3406 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3407 3408 assert(thread != NULL); 3409 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3410 3411 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3412 struct lba_range *range; 3413 3414 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3415 if (bdev_io_range_is_locked(bdev_io, range)) { 3416 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3417 return; 3418 } 3419 } 3420 } 3421 3422 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3423 3424 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3425 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3426 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3427 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3428 spdk_bdev_get_name(bdev)); 3429 3430 if (bdev_io->internal.split) { 3431 bdev_io_split(bdev_io); 3432 return; 3433 } 3434 3435 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3436 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3437 _bdev_io_submit(bdev_io); 3438 } else { 3439 bdev_io->internal.io_submit_ch = ch; 3440 bdev_io->internal.ch = bdev->internal.qos->ch; 3441 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3442 } 3443 } else { 3444 _bdev_io_submit(bdev_io); 3445 } 3446 } 3447 3448 static inline void 3449 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3450 { 3451 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3452 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3453 * For write operation we need to pull buffers from memory domain before submitting IO. 3454 * Once read operation completes, we need to use memory_domain push functionality to 3455 * update data in original memory domain IO buffer 3456 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3457 bdev_io->u.bdev.memory_domain = NULL; 3458 bdev_io->u.bdev.memory_domain_ctx = NULL; 3459 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3460 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3461 } 3462 3463 static inline void 3464 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3465 { 3466 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3467 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3468 3469 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3470 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3471 bdev_io_complete_unsubmitted(bdev_io); 3472 return; 3473 } 3474 3475 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3476 * support them, but we need to execute an accel sequence and the data buffer is from accel 3477 * memory domain (to avoid doing a push/pull from that domain). 3478 */ 3479 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3480 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3481 _bdev_io_ext_use_bounce_buffer(bdev_io); 3482 return; 3483 } 3484 3485 if (needs_exec) { 3486 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3487 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3488 return; 3489 } 3490 /* For reads we'll execute the sequence after the data is read, so, for now, only 3491 * clear out accel_sequence pointer and submit the IO */ 3492 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3493 bdev_io->u.bdev.accel_sequence = NULL; 3494 } 3495 3496 bdev_io_submit(bdev_io); 3497 } 3498 3499 static void 3500 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3501 { 3502 struct spdk_bdev *bdev = bdev_io->bdev; 3503 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3504 struct spdk_io_channel *ch = bdev_ch->channel; 3505 3506 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3507 3508 bdev_io->internal.in_submit_request = true; 3509 bdev_submit_request(bdev, ch, bdev_io); 3510 bdev_io->internal.in_submit_request = false; 3511 } 3512 3513 void 3514 bdev_io_init(struct spdk_bdev_io *bdev_io, 3515 struct spdk_bdev *bdev, void *cb_arg, 3516 spdk_bdev_io_completion_cb cb) 3517 { 3518 bdev_io->bdev = bdev; 3519 bdev_io->internal.caller_ctx = cb_arg; 3520 bdev_io->internal.cb = cb; 3521 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3522 bdev_io->internal.in_submit_request = false; 3523 bdev_io->internal.buf = NULL; 3524 bdev_io->internal.io_submit_ch = NULL; 3525 bdev_io->internal.orig_iovs = NULL; 3526 bdev_io->internal.orig_iovcnt = 0; 3527 bdev_io->internal.orig_md_iov.iov_base = NULL; 3528 bdev_io->internal.error.nvme.cdw0 = 0; 3529 bdev_io->num_retries = 0; 3530 bdev_io->internal.get_buf_cb = NULL; 3531 bdev_io->internal.get_aux_buf_cb = NULL; 3532 bdev_io->internal.memory_domain = NULL; 3533 bdev_io->internal.memory_domain_ctx = NULL; 3534 bdev_io->internal.data_transfer_cpl = NULL; 3535 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3536 bdev_io->internal.accel_sequence = NULL; 3537 } 3538 3539 static bool 3540 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3541 { 3542 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3543 } 3544 3545 bool 3546 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3547 { 3548 bool supported; 3549 3550 supported = bdev_io_type_supported(bdev, io_type); 3551 3552 if (!supported) { 3553 switch (io_type) { 3554 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3555 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3556 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3557 break; 3558 default: 3559 break; 3560 } 3561 } 3562 3563 return supported; 3564 } 3565 3566 uint64_t 3567 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3568 { 3569 return bdev_io->internal.submit_tsc; 3570 } 3571 3572 int 3573 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3574 { 3575 if (bdev->fn_table->dump_info_json) { 3576 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3577 } 3578 3579 return 0; 3580 } 3581 3582 static void 3583 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3584 { 3585 uint32_t max_per_timeslice = 0; 3586 int i; 3587 3588 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3589 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3590 qos->rate_limits[i].max_per_timeslice = 0; 3591 continue; 3592 } 3593 3594 max_per_timeslice = qos->rate_limits[i].limit * 3595 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3596 3597 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3598 qos->rate_limits[i].min_per_timeslice); 3599 3600 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3601 } 3602 3603 bdev_qos_set_ops(qos); 3604 } 3605 3606 static int 3607 bdev_channel_poll_qos(void *arg) 3608 { 3609 struct spdk_bdev_qos *qos = arg; 3610 uint64_t now = spdk_get_ticks(); 3611 int i; 3612 3613 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3614 /* We received our callback earlier than expected - return 3615 * immediately and wait to do accounting until at least one 3616 * timeslice has actually expired. This should never happen 3617 * with a well-behaved timer implementation. 3618 */ 3619 return SPDK_POLLER_IDLE; 3620 } 3621 3622 /* Reset for next round of rate limiting */ 3623 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3624 /* We may have allowed the IOs or bytes to slightly overrun in the last 3625 * timeslice. remaining_this_timeslice is signed, so if it's negative 3626 * here, we'll account for the overrun so that the next timeslice will 3627 * be appropriately reduced. 3628 */ 3629 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3630 qos->rate_limits[i].remaining_this_timeslice = 0; 3631 } 3632 } 3633 3634 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3635 qos->last_timeslice += qos->timeslice_size; 3636 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3637 qos->rate_limits[i].remaining_this_timeslice += 3638 qos->rate_limits[i].max_per_timeslice; 3639 } 3640 } 3641 3642 return bdev_qos_io_submit(qos->ch, qos); 3643 } 3644 3645 static void 3646 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3647 { 3648 struct spdk_bdev_shared_resource *shared_resource; 3649 struct lba_range *range; 3650 3651 bdev_free_io_stat(ch->stat); 3652 #ifdef SPDK_CONFIG_VTUNE 3653 bdev_free_io_stat(ch->prev_stat); 3654 #endif 3655 3656 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3657 range = TAILQ_FIRST(&ch->locked_ranges); 3658 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3659 free(range); 3660 } 3661 3662 spdk_put_io_channel(ch->channel); 3663 spdk_put_io_channel(ch->accel_channel); 3664 3665 shared_resource = ch->shared_resource; 3666 3667 assert(TAILQ_EMPTY(&ch->io_locked)); 3668 assert(TAILQ_EMPTY(&ch->io_submitted)); 3669 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3670 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3671 assert(ch->io_outstanding == 0); 3672 assert(shared_resource->ref > 0); 3673 shared_resource->ref--; 3674 if (shared_resource->ref == 0) { 3675 assert(shared_resource->io_outstanding == 0); 3676 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3677 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3678 free(shared_resource); 3679 } 3680 } 3681 3682 static void 3683 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3684 { 3685 struct spdk_bdev_qos *qos = bdev->internal.qos; 3686 int i; 3687 3688 assert(spdk_spin_held(&bdev->internal.spinlock)); 3689 3690 /* Rate limiting on this bdev enabled */ 3691 if (qos) { 3692 if (qos->ch == NULL) { 3693 struct spdk_io_channel *io_ch; 3694 3695 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3696 bdev->name, spdk_get_thread()); 3697 3698 /* No qos channel has been selected, so set one up */ 3699 3700 /* Take another reference to ch */ 3701 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3702 assert(io_ch != NULL); 3703 qos->ch = ch; 3704 3705 qos->thread = spdk_io_channel_get_thread(io_ch); 3706 3707 TAILQ_INIT(&qos->queued); 3708 3709 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3710 if (bdev_qos_is_iops_rate_limit(i) == true) { 3711 qos->rate_limits[i].min_per_timeslice = 3712 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3713 } else { 3714 qos->rate_limits[i].min_per_timeslice = 3715 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3716 } 3717 3718 if (qos->rate_limits[i].limit == 0) { 3719 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3720 } 3721 } 3722 bdev_qos_update_max_quota_per_timeslice(qos); 3723 qos->timeslice_size = 3724 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3725 qos->last_timeslice = spdk_get_ticks(); 3726 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3727 qos, 3728 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3729 } 3730 3731 ch->flags |= BDEV_CH_QOS_ENABLED; 3732 } 3733 } 3734 3735 struct poll_timeout_ctx { 3736 struct spdk_bdev_desc *desc; 3737 uint64_t timeout_in_sec; 3738 spdk_bdev_io_timeout_cb cb_fn; 3739 void *cb_arg; 3740 }; 3741 3742 static void 3743 bdev_desc_free(struct spdk_bdev_desc *desc) 3744 { 3745 spdk_spin_destroy(&desc->spinlock); 3746 free(desc->media_events_buffer); 3747 free(desc); 3748 } 3749 3750 static void 3751 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3752 { 3753 struct poll_timeout_ctx *ctx = _ctx; 3754 struct spdk_bdev_desc *desc = ctx->desc; 3755 3756 free(ctx); 3757 3758 spdk_spin_lock(&desc->spinlock); 3759 desc->refs--; 3760 if (desc->closed == true && desc->refs == 0) { 3761 spdk_spin_unlock(&desc->spinlock); 3762 bdev_desc_free(desc); 3763 return; 3764 } 3765 spdk_spin_unlock(&desc->spinlock); 3766 } 3767 3768 static void 3769 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3770 struct spdk_io_channel *io_ch, void *_ctx) 3771 { 3772 struct poll_timeout_ctx *ctx = _ctx; 3773 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3774 struct spdk_bdev_desc *desc = ctx->desc; 3775 struct spdk_bdev_io *bdev_io; 3776 uint64_t now; 3777 3778 spdk_spin_lock(&desc->spinlock); 3779 if (desc->closed == true) { 3780 spdk_spin_unlock(&desc->spinlock); 3781 spdk_bdev_for_each_channel_continue(i, -1); 3782 return; 3783 } 3784 spdk_spin_unlock(&desc->spinlock); 3785 3786 now = spdk_get_ticks(); 3787 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3788 /* Exclude any I/O that are generated via splitting. */ 3789 if (bdev_io->internal.cb == bdev_io_split_done) { 3790 continue; 3791 } 3792 3793 /* Once we find an I/O that has not timed out, we can immediately 3794 * exit the loop. 3795 */ 3796 if (now < (bdev_io->internal.submit_tsc + 3797 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3798 goto end; 3799 } 3800 3801 if (bdev_io->internal.desc == desc) { 3802 ctx->cb_fn(ctx->cb_arg, bdev_io); 3803 } 3804 } 3805 3806 end: 3807 spdk_bdev_for_each_channel_continue(i, 0); 3808 } 3809 3810 static int 3811 bdev_poll_timeout_io(void *arg) 3812 { 3813 struct spdk_bdev_desc *desc = arg; 3814 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3815 struct poll_timeout_ctx *ctx; 3816 3817 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3818 if (!ctx) { 3819 SPDK_ERRLOG("failed to allocate memory\n"); 3820 return SPDK_POLLER_BUSY; 3821 } 3822 ctx->desc = desc; 3823 ctx->cb_arg = desc->cb_arg; 3824 ctx->cb_fn = desc->cb_fn; 3825 ctx->timeout_in_sec = desc->timeout_in_sec; 3826 3827 /* Take a ref on the descriptor in case it gets closed while we are checking 3828 * all of the channels. 3829 */ 3830 spdk_spin_lock(&desc->spinlock); 3831 desc->refs++; 3832 spdk_spin_unlock(&desc->spinlock); 3833 3834 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3835 bdev_channel_poll_timeout_io_done); 3836 3837 return SPDK_POLLER_BUSY; 3838 } 3839 3840 int 3841 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3842 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3843 { 3844 assert(desc->thread == spdk_get_thread()); 3845 3846 spdk_poller_unregister(&desc->io_timeout_poller); 3847 3848 if (timeout_in_sec) { 3849 assert(cb_fn != NULL); 3850 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3851 desc, 3852 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3853 1000); 3854 if (desc->io_timeout_poller == NULL) { 3855 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3856 return -1; 3857 } 3858 } 3859 3860 desc->cb_fn = cb_fn; 3861 desc->cb_arg = cb_arg; 3862 desc->timeout_in_sec = timeout_in_sec; 3863 3864 return 0; 3865 } 3866 3867 static int 3868 bdev_channel_create(void *io_device, void *ctx_buf) 3869 { 3870 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3871 struct spdk_bdev_channel *ch = ctx_buf; 3872 struct spdk_io_channel *mgmt_io_ch; 3873 struct spdk_bdev_mgmt_channel *mgmt_ch; 3874 struct spdk_bdev_shared_resource *shared_resource; 3875 struct lba_range *range; 3876 3877 ch->bdev = bdev; 3878 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3879 if (!ch->channel) { 3880 return -1; 3881 } 3882 3883 ch->accel_channel = spdk_accel_get_io_channel(); 3884 if (!ch->accel_channel) { 3885 spdk_put_io_channel(ch->channel); 3886 return -1; 3887 } 3888 3889 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3890 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3891 3892 assert(ch->histogram == NULL); 3893 if (bdev->internal.histogram_enabled) { 3894 ch->histogram = spdk_histogram_data_alloc(); 3895 if (ch->histogram == NULL) { 3896 SPDK_ERRLOG("Could not allocate histogram\n"); 3897 } 3898 } 3899 3900 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3901 if (!mgmt_io_ch) { 3902 spdk_put_io_channel(ch->channel); 3903 spdk_put_io_channel(ch->accel_channel); 3904 return -1; 3905 } 3906 3907 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3908 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3909 if (shared_resource->shared_ch == ch->channel) { 3910 spdk_put_io_channel(mgmt_io_ch); 3911 shared_resource->ref++; 3912 break; 3913 } 3914 } 3915 3916 if (shared_resource == NULL) { 3917 shared_resource = calloc(1, sizeof(*shared_resource)); 3918 if (shared_resource == NULL) { 3919 spdk_put_io_channel(ch->channel); 3920 spdk_put_io_channel(ch->accel_channel); 3921 spdk_put_io_channel(mgmt_io_ch); 3922 return -1; 3923 } 3924 3925 shared_resource->mgmt_ch = mgmt_ch; 3926 shared_resource->io_outstanding = 0; 3927 TAILQ_INIT(&shared_resource->nomem_io); 3928 shared_resource->nomem_threshold = 0; 3929 shared_resource->shared_ch = ch->channel; 3930 shared_resource->ref = 1; 3931 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3932 } 3933 3934 ch->io_outstanding = 0; 3935 TAILQ_INIT(&ch->queued_resets); 3936 TAILQ_INIT(&ch->locked_ranges); 3937 ch->flags = 0; 3938 ch->shared_resource = shared_resource; 3939 3940 TAILQ_INIT(&ch->io_submitted); 3941 TAILQ_INIT(&ch->io_locked); 3942 TAILQ_INIT(&ch->io_accel_exec); 3943 TAILQ_INIT(&ch->io_memory_domain); 3944 3945 ch->stat = bdev_alloc_io_stat(false); 3946 if (ch->stat == NULL) { 3947 bdev_channel_destroy_resource(ch); 3948 return -1; 3949 } 3950 3951 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3952 3953 #ifdef SPDK_CONFIG_VTUNE 3954 { 3955 char *name; 3956 __itt_init_ittlib(NULL, 0); 3957 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3958 if (!name) { 3959 bdev_channel_destroy_resource(ch); 3960 return -1; 3961 } 3962 ch->handle = __itt_string_handle_create(name); 3963 free(name); 3964 ch->start_tsc = spdk_get_ticks(); 3965 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3966 ch->prev_stat = bdev_alloc_io_stat(false); 3967 if (ch->prev_stat == NULL) { 3968 bdev_channel_destroy_resource(ch); 3969 return -1; 3970 } 3971 } 3972 #endif 3973 3974 spdk_spin_lock(&bdev->internal.spinlock); 3975 bdev_enable_qos(bdev, ch); 3976 3977 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3978 struct lba_range *new_range; 3979 3980 new_range = calloc(1, sizeof(*new_range)); 3981 if (new_range == NULL) { 3982 spdk_spin_unlock(&bdev->internal.spinlock); 3983 bdev_channel_destroy_resource(ch); 3984 return -1; 3985 } 3986 new_range->length = range->length; 3987 new_range->offset = range->offset; 3988 new_range->locked_ctx = range->locked_ctx; 3989 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3990 } 3991 3992 spdk_spin_unlock(&bdev->internal.spinlock); 3993 3994 return 0; 3995 } 3996 3997 static int 3998 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3999 void *cb_ctx) 4000 { 4001 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4002 struct spdk_bdev_io *bdev_io; 4003 uint64_t buf_len; 4004 4005 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4006 if (bdev_io->internal.ch == bdev_ch) { 4007 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4008 spdk_iobuf_entry_abort(ch, entry, buf_len); 4009 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4010 } 4011 4012 return 0; 4013 } 4014 4015 /* 4016 * Abort I/O that are waiting on a data buffer. 4017 */ 4018 static void 4019 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4020 { 4021 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4022 bdev_abort_all_buf_io_cb, ch); 4023 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4024 bdev_abort_all_buf_io_cb, ch); 4025 } 4026 4027 /* 4028 * Abort I/O that are queued waiting for submission. These types of I/O are 4029 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4030 */ 4031 static void 4032 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4033 { 4034 struct spdk_bdev_io *bdev_io, *tmp; 4035 4036 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4037 if (bdev_io->internal.ch == ch) { 4038 TAILQ_REMOVE(queue, bdev_io, internal.link); 4039 /* 4040 * spdk_bdev_io_complete() assumes that the completed I/O had 4041 * been submitted to the bdev module. Since in this case it 4042 * hadn't, bump io_outstanding to account for the decrement 4043 * that spdk_bdev_io_complete() will do. 4044 */ 4045 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4046 ch->io_outstanding++; 4047 ch->shared_resource->io_outstanding++; 4048 } 4049 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4050 } 4051 } 4052 } 4053 4054 static bool 4055 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4056 { 4057 struct spdk_bdev_io *bdev_io; 4058 4059 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4060 if (bdev_io == bio_to_abort) { 4061 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4062 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4063 return true; 4064 } 4065 } 4066 4067 return false; 4068 } 4069 4070 static int 4071 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4072 { 4073 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4074 uint64_t buf_len; 4075 4076 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4077 if (bdev_io == bio_to_abort) { 4078 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4079 spdk_iobuf_entry_abort(ch, entry, buf_len); 4080 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4081 return 1; 4082 } 4083 4084 return 0; 4085 } 4086 4087 static bool 4088 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4089 { 4090 int rc; 4091 4092 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4093 bdev_abort_buf_io_cb, bio_to_abort); 4094 if (rc == 1) { 4095 return true; 4096 } 4097 4098 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4099 bdev_abort_buf_io_cb, bio_to_abort); 4100 return rc == 1; 4101 } 4102 4103 static void 4104 bdev_qos_channel_destroy(void *cb_arg) 4105 { 4106 struct spdk_bdev_qos *qos = cb_arg; 4107 4108 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4109 spdk_poller_unregister(&qos->poller); 4110 4111 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4112 4113 free(qos); 4114 } 4115 4116 static int 4117 bdev_qos_destroy(struct spdk_bdev *bdev) 4118 { 4119 int i; 4120 4121 /* 4122 * Cleanly shutting down the QoS poller is tricky, because 4123 * during the asynchronous operation the user could open 4124 * a new descriptor and create a new channel, spawning 4125 * a new QoS poller. 4126 * 4127 * The strategy is to create a new QoS structure here and swap it 4128 * in. The shutdown path then continues to refer to the old one 4129 * until it completes and then releases it. 4130 */ 4131 struct spdk_bdev_qos *new_qos, *old_qos; 4132 4133 old_qos = bdev->internal.qos; 4134 4135 new_qos = calloc(1, sizeof(*new_qos)); 4136 if (!new_qos) { 4137 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4138 return -ENOMEM; 4139 } 4140 4141 /* Copy the old QoS data into the newly allocated structure */ 4142 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4143 4144 /* Zero out the key parts of the QoS structure */ 4145 new_qos->ch = NULL; 4146 new_qos->thread = NULL; 4147 new_qos->poller = NULL; 4148 TAILQ_INIT(&new_qos->queued); 4149 /* 4150 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4151 * It will be used later for the new QoS structure. 4152 */ 4153 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4154 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4155 new_qos->rate_limits[i].min_per_timeslice = 0; 4156 new_qos->rate_limits[i].max_per_timeslice = 0; 4157 } 4158 4159 bdev->internal.qos = new_qos; 4160 4161 if (old_qos->thread == NULL) { 4162 free(old_qos); 4163 } else { 4164 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4165 } 4166 4167 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4168 * been destroyed yet. The destruction path will end up waiting for the final 4169 * channel to be put before it releases resources. */ 4170 4171 return 0; 4172 } 4173 4174 void 4175 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4176 { 4177 total->bytes_read += add->bytes_read; 4178 total->num_read_ops += add->num_read_ops; 4179 total->bytes_written += add->bytes_written; 4180 total->num_write_ops += add->num_write_ops; 4181 total->bytes_unmapped += add->bytes_unmapped; 4182 total->num_unmap_ops += add->num_unmap_ops; 4183 total->bytes_copied += add->bytes_copied; 4184 total->num_copy_ops += add->num_copy_ops; 4185 total->read_latency_ticks += add->read_latency_ticks; 4186 total->write_latency_ticks += add->write_latency_ticks; 4187 total->unmap_latency_ticks += add->unmap_latency_ticks; 4188 total->copy_latency_ticks += add->copy_latency_ticks; 4189 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4190 total->max_read_latency_ticks = add->max_read_latency_ticks; 4191 } 4192 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4193 total->min_read_latency_ticks = add->min_read_latency_ticks; 4194 } 4195 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4196 total->max_write_latency_ticks = add->max_write_latency_ticks; 4197 } 4198 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4199 total->min_write_latency_ticks = add->min_write_latency_ticks; 4200 } 4201 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4202 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4203 } 4204 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4205 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4206 } 4207 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4208 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4209 } 4210 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4211 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4212 } 4213 } 4214 4215 static void 4216 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4217 { 4218 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4219 4220 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4221 memcpy(to_stat->io_error, from_stat->io_error, 4222 sizeof(struct spdk_bdev_io_error_stat)); 4223 } 4224 } 4225 4226 void 4227 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4228 { 4229 stat->max_read_latency_ticks = 0; 4230 stat->min_read_latency_ticks = UINT64_MAX; 4231 stat->max_write_latency_ticks = 0; 4232 stat->min_write_latency_ticks = UINT64_MAX; 4233 stat->max_unmap_latency_ticks = 0; 4234 stat->min_unmap_latency_ticks = UINT64_MAX; 4235 stat->max_copy_latency_ticks = 0; 4236 stat->min_copy_latency_ticks = UINT64_MAX; 4237 4238 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4239 return; 4240 } 4241 4242 stat->bytes_read = 0; 4243 stat->num_read_ops = 0; 4244 stat->bytes_written = 0; 4245 stat->num_write_ops = 0; 4246 stat->bytes_unmapped = 0; 4247 stat->num_unmap_ops = 0; 4248 stat->bytes_copied = 0; 4249 stat->num_copy_ops = 0; 4250 stat->read_latency_ticks = 0; 4251 stat->write_latency_ticks = 0; 4252 stat->unmap_latency_ticks = 0; 4253 stat->copy_latency_ticks = 0; 4254 4255 if (stat->io_error != NULL) { 4256 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4257 } 4258 } 4259 4260 struct spdk_bdev_io_stat * 4261 bdev_alloc_io_stat(bool io_error_stat) 4262 { 4263 struct spdk_bdev_io_stat *stat; 4264 4265 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4266 if (stat == NULL) { 4267 return NULL; 4268 } 4269 4270 if (io_error_stat) { 4271 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4272 if (stat->io_error == NULL) { 4273 free(stat); 4274 return NULL; 4275 } 4276 } else { 4277 stat->io_error = NULL; 4278 } 4279 4280 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4281 4282 return stat; 4283 } 4284 4285 void 4286 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4287 { 4288 if (stat != NULL) { 4289 free(stat->io_error); 4290 free(stat); 4291 } 4292 } 4293 4294 void 4295 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4296 { 4297 int i; 4298 4299 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4300 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4301 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4302 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4303 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4304 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4305 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4306 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4307 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4308 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4309 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4310 stat->min_read_latency_ticks != UINT64_MAX ? 4311 stat->min_read_latency_ticks : 0); 4312 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4313 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4314 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4315 stat->min_write_latency_ticks != UINT64_MAX ? 4316 stat->min_write_latency_ticks : 0); 4317 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4318 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4319 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4320 stat->min_unmap_latency_ticks != UINT64_MAX ? 4321 stat->min_unmap_latency_ticks : 0); 4322 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4323 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4324 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4325 stat->min_copy_latency_ticks != UINT64_MAX ? 4326 stat->min_copy_latency_ticks : 0); 4327 4328 if (stat->io_error != NULL) { 4329 spdk_json_write_named_object_begin(w, "io_error"); 4330 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4331 if (stat->io_error->error_status[i] != 0) { 4332 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4333 stat->io_error->error_status[i]); 4334 } 4335 } 4336 spdk_json_write_object_end(w); 4337 } 4338 } 4339 4340 static void 4341 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4342 { 4343 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4344 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4345 4346 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4347 bdev_abort_all_buf_io(mgmt_ch, ch); 4348 bdev_abort_all_buf_io(mgmt_ch, ch); 4349 } 4350 4351 static void 4352 bdev_channel_destroy(void *io_device, void *ctx_buf) 4353 { 4354 struct spdk_bdev_channel *ch = ctx_buf; 4355 4356 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4357 spdk_get_thread()); 4358 4359 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4360 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4361 4362 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4363 spdk_spin_lock(&ch->bdev->internal.spinlock); 4364 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4365 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4366 4367 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4368 4369 bdev_channel_abort_queued_ios(ch); 4370 4371 if (ch->histogram) { 4372 spdk_histogram_data_free(ch->histogram); 4373 } 4374 4375 bdev_channel_destroy_resource(ch); 4376 } 4377 4378 /* 4379 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4380 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4381 */ 4382 static int 4383 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4384 { 4385 struct spdk_bdev_name *tmp; 4386 4387 bdev_name->name = strdup(name); 4388 if (bdev_name->name == NULL) { 4389 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4390 return -ENOMEM; 4391 } 4392 4393 bdev_name->bdev = bdev; 4394 4395 spdk_spin_lock(&g_bdev_mgr.spinlock); 4396 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4397 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4398 4399 if (tmp != NULL) { 4400 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4401 free(bdev_name->name); 4402 return -EEXIST; 4403 } 4404 4405 return 0; 4406 } 4407 4408 static void 4409 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4410 { 4411 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4412 free(bdev_name->name); 4413 } 4414 4415 static void 4416 bdev_name_del(struct spdk_bdev_name *bdev_name) 4417 { 4418 spdk_spin_lock(&g_bdev_mgr.spinlock); 4419 bdev_name_del_unsafe(bdev_name); 4420 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4421 } 4422 4423 int 4424 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4425 { 4426 struct spdk_bdev_alias *tmp; 4427 int ret; 4428 4429 if (alias == NULL) { 4430 SPDK_ERRLOG("Empty alias passed\n"); 4431 return -EINVAL; 4432 } 4433 4434 tmp = calloc(1, sizeof(*tmp)); 4435 if (tmp == NULL) { 4436 SPDK_ERRLOG("Unable to allocate alias\n"); 4437 return -ENOMEM; 4438 } 4439 4440 ret = bdev_name_add(&tmp->alias, bdev, alias); 4441 if (ret != 0) { 4442 free(tmp); 4443 return ret; 4444 } 4445 4446 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4447 4448 return 0; 4449 } 4450 4451 static int 4452 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4453 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4454 { 4455 struct spdk_bdev_alias *tmp; 4456 4457 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4458 if (strcmp(alias, tmp->alias.name) == 0) { 4459 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4460 alias_del_fn(&tmp->alias); 4461 free(tmp); 4462 return 0; 4463 } 4464 } 4465 4466 return -ENOENT; 4467 } 4468 4469 int 4470 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4471 { 4472 int rc; 4473 4474 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4475 if (rc == -ENOENT) { 4476 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4477 } 4478 4479 return rc; 4480 } 4481 4482 void 4483 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4484 { 4485 struct spdk_bdev_alias *p, *tmp; 4486 4487 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4488 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4489 bdev_name_del(&p->alias); 4490 free(p); 4491 } 4492 } 4493 4494 struct spdk_io_channel * 4495 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4496 { 4497 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4498 } 4499 4500 void * 4501 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4502 { 4503 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4504 void *ctx = NULL; 4505 4506 if (bdev->fn_table->get_module_ctx) { 4507 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4508 } 4509 4510 return ctx; 4511 } 4512 4513 const char * 4514 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4515 { 4516 return bdev->module->name; 4517 } 4518 4519 const char * 4520 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4521 { 4522 return bdev->name; 4523 } 4524 4525 const char * 4526 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4527 { 4528 return bdev->product_name; 4529 } 4530 4531 const struct spdk_bdev_aliases_list * 4532 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4533 { 4534 return &bdev->aliases; 4535 } 4536 4537 uint32_t 4538 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4539 { 4540 return bdev->blocklen; 4541 } 4542 4543 uint32_t 4544 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4545 { 4546 return bdev->write_unit_size; 4547 } 4548 4549 uint64_t 4550 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4551 { 4552 return bdev->blockcnt; 4553 } 4554 4555 const char * 4556 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4557 { 4558 return qos_rpc_type[type]; 4559 } 4560 4561 void 4562 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4563 { 4564 int i; 4565 4566 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4567 4568 spdk_spin_lock(&bdev->internal.spinlock); 4569 if (bdev->internal.qos) { 4570 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4571 if (bdev->internal.qos->rate_limits[i].limit != 4572 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4573 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4574 if (bdev_qos_is_iops_rate_limit(i) == false) { 4575 /* Change from Byte to Megabyte which is user visible. */ 4576 limits[i] = limits[i] / 1024 / 1024; 4577 } 4578 } 4579 } 4580 } 4581 spdk_spin_unlock(&bdev->internal.spinlock); 4582 } 4583 4584 size_t 4585 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4586 { 4587 return 1 << bdev->required_alignment; 4588 } 4589 4590 uint32_t 4591 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4592 { 4593 return bdev->optimal_io_boundary; 4594 } 4595 4596 bool 4597 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4598 { 4599 return bdev->write_cache; 4600 } 4601 4602 const struct spdk_uuid * 4603 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4604 { 4605 return &bdev->uuid; 4606 } 4607 4608 uint16_t 4609 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4610 { 4611 return bdev->acwu; 4612 } 4613 4614 uint32_t 4615 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4616 { 4617 return bdev->md_len; 4618 } 4619 4620 bool 4621 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4622 { 4623 return (bdev->md_len != 0) && bdev->md_interleave; 4624 } 4625 4626 bool 4627 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4628 { 4629 return (bdev->md_len != 0) && !bdev->md_interleave; 4630 } 4631 4632 bool 4633 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4634 { 4635 return bdev->zoned; 4636 } 4637 4638 uint32_t 4639 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4640 { 4641 if (spdk_bdev_is_md_interleaved(bdev)) { 4642 return bdev->blocklen - bdev->md_len; 4643 } else { 4644 return bdev->blocklen; 4645 } 4646 } 4647 4648 uint32_t 4649 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4650 { 4651 return bdev->phys_blocklen; 4652 } 4653 4654 static uint32_t 4655 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4656 { 4657 if (!spdk_bdev_is_md_interleaved(bdev)) { 4658 return bdev->blocklen + bdev->md_len; 4659 } else { 4660 return bdev->blocklen; 4661 } 4662 } 4663 4664 /* We have to use the typedef in the function declaration to appease astyle. */ 4665 typedef enum spdk_dif_type spdk_dif_type_t; 4666 4667 spdk_dif_type_t 4668 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4669 { 4670 if (bdev->md_len != 0) { 4671 return bdev->dif_type; 4672 } else { 4673 return SPDK_DIF_DISABLE; 4674 } 4675 } 4676 4677 bool 4678 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4679 { 4680 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4681 return bdev->dif_is_head_of_md; 4682 } else { 4683 return false; 4684 } 4685 } 4686 4687 bool 4688 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4689 enum spdk_dif_check_type check_type) 4690 { 4691 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4692 return false; 4693 } 4694 4695 switch (check_type) { 4696 case SPDK_DIF_CHECK_TYPE_REFTAG: 4697 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4698 case SPDK_DIF_CHECK_TYPE_APPTAG: 4699 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4700 case SPDK_DIF_CHECK_TYPE_GUARD: 4701 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4702 default: 4703 return false; 4704 } 4705 } 4706 4707 static uint32_t 4708 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4709 { 4710 uint64_t aligned_length, max_write_blocks; 4711 4712 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4713 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4714 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4715 4716 return max_write_blocks; 4717 } 4718 4719 uint32_t 4720 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4721 { 4722 return bdev->max_copy; 4723 } 4724 4725 uint64_t 4726 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4727 { 4728 return bdev->internal.measured_queue_depth; 4729 } 4730 4731 uint64_t 4732 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4733 { 4734 return bdev->internal.period; 4735 } 4736 4737 uint64_t 4738 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4739 { 4740 return bdev->internal.weighted_io_time; 4741 } 4742 4743 uint64_t 4744 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4745 { 4746 return bdev->internal.io_time; 4747 } 4748 4749 static void bdev_update_qd_sampling_period(void *ctx); 4750 4751 static void 4752 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4753 { 4754 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4755 4756 if (bdev->internal.measured_queue_depth) { 4757 bdev->internal.io_time += bdev->internal.period; 4758 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4759 } 4760 4761 bdev->internal.qd_poll_in_progress = false; 4762 4763 bdev_update_qd_sampling_period(bdev); 4764 } 4765 4766 static void 4767 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4768 struct spdk_io_channel *io_ch, void *_ctx) 4769 { 4770 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4771 4772 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4773 spdk_bdev_for_each_channel_continue(i, 0); 4774 } 4775 4776 static int 4777 bdev_calculate_measured_queue_depth(void *ctx) 4778 { 4779 struct spdk_bdev *bdev = ctx; 4780 4781 bdev->internal.qd_poll_in_progress = true; 4782 bdev->internal.temporary_queue_depth = 0; 4783 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4784 return SPDK_POLLER_BUSY; 4785 } 4786 4787 static void 4788 bdev_update_qd_sampling_period(void *ctx) 4789 { 4790 struct spdk_bdev *bdev = ctx; 4791 4792 if (bdev->internal.period == bdev->internal.new_period) { 4793 return; 4794 } 4795 4796 if (bdev->internal.qd_poll_in_progress) { 4797 return; 4798 } 4799 4800 bdev->internal.period = bdev->internal.new_period; 4801 4802 spdk_poller_unregister(&bdev->internal.qd_poller); 4803 if (bdev->internal.period != 0) { 4804 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4805 bdev, bdev->internal.period); 4806 } else { 4807 spdk_bdev_close(bdev->internal.qd_desc); 4808 bdev->internal.qd_desc = NULL; 4809 } 4810 } 4811 4812 static void 4813 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4814 { 4815 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4816 } 4817 4818 void 4819 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4820 { 4821 int rc; 4822 4823 if (bdev->internal.new_period == period) { 4824 return; 4825 } 4826 4827 bdev->internal.new_period = period; 4828 4829 if (bdev->internal.qd_desc != NULL) { 4830 assert(bdev->internal.period != 0); 4831 4832 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4833 bdev_update_qd_sampling_period, bdev); 4834 return; 4835 } 4836 4837 assert(bdev->internal.period == 0); 4838 4839 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4840 NULL, &bdev->internal.qd_desc); 4841 if (rc != 0) { 4842 return; 4843 } 4844 4845 bdev->internal.period = period; 4846 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4847 bdev, period); 4848 } 4849 4850 struct bdev_get_current_qd_ctx { 4851 uint64_t current_qd; 4852 spdk_bdev_get_current_qd_cb cb_fn; 4853 void *cb_arg; 4854 }; 4855 4856 static void 4857 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4858 { 4859 struct bdev_get_current_qd_ctx *ctx = _ctx; 4860 4861 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4862 4863 free(ctx); 4864 } 4865 4866 static void 4867 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4868 struct spdk_io_channel *io_ch, void *_ctx) 4869 { 4870 struct bdev_get_current_qd_ctx *ctx = _ctx; 4871 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4872 4873 ctx->current_qd += bdev_ch->io_outstanding; 4874 4875 spdk_bdev_for_each_channel_continue(i, 0); 4876 } 4877 4878 void 4879 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4880 void *cb_arg) 4881 { 4882 struct bdev_get_current_qd_ctx *ctx; 4883 4884 assert(cb_fn != NULL); 4885 4886 ctx = calloc(1, sizeof(*ctx)); 4887 if (ctx == NULL) { 4888 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4889 return; 4890 } 4891 4892 ctx->cb_fn = cb_fn; 4893 ctx->cb_arg = cb_arg; 4894 4895 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4896 } 4897 4898 static void 4899 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4900 { 4901 assert(desc->thread == spdk_get_thread()); 4902 4903 spdk_spin_lock(&desc->spinlock); 4904 desc->refs--; 4905 if (!desc->closed) { 4906 spdk_spin_unlock(&desc->spinlock); 4907 desc->callback.event_fn(type, 4908 desc->bdev, 4909 desc->callback.ctx); 4910 return; 4911 } else if (desc->refs == 0) { 4912 /* This descriptor was closed after this event_notify message was sent. 4913 * spdk_bdev_close() could not free the descriptor since this message was 4914 * in flight, so we free it now using bdev_desc_free(). 4915 */ 4916 spdk_spin_unlock(&desc->spinlock); 4917 bdev_desc_free(desc); 4918 return; 4919 } 4920 spdk_spin_unlock(&desc->spinlock); 4921 } 4922 4923 static void 4924 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4925 { 4926 spdk_spin_lock(&desc->spinlock); 4927 desc->refs++; 4928 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4929 spdk_spin_unlock(&desc->spinlock); 4930 } 4931 4932 static void 4933 _resize_notify(void *ctx) 4934 { 4935 struct spdk_bdev_desc *desc = ctx; 4936 4937 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4938 } 4939 4940 int 4941 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4942 { 4943 struct spdk_bdev_desc *desc; 4944 int ret; 4945 4946 if (size == bdev->blockcnt) { 4947 return 0; 4948 } 4949 4950 spdk_spin_lock(&bdev->internal.spinlock); 4951 4952 /* bdev has open descriptors */ 4953 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4954 bdev->blockcnt > size) { 4955 ret = -EBUSY; 4956 } else { 4957 bdev->blockcnt = size; 4958 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4959 event_notify(desc, _resize_notify); 4960 } 4961 ret = 0; 4962 } 4963 4964 spdk_spin_unlock(&bdev->internal.spinlock); 4965 4966 return ret; 4967 } 4968 4969 /* 4970 * Convert I/O offset and length from bytes to blocks. 4971 * 4972 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4973 */ 4974 static uint64_t 4975 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4976 uint64_t num_bytes, uint64_t *num_blocks) 4977 { 4978 uint32_t block_size = bdev->blocklen; 4979 uint8_t shift_cnt; 4980 4981 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4982 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4983 shift_cnt = spdk_u32log2(block_size); 4984 *offset_blocks = offset_bytes >> shift_cnt; 4985 *num_blocks = num_bytes >> shift_cnt; 4986 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4987 (num_bytes - (*num_blocks << shift_cnt)); 4988 } else { 4989 *offset_blocks = offset_bytes / block_size; 4990 *num_blocks = num_bytes / block_size; 4991 return (offset_bytes % block_size) | (num_bytes % block_size); 4992 } 4993 } 4994 4995 static bool 4996 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4997 { 4998 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4999 * has been an overflow and hence the offset has been wrapped around */ 5000 if (offset_blocks + num_blocks < offset_blocks) { 5001 return false; 5002 } 5003 5004 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5005 if (offset_blocks + num_blocks > bdev->blockcnt) { 5006 return false; 5007 } 5008 5009 return true; 5010 } 5011 5012 static void 5013 bdev_seek_complete_cb(void *ctx) 5014 { 5015 struct spdk_bdev_io *bdev_io = ctx; 5016 5017 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5018 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5019 } 5020 5021 static int 5022 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5023 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5024 spdk_bdev_io_completion_cb cb, void *cb_arg) 5025 { 5026 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5027 struct spdk_bdev_io *bdev_io; 5028 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5029 5030 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5031 5032 /* Check if offset_blocks is valid looking at the validity of one block */ 5033 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5034 return -EINVAL; 5035 } 5036 5037 bdev_io = bdev_channel_get_io(channel); 5038 if (!bdev_io) { 5039 return -ENOMEM; 5040 } 5041 5042 bdev_io->internal.ch = channel; 5043 bdev_io->internal.desc = desc; 5044 bdev_io->type = io_type; 5045 bdev_io->u.bdev.offset_blocks = offset_blocks; 5046 bdev_io->u.bdev.memory_domain = NULL; 5047 bdev_io->u.bdev.memory_domain_ctx = NULL; 5048 bdev_io->u.bdev.accel_sequence = NULL; 5049 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5050 5051 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5052 /* In case bdev doesn't support seek to next data/hole offset, 5053 * it is assumed that only data and no holes are present */ 5054 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5055 bdev_io->u.bdev.seek.offset = offset_blocks; 5056 } else { 5057 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5058 } 5059 5060 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5061 return 0; 5062 } 5063 5064 bdev_io_submit(bdev_io); 5065 return 0; 5066 } 5067 5068 int 5069 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5070 uint64_t offset_blocks, 5071 spdk_bdev_io_completion_cb cb, void *cb_arg) 5072 { 5073 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5074 } 5075 5076 int 5077 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5078 uint64_t offset_blocks, 5079 spdk_bdev_io_completion_cb cb, void *cb_arg) 5080 { 5081 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5082 } 5083 5084 uint64_t 5085 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5086 { 5087 return bdev_io->u.bdev.seek.offset; 5088 } 5089 5090 static int 5091 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5092 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5093 spdk_bdev_io_completion_cb cb, void *cb_arg) 5094 { 5095 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5096 struct spdk_bdev_io *bdev_io; 5097 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5098 5099 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5100 return -EINVAL; 5101 } 5102 5103 bdev_io = bdev_channel_get_io(channel); 5104 if (!bdev_io) { 5105 return -ENOMEM; 5106 } 5107 5108 bdev_io->internal.ch = channel; 5109 bdev_io->internal.desc = desc; 5110 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5111 bdev_io->u.bdev.iovs = &bdev_io->iov; 5112 bdev_io->u.bdev.iovs[0].iov_base = buf; 5113 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5114 bdev_io->u.bdev.iovcnt = 1; 5115 bdev_io->u.bdev.md_buf = md_buf; 5116 bdev_io->u.bdev.num_blocks = num_blocks; 5117 bdev_io->u.bdev.offset_blocks = offset_blocks; 5118 bdev_io->u.bdev.memory_domain = NULL; 5119 bdev_io->u.bdev.memory_domain_ctx = NULL; 5120 bdev_io->u.bdev.accel_sequence = NULL; 5121 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5122 5123 bdev_io_submit(bdev_io); 5124 return 0; 5125 } 5126 5127 int 5128 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5129 void *buf, uint64_t offset, uint64_t nbytes, 5130 spdk_bdev_io_completion_cb cb, void *cb_arg) 5131 { 5132 uint64_t offset_blocks, num_blocks; 5133 5134 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5135 nbytes, &num_blocks) != 0) { 5136 return -EINVAL; 5137 } 5138 5139 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5140 } 5141 5142 int 5143 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5144 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5145 spdk_bdev_io_completion_cb cb, void *cb_arg) 5146 { 5147 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5148 } 5149 5150 int 5151 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5152 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5153 spdk_bdev_io_completion_cb cb, void *cb_arg) 5154 { 5155 struct iovec iov = { 5156 .iov_base = buf, 5157 }; 5158 5159 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5160 return -EINVAL; 5161 } 5162 5163 if (md_buf && !_is_buf_allocated(&iov)) { 5164 return -EINVAL; 5165 } 5166 5167 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5168 cb, cb_arg); 5169 } 5170 5171 int 5172 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5173 struct iovec *iov, int iovcnt, 5174 uint64_t offset, uint64_t nbytes, 5175 spdk_bdev_io_completion_cb cb, void *cb_arg) 5176 { 5177 uint64_t offset_blocks, num_blocks; 5178 5179 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5180 nbytes, &num_blocks) != 0) { 5181 return -EINVAL; 5182 } 5183 5184 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5185 } 5186 5187 static int 5188 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5189 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5190 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5191 struct spdk_accel_sequence *seq, 5192 spdk_bdev_io_completion_cb cb, void *cb_arg) 5193 { 5194 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5195 struct spdk_bdev_io *bdev_io; 5196 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5197 5198 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5199 return -EINVAL; 5200 } 5201 5202 bdev_io = bdev_channel_get_io(channel); 5203 if (!bdev_io) { 5204 return -ENOMEM; 5205 } 5206 5207 bdev_io->internal.ch = channel; 5208 bdev_io->internal.desc = desc; 5209 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5210 bdev_io->u.bdev.iovs = iov; 5211 bdev_io->u.bdev.iovcnt = iovcnt; 5212 bdev_io->u.bdev.md_buf = md_buf; 5213 bdev_io->u.bdev.num_blocks = num_blocks; 5214 bdev_io->u.bdev.offset_blocks = offset_blocks; 5215 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5216 bdev_io->internal.memory_domain = domain; 5217 bdev_io->internal.memory_domain_ctx = domain_ctx; 5218 bdev_io->internal.accel_sequence = seq; 5219 bdev_io->u.bdev.memory_domain = domain; 5220 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5221 bdev_io->u.bdev.accel_sequence = seq; 5222 5223 _bdev_io_submit_ext(desc, bdev_io); 5224 5225 return 0; 5226 } 5227 5228 int 5229 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5230 struct iovec *iov, int iovcnt, 5231 uint64_t offset_blocks, uint64_t num_blocks, 5232 spdk_bdev_io_completion_cb cb, void *cb_arg) 5233 { 5234 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5235 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5236 } 5237 5238 int 5239 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5240 struct iovec *iov, int iovcnt, void *md_buf, 5241 uint64_t offset_blocks, uint64_t num_blocks, 5242 spdk_bdev_io_completion_cb cb, void *cb_arg) 5243 { 5244 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5245 return -EINVAL; 5246 } 5247 5248 if (md_buf && !_is_buf_allocated(iov)) { 5249 return -EINVAL; 5250 } 5251 5252 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5253 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5254 } 5255 5256 static inline bool 5257 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5258 { 5259 /* 5260 * We check if opts size is at least of size when we first introduced 5261 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5262 * are not checked internal. 5263 */ 5264 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5265 sizeof(opts->metadata) && 5266 opts->size <= sizeof(*opts) && 5267 /* When memory domain is used, the user must provide data buffers */ 5268 (!opts->memory_domain || (iov && iov[0].iov_base)); 5269 } 5270 5271 int 5272 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5273 struct iovec *iov, int iovcnt, 5274 uint64_t offset_blocks, uint64_t num_blocks, 5275 spdk_bdev_io_completion_cb cb, void *cb_arg, 5276 struct spdk_bdev_ext_io_opts *opts) 5277 { 5278 void *md = NULL; 5279 5280 if (opts) { 5281 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5282 return -EINVAL; 5283 } 5284 md = opts->metadata; 5285 } 5286 5287 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5288 return -EINVAL; 5289 } 5290 5291 if (md && !_is_buf_allocated(iov)) { 5292 return -EINVAL; 5293 } 5294 5295 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5296 num_blocks, 5297 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5298 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5299 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5300 cb, cb_arg); 5301 } 5302 5303 static int 5304 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5305 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5306 spdk_bdev_io_completion_cb cb, void *cb_arg) 5307 { 5308 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5309 struct spdk_bdev_io *bdev_io; 5310 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5311 5312 if (!desc->write) { 5313 return -EBADF; 5314 } 5315 5316 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5317 return -EINVAL; 5318 } 5319 5320 bdev_io = bdev_channel_get_io(channel); 5321 if (!bdev_io) { 5322 return -ENOMEM; 5323 } 5324 5325 bdev_io->internal.ch = channel; 5326 bdev_io->internal.desc = desc; 5327 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5328 bdev_io->u.bdev.iovs = &bdev_io->iov; 5329 bdev_io->u.bdev.iovs[0].iov_base = buf; 5330 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5331 bdev_io->u.bdev.iovcnt = 1; 5332 bdev_io->u.bdev.md_buf = md_buf; 5333 bdev_io->u.bdev.num_blocks = num_blocks; 5334 bdev_io->u.bdev.offset_blocks = offset_blocks; 5335 bdev_io->u.bdev.memory_domain = NULL; 5336 bdev_io->u.bdev.memory_domain_ctx = NULL; 5337 bdev_io->u.bdev.accel_sequence = NULL; 5338 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5339 5340 bdev_io_submit(bdev_io); 5341 return 0; 5342 } 5343 5344 int 5345 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5346 void *buf, uint64_t offset, uint64_t nbytes, 5347 spdk_bdev_io_completion_cb cb, void *cb_arg) 5348 { 5349 uint64_t offset_blocks, num_blocks; 5350 5351 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5352 nbytes, &num_blocks) != 0) { 5353 return -EINVAL; 5354 } 5355 5356 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5357 } 5358 5359 int 5360 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5361 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5362 spdk_bdev_io_completion_cb cb, void *cb_arg) 5363 { 5364 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5365 cb, cb_arg); 5366 } 5367 5368 int 5369 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5370 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5371 spdk_bdev_io_completion_cb cb, void *cb_arg) 5372 { 5373 struct iovec iov = { 5374 .iov_base = buf, 5375 }; 5376 5377 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5378 return -EINVAL; 5379 } 5380 5381 if (md_buf && !_is_buf_allocated(&iov)) { 5382 return -EINVAL; 5383 } 5384 5385 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5386 cb, cb_arg); 5387 } 5388 5389 static int 5390 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5391 struct iovec *iov, int iovcnt, void *md_buf, 5392 uint64_t offset_blocks, uint64_t num_blocks, 5393 struct spdk_memory_domain *domain, void *domain_ctx, 5394 struct spdk_accel_sequence *seq, 5395 spdk_bdev_io_completion_cb cb, void *cb_arg) 5396 { 5397 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5398 struct spdk_bdev_io *bdev_io; 5399 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5400 5401 if (!desc->write) { 5402 return -EBADF; 5403 } 5404 5405 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5406 return -EINVAL; 5407 } 5408 5409 bdev_io = bdev_channel_get_io(channel); 5410 if (!bdev_io) { 5411 return -ENOMEM; 5412 } 5413 5414 bdev_io->internal.ch = channel; 5415 bdev_io->internal.desc = desc; 5416 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5417 bdev_io->u.bdev.iovs = iov; 5418 bdev_io->u.bdev.iovcnt = iovcnt; 5419 bdev_io->u.bdev.md_buf = md_buf; 5420 bdev_io->u.bdev.num_blocks = num_blocks; 5421 bdev_io->u.bdev.offset_blocks = offset_blocks; 5422 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5423 bdev_io->internal.memory_domain = domain; 5424 bdev_io->internal.memory_domain_ctx = domain_ctx; 5425 bdev_io->internal.accel_sequence = seq; 5426 bdev_io->u.bdev.memory_domain = domain; 5427 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5428 bdev_io->u.bdev.accel_sequence = seq; 5429 5430 _bdev_io_submit_ext(desc, bdev_io); 5431 5432 return 0; 5433 } 5434 5435 int 5436 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5437 struct iovec *iov, int iovcnt, 5438 uint64_t offset, uint64_t len, 5439 spdk_bdev_io_completion_cb cb, void *cb_arg) 5440 { 5441 uint64_t offset_blocks, num_blocks; 5442 5443 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5444 len, &num_blocks) != 0) { 5445 return -EINVAL; 5446 } 5447 5448 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5449 } 5450 5451 int 5452 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5453 struct iovec *iov, int iovcnt, 5454 uint64_t offset_blocks, uint64_t num_blocks, 5455 spdk_bdev_io_completion_cb cb, void *cb_arg) 5456 { 5457 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5458 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5459 } 5460 5461 int 5462 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5463 struct iovec *iov, int iovcnt, void *md_buf, 5464 uint64_t offset_blocks, uint64_t num_blocks, 5465 spdk_bdev_io_completion_cb cb, void *cb_arg) 5466 { 5467 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5468 return -EINVAL; 5469 } 5470 5471 if (md_buf && !_is_buf_allocated(iov)) { 5472 return -EINVAL; 5473 } 5474 5475 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5476 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5477 } 5478 5479 int 5480 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5481 struct iovec *iov, int iovcnt, 5482 uint64_t offset_blocks, uint64_t num_blocks, 5483 spdk_bdev_io_completion_cb cb, void *cb_arg, 5484 struct spdk_bdev_ext_io_opts *opts) 5485 { 5486 void *md = NULL; 5487 5488 if (opts) { 5489 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5490 return -EINVAL; 5491 } 5492 md = opts->metadata; 5493 } 5494 5495 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5496 return -EINVAL; 5497 } 5498 5499 if (md && !_is_buf_allocated(iov)) { 5500 return -EINVAL; 5501 } 5502 5503 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5504 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5505 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5506 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5507 cb, cb_arg); 5508 } 5509 5510 static void 5511 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5512 { 5513 struct spdk_bdev_io *parent_io = cb_arg; 5514 struct spdk_bdev *bdev = parent_io->bdev; 5515 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5516 int i, rc = 0; 5517 5518 if (!success) { 5519 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5520 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5521 spdk_bdev_free_io(bdev_io); 5522 return; 5523 } 5524 5525 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5526 rc = memcmp(read_buf, 5527 parent_io->u.bdev.iovs[i].iov_base, 5528 parent_io->u.bdev.iovs[i].iov_len); 5529 if (rc) { 5530 break; 5531 } 5532 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5533 } 5534 5535 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5536 rc = memcmp(bdev_io->u.bdev.md_buf, 5537 parent_io->u.bdev.md_buf, 5538 spdk_bdev_get_md_size(bdev)); 5539 } 5540 5541 spdk_bdev_free_io(bdev_io); 5542 5543 if (rc == 0) { 5544 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5545 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5546 } else { 5547 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5548 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5549 } 5550 } 5551 5552 static void 5553 bdev_compare_do_read(void *_bdev_io) 5554 { 5555 struct spdk_bdev_io *bdev_io = _bdev_io; 5556 int rc; 5557 5558 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5559 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5560 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5561 bdev_compare_do_read_done, bdev_io); 5562 5563 if (rc == -ENOMEM) { 5564 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5565 } else if (rc != 0) { 5566 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5567 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5568 } 5569 } 5570 5571 static int 5572 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5573 struct iovec *iov, int iovcnt, void *md_buf, 5574 uint64_t offset_blocks, uint64_t num_blocks, 5575 spdk_bdev_io_completion_cb cb, void *cb_arg) 5576 { 5577 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5578 struct spdk_bdev_io *bdev_io; 5579 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5580 5581 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5582 return -EINVAL; 5583 } 5584 5585 bdev_io = bdev_channel_get_io(channel); 5586 if (!bdev_io) { 5587 return -ENOMEM; 5588 } 5589 5590 bdev_io->internal.ch = channel; 5591 bdev_io->internal.desc = desc; 5592 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5593 bdev_io->u.bdev.iovs = iov; 5594 bdev_io->u.bdev.iovcnt = iovcnt; 5595 bdev_io->u.bdev.md_buf = md_buf; 5596 bdev_io->u.bdev.num_blocks = num_blocks; 5597 bdev_io->u.bdev.offset_blocks = offset_blocks; 5598 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5599 bdev_io->u.bdev.memory_domain = NULL; 5600 bdev_io->u.bdev.memory_domain_ctx = NULL; 5601 bdev_io->u.bdev.accel_sequence = NULL; 5602 5603 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5604 bdev_io_submit(bdev_io); 5605 return 0; 5606 } 5607 5608 bdev_compare_do_read(bdev_io); 5609 5610 return 0; 5611 } 5612 5613 int 5614 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5615 struct iovec *iov, int iovcnt, 5616 uint64_t offset_blocks, uint64_t num_blocks, 5617 spdk_bdev_io_completion_cb cb, void *cb_arg) 5618 { 5619 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5620 num_blocks, cb, cb_arg); 5621 } 5622 5623 int 5624 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5625 struct iovec *iov, int iovcnt, void *md_buf, 5626 uint64_t offset_blocks, uint64_t num_blocks, 5627 spdk_bdev_io_completion_cb cb, void *cb_arg) 5628 { 5629 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5630 return -EINVAL; 5631 } 5632 5633 if (md_buf && !_is_buf_allocated(iov)) { 5634 return -EINVAL; 5635 } 5636 5637 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5638 num_blocks, cb, cb_arg); 5639 } 5640 5641 static int 5642 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5643 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5644 spdk_bdev_io_completion_cb cb, void *cb_arg) 5645 { 5646 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5647 struct spdk_bdev_io *bdev_io; 5648 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5649 5650 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5651 return -EINVAL; 5652 } 5653 5654 bdev_io = bdev_channel_get_io(channel); 5655 if (!bdev_io) { 5656 return -ENOMEM; 5657 } 5658 5659 bdev_io->internal.ch = channel; 5660 bdev_io->internal.desc = desc; 5661 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5662 bdev_io->u.bdev.iovs = &bdev_io->iov; 5663 bdev_io->u.bdev.iovs[0].iov_base = buf; 5664 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5665 bdev_io->u.bdev.iovcnt = 1; 5666 bdev_io->u.bdev.md_buf = md_buf; 5667 bdev_io->u.bdev.num_blocks = num_blocks; 5668 bdev_io->u.bdev.offset_blocks = offset_blocks; 5669 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5670 bdev_io->u.bdev.memory_domain = NULL; 5671 bdev_io->u.bdev.memory_domain_ctx = NULL; 5672 bdev_io->u.bdev.accel_sequence = NULL; 5673 5674 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5675 bdev_io_submit(bdev_io); 5676 return 0; 5677 } 5678 5679 bdev_compare_do_read(bdev_io); 5680 5681 return 0; 5682 } 5683 5684 int 5685 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5686 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5687 spdk_bdev_io_completion_cb cb, void *cb_arg) 5688 { 5689 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5690 cb, cb_arg); 5691 } 5692 5693 int 5694 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5695 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5696 spdk_bdev_io_completion_cb cb, void *cb_arg) 5697 { 5698 struct iovec iov = { 5699 .iov_base = buf, 5700 }; 5701 5702 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5703 return -EINVAL; 5704 } 5705 5706 if (md_buf && !_is_buf_allocated(&iov)) { 5707 return -EINVAL; 5708 } 5709 5710 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5711 cb, cb_arg); 5712 } 5713 5714 static void 5715 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5716 { 5717 struct spdk_bdev_io *bdev_io = ctx; 5718 5719 if (unlock_status) { 5720 SPDK_ERRLOG("LBA range unlock failed\n"); 5721 } 5722 5723 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5724 false, bdev_io->internal.caller_ctx); 5725 } 5726 5727 static void 5728 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5729 { 5730 bdev_io->internal.status = status; 5731 5732 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5733 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5734 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5735 } 5736 5737 static void 5738 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5739 { 5740 struct spdk_bdev_io *parent_io = cb_arg; 5741 5742 if (!success) { 5743 SPDK_ERRLOG("Compare and write operation failed\n"); 5744 } 5745 5746 spdk_bdev_free_io(bdev_io); 5747 5748 bdev_comparev_and_writev_blocks_unlock(parent_io, 5749 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5750 } 5751 5752 static void 5753 bdev_compare_and_write_do_write(void *_bdev_io) 5754 { 5755 struct spdk_bdev_io *bdev_io = _bdev_io; 5756 int rc; 5757 5758 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5759 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5760 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5761 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5762 bdev_compare_and_write_do_write_done, bdev_io); 5763 5764 5765 if (rc == -ENOMEM) { 5766 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5767 } else if (rc != 0) { 5768 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5769 } 5770 } 5771 5772 static void 5773 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5774 { 5775 struct spdk_bdev_io *parent_io = cb_arg; 5776 5777 spdk_bdev_free_io(bdev_io); 5778 5779 if (!success) { 5780 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5781 return; 5782 } 5783 5784 bdev_compare_and_write_do_write(parent_io); 5785 } 5786 5787 static void 5788 bdev_compare_and_write_do_compare(void *_bdev_io) 5789 { 5790 struct spdk_bdev_io *bdev_io = _bdev_io; 5791 int rc; 5792 5793 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5794 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5795 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5796 bdev_compare_and_write_do_compare_done, bdev_io); 5797 5798 if (rc == -ENOMEM) { 5799 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5800 } else if (rc != 0) { 5801 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5802 } 5803 } 5804 5805 static void 5806 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5807 { 5808 struct spdk_bdev_io *bdev_io = ctx; 5809 5810 if (status) { 5811 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5812 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5813 return; 5814 } 5815 5816 bdev_compare_and_write_do_compare(bdev_io); 5817 } 5818 5819 int 5820 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5821 struct iovec *compare_iov, int compare_iovcnt, 5822 struct iovec *write_iov, int write_iovcnt, 5823 uint64_t offset_blocks, uint64_t num_blocks, 5824 spdk_bdev_io_completion_cb cb, void *cb_arg) 5825 { 5826 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5827 struct spdk_bdev_io *bdev_io; 5828 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5829 5830 if (!desc->write) { 5831 return -EBADF; 5832 } 5833 5834 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5835 return -EINVAL; 5836 } 5837 5838 if (num_blocks > bdev->acwu) { 5839 return -EINVAL; 5840 } 5841 5842 bdev_io = bdev_channel_get_io(channel); 5843 if (!bdev_io) { 5844 return -ENOMEM; 5845 } 5846 5847 bdev_io->internal.ch = channel; 5848 bdev_io->internal.desc = desc; 5849 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5850 bdev_io->u.bdev.iovs = compare_iov; 5851 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5852 bdev_io->u.bdev.fused_iovs = write_iov; 5853 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5854 bdev_io->u.bdev.md_buf = NULL; 5855 bdev_io->u.bdev.num_blocks = num_blocks; 5856 bdev_io->u.bdev.offset_blocks = offset_blocks; 5857 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5858 bdev_io->u.bdev.memory_domain = NULL; 5859 bdev_io->u.bdev.memory_domain_ctx = NULL; 5860 bdev_io->u.bdev.accel_sequence = NULL; 5861 5862 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5863 bdev_io_submit(bdev_io); 5864 return 0; 5865 } 5866 5867 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5868 bdev_comparev_and_writev_blocks_locked, bdev_io); 5869 } 5870 5871 int 5872 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5873 struct iovec *iov, int iovcnt, 5874 uint64_t offset_blocks, uint64_t num_blocks, 5875 bool populate, 5876 spdk_bdev_io_completion_cb cb, void *cb_arg) 5877 { 5878 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5879 struct spdk_bdev_io *bdev_io; 5880 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5881 5882 if (!desc->write) { 5883 return -EBADF; 5884 } 5885 5886 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5887 return -EINVAL; 5888 } 5889 5890 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5891 return -ENOTSUP; 5892 } 5893 5894 bdev_io = bdev_channel_get_io(channel); 5895 if (!bdev_io) { 5896 return -ENOMEM; 5897 } 5898 5899 bdev_io->internal.ch = channel; 5900 bdev_io->internal.desc = desc; 5901 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5902 bdev_io->u.bdev.num_blocks = num_blocks; 5903 bdev_io->u.bdev.offset_blocks = offset_blocks; 5904 bdev_io->u.bdev.iovs = iov; 5905 bdev_io->u.bdev.iovcnt = iovcnt; 5906 bdev_io->u.bdev.md_buf = NULL; 5907 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5908 bdev_io->u.bdev.zcopy.commit = 0; 5909 bdev_io->u.bdev.zcopy.start = 1; 5910 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5911 bdev_io->u.bdev.memory_domain = NULL; 5912 bdev_io->u.bdev.memory_domain_ctx = NULL; 5913 bdev_io->u.bdev.accel_sequence = NULL; 5914 5915 bdev_io_submit(bdev_io); 5916 5917 return 0; 5918 } 5919 5920 int 5921 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5922 spdk_bdev_io_completion_cb cb, void *cb_arg) 5923 { 5924 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5925 return -EINVAL; 5926 } 5927 5928 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5929 bdev_io->u.bdev.zcopy.start = 0; 5930 bdev_io->internal.caller_ctx = cb_arg; 5931 bdev_io->internal.cb = cb; 5932 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5933 5934 bdev_io_submit(bdev_io); 5935 5936 return 0; 5937 } 5938 5939 int 5940 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5941 uint64_t offset, uint64_t len, 5942 spdk_bdev_io_completion_cb cb, void *cb_arg) 5943 { 5944 uint64_t offset_blocks, num_blocks; 5945 5946 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5947 len, &num_blocks) != 0) { 5948 return -EINVAL; 5949 } 5950 5951 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5952 } 5953 5954 int 5955 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5956 uint64_t offset_blocks, uint64_t num_blocks, 5957 spdk_bdev_io_completion_cb cb, void *cb_arg) 5958 { 5959 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5960 struct spdk_bdev_io *bdev_io; 5961 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5962 5963 if (!desc->write) { 5964 return -EBADF; 5965 } 5966 5967 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5968 return -EINVAL; 5969 } 5970 5971 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5972 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5973 return -ENOTSUP; 5974 } 5975 5976 bdev_io = bdev_channel_get_io(channel); 5977 5978 if (!bdev_io) { 5979 return -ENOMEM; 5980 } 5981 5982 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5983 bdev_io->internal.ch = channel; 5984 bdev_io->internal.desc = desc; 5985 bdev_io->u.bdev.offset_blocks = offset_blocks; 5986 bdev_io->u.bdev.num_blocks = num_blocks; 5987 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5988 bdev_io->u.bdev.memory_domain = NULL; 5989 bdev_io->u.bdev.memory_domain_ctx = NULL; 5990 bdev_io->u.bdev.accel_sequence = NULL; 5991 5992 /* If the write_zeroes size is large and should be split, use the generic split 5993 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 5994 * 5995 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 5996 * or emulate it using regular write request otherwise. 5997 */ 5998 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 5999 bdev_io->internal.split) { 6000 bdev_io_submit(bdev_io); 6001 return 0; 6002 } 6003 6004 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6005 6006 return bdev_write_zero_buffer(bdev_io); 6007 } 6008 6009 int 6010 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6011 uint64_t offset, uint64_t nbytes, 6012 spdk_bdev_io_completion_cb cb, void *cb_arg) 6013 { 6014 uint64_t offset_blocks, num_blocks; 6015 6016 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6017 nbytes, &num_blocks) != 0) { 6018 return -EINVAL; 6019 } 6020 6021 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6022 } 6023 6024 int 6025 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6026 uint64_t offset_blocks, uint64_t num_blocks, 6027 spdk_bdev_io_completion_cb cb, void *cb_arg) 6028 { 6029 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6030 struct spdk_bdev_io *bdev_io; 6031 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6032 6033 if (!desc->write) { 6034 return -EBADF; 6035 } 6036 6037 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6038 return -EINVAL; 6039 } 6040 6041 if (num_blocks == 0) { 6042 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6043 return -EINVAL; 6044 } 6045 6046 bdev_io = bdev_channel_get_io(channel); 6047 if (!bdev_io) { 6048 return -ENOMEM; 6049 } 6050 6051 bdev_io->internal.ch = channel; 6052 bdev_io->internal.desc = desc; 6053 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6054 6055 bdev_io->u.bdev.iovs = &bdev_io->iov; 6056 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6057 bdev_io->u.bdev.iovs[0].iov_len = 0; 6058 bdev_io->u.bdev.iovcnt = 1; 6059 6060 bdev_io->u.bdev.offset_blocks = offset_blocks; 6061 bdev_io->u.bdev.num_blocks = num_blocks; 6062 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6063 bdev_io->u.bdev.memory_domain = NULL; 6064 bdev_io->u.bdev.memory_domain_ctx = NULL; 6065 bdev_io->u.bdev.accel_sequence = NULL; 6066 6067 bdev_io_submit(bdev_io); 6068 return 0; 6069 } 6070 6071 int 6072 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6073 uint64_t offset, uint64_t length, 6074 spdk_bdev_io_completion_cb cb, void *cb_arg) 6075 { 6076 uint64_t offset_blocks, num_blocks; 6077 6078 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6079 length, &num_blocks) != 0) { 6080 return -EINVAL; 6081 } 6082 6083 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6084 } 6085 6086 int 6087 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6088 uint64_t offset_blocks, uint64_t num_blocks, 6089 spdk_bdev_io_completion_cb cb, void *cb_arg) 6090 { 6091 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6092 struct spdk_bdev_io *bdev_io; 6093 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6094 6095 if (!desc->write) { 6096 return -EBADF; 6097 } 6098 6099 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6100 return -EINVAL; 6101 } 6102 6103 bdev_io = bdev_channel_get_io(channel); 6104 if (!bdev_io) { 6105 return -ENOMEM; 6106 } 6107 6108 bdev_io->internal.ch = channel; 6109 bdev_io->internal.desc = desc; 6110 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6111 bdev_io->u.bdev.iovs = NULL; 6112 bdev_io->u.bdev.iovcnt = 0; 6113 bdev_io->u.bdev.offset_blocks = offset_blocks; 6114 bdev_io->u.bdev.num_blocks = num_blocks; 6115 bdev_io->u.bdev.memory_domain = NULL; 6116 bdev_io->u.bdev.memory_domain_ctx = NULL; 6117 bdev_io->u.bdev.accel_sequence = NULL; 6118 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6119 6120 bdev_io_submit(bdev_io); 6121 return 0; 6122 } 6123 6124 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6125 6126 static void 6127 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6128 { 6129 struct spdk_bdev_channel *ch = _ctx; 6130 struct spdk_bdev_io *bdev_io; 6131 6132 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6133 6134 if (status == -EBUSY) { 6135 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6136 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6137 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6138 } else { 6139 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6140 6141 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6142 /* If outstanding IOs are still present and reset_io_drain_timeout 6143 * seconds passed, start the reset. */ 6144 bdev_io_submit_reset(bdev_io); 6145 } else { 6146 /* We still have in progress memory domain pull/push or we're 6147 * executing accel sequence. Since we cannot abort either of those 6148 * operaions, fail the reset request. */ 6149 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6150 } 6151 } 6152 } else { 6153 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6154 SPDK_DEBUGLOG(bdev, 6155 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6156 ch->bdev->name); 6157 /* Mark the completion status as a SUCCESS and complete the reset. */ 6158 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6159 } 6160 } 6161 6162 static void 6163 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6164 struct spdk_io_channel *io_ch, void *_ctx) 6165 { 6166 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6167 int status = 0; 6168 6169 if (cur_ch->io_outstanding > 0 || 6170 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6171 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6172 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6173 * further iteration over the rest of the channels and pass non-zero status 6174 * to the callback function. */ 6175 status = -EBUSY; 6176 } 6177 spdk_bdev_for_each_channel_continue(i, status); 6178 } 6179 6180 static int 6181 bdev_reset_poll_for_outstanding_io(void *ctx) 6182 { 6183 struct spdk_bdev_channel *ch = ctx; 6184 struct spdk_bdev_io *bdev_io; 6185 6186 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6187 6188 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6189 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6190 bdev_reset_check_outstanding_io_done); 6191 6192 return SPDK_POLLER_BUSY; 6193 } 6194 6195 static void 6196 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6197 { 6198 struct spdk_bdev_channel *ch = _ctx; 6199 struct spdk_bdev_io *bdev_io; 6200 6201 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6202 6203 if (bdev->reset_io_drain_timeout == 0) { 6204 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6205 6206 bdev_io_submit_reset(bdev_io); 6207 return; 6208 } 6209 6210 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6211 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6212 6213 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6214 * submit the reset to the underlying module only if outstanding I/O 6215 * remain after reset_io_drain_timeout seconds have passed. */ 6216 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6217 bdev_reset_check_outstanding_io_done); 6218 } 6219 6220 static void 6221 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6222 struct spdk_io_channel *ch, void *_ctx) 6223 { 6224 struct spdk_bdev_channel *channel; 6225 struct spdk_bdev_mgmt_channel *mgmt_channel; 6226 struct spdk_bdev_shared_resource *shared_resource; 6227 bdev_io_tailq_t tmp_queued; 6228 6229 TAILQ_INIT(&tmp_queued); 6230 6231 channel = __io_ch_to_bdev_ch(ch); 6232 shared_resource = channel->shared_resource; 6233 mgmt_channel = shared_resource->mgmt_ch; 6234 6235 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6236 6237 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6238 /* The QoS object is always valid and readable while 6239 * the channel flag is set, so the lock here should not 6240 * be necessary. We're not in the fast path though, so 6241 * just take it anyway. */ 6242 spdk_spin_lock(&channel->bdev->internal.spinlock); 6243 if (channel->bdev->internal.qos->ch == channel) { 6244 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6245 } 6246 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6247 } 6248 6249 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6250 bdev_abort_all_buf_io(mgmt_channel, channel); 6251 bdev_abort_all_buf_io(mgmt_channel, channel); 6252 bdev_abort_all_queued_io(&tmp_queued, channel); 6253 6254 spdk_bdev_for_each_channel_continue(i, 0); 6255 } 6256 6257 static void 6258 bdev_start_reset(void *ctx) 6259 { 6260 struct spdk_bdev_channel *ch = ctx; 6261 6262 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6263 bdev_reset_freeze_channel_done); 6264 } 6265 6266 static void 6267 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6268 { 6269 struct spdk_bdev *bdev = ch->bdev; 6270 6271 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6272 6273 spdk_spin_lock(&bdev->internal.spinlock); 6274 if (bdev->internal.reset_in_progress == NULL) { 6275 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6276 /* 6277 * Take a channel reference for the target bdev for the life of this 6278 * reset. This guards against the channel getting destroyed while 6279 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6280 * progress. We will release the reference when this reset is 6281 * completed. 6282 */ 6283 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6284 bdev_start_reset(ch); 6285 } 6286 spdk_spin_unlock(&bdev->internal.spinlock); 6287 } 6288 6289 int 6290 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6291 spdk_bdev_io_completion_cb cb, void *cb_arg) 6292 { 6293 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6294 struct spdk_bdev_io *bdev_io; 6295 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6296 6297 bdev_io = bdev_channel_get_io(channel); 6298 if (!bdev_io) { 6299 return -ENOMEM; 6300 } 6301 6302 bdev_io->internal.ch = channel; 6303 bdev_io->internal.desc = desc; 6304 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6305 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6306 bdev_io->u.reset.ch_ref = NULL; 6307 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6308 6309 spdk_spin_lock(&bdev->internal.spinlock); 6310 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6311 spdk_spin_unlock(&bdev->internal.spinlock); 6312 6313 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6314 internal.ch_link); 6315 6316 bdev_channel_start_reset(channel); 6317 6318 return 0; 6319 } 6320 6321 void 6322 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6323 struct spdk_bdev_io_stat *stat) 6324 { 6325 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6326 6327 bdev_get_io_stat(stat, channel->stat); 6328 } 6329 6330 static void 6331 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6332 { 6333 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6334 6335 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6336 bdev_iostat_ctx->cb_arg, 0); 6337 free(bdev_iostat_ctx); 6338 } 6339 6340 static void 6341 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6342 struct spdk_io_channel *ch, void *_ctx) 6343 { 6344 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6345 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6346 6347 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6348 spdk_bdev_for_each_channel_continue(i, 0); 6349 } 6350 6351 void 6352 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6353 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6354 { 6355 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6356 6357 assert(bdev != NULL); 6358 assert(stat != NULL); 6359 assert(cb != NULL); 6360 6361 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6362 if (bdev_iostat_ctx == NULL) { 6363 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6364 cb(bdev, stat, cb_arg, -ENOMEM); 6365 return; 6366 } 6367 6368 bdev_iostat_ctx->stat = stat; 6369 bdev_iostat_ctx->cb = cb; 6370 bdev_iostat_ctx->cb_arg = cb_arg; 6371 6372 /* Start with the statistics from previously deleted channels. */ 6373 spdk_spin_lock(&bdev->internal.spinlock); 6374 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6375 spdk_spin_unlock(&bdev->internal.spinlock); 6376 6377 /* Then iterate and add the statistics from each existing channel. */ 6378 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6379 bdev_get_device_stat_done); 6380 } 6381 6382 struct bdev_iostat_reset_ctx { 6383 enum spdk_bdev_reset_stat_mode mode; 6384 bdev_reset_device_stat_cb cb; 6385 void *cb_arg; 6386 }; 6387 6388 static void 6389 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6390 { 6391 struct bdev_iostat_reset_ctx *ctx = _ctx; 6392 6393 ctx->cb(bdev, ctx->cb_arg, 0); 6394 6395 free(ctx); 6396 } 6397 6398 static void 6399 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6400 struct spdk_io_channel *ch, void *_ctx) 6401 { 6402 struct bdev_iostat_reset_ctx *ctx = _ctx; 6403 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6404 6405 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6406 6407 spdk_bdev_for_each_channel_continue(i, 0); 6408 } 6409 6410 void 6411 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6412 bdev_reset_device_stat_cb cb, void *cb_arg) 6413 { 6414 struct bdev_iostat_reset_ctx *ctx; 6415 6416 assert(bdev != NULL); 6417 assert(cb != NULL); 6418 6419 ctx = calloc(1, sizeof(*ctx)); 6420 if (ctx == NULL) { 6421 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6422 cb(bdev, cb_arg, -ENOMEM); 6423 return; 6424 } 6425 6426 ctx->mode = mode; 6427 ctx->cb = cb; 6428 ctx->cb_arg = cb_arg; 6429 6430 spdk_spin_lock(&bdev->internal.spinlock); 6431 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6432 spdk_spin_unlock(&bdev->internal.spinlock); 6433 6434 spdk_bdev_for_each_channel(bdev, 6435 bdev_reset_each_channel_stat, 6436 ctx, 6437 bdev_reset_device_stat_done); 6438 } 6439 6440 int 6441 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6442 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6443 spdk_bdev_io_completion_cb cb, void *cb_arg) 6444 { 6445 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6446 struct spdk_bdev_io *bdev_io; 6447 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6448 6449 if (!desc->write) { 6450 return -EBADF; 6451 } 6452 6453 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6454 return -ENOTSUP; 6455 } 6456 6457 bdev_io = bdev_channel_get_io(channel); 6458 if (!bdev_io) { 6459 return -ENOMEM; 6460 } 6461 6462 bdev_io->internal.ch = channel; 6463 bdev_io->internal.desc = desc; 6464 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6465 bdev_io->u.nvme_passthru.cmd = *cmd; 6466 bdev_io->u.nvme_passthru.buf = buf; 6467 bdev_io->u.nvme_passthru.nbytes = nbytes; 6468 bdev_io->u.nvme_passthru.md_buf = NULL; 6469 bdev_io->u.nvme_passthru.md_len = 0; 6470 6471 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6472 6473 bdev_io_submit(bdev_io); 6474 return 0; 6475 } 6476 6477 int 6478 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6479 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6480 spdk_bdev_io_completion_cb cb, void *cb_arg) 6481 { 6482 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6483 struct spdk_bdev_io *bdev_io; 6484 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6485 6486 if (!desc->write) { 6487 /* 6488 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6489 * to easily determine if the command is a read or write, but for now just 6490 * do not allow io_passthru with a read-only descriptor. 6491 */ 6492 return -EBADF; 6493 } 6494 6495 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6496 return -ENOTSUP; 6497 } 6498 6499 bdev_io = bdev_channel_get_io(channel); 6500 if (!bdev_io) { 6501 return -ENOMEM; 6502 } 6503 6504 bdev_io->internal.ch = channel; 6505 bdev_io->internal.desc = desc; 6506 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6507 bdev_io->u.nvme_passthru.cmd = *cmd; 6508 bdev_io->u.nvme_passthru.buf = buf; 6509 bdev_io->u.nvme_passthru.nbytes = nbytes; 6510 bdev_io->u.nvme_passthru.md_buf = NULL; 6511 bdev_io->u.nvme_passthru.md_len = 0; 6512 6513 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6514 6515 bdev_io_submit(bdev_io); 6516 return 0; 6517 } 6518 6519 int 6520 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6521 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6522 spdk_bdev_io_completion_cb cb, void *cb_arg) 6523 { 6524 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6525 struct spdk_bdev_io *bdev_io; 6526 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6527 6528 if (!desc->write) { 6529 /* 6530 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6531 * to easily determine if the command is a read or write, but for now just 6532 * do not allow io_passthru with a read-only descriptor. 6533 */ 6534 return -EBADF; 6535 } 6536 6537 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6538 return -ENOTSUP; 6539 } 6540 6541 bdev_io = bdev_channel_get_io(channel); 6542 if (!bdev_io) { 6543 return -ENOMEM; 6544 } 6545 6546 bdev_io->internal.ch = channel; 6547 bdev_io->internal.desc = desc; 6548 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6549 bdev_io->u.nvme_passthru.cmd = *cmd; 6550 bdev_io->u.nvme_passthru.buf = buf; 6551 bdev_io->u.nvme_passthru.nbytes = nbytes; 6552 bdev_io->u.nvme_passthru.md_buf = md_buf; 6553 bdev_io->u.nvme_passthru.md_len = md_len; 6554 6555 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6556 6557 bdev_io_submit(bdev_io); 6558 return 0; 6559 } 6560 6561 static void bdev_abort_retry(void *ctx); 6562 static void bdev_abort(struct spdk_bdev_io *parent_io); 6563 6564 static void 6565 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6566 { 6567 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6568 struct spdk_bdev_io *parent_io = cb_arg; 6569 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6570 6571 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6572 6573 spdk_bdev_free_io(bdev_io); 6574 6575 if (!success) { 6576 /* Check if the target I/O completed in the meantime. */ 6577 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6578 if (tmp_io == bio_to_abort) { 6579 break; 6580 } 6581 } 6582 6583 /* If the target I/O still exists, set the parent to failed. */ 6584 if (tmp_io != NULL) { 6585 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6586 } 6587 } 6588 6589 parent_io->u.bdev.split_outstanding--; 6590 if (parent_io->u.bdev.split_outstanding == 0) { 6591 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6592 bdev_abort_retry(parent_io); 6593 } else { 6594 bdev_io_complete(parent_io); 6595 } 6596 } 6597 } 6598 6599 static int 6600 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6601 struct spdk_bdev_io *bio_to_abort, 6602 spdk_bdev_io_completion_cb cb, void *cb_arg) 6603 { 6604 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6605 struct spdk_bdev_io *bdev_io; 6606 6607 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6608 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6609 /* TODO: Abort reset or abort request. */ 6610 return -ENOTSUP; 6611 } 6612 6613 bdev_io = bdev_channel_get_io(channel); 6614 if (bdev_io == NULL) { 6615 return -ENOMEM; 6616 } 6617 6618 bdev_io->internal.ch = channel; 6619 bdev_io->internal.desc = desc; 6620 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6621 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6622 6623 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6624 assert(bdev_io_should_split(bio_to_abort)); 6625 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6626 6627 /* Parent abort request is not submitted directly, but to manage its 6628 * execution add it to the submitted list here. 6629 */ 6630 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6631 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6632 6633 bdev_abort(bdev_io); 6634 6635 return 0; 6636 } 6637 6638 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6639 6640 /* Submit the abort request to the underlying bdev module. */ 6641 bdev_io_submit(bdev_io); 6642 6643 return 0; 6644 } 6645 6646 static bool 6647 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6648 { 6649 struct spdk_bdev_io *iter; 6650 6651 TAILQ_FOREACH(iter, tailq, internal.link) { 6652 if (iter == bdev_io) { 6653 return true; 6654 } 6655 } 6656 6657 return false; 6658 } 6659 6660 static uint32_t 6661 _bdev_abort(struct spdk_bdev_io *parent_io) 6662 { 6663 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6664 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6665 void *bio_cb_arg; 6666 struct spdk_bdev_io *bio_to_abort; 6667 uint32_t matched_ios; 6668 int rc; 6669 6670 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6671 6672 /* matched_ios is returned and will be kept by the caller. 6673 * 6674 * This function will be used for two cases, 1) the same cb_arg is used for 6675 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6676 * Incrementing split_outstanding directly here may confuse readers especially 6677 * for the 1st case. 6678 * 6679 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6680 * works as expected. 6681 */ 6682 matched_ios = 0; 6683 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6684 6685 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6686 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6687 continue; 6688 } 6689 6690 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6691 /* Any I/O which was submitted after this abort command should be excluded. */ 6692 continue; 6693 } 6694 6695 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6696 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6697 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6698 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6699 break; 6700 } 6701 6702 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6703 if (rc != 0) { 6704 if (rc == -ENOMEM) { 6705 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6706 } else { 6707 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6708 } 6709 break; 6710 } 6711 matched_ios++; 6712 } 6713 6714 return matched_ios; 6715 } 6716 6717 static void 6718 bdev_abort_retry(void *ctx) 6719 { 6720 struct spdk_bdev_io *parent_io = ctx; 6721 uint32_t matched_ios; 6722 6723 matched_ios = _bdev_abort(parent_io); 6724 6725 if (matched_ios == 0) { 6726 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6727 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6728 } else { 6729 /* For retry, the case that no target I/O was found is success 6730 * because it means target I/Os completed in the meantime. 6731 */ 6732 bdev_io_complete(parent_io); 6733 } 6734 return; 6735 } 6736 6737 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6738 parent_io->u.bdev.split_outstanding = matched_ios; 6739 } 6740 6741 static void 6742 bdev_abort(struct spdk_bdev_io *parent_io) 6743 { 6744 uint32_t matched_ios; 6745 6746 matched_ios = _bdev_abort(parent_io); 6747 6748 if (matched_ios == 0) { 6749 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6750 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6751 } else { 6752 /* The case the no target I/O was found is failure. */ 6753 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6754 bdev_io_complete(parent_io); 6755 } 6756 return; 6757 } 6758 6759 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6760 parent_io->u.bdev.split_outstanding = matched_ios; 6761 } 6762 6763 int 6764 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6765 void *bio_cb_arg, 6766 spdk_bdev_io_completion_cb cb, void *cb_arg) 6767 { 6768 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6769 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6770 struct spdk_bdev_io *bdev_io; 6771 6772 if (bio_cb_arg == NULL) { 6773 return -EINVAL; 6774 } 6775 6776 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6777 return -ENOTSUP; 6778 } 6779 6780 bdev_io = bdev_channel_get_io(channel); 6781 if (bdev_io == NULL) { 6782 return -ENOMEM; 6783 } 6784 6785 bdev_io->internal.ch = channel; 6786 bdev_io->internal.desc = desc; 6787 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6788 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6789 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6790 6791 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6792 6793 /* Parent abort request is not submitted directly, but to manage its execution, 6794 * add it to the submitted list here. 6795 */ 6796 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6797 6798 bdev_abort(bdev_io); 6799 6800 return 0; 6801 } 6802 6803 int 6804 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6805 struct spdk_bdev_io_wait_entry *entry) 6806 { 6807 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6808 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6809 6810 if (bdev != entry->bdev) { 6811 SPDK_ERRLOG("bdevs do not match\n"); 6812 return -EINVAL; 6813 } 6814 6815 if (mgmt_ch->per_thread_cache_count > 0) { 6816 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6817 return -EINVAL; 6818 } 6819 6820 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6821 return 0; 6822 } 6823 6824 static inline void 6825 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6826 { 6827 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6828 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6829 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6830 uint32_t blocklen = bdev_io->bdev->blocklen; 6831 6832 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6833 switch (bdev_io->type) { 6834 case SPDK_BDEV_IO_TYPE_READ: 6835 io_stat->bytes_read += num_blocks * blocklen; 6836 io_stat->num_read_ops++; 6837 io_stat->read_latency_ticks += tsc_diff; 6838 if (io_stat->max_read_latency_ticks < tsc_diff) { 6839 io_stat->max_read_latency_ticks = tsc_diff; 6840 } 6841 if (io_stat->min_read_latency_ticks > tsc_diff) { 6842 io_stat->min_read_latency_ticks = tsc_diff; 6843 } 6844 break; 6845 case SPDK_BDEV_IO_TYPE_WRITE: 6846 io_stat->bytes_written += num_blocks * blocklen; 6847 io_stat->num_write_ops++; 6848 io_stat->write_latency_ticks += tsc_diff; 6849 if (io_stat->max_write_latency_ticks < tsc_diff) { 6850 io_stat->max_write_latency_ticks = tsc_diff; 6851 } 6852 if (io_stat->min_write_latency_ticks > tsc_diff) { 6853 io_stat->min_write_latency_ticks = tsc_diff; 6854 } 6855 break; 6856 case SPDK_BDEV_IO_TYPE_UNMAP: 6857 io_stat->bytes_unmapped += num_blocks * blocklen; 6858 io_stat->num_unmap_ops++; 6859 io_stat->unmap_latency_ticks += tsc_diff; 6860 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6861 io_stat->max_unmap_latency_ticks = tsc_diff; 6862 } 6863 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6864 io_stat->min_unmap_latency_ticks = tsc_diff; 6865 } 6866 break; 6867 case SPDK_BDEV_IO_TYPE_ZCOPY: 6868 /* Track the data in the start phase only */ 6869 if (bdev_io->u.bdev.zcopy.start) { 6870 if (bdev_io->u.bdev.zcopy.populate) { 6871 io_stat->bytes_read += num_blocks * blocklen; 6872 io_stat->num_read_ops++; 6873 io_stat->read_latency_ticks += tsc_diff; 6874 if (io_stat->max_read_latency_ticks < tsc_diff) { 6875 io_stat->max_read_latency_ticks = tsc_diff; 6876 } 6877 if (io_stat->min_read_latency_ticks > tsc_diff) { 6878 io_stat->min_read_latency_ticks = tsc_diff; 6879 } 6880 } else { 6881 io_stat->bytes_written += num_blocks * blocklen; 6882 io_stat->num_write_ops++; 6883 io_stat->write_latency_ticks += tsc_diff; 6884 if (io_stat->max_write_latency_ticks < tsc_diff) { 6885 io_stat->max_write_latency_ticks = tsc_diff; 6886 } 6887 if (io_stat->min_write_latency_ticks > tsc_diff) { 6888 io_stat->min_write_latency_ticks = tsc_diff; 6889 } 6890 } 6891 } 6892 break; 6893 case SPDK_BDEV_IO_TYPE_COPY: 6894 io_stat->bytes_copied += num_blocks * blocklen; 6895 io_stat->num_copy_ops++; 6896 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6897 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6898 io_stat->max_copy_latency_ticks = tsc_diff; 6899 } 6900 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6901 io_stat->min_copy_latency_ticks = tsc_diff; 6902 } 6903 break; 6904 default: 6905 break; 6906 } 6907 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6908 io_stat = bdev_io->bdev->internal.stat; 6909 assert(io_stat->io_error != NULL); 6910 6911 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6912 io_stat->io_error->error_status[-io_status - 1]++; 6913 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6914 } 6915 6916 #ifdef SPDK_CONFIG_VTUNE 6917 uint64_t now_tsc = spdk_get_ticks(); 6918 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6919 uint64_t data[5]; 6920 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6921 6922 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6923 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6924 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6925 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6926 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6927 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6928 6929 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6930 __itt_metadata_u64, 5, data); 6931 6932 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6933 bdev_io->internal.ch->start_tsc = now_tsc; 6934 } 6935 #endif 6936 } 6937 6938 static inline void 6939 _bdev_io_complete(void *ctx) 6940 { 6941 struct spdk_bdev_io *bdev_io = ctx; 6942 6943 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6944 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6945 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6946 } 6947 6948 assert(bdev_io->internal.cb != NULL); 6949 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6950 6951 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6952 bdev_io->internal.caller_ctx); 6953 } 6954 6955 static inline void 6956 bdev_io_complete(void *ctx) 6957 { 6958 struct spdk_bdev_io *bdev_io = ctx; 6959 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6960 uint64_t tsc, tsc_diff; 6961 6962 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6963 /* 6964 * Defer completion to avoid potential infinite recursion if the 6965 * user's completion callback issues a new I/O. 6966 */ 6967 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6968 bdev_io_complete, bdev_io); 6969 return; 6970 } 6971 6972 tsc = spdk_get_ticks(); 6973 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6974 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6975 bdev_io->internal.caller_ctx); 6976 6977 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6978 6979 if (bdev_io->internal.ch->histogram) { 6980 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6981 } 6982 6983 bdev_io_update_io_stat(bdev_io, tsc_diff); 6984 _bdev_io_complete(bdev_io); 6985 } 6986 6987 /* The difference between this function and bdev_io_complete() is that this should be called to 6988 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6989 * io_submitted list and don't have submit_tsc updated. 6990 */ 6991 static inline void 6992 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 6993 { 6994 /* Since the IO hasn't been submitted it's bound to be failed */ 6995 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6996 6997 /* At this point we don't know if the IO is completed from submission context or not, but, 6998 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 6999 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7000 _bdev_io_complete, bdev_io); 7001 } 7002 7003 static void bdev_destroy_cb(void *io_device); 7004 7005 static void 7006 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7007 { 7008 struct spdk_bdev_io *bdev_io = _ctx; 7009 7010 if (bdev_io->u.reset.ch_ref != NULL) { 7011 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7012 bdev_io->u.reset.ch_ref = NULL; 7013 } 7014 7015 bdev_io_complete(bdev_io); 7016 7017 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7018 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7019 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7020 } 7021 } 7022 7023 static void 7024 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7025 struct spdk_io_channel *_ch, void *_ctx) 7026 { 7027 struct spdk_bdev_io *bdev_io = _ctx; 7028 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7029 struct spdk_bdev_io *queued_reset; 7030 7031 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7032 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7033 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7034 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7035 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7036 } 7037 7038 spdk_bdev_for_each_channel_continue(i, 0); 7039 } 7040 7041 static void 7042 bdev_io_complete_sequence_cb(void *ctx, int status) 7043 { 7044 struct spdk_bdev_io *bdev_io = ctx; 7045 7046 /* u.bdev.accel_sequence should have already been cleared at this point */ 7047 assert(bdev_io->u.bdev.accel_sequence == NULL); 7048 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7049 bdev_io->internal.accel_sequence = NULL; 7050 7051 if (spdk_unlikely(status != 0)) { 7052 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7053 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7054 } 7055 7056 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_INVALID))) { 7057 return; 7058 } 7059 7060 bdev_io_complete(bdev_io); 7061 } 7062 7063 void 7064 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7065 { 7066 struct spdk_bdev *bdev = bdev_io->bdev; 7067 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7068 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7069 7070 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7071 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7072 spdk_bdev_get_module_name(bdev), 7073 bdev_io_status_get_string(bdev_io->internal.status)); 7074 assert(false); 7075 } 7076 bdev_io->internal.status = status; 7077 7078 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7079 bool unlock_channels = false; 7080 7081 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7082 SPDK_ERRLOG("NOMEM returned for reset\n"); 7083 } 7084 spdk_spin_lock(&bdev->internal.spinlock); 7085 if (bdev_io == bdev->internal.reset_in_progress) { 7086 bdev->internal.reset_in_progress = NULL; 7087 unlock_channels = true; 7088 } 7089 spdk_spin_unlock(&bdev->internal.spinlock); 7090 7091 if (unlock_channels) { 7092 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7093 bdev_reset_complete); 7094 return; 7095 } 7096 } else { 7097 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7098 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7099 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7100 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7101 return; 7102 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 7103 _bdev_io_push_bounce_data_buffer(bdev_io, 7104 _bdev_io_complete_push_bounce_done); 7105 /* bdev IO will be completed in the callback */ 7106 return; 7107 } 7108 } 7109 7110 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7111 return; 7112 } 7113 } 7114 7115 bdev_io_complete(bdev_io); 7116 } 7117 7118 void 7119 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7120 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7121 { 7122 enum spdk_bdev_io_status status; 7123 7124 if (sc == SPDK_SCSI_STATUS_GOOD) { 7125 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7126 } else { 7127 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7128 bdev_io->internal.error.scsi.sc = sc; 7129 bdev_io->internal.error.scsi.sk = sk; 7130 bdev_io->internal.error.scsi.asc = asc; 7131 bdev_io->internal.error.scsi.ascq = ascq; 7132 } 7133 7134 spdk_bdev_io_complete(bdev_io, status); 7135 } 7136 7137 void 7138 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7139 int *sc, int *sk, int *asc, int *ascq) 7140 { 7141 assert(sc != NULL); 7142 assert(sk != NULL); 7143 assert(asc != NULL); 7144 assert(ascq != NULL); 7145 7146 switch (bdev_io->internal.status) { 7147 case SPDK_BDEV_IO_STATUS_SUCCESS: 7148 *sc = SPDK_SCSI_STATUS_GOOD; 7149 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7150 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7151 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7152 break; 7153 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7154 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7155 break; 7156 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7157 *sc = bdev_io->internal.error.scsi.sc; 7158 *sk = bdev_io->internal.error.scsi.sk; 7159 *asc = bdev_io->internal.error.scsi.asc; 7160 *ascq = bdev_io->internal.error.scsi.ascq; 7161 break; 7162 default: 7163 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7164 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7165 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7166 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7167 break; 7168 } 7169 } 7170 7171 void 7172 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7173 { 7174 enum spdk_bdev_io_status status; 7175 7176 if (aio_result == 0) { 7177 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7178 } else { 7179 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7180 } 7181 7182 bdev_io->internal.error.aio_result = aio_result; 7183 7184 spdk_bdev_io_complete(bdev_io, status); 7185 } 7186 7187 void 7188 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7189 { 7190 assert(aio_result != NULL); 7191 7192 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7193 *aio_result = bdev_io->internal.error.aio_result; 7194 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7195 *aio_result = 0; 7196 } else { 7197 *aio_result = -EIO; 7198 } 7199 } 7200 7201 void 7202 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7203 { 7204 enum spdk_bdev_io_status status; 7205 7206 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7207 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7208 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7209 status = SPDK_BDEV_IO_STATUS_ABORTED; 7210 } else { 7211 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7212 } 7213 7214 bdev_io->internal.error.nvme.cdw0 = cdw0; 7215 bdev_io->internal.error.nvme.sct = sct; 7216 bdev_io->internal.error.nvme.sc = sc; 7217 7218 spdk_bdev_io_complete(bdev_io, status); 7219 } 7220 7221 void 7222 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7223 { 7224 assert(sct != NULL); 7225 assert(sc != NULL); 7226 assert(cdw0 != NULL); 7227 7228 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7229 *sct = SPDK_NVME_SCT_GENERIC; 7230 *sc = SPDK_NVME_SC_SUCCESS; 7231 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7232 *cdw0 = 0; 7233 } else { 7234 *cdw0 = 1U; 7235 } 7236 return; 7237 } 7238 7239 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7240 *sct = bdev_io->internal.error.nvme.sct; 7241 *sc = bdev_io->internal.error.nvme.sc; 7242 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7243 *sct = SPDK_NVME_SCT_GENERIC; 7244 *sc = SPDK_NVME_SC_SUCCESS; 7245 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7246 *sct = SPDK_NVME_SCT_GENERIC; 7247 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7248 } else { 7249 *sct = SPDK_NVME_SCT_GENERIC; 7250 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7251 } 7252 7253 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7254 } 7255 7256 void 7257 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7258 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7259 { 7260 assert(first_sct != NULL); 7261 assert(first_sc != NULL); 7262 assert(second_sct != NULL); 7263 assert(second_sc != NULL); 7264 assert(cdw0 != NULL); 7265 7266 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7267 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7268 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7269 *first_sct = bdev_io->internal.error.nvme.sct; 7270 *first_sc = bdev_io->internal.error.nvme.sc; 7271 *second_sct = SPDK_NVME_SCT_GENERIC; 7272 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7273 } else { 7274 *first_sct = SPDK_NVME_SCT_GENERIC; 7275 *first_sc = SPDK_NVME_SC_SUCCESS; 7276 *second_sct = bdev_io->internal.error.nvme.sct; 7277 *second_sc = bdev_io->internal.error.nvme.sc; 7278 } 7279 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7280 *first_sct = SPDK_NVME_SCT_GENERIC; 7281 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7282 *second_sct = SPDK_NVME_SCT_GENERIC; 7283 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7284 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7285 *first_sct = SPDK_NVME_SCT_GENERIC; 7286 *first_sc = SPDK_NVME_SC_SUCCESS; 7287 *second_sct = SPDK_NVME_SCT_GENERIC; 7288 *second_sc = SPDK_NVME_SC_SUCCESS; 7289 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7290 *first_sct = SPDK_NVME_SCT_GENERIC; 7291 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7292 *second_sct = SPDK_NVME_SCT_GENERIC; 7293 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7294 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7295 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7296 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7297 *second_sct = SPDK_NVME_SCT_GENERIC; 7298 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7299 } else { 7300 *first_sct = SPDK_NVME_SCT_GENERIC; 7301 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7302 *second_sct = SPDK_NVME_SCT_GENERIC; 7303 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7304 } 7305 7306 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7307 } 7308 7309 struct spdk_thread * 7310 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7311 { 7312 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7313 } 7314 7315 struct spdk_io_channel * 7316 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7317 { 7318 return bdev_io->internal.ch->channel; 7319 } 7320 7321 static int 7322 bdev_register(struct spdk_bdev *bdev) 7323 { 7324 char *bdev_name; 7325 char uuid[SPDK_UUID_STRING_LEN]; 7326 struct spdk_iobuf_opts iobuf_opts; 7327 int ret, i; 7328 7329 assert(bdev->module != NULL); 7330 7331 if (!bdev->name) { 7332 SPDK_ERRLOG("Bdev name is NULL\n"); 7333 return -EINVAL; 7334 } 7335 7336 if (!strlen(bdev->name)) { 7337 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7338 return -EINVAL; 7339 } 7340 7341 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7342 if (bdev->fn_table->accel_sequence_supported == NULL) { 7343 continue; 7344 } 7345 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7346 (enum spdk_bdev_io_type)i)) { 7347 continue; 7348 } 7349 7350 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7351 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7352 "memory domains\n"); 7353 return -EINVAL; 7354 } 7355 7356 if (spdk_bdev_is_md_separate(bdev)) { 7357 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7358 "accel sequence support\n"); 7359 return -EINVAL; 7360 } 7361 } 7362 7363 /* Users often register their own I/O devices using the bdev name. In 7364 * order to avoid conflicts, prepend bdev_. */ 7365 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7366 if (!bdev_name) { 7367 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7368 return -ENOMEM; 7369 } 7370 7371 bdev->internal.stat = bdev_alloc_io_stat(true); 7372 if (!bdev->internal.stat) { 7373 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7374 free(bdev_name); 7375 return -ENOMEM; 7376 } 7377 7378 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7379 bdev->internal.measured_queue_depth = UINT64_MAX; 7380 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7381 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7382 bdev->internal.qd_poller = NULL; 7383 bdev->internal.qos = NULL; 7384 7385 TAILQ_INIT(&bdev->internal.open_descs); 7386 TAILQ_INIT(&bdev->internal.locked_ranges); 7387 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7388 TAILQ_INIT(&bdev->aliases); 7389 7390 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7391 if (ret != 0) { 7392 bdev_free_io_stat(bdev->internal.stat); 7393 free(bdev_name); 7394 return ret; 7395 } 7396 7397 /* UUID may be specified by the user or defined by bdev itself. 7398 * Otherwise it will be generated here, so this field will never be empty. */ 7399 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7400 spdk_uuid_generate(&bdev->uuid); 7401 } 7402 7403 /* Add the UUID alias only if it's different than the name */ 7404 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7405 if (strcmp(bdev->name, uuid) != 0) { 7406 ret = spdk_bdev_alias_add(bdev, uuid); 7407 if (ret != 0) { 7408 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7409 bdev_name_del(&bdev->internal.bdev_name); 7410 bdev_free_io_stat(bdev->internal.stat); 7411 free(bdev_name); 7412 return ret; 7413 } 7414 } 7415 7416 if (spdk_bdev_get_buf_align(bdev) > 1) { 7417 if (bdev->split_on_optimal_io_boundary) { 7418 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7419 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7420 } else { 7421 bdev->split_on_optimal_io_boundary = true; 7422 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7423 } 7424 } 7425 7426 /* If the user didn't specify a write unit size, set it to one. */ 7427 if (bdev->write_unit_size == 0) { 7428 bdev->write_unit_size = 1; 7429 } 7430 7431 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7432 if (bdev->acwu == 0) { 7433 bdev->acwu = bdev->write_unit_size; 7434 } 7435 7436 if (bdev->phys_blocklen == 0) { 7437 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7438 } 7439 7440 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7441 spdk_iobuf_get_opts(&iobuf_opts); 7442 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7443 } 7444 7445 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7446 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7447 } 7448 7449 bdev->internal.reset_in_progress = NULL; 7450 bdev->internal.qd_poll_in_progress = false; 7451 bdev->internal.period = 0; 7452 bdev->internal.new_period = 0; 7453 7454 spdk_io_device_register(__bdev_to_io_dev(bdev), 7455 bdev_channel_create, bdev_channel_destroy, 7456 sizeof(struct spdk_bdev_channel), 7457 bdev_name); 7458 7459 free(bdev_name); 7460 7461 spdk_spin_init(&bdev->internal.spinlock); 7462 7463 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7464 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7465 7466 return 0; 7467 } 7468 7469 static void 7470 bdev_destroy_cb(void *io_device) 7471 { 7472 int rc; 7473 struct spdk_bdev *bdev; 7474 spdk_bdev_unregister_cb cb_fn; 7475 void *cb_arg; 7476 7477 bdev = __bdev_from_io_dev(io_device); 7478 7479 if (bdev->internal.unregister_td != spdk_get_thread()) { 7480 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7481 return; 7482 } 7483 7484 cb_fn = bdev->internal.unregister_cb; 7485 cb_arg = bdev->internal.unregister_ctx; 7486 7487 spdk_spin_destroy(&bdev->internal.spinlock); 7488 free(bdev->internal.qos); 7489 bdev_free_io_stat(bdev->internal.stat); 7490 7491 rc = bdev->fn_table->destruct(bdev->ctxt); 7492 if (rc < 0) { 7493 SPDK_ERRLOG("destruct failed\n"); 7494 } 7495 if (rc <= 0 && cb_fn != NULL) { 7496 cb_fn(cb_arg, rc); 7497 } 7498 } 7499 7500 void 7501 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7502 { 7503 if (bdev->internal.unregister_cb != NULL) { 7504 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7505 } 7506 } 7507 7508 static void 7509 _remove_notify(void *arg) 7510 { 7511 struct spdk_bdev_desc *desc = arg; 7512 7513 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7514 } 7515 7516 /* returns: 0 - bdev removed and ready to be destructed. 7517 * -EBUSY - bdev can't be destructed yet. */ 7518 static int 7519 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7520 { 7521 struct spdk_bdev_desc *desc, *tmp; 7522 int rc = 0; 7523 char uuid[SPDK_UUID_STRING_LEN]; 7524 7525 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7526 assert(spdk_spin_held(&bdev->internal.spinlock)); 7527 7528 /* Notify each descriptor about hotremoval */ 7529 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7530 rc = -EBUSY; 7531 /* 7532 * Defer invocation of the event_cb to a separate message that will 7533 * run later on its thread. This ensures this context unwinds and 7534 * we don't recursively unregister this bdev again if the event_cb 7535 * immediately closes its descriptor. 7536 */ 7537 event_notify(desc, _remove_notify); 7538 } 7539 7540 /* If there are no descriptors, proceed removing the bdev */ 7541 if (rc == 0) { 7542 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7543 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7544 7545 /* Delete the name and the UUID alias */ 7546 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7547 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7548 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7549 7550 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7551 7552 if (bdev->internal.reset_in_progress != NULL) { 7553 /* If reset is in progress, let the completion callback for reset 7554 * unregister the bdev. 7555 */ 7556 rc = -EBUSY; 7557 } 7558 } 7559 7560 return rc; 7561 } 7562 7563 static void 7564 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7565 struct spdk_io_channel *io_ch, void *_ctx) 7566 { 7567 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7568 7569 bdev_channel_abort_queued_ios(bdev_ch); 7570 spdk_bdev_for_each_channel_continue(i, 0); 7571 } 7572 7573 static void 7574 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7575 { 7576 int rc; 7577 7578 spdk_spin_lock(&g_bdev_mgr.spinlock); 7579 spdk_spin_lock(&bdev->internal.spinlock); 7580 /* 7581 * Set the status to REMOVING after completing to abort channels. Otherwise, 7582 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7583 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7584 * may fail. 7585 */ 7586 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7587 rc = bdev_unregister_unsafe(bdev); 7588 spdk_spin_unlock(&bdev->internal.spinlock); 7589 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7590 7591 if (rc == 0) { 7592 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7593 } 7594 } 7595 7596 void 7597 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7598 { 7599 struct spdk_thread *thread; 7600 7601 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7602 7603 thread = spdk_get_thread(); 7604 if (!thread) { 7605 /* The user called this from a non-SPDK thread. */ 7606 if (cb_fn != NULL) { 7607 cb_fn(cb_arg, -ENOTSUP); 7608 } 7609 return; 7610 } 7611 7612 spdk_spin_lock(&g_bdev_mgr.spinlock); 7613 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7614 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7615 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7616 if (cb_fn) { 7617 cb_fn(cb_arg, -EBUSY); 7618 } 7619 return; 7620 } 7621 7622 spdk_spin_lock(&bdev->internal.spinlock); 7623 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7624 bdev->internal.unregister_cb = cb_fn; 7625 bdev->internal.unregister_ctx = cb_arg; 7626 bdev->internal.unregister_td = thread; 7627 spdk_spin_unlock(&bdev->internal.spinlock); 7628 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7629 7630 spdk_bdev_set_qd_sampling_period(bdev, 0); 7631 7632 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7633 bdev_unregister); 7634 } 7635 7636 int 7637 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7638 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7639 { 7640 struct spdk_bdev_desc *desc; 7641 struct spdk_bdev *bdev; 7642 int rc; 7643 7644 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7645 if (rc != 0) { 7646 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7647 return rc; 7648 } 7649 7650 bdev = spdk_bdev_desc_get_bdev(desc); 7651 7652 if (bdev->module != module) { 7653 spdk_bdev_close(desc); 7654 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7655 bdev_name); 7656 return -ENODEV; 7657 } 7658 7659 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7660 7661 spdk_bdev_close(desc); 7662 7663 return 0; 7664 } 7665 7666 static int 7667 bdev_start_qos(struct spdk_bdev *bdev) 7668 { 7669 struct set_qos_limit_ctx *ctx; 7670 7671 /* Enable QoS */ 7672 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7673 ctx = calloc(1, sizeof(*ctx)); 7674 if (ctx == NULL) { 7675 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7676 return -ENOMEM; 7677 } 7678 ctx->bdev = bdev; 7679 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7680 } 7681 7682 return 0; 7683 } 7684 7685 static void 7686 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7687 struct spdk_bdev *bdev) 7688 { 7689 enum spdk_bdev_claim_type type; 7690 const char *typename, *modname; 7691 extern struct spdk_log_flag SPDK_LOG_bdev; 7692 7693 assert(spdk_spin_held(&bdev->internal.spinlock)); 7694 7695 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7696 return; 7697 } 7698 7699 type = bdev->internal.claim_type; 7700 typename = spdk_bdev_claim_get_name(type); 7701 7702 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7703 modname = bdev->internal.claim.v1.module->name; 7704 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7705 bdev->name, detail, typename, modname); 7706 return; 7707 } 7708 7709 if (claim_type_is_v2(type)) { 7710 struct spdk_bdev_module_claim *claim; 7711 7712 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7713 modname = claim->module->name; 7714 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7715 bdev->name, detail, typename, modname); 7716 } 7717 return; 7718 } 7719 7720 assert(false); 7721 } 7722 7723 static int 7724 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7725 { 7726 struct spdk_thread *thread; 7727 int rc = 0; 7728 7729 thread = spdk_get_thread(); 7730 if (!thread) { 7731 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7732 return -ENOTSUP; 7733 } 7734 7735 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7736 spdk_get_thread()); 7737 7738 desc->bdev = bdev; 7739 desc->thread = thread; 7740 desc->write = write; 7741 7742 spdk_spin_lock(&bdev->internal.spinlock); 7743 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7744 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7745 spdk_spin_unlock(&bdev->internal.spinlock); 7746 return -ENODEV; 7747 } 7748 7749 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7750 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7751 spdk_spin_unlock(&bdev->internal.spinlock); 7752 return -EPERM; 7753 } 7754 7755 rc = bdev_start_qos(bdev); 7756 if (rc != 0) { 7757 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7758 spdk_spin_unlock(&bdev->internal.spinlock); 7759 return rc; 7760 } 7761 7762 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7763 7764 spdk_spin_unlock(&bdev->internal.spinlock); 7765 7766 return 0; 7767 } 7768 7769 static int 7770 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7771 struct spdk_bdev_desc **_desc) 7772 { 7773 struct spdk_bdev_desc *desc; 7774 unsigned int i; 7775 7776 desc = calloc(1, sizeof(*desc)); 7777 if (desc == NULL) { 7778 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7779 return -ENOMEM; 7780 } 7781 7782 TAILQ_INIT(&desc->pending_media_events); 7783 TAILQ_INIT(&desc->free_media_events); 7784 7785 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7786 desc->callback.event_fn = event_cb; 7787 desc->callback.ctx = event_ctx; 7788 spdk_spin_init(&desc->spinlock); 7789 7790 if (bdev->media_events) { 7791 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7792 sizeof(*desc->media_events_buffer)); 7793 if (desc->media_events_buffer == NULL) { 7794 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7795 bdev_desc_free(desc); 7796 return -ENOMEM; 7797 } 7798 7799 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7800 TAILQ_INSERT_TAIL(&desc->free_media_events, 7801 &desc->media_events_buffer[i], tailq); 7802 } 7803 } 7804 7805 if (bdev->fn_table->accel_sequence_supported != NULL) { 7806 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7807 desc->accel_sequence_supported[i] = 7808 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7809 (enum spdk_bdev_io_type)i); 7810 } 7811 } 7812 7813 *_desc = desc; 7814 7815 return 0; 7816 } 7817 7818 int 7819 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7820 void *event_ctx, struct spdk_bdev_desc **_desc) 7821 { 7822 struct spdk_bdev_desc *desc; 7823 struct spdk_bdev *bdev; 7824 int rc; 7825 7826 if (event_cb == NULL) { 7827 SPDK_ERRLOG("Missing event callback function\n"); 7828 return -EINVAL; 7829 } 7830 7831 spdk_spin_lock(&g_bdev_mgr.spinlock); 7832 7833 bdev = bdev_get_by_name(bdev_name); 7834 7835 if (bdev == NULL) { 7836 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7837 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7838 return -ENODEV; 7839 } 7840 7841 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7842 if (rc != 0) { 7843 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7844 return rc; 7845 } 7846 7847 rc = bdev_open(bdev, write, desc); 7848 if (rc != 0) { 7849 bdev_desc_free(desc); 7850 desc = NULL; 7851 } 7852 7853 *_desc = desc; 7854 7855 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7856 7857 return rc; 7858 } 7859 7860 static void 7861 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7862 { 7863 int rc; 7864 7865 spdk_spin_lock(&bdev->internal.spinlock); 7866 spdk_spin_lock(&desc->spinlock); 7867 7868 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7869 7870 desc->closed = true; 7871 7872 if (desc->claim != NULL) { 7873 bdev_desc_release_claims(desc); 7874 } 7875 7876 if (0 == desc->refs) { 7877 spdk_spin_unlock(&desc->spinlock); 7878 bdev_desc_free(desc); 7879 } else { 7880 spdk_spin_unlock(&desc->spinlock); 7881 } 7882 7883 /* If no more descriptors, kill QoS channel */ 7884 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7885 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7886 bdev->name, spdk_get_thread()); 7887 7888 if (bdev_qos_destroy(bdev)) { 7889 /* There isn't anything we can do to recover here. Just let the 7890 * old QoS poller keep running. The QoS handling won't change 7891 * cores when the user allocates a new channel, but it won't break. */ 7892 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7893 } 7894 } 7895 7896 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7897 rc = bdev_unregister_unsafe(bdev); 7898 spdk_spin_unlock(&bdev->internal.spinlock); 7899 7900 if (rc == 0) { 7901 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7902 } 7903 } else { 7904 spdk_spin_unlock(&bdev->internal.spinlock); 7905 } 7906 } 7907 7908 void 7909 spdk_bdev_close(struct spdk_bdev_desc *desc) 7910 { 7911 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7912 7913 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7914 spdk_get_thread()); 7915 7916 assert(desc->thread == spdk_get_thread()); 7917 7918 spdk_poller_unregister(&desc->io_timeout_poller); 7919 7920 spdk_spin_lock(&g_bdev_mgr.spinlock); 7921 7922 bdev_close(bdev, desc); 7923 7924 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7925 } 7926 7927 static void 7928 bdev_register_finished(void *arg) 7929 { 7930 struct spdk_bdev_desc *desc = arg; 7931 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7932 7933 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7934 7935 spdk_spin_lock(&g_bdev_mgr.spinlock); 7936 7937 bdev_close(bdev, desc); 7938 7939 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7940 } 7941 7942 int 7943 spdk_bdev_register(struct spdk_bdev *bdev) 7944 { 7945 struct spdk_bdev_desc *desc; 7946 struct spdk_thread *thread = spdk_get_thread(); 7947 int rc; 7948 7949 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7950 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7951 thread ? spdk_thread_get_name(thread) : "null"); 7952 return -EINVAL; 7953 } 7954 7955 rc = bdev_register(bdev); 7956 if (rc != 0) { 7957 return rc; 7958 } 7959 7960 /* A descriptor is opened to prevent bdev deletion during examination */ 7961 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7962 if (rc != 0) { 7963 spdk_bdev_unregister(bdev, NULL, NULL); 7964 return rc; 7965 } 7966 7967 rc = bdev_open(bdev, false, desc); 7968 if (rc != 0) { 7969 bdev_desc_free(desc); 7970 spdk_bdev_unregister(bdev, NULL, NULL); 7971 return rc; 7972 } 7973 7974 /* Examine configuration before initializing I/O */ 7975 bdev_examine(bdev); 7976 7977 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7978 if (rc != 0) { 7979 bdev_close(bdev, desc); 7980 spdk_bdev_unregister(bdev, NULL, NULL); 7981 } 7982 7983 return rc; 7984 } 7985 7986 int 7987 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7988 struct spdk_bdev_module *module) 7989 { 7990 spdk_spin_lock(&bdev->internal.spinlock); 7991 7992 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7993 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7994 spdk_spin_unlock(&bdev->internal.spinlock); 7995 return -EPERM; 7996 } 7997 7998 if (desc && !desc->write) { 7999 desc->write = true; 8000 } 8001 8002 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8003 bdev->internal.claim.v1.module = module; 8004 8005 spdk_spin_unlock(&bdev->internal.spinlock); 8006 return 0; 8007 } 8008 8009 void 8010 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8011 { 8012 spdk_spin_lock(&bdev->internal.spinlock); 8013 8014 assert(bdev->internal.claim.v1.module != NULL); 8015 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8016 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8017 bdev->internal.claim.v1.module = NULL; 8018 8019 spdk_spin_unlock(&bdev->internal.spinlock); 8020 } 8021 8022 /* 8023 * Start claims v2 8024 */ 8025 8026 const char * 8027 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8028 { 8029 switch (type) { 8030 case SPDK_BDEV_CLAIM_NONE: 8031 return "not_claimed"; 8032 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8033 return "exclusive_write"; 8034 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8035 return "read_many_write_one"; 8036 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8037 return "read_many_write_none"; 8038 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8039 return "read_many_write_many"; 8040 default: 8041 break; 8042 } 8043 return "invalid_claim"; 8044 } 8045 8046 static bool 8047 claim_type_is_v2(enum spdk_bdev_claim_type type) 8048 { 8049 switch (type) { 8050 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8051 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8052 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8053 return true; 8054 default: 8055 break; 8056 } 8057 return false; 8058 } 8059 8060 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8061 static bool 8062 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8063 { 8064 switch (type) { 8065 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8066 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8067 return true; 8068 default: 8069 break; 8070 } 8071 return false; 8072 } 8073 8074 void 8075 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8076 { 8077 if (opts == NULL) { 8078 SPDK_ERRLOG("opts should not be NULL\n"); 8079 assert(opts != NULL); 8080 return; 8081 } 8082 if (size == 0) { 8083 SPDK_ERRLOG("size should not be zero\n"); 8084 assert(size != 0); 8085 return; 8086 } 8087 8088 memset(opts, 0, size); 8089 opts->opts_size = size; 8090 8091 #define FIELD_OK(field) \ 8092 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8093 8094 #define SET_FIELD(field, value) \ 8095 if (FIELD_OK(field)) { \ 8096 opts->field = value; \ 8097 } \ 8098 8099 SET_FIELD(shared_claim_key, 0); 8100 8101 #undef FIELD_OK 8102 #undef SET_FIELD 8103 } 8104 8105 static int 8106 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8107 { 8108 if (src->opts_size == 0) { 8109 SPDK_ERRLOG("size should not be zero\n"); 8110 return -1; 8111 } 8112 8113 memset(dst, 0, sizeof(*dst)); 8114 dst->opts_size = src->opts_size; 8115 8116 #define FIELD_OK(field) \ 8117 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8118 8119 #define SET_FIELD(field) \ 8120 if (FIELD_OK(field)) { \ 8121 dst->field = src->field; \ 8122 } \ 8123 8124 if (FIELD_OK(name)) { 8125 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8126 } 8127 8128 SET_FIELD(shared_claim_key); 8129 8130 /* You should not remove this statement, but need to update the assert statement 8131 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8132 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8133 8134 #undef FIELD_OK 8135 #undef SET_FIELD 8136 return 0; 8137 } 8138 8139 /* Returns 0 if a read-write-once claim can be taken. */ 8140 static int 8141 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8142 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8143 { 8144 struct spdk_bdev *bdev = desc->bdev; 8145 struct spdk_bdev_desc *open_desc; 8146 8147 assert(spdk_spin_held(&bdev->internal.spinlock)); 8148 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8149 8150 if (opts->shared_claim_key != 0) { 8151 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8152 bdev->name); 8153 return -EINVAL; 8154 } 8155 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8156 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8157 return -EPERM; 8158 } 8159 if (desc->claim != NULL) { 8160 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8161 bdev->name, desc->claim->module->name); 8162 return -EPERM; 8163 } 8164 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8165 if (desc != open_desc && open_desc->write) { 8166 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8167 "another descriptor is open for writing\n", 8168 bdev->name); 8169 return -EPERM; 8170 } 8171 } 8172 8173 return 0; 8174 } 8175 8176 /* Returns 0 if a read-only-many claim can be taken. */ 8177 static int 8178 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8179 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8180 { 8181 struct spdk_bdev *bdev = desc->bdev; 8182 struct spdk_bdev_desc *open_desc; 8183 8184 assert(spdk_spin_held(&bdev->internal.spinlock)); 8185 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8186 assert(desc->claim == NULL); 8187 8188 if (desc->write) { 8189 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8190 bdev->name); 8191 return -EINVAL; 8192 } 8193 if (opts->shared_claim_key != 0) { 8194 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8195 return -EINVAL; 8196 } 8197 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8198 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8199 if (open_desc->write) { 8200 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8201 "another descriptor is open for writing\n", 8202 bdev->name); 8203 return -EPERM; 8204 } 8205 } 8206 } 8207 8208 return 0; 8209 } 8210 8211 /* Returns 0 if a read-write-many claim can be taken. */ 8212 static int 8213 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8214 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8215 { 8216 struct spdk_bdev *bdev = desc->bdev; 8217 struct spdk_bdev_desc *open_desc; 8218 8219 assert(spdk_spin_held(&bdev->internal.spinlock)); 8220 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8221 assert(desc->claim == NULL); 8222 8223 if (opts->shared_claim_key == 0) { 8224 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8225 bdev->name); 8226 return -EINVAL; 8227 } 8228 switch (bdev->internal.claim_type) { 8229 case SPDK_BDEV_CLAIM_NONE: 8230 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8231 if (open_desc == desc) { 8232 continue; 8233 } 8234 if (open_desc->write) { 8235 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8236 "another descriptor is open for writing without a " 8237 "claim\n", bdev->name); 8238 return -EPERM; 8239 } 8240 } 8241 break; 8242 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8243 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8244 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8245 return -EPERM; 8246 } 8247 break; 8248 default: 8249 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8250 return -EBUSY; 8251 } 8252 8253 return 0; 8254 } 8255 8256 /* Updates desc and its bdev with a v2 claim. */ 8257 static int 8258 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8259 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8260 { 8261 struct spdk_bdev *bdev = desc->bdev; 8262 struct spdk_bdev_module_claim *claim; 8263 8264 assert(spdk_spin_held(&bdev->internal.spinlock)); 8265 assert(claim_type_is_v2(type)); 8266 assert(desc->claim == NULL); 8267 8268 claim = calloc(1, sizeof(*desc->claim)); 8269 if (claim == NULL) { 8270 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8271 return -ENOMEM; 8272 } 8273 claim->module = module; 8274 claim->desc = desc; 8275 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8276 memcpy(claim->name, opts->name, sizeof(claim->name)); 8277 desc->claim = claim; 8278 8279 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8280 bdev->internal.claim_type = type; 8281 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8282 bdev->internal.claim.v2.key = opts->shared_claim_key; 8283 } 8284 assert(type == bdev->internal.claim_type); 8285 8286 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8287 8288 if (!desc->write && claim_type_promotes_to_write(type)) { 8289 desc->write = true; 8290 } 8291 8292 return 0; 8293 } 8294 8295 int 8296 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8297 struct spdk_bdev_claim_opts *_opts, 8298 struct spdk_bdev_module *module) 8299 { 8300 struct spdk_bdev *bdev; 8301 struct spdk_bdev_claim_opts opts; 8302 int rc = 0; 8303 8304 if (desc == NULL) { 8305 SPDK_ERRLOG("descriptor must not be NULL\n"); 8306 return -EINVAL; 8307 } 8308 8309 bdev = desc->bdev; 8310 8311 if (_opts == NULL) { 8312 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8313 } else if (claim_opts_copy(_opts, &opts) != 0) { 8314 return -EINVAL; 8315 } 8316 8317 spdk_spin_lock(&bdev->internal.spinlock); 8318 8319 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8320 bdev->internal.claim_type != type) { 8321 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8322 spdk_spin_unlock(&bdev->internal.spinlock); 8323 return -EPERM; 8324 } 8325 8326 if (claim_type_is_v2(type) && desc->claim != NULL) { 8327 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8328 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8329 spdk_spin_unlock(&bdev->internal.spinlock); 8330 return -EPERM; 8331 } 8332 8333 switch (type) { 8334 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8335 spdk_spin_unlock(&bdev->internal.spinlock); 8336 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8337 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8338 rc = claim_verify_rwo(desc, type, &opts, module); 8339 break; 8340 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8341 rc = claim_verify_rom(desc, type, &opts, module); 8342 break; 8343 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8344 rc = claim_verify_rwm(desc, type, &opts, module); 8345 break; 8346 default: 8347 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8348 rc = -ENOTSUP; 8349 } 8350 8351 if (rc == 0) { 8352 rc = claim_bdev(desc, type, &opts, module); 8353 } 8354 8355 spdk_spin_unlock(&bdev->internal.spinlock); 8356 return rc; 8357 } 8358 8359 static void 8360 claim_reset(struct spdk_bdev *bdev) 8361 { 8362 assert(spdk_spin_held(&bdev->internal.spinlock)); 8363 assert(claim_type_is_v2(bdev->internal.claim_type)); 8364 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8365 8366 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8367 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8368 } 8369 8370 static void 8371 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8372 { 8373 struct spdk_bdev *bdev = desc->bdev; 8374 8375 assert(spdk_spin_held(&bdev->internal.spinlock)); 8376 assert(claim_type_is_v2(bdev->internal.claim_type)); 8377 8378 if (bdev->internal.examine_in_progress == 0) { 8379 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8380 free(desc->claim); 8381 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8382 claim_reset(bdev); 8383 } 8384 } else { 8385 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8386 desc->claim->module = NULL; 8387 desc->claim->desc = NULL; 8388 } 8389 desc->claim = NULL; 8390 } 8391 8392 /* 8393 * End claims v2 8394 */ 8395 8396 struct spdk_bdev * 8397 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8398 { 8399 assert(desc != NULL); 8400 return desc->bdev; 8401 } 8402 8403 int 8404 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8405 { 8406 struct spdk_bdev *bdev, *tmp; 8407 struct spdk_bdev_desc *desc; 8408 int rc = 0; 8409 8410 assert(fn != NULL); 8411 8412 spdk_spin_lock(&g_bdev_mgr.spinlock); 8413 bdev = spdk_bdev_first(); 8414 while (bdev != NULL) { 8415 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8416 if (rc != 0) { 8417 break; 8418 } 8419 rc = bdev_open(bdev, false, desc); 8420 if (rc != 0) { 8421 bdev_desc_free(desc); 8422 if (rc == -ENODEV) { 8423 /* Ignore the error and move to the next bdev. */ 8424 rc = 0; 8425 bdev = spdk_bdev_next(bdev); 8426 continue; 8427 } 8428 break; 8429 } 8430 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8431 8432 rc = fn(ctx, bdev); 8433 8434 spdk_spin_lock(&g_bdev_mgr.spinlock); 8435 tmp = spdk_bdev_next(bdev); 8436 bdev_close(bdev, desc); 8437 if (rc != 0) { 8438 break; 8439 } 8440 bdev = tmp; 8441 } 8442 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8443 8444 return rc; 8445 } 8446 8447 int 8448 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8449 { 8450 struct spdk_bdev *bdev, *tmp; 8451 struct spdk_bdev_desc *desc; 8452 int rc = 0; 8453 8454 assert(fn != NULL); 8455 8456 spdk_spin_lock(&g_bdev_mgr.spinlock); 8457 bdev = spdk_bdev_first_leaf(); 8458 while (bdev != NULL) { 8459 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8460 if (rc != 0) { 8461 break; 8462 } 8463 rc = bdev_open(bdev, false, desc); 8464 if (rc != 0) { 8465 bdev_desc_free(desc); 8466 if (rc == -ENODEV) { 8467 /* Ignore the error and move to the next bdev. */ 8468 rc = 0; 8469 bdev = spdk_bdev_next_leaf(bdev); 8470 continue; 8471 } 8472 break; 8473 } 8474 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8475 8476 rc = fn(ctx, bdev); 8477 8478 spdk_spin_lock(&g_bdev_mgr.spinlock); 8479 tmp = spdk_bdev_next_leaf(bdev); 8480 bdev_close(bdev, desc); 8481 if (rc != 0) { 8482 break; 8483 } 8484 bdev = tmp; 8485 } 8486 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8487 8488 return rc; 8489 } 8490 8491 void 8492 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8493 { 8494 struct iovec *iovs; 8495 int iovcnt; 8496 8497 if (bdev_io == NULL) { 8498 return; 8499 } 8500 8501 switch (bdev_io->type) { 8502 case SPDK_BDEV_IO_TYPE_READ: 8503 case SPDK_BDEV_IO_TYPE_WRITE: 8504 case SPDK_BDEV_IO_TYPE_ZCOPY: 8505 iovs = bdev_io->u.bdev.iovs; 8506 iovcnt = bdev_io->u.bdev.iovcnt; 8507 break; 8508 default: 8509 iovs = NULL; 8510 iovcnt = 0; 8511 break; 8512 } 8513 8514 if (iovp) { 8515 *iovp = iovs; 8516 } 8517 if (iovcntp) { 8518 *iovcntp = iovcnt; 8519 } 8520 } 8521 8522 void * 8523 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8524 { 8525 if (bdev_io == NULL) { 8526 return NULL; 8527 } 8528 8529 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8530 return NULL; 8531 } 8532 8533 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8534 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8535 return bdev_io->u.bdev.md_buf; 8536 } 8537 8538 return NULL; 8539 } 8540 8541 void * 8542 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8543 { 8544 if (bdev_io == NULL) { 8545 assert(false); 8546 return NULL; 8547 } 8548 8549 return bdev_io->internal.caller_ctx; 8550 } 8551 8552 void 8553 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8554 { 8555 8556 if (spdk_bdev_module_list_find(bdev_module->name)) { 8557 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8558 assert(false); 8559 } 8560 8561 spdk_spin_init(&bdev_module->internal.spinlock); 8562 8563 /* 8564 * Modules with examine callbacks must be initialized first, so they are 8565 * ready to handle examine callbacks from later modules that will 8566 * register physical bdevs. 8567 */ 8568 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8569 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8570 } else { 8571 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8572 } 8573 } 8574 8575 struct spdk_bdev_module * 8576 spdk_bdev_module_list_find(const char *name) 8577 { 8578 struct spdk_bdev_module *bdev_module; 8579 8580 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8581 if (strcmp(name, bdev_module->name) == 0) { 8582 break; 8583 } 8584 } 8585 8586 return bdev_module; 8587 } 8588 8589 static int 8590 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8591 { 8592 uint64_t num_blocks; 8593 void *md_buf = NULL; 8594 8595 num_blocks = bdev_io->u.bdev.num_blocks; 8596 8597 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8598 md_buf = (char *)g_bdev_mgr.zero_buffer + 8599 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8600 } 8601 8602 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8603 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8604 g_bdev_mgr.zero_buffer, md_buf, 8605 bdev_io->u.bdev.offset_blocks, num_blocks, 8606 bdev_write_zero_buffer_done, bdev_io); 8607 } 8608 8609 static void 8610 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8611 { 8612 struct spdk_bdev_io *parent_io = cb_arg; 8613 8614 spdk_bdev_free_io(bdev_io); 8615 8616 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8617 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8618 } 8619 8620 static void 8621 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8622 { 8623 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8624 ctx->bdev->internal.qos_mod_in_progress = false; 8625 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8626 8627 if (ctx->cb_fn) { 8628 ctx->cb_fn(ctx->cb_arg, status); 8629 } 8630 free(ctx); 8631 } 8632 8633 static void 8634 bdev_disable_qos_done(void *cb_arg) 8635 { 8636 struct set_qos_limit_ctx *ctx = cb_arg; 8637 struct spdk_bdev *bdev = ctx->bdev; 8638 struct spdk_bdev_io *bdev_io; 8639 struct spdk_bdev_qos *qos; 8640 8641 spdk_spin_lock(&bdev->internal.spinlock); 8642 qos = bdev->internal.qos; 8643 bdev->internal.qos = NULL; 8644 spdk_spin_unlock(&bdev->internal.spinlock); 8645 8646 while (!TAILQ_EMPTY(&qos->queued)) { 8647 /* Send queued I/O back to their original thread for resubmission. */ 8648 bdev_io = TAILQ_FIRST(&qos->queued); 8649 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8650 8651 if (bdev_io->internal.io_submit_ch) { 8652 /* 8653 * Channel was changed when sending it to the QoS thread - change it back 8654 * before sending it back to the original thread. 8655 */ 8656 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8657 bdev_io->internal.io_submit_ch = NULL; 8658 } 8659 8660 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8661 _bdev_io_submit, bdev_io); 8662 } 8663 8664 if (qos->thread != NULL) { 8665 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8666 spdk_poller_unregister(&qos->poller); 8667 } 8668 8669 free(qos); 8670 8671 bdev_set_qos_limit_done(ctx, 0); 8672 } 8673 8674 static void 8675 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8676 { 8677 struct set_qos_limit_ctx *ctx = _ctx; 8678 struct spdk_thread *thread; 8679 8680 spdk_spin_lock(&bdev->internal.spinlock); 8681 thread = bdev->internal.qos->thread; 8682 spdk_spin_unlock(&bdev->internal.spinlock); 8683 8684 if (thread != NULL) { 8685 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8686 } else { 8687 bdev_disable_qos_done(ctx); 8688 } 8689 } 8690 8691 static void 8692 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8693 struct spdk_io_channel *ch, void *_ctx) 8694 { 8695 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8696 8697 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8698 8699 spdk_bdev_for_each_channel_continue(i, 0); 8700 } 8701 8702 static void 8703 bdev_update_qos_rate_limit_msg(void *cb_arg) 8704 { 8705 struct set_qos_limit_ctx *ctx = cb_arg; 8706 struct spdk_bdev *bdev = ctx->bdev; 8707 8708 spdk_spin_lock(&bdev->internal.spinlock); 8709 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8710 spdk_spin_unlock(&bdev->internal.spinlock); 8711 8712 bdev_set_qos_limit_done(ctx, 0); 8713 } 8714 8715 static void 8716 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8717 struct spdk_io_channel *ch, void *_ctx) 8718 { 8719 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8720 8721 spdk_spin_lock(&bdev->internal.spinlock); 8722 bdev_enable_qos(bdev, bdev_ch); 8723 spdk_spin_unlock(&bdev->internal.spinlock); 8724 spdk_bdev_for_each_channel_continue(i, 0); 8725 } 8726 8727 static void 8728 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8729 { 8730 struct set_qos_limit_ctx *ctx = _ctx; 8731 8732 bdev_set_qos_limit_done(ctx, status); 8733 } 8734 8735 static void 8736 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8737 { 8738 int i; 8739 8740 assert(bdev->internal.qos != NULL); 8741 8742 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8743 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8744 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8745 8746 if (limits[i] == 0) { 8747 bdev->internal.qos->rate_limits[i].limit = 8748 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8749 } 8750 } 8751 } 8752 } 8753 8754 void 8755 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8756 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8757 { 8758 struct set_qos_limit_ctx *ctx; 8759 uint32_t limit_set_complement; 8760 uint64_t min_limit_per_sec; 8761 int i; 8762 bool disable_rate_limit = true; 8763 8764 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8765 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8766 continue; 8767 } 8768 8769 if (limits[i] > 0) { 8770 disable_rate_limit = false; 8771 } 8772 8773 if (bdev_qos_is_iops_rate_limit(i) == true) { 8774 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8775 } else { 8776 /* Change from megabyte to byte rate limit */ 8777 limits[i] = limits[i] * 1024 * 1024; 8778 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8779 } 8780 8781 limit_set_complement = limits[i] % min_limit_per_sec; 8782 if (limit_set_complement) { 8783 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8784 limits[i], min_limit_per_sec); 8785 limits[i] += min_limit_per_sec - limit_set_complement; 8786 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8787 } 8788 } 8789 8790 ctx = calloc(1, sizeof(*ctx)); 8791 if (ctx == NULL) { 8792 cb_fn(cb_arg, -ENOMEM); 8793 return; 8794 } 8795 8796 ctx->cb_fn = cb_fn; 8797 ctx->cb_arg = cb_arg; 8798 ctx->bdev = bdev; 8799 8800 spdk_spin_lock(&bdev->internal.spinlock); 8801 if (bdev->internal.qos_mod_in_progress) { 8802 spdk_spin_unlock(&bdev->internal.spinlock); 8803 free(ctx); 8804 cb_fn(cb_arg, -EAGAIN); 8805 return; 8806 } 8807 bdev->internal.qos_mod_in_progress = true; 8808 8809 if (disable_rate_limit == true && bdev->internal.qos) { 8810 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8811 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8812 (bdev->internal.qos->rate_limits[i].limit > 0 && 8813 bdev->internal.qos->rate_limits[i].limit != 8814 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8815 disable_rate_limit = false; 8816 break; 8817 } 8818 } 8819 } 8820 8821 if (disable_rate_limit == false) { 8822 if (bdev->internal.qos == NULL) { 8823 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8824 if (!bdev->internal.qos) { 8825 spdk_spin_unlock(&bdev->internal.spinlock); 8826 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8827 bdev_set_qos_limit_done(ctx, -ENOMEM); 8828 return; 8829 } 8830 } 8831 8832 if (bdev->internal.qos->thread == NULL) { 8833 /* Enabling */ 8834 bdev_set_qos_rate_limits(bdev, limits); 8835 8836 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8837 bdev_enable_qos_done); 8838 } else { 8839 /* Updating */ 8840 bdev_set_qos_rate_limits(bdev, limits); 8841 8842 spdk_thread_send_msg(bdev->internal.qos->thread, 8843 bdev_update_qos_rate_limit_msg, ctx); 8844 } 8845 } else { 8846 if (bdev->internal.qos != NULL) { 8847 bdev_set_qos_rate_limits(bdev, limits); 8848 8849 /* Disabling */ 8850 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8851 bdev_disable_qos_msg_done); 8852 } else { 8853 spdk_spin_unlock(&bdev->internal.spinlock); 8854 bdev_set_qos_limit_done(ctx, 0); 8855 return; 8856 } 8857 } 8858 8859 spdk_spin_unlock(&bdev->internal.spinlock); 8860 } 8861 8862 struct spdk_bdev_histogram_ctx { 8863 spdk_bdev_histogram_status_cb cb_fn; 8864 void *cb_arg; 8865 struct spdk_bdev *bdev; 8866 int status; 8867 }; 8868 8869 static void 8870 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8871 { 8872 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8873 8874 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8875 ctx->bdev->internal.histogram_in_progress = false; 8876 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8877 ctx->cb_fn(ctx->cb_arg, ctx->status); 8878 free(ctx); 8879 } 8880 8881 static void 8882 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8883 struct spdk_io_channel *_ch, void *_ctx) 8884 { 8885 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8886 8887 if (ch->histogram != NULL) { 8888 spdk_histogram_data_free(ch->histogram); 8889 ch->histogram = NULL; 8890 } 8891 spdk_bdev_for_each_channel_continue(i, 0); 8892 } 8893 8894 static void 8895 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8896 { 8897 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8898 8899 if (status != 0) { 8900 ctx->status = status; 8901 ctx->bdev->internal.histogram_enabled = false; 8902 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8903 bdev_histogram_disable_channel_cb); 8904 } else { 8905 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8906 ctx->bdev->internal.histogram_in_progress = false; 8907 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8908 ctx->cb_fn(ctx->cb_arg, ctx->status); 8909 free(ctx); 8910 } 8911 } 8912 8913 static void 8914 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8915 struct spdk_io_channel *_ch, void *_ctx) 8916 { 8917 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8918 int status = 0; 8919 8920 if (ch->histogram == NULL) { 8921 ch->histogram = spdk_histogram_data_alloc(); 8922 if (ch->histogram == NULL) { 8923 status = -ENOMEM; 8924 } 8925 } 8926 8927 spdk_bdev_for_each_channel_continue(i, status); 8928 } 8929 8930 void 8931 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8932 void *cb_arg, bool enable) 8933 { 8934 struct spdk_bdev_histogram_ctx *ctx; 8935 8936 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8937 if (ctx == NULL) { 8938 cb_fn(cb_arg, -ENOMEM); 8939 return; 8940 } 8941 8942 ctx->bdev = bdev; 8943 ctx->status = 0; 8944 ctx->cb_fn = cb_fn; 8945 ctx->cb_arg = cb_arg; 8946 8947 spdk_spin_lock(&bdev->internal.spinlock); 8948 if (bdev->internal.histogram_in_progress) { 8949 spdk_spin_unlock(&bdev->internal.spinlock); 8950 free(ctx); 8951 cb_fn(cb_arg, -EAGAIN); 8952 return; 8953 } 8954 8955 bdev->internal.histogram_in_progress = true; 8956 spdk_spin_unlock(&bdev->internal.spinlock); 8957 8958 bdev->internal.histogram_enabled = enable; 8959 8960 if (enable) { 8961 /* Allocate histogram for each channel */ 8962 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8963 bdev_histogram_enable_channel_cb); 8964 } else { 8965 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8966 bdev_histogram_disable_channel_cb); 8967 } 8968 } 8969 8970 struct spdk_bdev_histogram_data_ctx { 8971 spdk_bdev_histogram_data_cb cb_fn; 8972 void *cb_arg; 8973 struct spdk_bdev *bdev; 8974 /** merged histogram data from all channels */ 8975 struct spdk_histogram_data *histogram; 8976 }; 8977 8978 static void 8979 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8980 { 8981 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8982 8983 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8984 free(ctx); 8985 } 8986 8987 static void 8988 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8989 struct spdk_io_channel *_ch, void *_ctx) 8990 { 8991 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8992 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8993 int status = 0; 8994 8995 if (ch->histogram == NULL) { 8996 status = -EFAULT; 8997 } else { 8998 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8999 } 9000 9001 spdk_bdev_for_each_channel_continue(i, status); 9002 } 9003 9004 void 9005 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9006 spdk_bdev_histogram_data_cb cb_fn, 9007 void *cb_arg) 9008 { 9009 struct spdk_bdev_histogram_data_ctx *ctx; 9010 9011 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9012 if (ctx == NULL) { 9013 cb_fn(cb_arg, -ENOMEM, NULL); 9014 return; 9015 } 9016 9017 ctx->bdev = bdev; 9018 ctx->cb_fn = cb_fn; 9019 ctx->cb_arg = cb_arg; 9020 9021 ctx->histogram = histogram; 9022 9023 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9024 bdev_histogram_get_channel_cb); 9025 } 9026 9027 void 9028 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9029 void *cb_arg) 9030 { 9031 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9032 int status = 0; 9033 9034 assert(cb_fn != NULL); 9035 9036 if (bdev_ch->histogram == NULL) { 9037 status = -EFAULT; 9038 } 9039 cb_fn(cb_arg, status, bdev_ch->histogram); 9040 } 9041 9042 size_t 9043 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9044 size_t max_events) 9045 { 9046 struct media_event_entry *entry; 9047 size_t num_events = 0; 9048 9049 for (; num_events < max_events; ++num_events) { 9050 entry = TAILQ_FIRST(&desc->pending_media_events); 9051 if (entry == NULL) { 9052 break; 9053 } 9054 9055 events[num_events] = entry->event; 9056 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9057 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9058 } 9059 9060 return num_events; 9061 } 9062 9063 int 9064 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9065 size_t num_events) 9066 { 9067 struct spdk_bdev_desc *desc; 9068 struct media_event_entry *entry; 9069 size_t event_id; 9070 int rc = 0; 9071 9072 assert(bdev->media_events); 9073 9074 spdk_spin_lock(&bdev->internal.spinlock); 9075 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9076 if (desc->write) { 9077 break; 9078 } 9079 } 9080 9081 if (desc == NULL || desc->media_events_buffer == NULL) { 9082 rc = -ENODEV; 9083 goto out; 9084 } 9085 9086 for (event_id = 0; event_id < num_events; ++event_id) { 9087 entry = TAILQ_FIRST(&desc->free_media_events); 9088 if (entry == NULL) { 9089 break; 9090 } 9091 9092 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9093 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9094 entry->event = events[event_id]; 9095 } 9096 9097 rc = event_id; 9098 out: 9099 spdk_spin_unlock(&bdev->internal.spinlock); 9100 return rc; 9101 } 9102 9103 static void 9104 _media_management_notify(void *arg) 9105 { 9106 struct spdk_bdev_desc *desc = arg; 9107 9108 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9109 } 9110 9111 void 9112 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9113 { 9114 struct spdk_bdev_desc *desc; 9115 9116 spdk_spin_lock(&bdev->internal.spinlock); 9117 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9118 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9119 event_notify(desc, _media_management_notify); 9120 } 9121 } 9122 spdk_spin_unlock(&bdev->internal.spinlock); 9123 } 9124 9125 struct locked_lba_range_ctx { 9126 struct lba_range range; 9127 struct spdk_bdev *bdev; 9128 struct lba_range *current_range; 9129 struct lba_range *owner_range; 9130 struct spdk_poller *poller; 9131 lock_range_cb cb_fn; 9132 void *cb_arg; 9133 }; 9134 9135 static void 9136 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9137 { 9138 struct locked_lba_range_ctx *ctx = _ctx; 9139 9140 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 9141 free(ctx); 9142 } 9143 9144 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9145 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9146 9147 static void 9148 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9149 { 9150 struct locked_lba_range_ctx *ctx = _ctx; 9151 9152 if (status == -ENOMEM) { 9153 /* One of the channels could not allocate a range object. 9154 * So we have to go back and clean up any ranges that were 9155 * allocated successfully before we return error status to 9156 * the caller. We can reuse the unlock function to do that 9157 * clean up. 9158 */ 9159 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9160 bdev_lock_error_cleanup_cb); 9161 return; 9162 } 9163 9164 /* All channels have locked this range and no I/O overlapping the range 9165 * are outstanding! Set the owner_ch for the range object for the 9166 * locking channel, so that this channel will know that it is allowed 9167 * to write to this range. 9168 */ 9169 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9170 ctx->cb_fn(ctx->cb_arg, status); 9171 9172 /* Don't free the ctx here. Its range is in the bdev's global list of 9173 * locked ranges still, and will be removed and freed when this range 9174 * is later unlocked. 9175 */ 9176 } 9177 9178 static int 9179 bdev_lock_lba_range_check_io(void *_i) 9180 { 9181 struct spdk_bdev_channel_iter *i = _i; 9182 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9183 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9184 struct locked_lba_range_ctx *ctx = i->ctx; 9185 struct lba_range *range = ctx->current_range; 9186 struct spdk_bdev_io *bdev_io; 9187 9188 spdk_poller_unregister(&ctx->poller); 9189 9190 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9191 * range. But we need to wait until any outstanding IO overlapping with this range 9192 * are completed. 9193 */ 9194 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9195 if (bdev_io_range_is_locked(bdev_io, range)) { 9196 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9197 return SPDK_POLLER_BUSY; 9198 } 9199 } 9200 9201 spdk_bdev_for_each_channel_continue(i, 0); 9202 return SPDK_POLLER_BUSY; 9203 } 9204 9205 static void 9206 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9207 struct spdk_io_channel *_ch, void *_ctx) 9208 { 9209 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9210 struct locked_lba_range_ctx *ctx = _ctx; 9211 struct lba_range *range; 9212 9213 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9214 if (range->length == ctx->range.length && 9215 range->offset == ctx->range.offset && 9216 range->locked_ctx == ctx->range.locked_ctx) { 9217 /* This range already exists on this channel, so don't add 9218 * it again. This can happen when a new channel is created 9219 * while the for_each_channel operation is in progress. 9220 * Do not check for outstanding I/O in that case, since the 9221 * range was locked before any I/O could be submitted to the 9222 * new channel. 9223 */ 9224 spdk_bdev_for_each_channel_continue(i, 0); 9225 return; 9226 } 9227 } 9228 9229 range = calloc(1, sizeof(*range)); 9230 if (range == NULL) { 9231 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9232 return; 9233 } 9234 9235 range->length = ctx->range.length; 9236 range->offset = ctx->range.offset; 9237 range->locked_ctx = ctx->range.locked_ctx; 9238 ctx->current_range = range; 9239 if (ctx->range.owner_ch == ch) { 9240 /* This is the range object for the channel that will hold 9241 * the lock. Store it in the ctx object so that we can easily 9242 * set its owner_ch after the lock is finally acquired. 9243 */ 9244 ctx->owner_range = range; 9245 } 9246 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9247 bdev_lock_lba_range_check_io(i); 9248 } 9249 9250 static void 9251 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9252 { 9253 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 9254 9255 /* We will add a copy of this range to each channel now. */ 9256 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9257 bdev_lock_lba_range_cb); 9258 } 9259 9260 static bool 9261 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9262 { 9263 struct lba_range *r; 9264 9265 TAILQ_FOREACH(r, tailq, tailq) { 9266 if (bdev_lba_range_overlapped(range, r)) { 9267 return true; 9268 } 9269 } 9270 return false; 9271 } 9272 9273 static int 9274 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9275 uint64_t offset, uint64_t length, 9276 lock_range_cb cb_fn, void *cb_arg) 9277 { 9278 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9279 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9280 struct locked_lba_range_ctx *ctx; 9281 9282 if (cb_arg == NULL) { 9283 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9284 return -EINVAL; 9285 } 9286 9287 ctx = calloc(1, sizeof(*ctx)); 9288 if (ctx == NULL) { 9289 return -ENOMEM; 9290 } 9291 9292 ctx->range.offset = offset; 9293 ctx->range.length = length; 9294 ctx->range.owner_ch = ch; 9295 ctx->range.locked_ctx = cb_arg; 9296 ctx->bdev = bdev; 9297 ctx->cb_fn = cb_fn; 9298 ctx->cb_arg = cb_arg; 9299 9300 spdk_spin_lock(&bdev->internal.spinlock); 9301 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9302 /* There is an active lock overlapping with this range. 9303 * Put it on the pending list until this range no 9304 * longer overlaps with another. 9305 */ 9306 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9307 } else { 9308 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9309 bdev_lock_lba_range_ctx(bdev, ctx); 9310 } 9311 spdk_spin_unlock(&bdev->internal.spinlock); 9312 return 0; 9313 } 9314 9315 static void 9316 bdev_lock_lba_range_ctx_msg(void *_ctx) 9317 { 9318 struct locked_lba_range_ctx *ctx = _ctx; 9319 9320 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 9321 } 9322 9323 static void 9324 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9325 { 9326 struct locked_lba_range_ctx *ctx = _ctx; 9327 struct locked_lba_range_ctx *pending_ctx; 9328 struct lba_range *range, *tmp; 9329 9330 spdk_spin_lock(&bdev->internal.spinlock); 9331 /* Check if there are any pending locked ranges that overlap with this range 9332 * that was just unlocked. If there are, check that it doesn't overlap with any 9333 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9334 * the lock process. 9335 */ 9336 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9337 if (bdev_lba_range_overlapped(range, &ctx->range) && 9338 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9339 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9340 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9341 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9342 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 9343 bdev_lock_lba_range_ctx_msg, pending_ctx); 9344 } 9345 } 9346 spdk_spin_unlock(&bdev->internal.spinlock); 9347 9348 ctx->cb_fn(ctx->cb_arg, status); 9349 free(ctx); 9350 } 9351 9352 static void 9353 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9354 struct spdk_io_channel *_ch, void *_ctx) 9355 { 9356 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9357 struct locked_lba_range_ctx *ctx = _ctx; 9358 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9359 struct spdk_bdev_io *bdev_io; 9360 struct lba_range *range; 9361 9362 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9363 if (ctx->range.offset == range->offset && 9364 ctx->range.length == range->length && 9365 ctx->range.locked_ctx == range->locked_ctx) { 9366 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9367 free(range); 9368 break; 9369 } 9370 } 9371 9372 /* Note: we should almost always be able to assert that the range specified 9373 * was found. But there are some very rare corner cases where a new channel 9374 * gets created simultaneously with a range unlock, where this function 9375 * would execute on that new channel and wouldn't have the range. 9376 * We also use this to clean up range allocations when a later allocation 9377 * fails in the locking path. 9378 * So we can't actually assert() here. 9379 */ 9380 9381 /* Swap the locked IO into a temporary list, and then try to submit them again. 9382 * We could hyper-optimize this to only resubmit locked I/O that overlap 9383 * with the range that was just unlocked, but this isn't a performance path so 9384 * we go for simplicity here. 9385 */ 9386 TAILQ_INIT(&io_locked); 9387 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9388 while (!TAILQ_EMPTY(&io_locked)) { 9389 bdev_io = TAILQ_FIRST(&io_locked); 9390 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9391 bdev_io_submit(bdev_io); 9392 } 9393 9394 spdk_bdev_for_each_channel_continue(i, 0); 9395 } 9396 9397 static int 9398 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9399 uint64_t offset, uint64_t length, 9400 lock_range_cb cb_fn, void *cb_arg) 9401 { 9402 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9403 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9404 struct locked_lba_range_ctx *ctx; 9405 struct lba_range *range; 9406 bool range_found = false; 9407 9408 /* Let's make sure the specified channel actually has a lock on 9409 * the specified range. Note that the range must match exactly. 9410 */ 9411 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9412 if (range->offset == offset && range->length == length && 9413 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9414 range_found = true; 9415 break; 9416 } 9417 } 9418 9419 if (!range_found) { 9420 return -EINVAL; 9421 } 9422 9423 spdk_spin_lock(&bdev->internal.spinlock); 9424 /* We confirmed that this channel has locked the specified range. To 9425 * start the unlock the process, we find the range in the bdev's locked_ranges 9426 * and remove it. This ensures new channels don't inherit the locked range. 9427 * Then we will send a message to each channel (including the one specified 9428 * here) to remove the range from its per-channel list. 9429 */ 9430 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9431 if (range->offset == offset && range->length == length && 9432 range->locked_ctx == cb_arg) { 9433 break; 9434 } 9435 } 9436 if (range == NULL) { 9437 assert(false); 9438 spdk_spin_unlock(&bdev->internal.spinlock); 9439 return -EINVAL; 9440 } 9441 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9442 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9443 spdk_spin_unlock(&bdev->internal.spinlock); 9444 9445 ctx->cb_fn = cb_fn; 9446 ctx->cb_arg = cb_arg; 9447 9448 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9449 bdev_unlock_lba_range_cb); 9450 return 0; 9451 } 9452 9453 int 9454 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9455 int array_size) 9456 { 9457 if (!bdev) { 9458 return -EINVAL; 9459 } 9460 9461 if (bdev->fn_table->get_memory_domains) { 9462 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9463 } 9464 9465 return 0; 9466 } 9467 9468 struct spdk_bdev_for_each_io_ctx { 9469 void *ctx; 9470 spdk_bdev_io_fn fn; 9471 spdk_bdev_for_each_io_cb cb; 9472 }; 9473 9474 static void 9475 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9476 struct spdk_io_channel *io_ch, void *_ctx) 9477 { 9478 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9479 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9480 struct spdk_bdev_io *bdev_io; 9481 int rc = 0; 9482 9483 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9484 rc = ctx->fn(ctx->ctx, bdev_io); 9485 if (rc != 0) { 9486 break; 9487 } 9488 } 9489 9490 spdk_bdev_for_each_channel_continue(i, rc); 9491 } 9492 9493 static void 9494 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9495 { 9496 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9497 9498 ctx->cb(ctx->ctx, status); 9499 9500 free(ctx); 9501 } 9502 9503 void 9504 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9505 spdk_bdev_for_each_io_cb cb) 9506 { 9507 struct spdk_bdev_for_each_io_ctx *ctx; 9508 9509 assert(fn != NULL && cb != NULL); 9510 9511 ctx = calloc(1, sizeof(*ctx)); 9512 if (ctx == NULL) { 9513 SPDK_ERRLOG("Failed to allocate context.\n"); 9514 cb(_ctx, -ENOMEM); 9515 return; 9516 } 9517 9518 ctx->ctx = _ctx; 9519 ctx->fn = fn; 9520 ctx->cb = cb; 9521 9522 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9523 bdev_for_each_io_done); 9524 } 9525 9526 void 9527 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9528 { 9529 spdk_for_each_channel_continue(iter->i, status); 9530 } 9531 9532 static struct spdk_bdev * 9533 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9534 { 9535 void *io_device = spdk_io_channel_iter_get_io_device(i); 9536 9537 return __bdev_from_io_dev(io_device); 9538 } 9539 9540 static void 9541 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9542 { 9543 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9544 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9545 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9546 9547 iter->i = i; 9548 iter->fn(iter, bdev, ch, iter->ctx); 9549 } 9550 9551 static void 9552 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9553 { 9554 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9555 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9556 9557 iter->i = i; 9558 iter->cpl(bdev, iter->ctx, status); 9559 9560 free(iter); 9561 } 9562 9563 void 9564 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9565 void *ctx, spdk_bdev_for_each_channel_done cpl) 9566 { 9567 struct spdk_bdev_channel_iter *iter; 9568 9569 assert(bdev != NULL && fn != NULL && ctx != NULL); 9570 9571 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9572 if (iter == NULL) { 9573 SPDK_ERRLOG("Unable to allocate iterator\n"); 9574 assert(false); 9575 return; 9576 } 9577 9578 iter->fn = fn; 9579 iter->cpl = cpl; 9580 iter->ctx = ctx; 9581 9582 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9583 iter, bdev_each_channel_cpl); 9584 } 9585 9586 static void 9587 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9588 { 9589 struct spdk_bdev_io *parent_io = cb_arg; 9590 9591 spdk_bdev_free_io(bdev_io); 9592 9593 /* Check return status of write */ 9594 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9595 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9596 } 9597 9598 static void 9599 bdev_copy_do_write(void *_bdev_io) 9600 { 9601 struct spdk_bdev_io *bdev_io = _bdev_io; 9602 int rc; 9603 9604 /* Write blocks */ 9605 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9606 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9607 bdev_io->u.bdev.iovs[0].iov_base, 9608 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9609 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 9610 9611 if (rc == -ENOMEM) { 9612 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9613 } else if (rc != 0) { 9614 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9615 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9616 } 9617 } 9618 9619 static void 9620 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9621 { 9622 struct spdk_bdev_io *parent_io = cb_arg; 9623 9624 spdk_bdev_free_io(bdev_io); 9625 9626 /* Check return status of read */ 9627 if (!success) { 9628 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9629 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9630 return; 9631 } 9632 9633 /* Do write */ 9634 bdev_copy_do_write(parent_io); 9635 } 9636 9637 static void 9638 bdev_copy_do_read(void *_bdev_io) 9639 { 9640 struct spdk_bdev_io *bdev_io = _bdev_io; 9641 int rc; 9642 9643 /* Read blocks */ 9644 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9645 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9646 bdev_io->u.bdev.iovs[0].iov_base, 9647 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9648 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 9649 9650 if (rc == -ENOMEM) { 9651 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9652 } else if (rc != 0) { 9653 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9654 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9655 } 9656 } 9657 9658 static void 9659 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9660 { 9661 if (!success) { 9662 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9663 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9664 return; 9665 } 9666 9667 bdev_copy_do_read(bdev_io); 9668 } 9669 9670 int 9671 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9672 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9673 spdk_bdev_io_completion_cb cb, void *cb_arg) 9674 { 9675 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9676 struct spdk_bdev_io *bdev_io; 9677 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9678 9679 if (!desc->write) { 9680 return -EBADF; 9681 } 9682 9683 if (num_blocks == 0) { 9684 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9685 return -EINVAL; 9686 } 9687 9688 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9689 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9690 SPDK_DEBUGLOG(bdev, 9691 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9692 dst_offset_blocks, src_offset_blocks, num_blocks); 9693 return -EINVAL; 9694 } 9695 9696 bdev_io = bdev_channel_get_io(channel); 9697 if (!bdev_io) { 9698 return -ENOMEM; 9699 } 9700 9701 bdev_io->internal.ch = channel; 9702 bdev_io->internal.desc = desc; 9703 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9704 9705 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9706 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9707 bdev_io->u.bdev.num_blocks = num_blocks; 9708 bdev_io->u.bdev.memory_domain = NULL; 9709 bdev_io->u.bdev.memory_domain_ctx = NULL; 9710 bdev_io->u.bdev.iovs = NULL; 9711 bdev_io->u.bdev.iovcnt = 0; 9712 bdev_io->u.bdev.md_buf = NULL; 9713 bdev_io->u.bdev.accel_sequence = NULL; 9714 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9715 9716 if (dst_offset_blocks == src_offset_blocks) { 9717 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9718 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9719 9720 return 0; 9721 } 9722 9723 9724 /* If the copy size is large and should be split, use the generic split logic 9725 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 9726 * 9727 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 9728 * emulate it using regular read and write requests otherwise. 9729 */ 9730 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 9731 bdev_io->internal.split) { 9732 bdev_io_submit(bdev_io); 9733 return 0; 9734 } 9735 9736 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9737 9738 return 0; 9739 } 9740 9741 SPDK_LOG_REGISTER_COMPONENT(bdev) 9742 9743 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9744 { 9745 struct spdk_trace_tpoint_opts opts[] = { 9746 { 9747 "BDEV_IO_START", TRACE_BDEV_IO_START, 9748 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9749 { 9750 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9751 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9752 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9753 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9754 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9755 } 9756 }, 9757 { 9758 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9759 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9760 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9761 }, 9762 { 9763 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9764 OWNER_BDEV, OBJECT_NONE, 1, 9765 { 9766 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9767 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9768 } 9769 }, 9770 { 9771 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9772 OWNER_BDEV, OBJECT_NONE, 0, 9773 { 9774 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9775 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9776 } 9777 }, 9778 }; 9779 9780 9781 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9782 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9783 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9784 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9785 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9786 } 9787