1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 /* List of I/Os with accel sequence being currently executed */ 293 bdev_io_tailq_t io_accel_exec; 294 295 /* List of I/Os doing memory domain pull/push */ 296 bdev_io_tailq_t io_memory_domain; 297 298 uint32_t flags; 299 300 struct spdk_histogram_data *histogram; 301 302 #ifdef SPDK_CONFIG_VTUNE 303 uint64_t start_tsc; 304 uint64_t interval_tsc; 305 __itt_string_handle *handle; 306 struct spdk_bdev_io_stat *prev_stat; 307 #endif 308 309 bdev_io_tailq_t queued_resets; 310 311 lba_range_tailq_t locked_ranges; 312 }; 313 314 struct media_event_entry { 315 struct spdk_bdev_media_event event; 316 TAILQ_ENTRY(media_event_entry) tailq; 317 }; 318 319 #define MEDIA_EVENT_POOL_SIZE 64 320 321 struct spdk_bdev_desc { 322 struct spdk_bdev *bdev; 323 struct spdk_thread *thread; 324 struct { 325 spdk_bdev_event_cb_t event_fn; 326 void *ctx; 327 } callback; 328 bool closed; 329 bool write; 330 bool memory_domains_supported; 331 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 332 struct spdk_spinlock spinlock; 333 uint32_t refs; 334 TAILQ_HEAD(, media_event_entry) pending_media_events; 335 TAILQ_HEAD(, media_event_entry) free_media_events; 336 struct media_event_entry *media_events_buffer; 337 TAILQ_ENTRY(spdk_bdev_desc) link; 338 339 uint64_t timeout_in_sec; 340 spdk_bdev_io_timeout_cb cb_fn; 341 void *cb_arg; 342 struct spdk_poller *io_timeout_poller; 343 struct spdk_bdev_module_claim *claim; 344 }; 345 346 struct spdk_bdev_iostat_ctx { 347 struct spdk_bdev_io_stat *stat; 348 spdk_bdev_get_device_stat_cb cb; 349 void *cb_arg; 350 }; 351 352 struct set_qos_limit_ctx { 353 void (*cb_fn)(void *cb_arg, int status); 354 void *cb_arg; 355 struct spdk_bdev *bdev; 356 }; 357 358 struct spdk_bdev_channel_iter { 359 spdk_bdev_for_each_channel_msg fn; 360 spdk_bdev_for_each_channel_done cpl; 361 struct spdk_io_channel_iter *i; 362 void *ctx; 363 }; 364 365 struct spdk_bdev_io_error_stat { 366 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 367 }; 368 369 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 370 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 371 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 372 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 373 374 static inline void bdev_io_complete(void *ctx); 375 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 376 377 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 378 static void bdev_write_zero_buffer_next(void *_bdev_io); 379 380 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 381 struct spdk_io_channel *ch, void *_ctx); 382 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 383 384 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 385 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 386 uint64_t num_blocks, 387 struct spdk_memory_domain *domain, void *domain_ctx, 388 struct spdk_accel_sequence *seq, 389 spdk_bdev_io_completion_cb cb, void *cb_arg); 390 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 391 struct iovec *iov, int iovcnt, void *md_buf, 392 uint64_t offset_blocks, uint64_t num_blocks, 393 struct spdk_memory_domain *domain, void *domain_ctx, 394 struct spdk_accel_sequence *seq, 395 spdk_bdev_io_completion_cb cb, void *cb_arg); 396 397 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 398 uint64_t offset, uint64_t length, 399 lock_range_cb cb_fn, void *cb_arg); 400 401 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 402 uint64_t offset, uint64_t length, 403 lock_range_cb cb_fn, void *cb_arg); 404 405 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 406 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 407 408 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 409 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 410 static void claim_reset(struct spdk_bdev *bdev); 411 412 #define bdev_get_ext_io_opt(opts, field, defval) \ 413 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 414 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 415 416 void 417 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 418 { 419 if (!opts) { 420 SPDK_ERRLOG("opts should not be NULL\n"); 421 return; 422 } 423 424 if (!opts_size) { 425 SPDK_ERRLOG("opts_size should not be zero value\n"); 426 return; 427 } 428 429 opts->opts_size = opts_size; 430 431 #define SET_FIELD(field) \ 432 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 433 opts->field = g_bdev_opts.field; \ 434 } \ 435 436 SET_FIELD(bdev_io_pool_size); 437 SET_FIELD(bdev_io_cache_size); 438 SET_FIELD(bdev_auto_examine); 439 SET_FIELD(small_buf_pool_size); 440 SET_FIELD(large_buf_pool_size); 441 442 /* Do not remove this statement, you should always update this statement when you adding a new field, 443 * and do not forget to add the SET_FIELD statement for your added field. */ 444 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 445 446 #undef SET_FIELD 447 } 448 449 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 450 "v23.05", 0); 451 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 452 "v23.05", 0); 453 int 454 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 455 { 456 struct spdk_iobuf_opts iobuf_opts; 457 uint32_t min_pool_size; 458 int rc; 459 460 if (!opts) { 461 SPDK_ERRLOG("opts cannot be NULL\n"); 462 return -1; 463 } 464 465 if (!opts->opts_size) { 466 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 467 return -1; 468 } 469 470 /* 471 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 472 * initialization. A second mgmt_ch will be created on the same thread when the application starts 473 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 474 */ 475 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 476 if (opts->bdev_io_pool_size < min_pool_size) { 477 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 478 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 479 spdk_thread_get_count()); 480 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 481 return -1; 482 } 483 484 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 485 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 486 } 487 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 488 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 489 } 490 491 #define SET_FIELD(field) \ 492 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 493 g_bdev_opts.field = opts->field; \ 494 } \ 495 496 SET_FIELD(bdev_io_pool_size); 497 SET_FIELD(bdev_io_cache_size); 498 SET_FIELD(bdev_auto_examine); 499 SET_FIELD(small_buf_pool_size); 500 SET_FIELD(large_buf_pool_size); 501 502 spdk_iobuf_get_opts(&iobuf_opts); 503 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 504 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 505 506 rc = spdk_iobuf_set_opts(&iobuf_opts); 507 if (rc != 0) { 508 SPDK_ERRLOG("Failed to set iobuf opts\n"); 509 return -1; 510 } 511 512 g_bdev_opts.opts_size = opts->opts_size; 513 514 #undef SET_FIELD 515 516 return 0; 517 } 518 519 static struct spdk_bdev * 520 bdev_get_by_name(const char *bdev_name) 521 { 522 struct spdk_bdev_name find; 523 struct spdk_bdev_name *res; 524 525 find.name = (char *)bdev_name; 526 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 527 if (res != NULL) { 528 return res->bdev; 529 } 530 531 return NULL; 532 } 533 534 struct spdk_bdev * 535 spdk_bdev_get_by_name(const char *bdev_name) 536 { 537 struct spdk_bdev *bdev; 538 539 spdk_spin_lock(&g_bdev_mgr.spinlock); 540 bdev = bdev_get_by_name(bdev_name); 541 spdk_spin_unlock(&g_bdev_mgr.spinlock); 542 543 return bdev; 544 } 545 546 struct bdev_io_status_string { 547 enum spdk_bdev_io_status status; 548 const char *str; 549 }; 550 551 static const struct bdev_io_status_string bdev_io_status_strings[] = { 552 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 553 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 554 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 555 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 556 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 557 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 558 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 559 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 560 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 561 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 562 }; 563 564 static const char * 565 bdev_io_status_get_string(enum spdk_bdev_io_status status) 566 { 567 uint32_t i; 568 569 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 570 if (bdev_io_status_strings[i].status == status) { 571 return bdev_io_status_strings[i].str; 572 } 573 } 574 575 return "reserved"; 576 } 577 578 struct spdk_bdev_wait_for_examine_ctx { 579 struct spdk_poller *poller; 580 spdk_bdev_wait_for_examine_cb cb_fn; 581 void *cb_arg; 582 }; 583 584 static bool bdev_module_all_actions_completed(void); 585 586 static int 587 bdev_wait_for_examine_cb(void *arg) 588 { 589 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 590 591 if (!bdev_module_all_actions_completed()) { 592 return SPDK_POLLER_IDLE; 593 } 594 595 spdk_poller_unregister(&ctx->poller); 596 ctx->cb_fn(ctx->cb_arg); 597 free(ctx); 598 599 return SPDK_POLLER_BUSY; 600 } 601 602 int 603 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 604 { 605 struct spdk_bdev_wait_for_examine_ctx *ctx; 606 607 ctx = calloc(1, sizeof(*ctx)); 608 if (ctx == NULL) { 609 return -ENOMEM; 610 } 611 ctx->cb_fn = cb_fn; 612 ctx->cb_arg = cb_arg; 613 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 614 615 return 0; 616 } 617 618 struct spdk_bdev_examine_item { 619 char *name; 620 TAILQ_ENTRY(spdk_bdev_examine_item) link; 621 }; 622 623 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 624 625 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 626 g_bdev_examine_allowlist); 627 628 static inline bool 629 bdev_examine_allowlist_check(const char *name) 630 { 631 struct spdk_bdev_examine_item *item; 632 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 633 if (strcmp(name, item->name) == 0) { 634 return true; 635 } 636 } 637 return false; 638 } 639 640 static inline void 641 bdev_examine_allowlist_free(void) 642 { 643 struct spdk_bdev_examine_item *item; 644 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 645 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 646 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 647 free(item->name); 648 free(item); 649 } 650 } 651 652 static inline bool 653 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 654 { 655 struct spdk_bdev_alias *tmp; 656 if (bdev_examine_allowlist_check(bdev->name)) { 657 return true; 658 } 659 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 660 if (bdev_examine_allowlist_check(tmp->alias.name)) { 661 return true; 662 } 663 } 664 return false; 665 } 666 667 static inline bool 668 bdev_ok_to_examine(struct spdk_bdev *bdev) 669 { 670 if (g_bdev_opts.bdev_auto_examine) { 671 return true; 672 } else { 673 return bdev_in_examine_allowlist(bdev); 674 } 675 } 676 677 static void 678 bdev_examine(struct spdk_bdev *bdev) 679 { 680 struct spdk_bdev_module *module; 681 struct spdk_bdev_module_claim *claim, *tmpclaim; 682 uint32_t action; 683 684 if (!bdev_ok_to_examine(bdev)) { 685 return; 686 } 687 688 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 689 if (module->examine_config) { 690 spdk_spin_lock(&module->internal.spinlock); 691 action = module->internal.action_in_progress; 692 module->internal.action_in_progress++; 693 spdk_spin_unlock(&module->internal.spinlock); 694 module->examine_config(bdev); 695 if (action != module->internal.action_in_progress) { 696 SPDK_ERRLOG("examine_config for module %s did not call " 697 "spdk_bdev_module_examine_done()\n", module->name); 698 } 699 } 700 } 701 702 spdk_spin_lock(&bdev->internal.spinlock); 703 704 switch (bdev->internal.claim_type) { 705 case SPDK_BDEV_CLAIM_NONE: 706 /* Examine by all bdev modules */ 707 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 spdk_spin_lock(&bdev->internal.spinlock); 715 } 716 } 717 break; 718 case SPDK_BDEV_CLAIM_EXCL_WRITE: 719 /* Examine by the one bdev module with a v1 claim */ 720 module = bdev->internal.claim.v1.module; 721 if (module->examine_disk) { 722 spdk_spin_lock(&module->internal.spinlock); 723 module->internal.action_in_progress++; 724 spdk_spin_unlock(&module->internal.spinlock); 725 spdk_spin_unlock(&bdev->internal.spinlock); 726 module->examine_disk(bdev); 727 return; 728 } 729 break; 730 default: 731 /* Examine by all bdev modules with a v2 claim */ 732 assert(claim_type_is_v2(bdev->internal.claim_type)); 733 /* 734 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 735 * list, perhaps accessing freed memory. Without protection, this could happen 736 * while the lock is dropped during the examine callback. 737 */ 738 bdev->internal.examine_in_progress++; 739 740 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 741 module = claim->module; 742 743 if (module == NULL) { 744 /* This is a vestigial claim, held by examine_count */ 745 continue; 746 } 747 748 if (module->examine_disk == NULL) { 749 continue; 750 } 751 752 spdk_spin_lock(&module->internal.spinlock); 753 module->internal.action_in_progress++; 754 spdk_spin_unlock(&module->internal.spinlock); 755 756 /* Call examine_disk without holding internal.spinlock. */ 757 spdk_spin_unlock(&bdev->internal.spinlock); 758 module->examine_disk(bdev); 759 spdk_spin_lock(&bdev->internal.spinlock); 760 } 761 762 assert(bdev->internal.examine_in_progress > 0); 763 bdev->internal.examine_in_progress--; 764 if (bdev->internal.examine_in_progress == 0) { 765 /* Remove any claims that were released during examine_disk */ 766 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 767 if (claim->desc != NULL) { 768 continue; 769 } 770 771 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 772 free(claim); 773 } 774 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 775 claim_reset(bdev); 776 } 777 } 778 } 779 780 spdk_spin_unlock(&bdev->internal.spinlock); 781 } 782 783 int 784 spdk_bdev_examine(const char *name) 785 { 786 struct spdk_bdev *bdev; 787 struct spdk_bdev_examine_item *item; 788 struct spdk_thread *thread = spdk_get_thread(); 789 790 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 791 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 792 thread ? spdk_thread_get_name(thread) : "null"); 793 return -EINVAL; 794 } 795 796 if (g_bdev_opts.bdev_auto_examine) { 797 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 798 return -EINVAL; 799 } 800 801 if (bdev_examine_allowlist_check(name)) { 802 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 803 return -EEXIST; 804 } 805 806 item = calloc(1, sizeof(*item)); 807 if (!item) { 808 return -ENOMEM; 809 } 810 item->name = strdup(name); 811 if (!item->name) { 812 free(item); 813 return -ENOMEM; 814 } 815 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 816 817 bdev = spdk_bdev_get_by_name(name); 818 if (bdev) { 819 bdev_examine(bdev); 820 } 821 return 0; 822 } 823 824 static inline void 825 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 826 { 827 struct spdk_bdev_examine_item *item; 828 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 829 spdk_json_write_object_begin(w); 830 spdk_json_write_named_string(w, "method", "bdev_examine"); 831 spdk_json_write_named_object_begin(w, "params"); 832 spdk_json_write_named_string(w, "name", item->name); 833 spdk_json_write_object_end(w); 834 spdk_json_write_object_end(w); 835 } 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_first(void) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 struct spdk_bdev * 852 spdk_bdev_next(struct spdk_bdev *prev) 853 { 854 struct spdk_bdev *bdev; 855 856 bdev = TAILQ_NEXT(prev, internal.link); 857 if (bdev) { 858 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 859 } 860 861 return bdev; 862 } 863 864 static struct spdk_bdev * 865 _bdev_next_leaf(struct spdk_bdev *bdev) 866 { 867 while (bdev != NULL) { 868 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 869 return bdev; 870 } else { 871 bdev = TAILQ_NEXT(bdev, internal.link); 872 } 873 } 874 875 return bdev; 876 } 877 878 struct spdk_bdev * 879 spdk_bdev_first_leaf(void) 880 { 881 struct spdk_bdev *bdev; 882 883 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 884 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next_leaf(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 898 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static inline bool 907 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 908 { 909 return bdev_io->internal.memory_domain; 910 } 911 912 static inline bool 913 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 914 { 915 return bdev_io->internal.accel_sequence; 916 } 917 918 void 919 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 920 { 921 struct iovec *iovs; 922 923 if (bdev_io->u.bdev.iovs == NULL) { 924 bdev_io->u.bdev.iovs = &bdev_io->iov; 925 bdev_io->u.bdev.iovcnt = 1; 926 } 927 928 iovs = bdev_io->u.bdev.iovs; 929 930 assert(iovs != NULL); 931 assert(bdev_io->u.bdev.iovcnt >= 1); 932 933 iovs[0].iov_base = buf; 934 iovs[0].iov_len = len; 935 } 936 937 void 938 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 939 { 940 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 941 bdev_io->u.bdev.md_buf = md_buf; 942 } 943 944 static bool 945 _is_buf_allocated(const struct iovec *iovs) 946 { 947 if (iovs == NULL) { 948 return false; 949 } 950 951 return iovs[0].iov_base != NULL; 952 } 953 954 static bool 955 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 956 { 957 int i; 958 uintptr_t iov_base; 959 960 if (spdk_likely(alignment == 1)) { 961 return true; 962 } 963 964 for (i = 0; i < iovcnt; i++) { 965 iov_base = (uintptr_t)iovs[i].iov_base; 966 if ((iov_base & (alignment - 1)) != 0) { 967 return false; 968 } 969 } 970 971 return true; 972 } 973 974 static inline bool 975 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 976 { 977 if (!bdev_io_use_accel_sequence(bdev_io)) { 978 return false; 979 } 980 981 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 982 * bdev module didn't support accel sequences */ 983 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 984 } 985 986 static void 987 bdev_io_submit_sequence_cb(void *ctx, int status) 988 { 989 struct spdk_bdev_io *bdev_io = ctx; 990 991 bdev_io->u.bdev.accel_sequence = NULL; 992 bdev_io->internal.accel_sequence = NULL; 993 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 994 995 if (spdk_unlikely(status != 0)) { 996 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 997 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 998 bdev_io_complete_unsubmitted(bdev_io); 999 return; 1000 } 1001 1002 bdev_io_submit(bdev_io); 1003 } 1004 1005 static void 1006 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, spdk_accel_completion_cb cb_fn) 1007 { 1008 int rc; 1009 1010 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1011 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1012 1013 /* Since the operations are appended during submission, they're in the opposite order than 1014 * how we want to execute them for reads (i.e. we need to execute the most recently added 1015 * operation first), so reverse the sequence before executing it. 1016 */ 1017 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1018 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1019 } 1020 1021 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1022 1023 rc = spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, cb_fn, bdev_io); 1024 if (spdk_unlikely(rc != 0)) { 1025 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", rc); 1026 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1027 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1028 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1029 /* Writes haven't been submitted at this point yet */ 1030 bdev_io_complete_unsubmitted(bdev_io); 1031 } else { 1032 bdev_io_complete(bdev_io); 1033 } 1034 } 1035 } 1036 1037 static void 1038 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1039 { 1040 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1041 void *buf; 1042 1043 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1044 buf = bdev_io->internal.buf; 1045 bdev_io->internal.buf = NULL; 1046 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1047 bdev_io->internal.get_aux_buf_cb = NULL; 1048 } else { 1049 assert(bdev_io->internal.get_buf_cb != NULL); 1050 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1051 bdev_io->internal.get_buf_cb = NULL; 1052 } 1053 } 1054 1055 static void 1056 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1057 { 1058 struct spdk_bdev_io *bdev_io = ctx; 1059 1060 if (rc) { 1061 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1062 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1063 } 1064 bdev_io_get_buf_complete(bdev_io, !rc); 1065 } 1066 1067 static void 1068 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1069 { 1070 int rc = 0; 1071 1072 /* save original md_buf */ 1073 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1074 bdev_io->internal.orig_md_iov.iov_len = len; 1075 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1076 bdev_io->internal.bounce_md_iov.iov_len = len; 1077 /* set bounce md_buf */ 1078 bdev_io->u.bdev.md_buf = md_buf; 1079 1080 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1081 if (bdev_io_use_memory_domain(bdev_io)) { 1082 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1083 bdev_io->internal.memory_domain_ctx, 1084 &bdev_io->internal.orig_md_iov, 1, 1085 &bdev_io->internal.bounce_md_iov, 1, 1086 bdev_io->internal.data_transfer_cpl, 1087 bdev_io); 1088 if (rc == 0) { 1089 /* Continue to submit IO in completion callback */ 1090 return; 1091 } 1092 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1093 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1094 } else { 1095 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1096 } 1097 } 1098 1099 assert(bdev_io->internal.data_transfer_cpl); 1100 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1101 } 1102 1103 static void 1104 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1105 { 1106 struct spdk_bdev *bdev = bdev_io->bdev; 1107 uint64_t md_len; 1108 void *buf; 1109 1110 if (spdk_bdev_is_md_separate(bdev)) { 1111 assert(!bdev_io_use_accel_sequence(bdev_io)); 1112 1113 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1114 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1115 1116 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1117 1118 if (bdev_io->u.bdev.md_buf != NULL) { 1119 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1120 return; 1121 } else { 1122 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1123 } 1124 } 1125 1126 bdev_io_get_buf_complete(bdev_io, true); 1127 } 1128 1129 static void 1130 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1131 { 1132 struct spdk_bdev_io *bdev_io = ctx; 1133 1134 if (rc) { 1135 SPDK_ERRLOG("Failed to get data buffer\n"); 1136 assert(bdev_io->internal.data_transfer_cpl); 1137 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1138 return; 1139 } 1140 1141 _bdev_io_set_md_buf(bdev_io); 1142 } 1143 1144 static void 1145 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1146 bdev_copy_bounce_buffer_cpl cpl_cb) 1147 { 1148 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1149 int rc = 0; 1150 1151 bdev_io->internal.data_transfer_cpl = cpl_cb; 1152 /* save original iovec */ 1153 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1154 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1155 /* set bounce iov */ 1156 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1157 bdev_io->u.bdev.iovcnt = 1; 1158 /* set bounce buffer for this operation */ 1159 bdev_io->u.bdev.iovs[0].iov_base = buf; 1160 bdev_io->u.bdev.iovs[0].iov_len = len; 1161 1162 /* If we need to exec an accel sequence, append a copy operation making accel change the 1163 * src/dst buffers of the previous operation */ 1164 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1165 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1166 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1167 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1168 NULL, NULL, 1169 bdev_io->internal.orig_iovs, 1170 bdev_io->internal.orig_iovcnt, 1171 bdev_io->internal.memory_domain, 1172 bdev_io->internal.memory_domain_ctx, 1173 0, NULL, NULL); 1174 } else { 1175 /* We need to reverse the src/dst for reads */ 1176 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1177 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1178 bdev_io->internal.orig_iovs, 1179 bdev_io->internal.orig_iovcnt, 1180 bdev_io->internal.memory_domain, 1181 bdev_io->internal.memory_domain_ctx, 1182 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1183 NULL, NULL, 0, NULL, NULL); 1184 } 1185 1186 if (spdk_unlikely(rc != 0)) { 1187 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1188 bdev_io->internal.accel_sequence); 1189 } 1190 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1191 /* if this is write path, copy data from original buffer to bounce buffer */ 1192 if (bdev_io_use_memory_domain(bdev_io)) { 1193 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1194 bdev_io->internal.memory_domain_ctx, 1195 bdev_io->internal.orig_iovs, 1196 (uint32_t) bdev_io->internal.orig_iovcnt, 1197 bdev_io->u.bdev.iovs, 1, 1198 _bdev_io_pull_bounce_data_buf_done, 1199 bdev_io); 1200 if (rc == 0) { 1201 /* Continue to submit IO in completion callback */ 1202 return; 1203 } 1204 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1205 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1206 } else { 1207 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1208 } 1209 } 1210 1211 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1212 } 1213 1214 static void 1215 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1216 { 1217 struct spdk_bdev *bdev = bdev_io->bdev; 1218 bool buf_allocated; 1219 uint64_t alignment; 1220 void *aligned_buf; 1221 1222 bdev_io->internal.buf = buf; 1223 1224 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1225 bdev_io_get_buf_complete(bdev_io, true); 1226 return; 1227 } 1228 1229 alignment = spdk_bdev_get_buf_align(bdev); 1230 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1231 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1232 1233 if (buf_allocated) { 1234 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1235 /* Continue in completion callback */ 1236 return; 1237 } else { 1238 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1239 } 1240 1241 _bdev_io_set_md_buf(bdev_io); 1242 } 1243 1244 static inline uint64_t 1245 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1246 { 1247 struct spdk_bdev *bdev = bdev_io->bdev; 1248 uint64_t md_len, alignment; 1249 1250 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1251 alignment = spdk_bdev_get_buf_align(bdev); 1252 1253 return len + alignment + md_len; 1254 } 1255 1256 static void 1257 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1258 { 1259 struct spdk_bdev_mgmt_channel *ch; 1260 1261 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1262 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1263 } 1264 1265 static void 1266 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1267 { 1268 assert(bdev_io->internal.buf != NULL); 1269 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1270 bdev_io->internal.buf = NULL; 1271 } 1272 1273 void 1274 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1275 { 1276 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1277 1278 assert(buf != NULL); 1279 _bdev_io_put_buf(bdev_io, buf, len); 1280 } 1281 1282 static inline void 1283 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1284 struct spdk_bdev_io *bdev_io) 1285 { 1286 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1287 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1288 * sequence pointer to make sure we won't touch it anymore. */ 1289 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1290 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1291 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1292 bdev_io->internal.accel_sequence = NULL; 1293 } 1294 1295 bdev->fn_table->submit_request(ioch, bdev_io); 1296 } 1297 1298 static void 1299 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1300 { 1301 struct spdk_bdev *bdev = bdev_ch->bdev; 1302 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1303 struct spdk_bdev_io *bdev_io; 1304 1305 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1306 /* 1307 * Allow some more I/O to complete before retrying the nomem_io queue. 1308 * Some drivers (such as nvme) cannot immediately take a new I/O in 1309 * the context of a completion, because the resources for the I/O are 1310 * not released until control returns to the bdev poller. Also, we 1311 * may require several small I/O to complete before a larger I/O 1312 * (that requires splitting) can be submitted. 1313 */ 1314 return; 1315 } 1316 1317 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1318 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1319 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1320 bdev_io->internal.ch->io_outstanding++; 1321 shared_resource->io_outstanding++; 1322 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1323 bdev_io->internal.error.nvme.cdw0 = 0; 1324 bdev_io->num_retries++; 1325 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1326 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1327 break; 1328 } 1329 } 1330 } 1331 1332 static inline void 1333 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1334 struct spdk_bdev_shared_resource *shared_resource) 1335 { 1336 assert(bdev_ch->io_outstanding > 0); 1337 assert(shared_resource->io_outstanding > 0); 1338 bdev_ch->io_outstanding--; 1339 shared_resource->io_outstanding--; 1340 } 1341 1342 static inline bool 1343 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1344 { 1345 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1346 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1347 1348 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1349 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1350 /* 1351 * Wait for some of the outstanding I/O to complete before we 1352 * retry any of the nomem_io. Normally we will wait for 1353 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1354 * depth channels we will instead wait for half to complete. 1355 */ 1356 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1357 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1358 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1359 * ownership of that sequence is transferred back to the bdev layer, so we need to 1360 * restore internal.accel_sequence to make sure that the sequence is handled 1361 * correctly in case the I/O is later aborted. */ 1362 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1363 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1364 assert(bdev_io->internal.accel_sequence == NULL); 1365 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1366 } 1367 1368 return true; 1369 } 1370 1371 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1372 bdev_ch_retry_io(bdev_ch); 1373 } 1374 1375 return false; 1376 } 1377 1378 static void 1379 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1380 { 1381 struct spdk_bdev_io *bdev_io = ctx; 1382 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1383 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1384 1385 if (rc) { 1386 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1387 } 1388 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1389 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1390 */ 1391 bdev_io_put_buf(bdev_io); 1392 1393 /* Continue with IO completion flow */ 1394 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1395 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1396 return; 1397 } 1398 1399 bdev_io_complete(bdev_io); 1400 } 1401 1402 static void 1403 _bdev_io_push_bounce_md_buffer_done(void *ctx, int rc) 1404 { 1405 struct spdk_bdev_io *bdev_io = ctx; 1406 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1407 1408 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1409 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1410 } 1411 1412 static inline void 1413 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1414 { 1415 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1416 int rc = 0; 1417 1418 /* do the same for metadata buffer */ 1419 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1420 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1421 1422 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1423 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1424 if (bdev_io_use_memory_domain(bdev_io)) { 1425 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1426 /* If memory domain is used then we need to call async push function */ 1427 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1428 bdev_io->internal.memory_domain_ctx, 1429 &bdev_io->internal.orig_md_iov, 1430 (uint32_t)bdev_io->internal.orig_iovcnt, 1431 &bdev_io->internal.bounce_md_iov, 1, 1432 _bdev_io_push_bounce_md_buffer_done, 1433 bdev_io); 1434 if (rc == 0) { 1435 /* Continue IO completion in async callback */ 1436 return; 1437 } 1438 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1439 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1440 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1441 } else { 1442 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1443 bdev_io->internal.orig_md_iov.iov_len); 1444 } 1445 } 1446 } 1447 1448 assert(bdev_io->internal.data_transfer_cpl); 1449 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1450 } 1451 1452 static void 1453 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1454 { 1455 struct spdk_bdev_io *bdev_io = ctx; 1456 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1457 1458 assert(bdev_io->internal.data_transfer_cpl); 1459 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1460 1461 if (rc) { 1462 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1463 return; 1464 } 1465 1466 /* set original buffer for this io */ 1467 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1468 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1469 /* disable bouncing buffer for this io */ 1470 bdev_io->internal.orig_iovcnt = 0; 1471 bdev_io->internal.orig_iovs = NULL; 1472 1473 _bdev_io_push_bounce_md_buffer(bdev_io); 1474 } 1475 1476 static inline void 1477 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1478 { 1479 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1480 int rc = 0; 1481 1482 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1483 bdev_io->internal.data_transfer_cpl = cpl_cb; 1484 1485 /* if this is read path, copy data from bounce buffer to original buffer */ 1486 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1487 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1488 if (bdev_io_use_memory_domain(bdev_io)) { 1489 /* If memory domain is used then we need to call async push function */ 1490 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1491 bdev_io->internal.memory_domain_ctx, 1492 bdev_io->internal.orig_iovs, 1493 (uint32_t)bdev_io->internal.orig_iovcnt, 1494 &bdev_io->internal.bounce_iov, 1, 1495 _bdev_io_push_bounce_data_buffer_done, 1496 bdev_io); 1497 if (rc == 0) { 1498 /* Continue IO completion in async callback */ 1499 return; 1500 } 1501 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1502 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1503 } else { 1504 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1505 bdev_io->internal.orig_iovcnt, 1506 bdev_io->internal.bounce_iov.iov_base, 1507 bdev_io->internal.bounce_iov.iov_len); 1508 } 1509 } 1510 1511 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1512 } 1513 1514 static void 1515 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1516 { 1517 struct spdk_bdev_io *bdev_io; 1518 1519 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1520 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1521 } 1522 1523 static void 1524 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1525 { 1526 struct spdk_bdev_mgmt_channel *mgmt_ch; 1527 uint64_t max_len; 1528 void *buf; 1529 1530 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1531 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1532 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1533 1534 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1535 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1536 bdev_io_get_buf_complete(bdev_io, false); 1537 return; 1538 } 1539 1540 bdev_io->internal.buf_len = len; 1541 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1542 bdev_io_get_iobuf_cb); 1543 if (buf != NULL) { 1544 _bdev_io_set_buf(bdev_io, buf, len); 1545 } 1546 } 1547 1548 void 1549 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1550 { 1551 struct spdk_bdev *bdev = bdev_io->bdev; 1552 uint64_t alignment; 1553 1554 assert(cb != NULL); 1555 bdev_io->internal.get_buf_cb = cb; 1556 1557 alignment = spdk_bdev_get_buf_align(bdev); 1558 1559 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1560 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1561 /* Buffer already present and aligned */ 1562 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1563 return; 1564 } 1565 1566 bdev_io_get_buf(bdev_io, len); 1567 } 1568 1569 static void 1570 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1571 bool success) 1572 { 1573 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1574 1575 TAILQ_REMOVE(&bdev_ch->io_memory_domain, bdev_io, internal.link); 1576 1577 if (!success) { 1578 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1579 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1580 bdev_io_complete_unsubmitted(bdev_io); 1581 return; 1582 } 1583 1584 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1585 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1586 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1587 return; 1588 } 1589 /* For reads we'll execute the sequence after the data is read, so, for now, only 1590 * clear out accel_sequence pointer and submit the IO */ 1591 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1592 bdev_io->u.bdev.accel_sequence = NULL; 1593 } 1594 1595 bdev_io_submit(bdev_io); 1596 } 1597 1598 static void 1599 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1600 uint64_t len) 1601 { 1602 assert(cb != NULL); 1603 bdev_io->internal.get_buf_cb = cb; 1604 1605 bdev_io_get_buf(bdev_io, len); 1606 } 1607 1608 void 1609 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1610 { 1611 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1612 1613 assert(cb != NULL); 1614 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1615 bdev_io->internal.get_aux_buf_cb = cb; 1616 bdev_io_get_buf(bdev_io, len); 1617 } 1618 1619 static int 1620 bdev_module_get_max_ctx_size(void) 1621 { 1622 struct spdk_bdev_module *bdev_module; 1623 int max_bdev_module_size = 0; 1624 1625 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1626 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1627 max_bdev_module_size = bdev_module->get_ctx_size(); 1628 } 1629 } 1630 1631 return max_bdev_module_size; 1632 } 1633 1634 static void 1635 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1636 { 1637 int i; 1638 struct spdk_bdev_qos *qos = bdev->internal.qos; 1639 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1640 1641 if (!qos) { 1642 return; 1643 } 1644 1645 spdk_bdev_get_qos_rate_limits(bdev, limits); 1646 1647 spdk_json_write_object_begin(w); 1648 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1649 1650 spdk_json_write_named_object_begin(w, "params"); 1651 spdk_json_write_named_string(w, "name", bdev->name); 1652 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1653 if (limits[i] > 0) { 1654 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1655 } 1656 } 1657 spdk_json_write_object_end(w); 1658 1659 spdk_json_write_object_end(w); 1660 } 1661 1662 void 1663 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1664 { 1665 struct spdk_bdev_module *bdev_module; 1666 struct spdk_bdev *bdev; 1667 1668 assert(w != NULL); 1669 1670 spdk_json_write_array_begin(w); 1671 1672 spdk_json_write_object_begin(w); 1673 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1674 spdk_json_write_named_object_begin(w, "params"); 1675 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1676 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1677 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1678 spdk_json_write_object_end(w); 1679 spdk_json_write_object_end(w); 1680 1681 bdev_examine_allowlist_config_json(w); 1682 1683 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1684 if (bdev_module->config_json) { 1685 bdev_module->config_json(w); 1686 } 1687 } 1688 1689 spdk_spin_lock(&g_bdev_mgr.spinlock); 1690 1691 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1692 if (bdev->fn_table->write_config_json) { 1693 bdev->fn_table->write_config_json(bdev, w); 1694 } 1695 1696 bdev_qos_config_json(bdev, w); 1697 } 1698 1699 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1700 1701 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1702 spdk_json_write_object_begin(w); 1703 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1704 spdk_json_write_object_end(w); 1705 1706 spdk_json_write_array_end(w); 1707 } 1708 1709 static void 1710 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1711 { 1712 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1713 struct spdk_bdev_io *bdev_io; 1714 1715 spdk_iobuf_channel_fini(&ch->iobuf); 1716 1717 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1718 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1719 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1720 ch->per_thread_cache_count--; 1721 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1722 } 1723 1724 assert(ch->per_thread_cache_count == 0); 1725 } 1726 1727 static int 1728 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1729 { 1730 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1731 struct spdk_bdev_io *bdev_io; 1732 uint32_t i; 1733 int rc; 1734 1735 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1736 if (rc != 0) { 1737 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1738 return -1; 1739 } 1740 1741 STAILQ_INIT(&ch->per_thread_cache); 1742 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1743 1744 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1745 ch->per_thread_cache_count = 0; 1746 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1747 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1748 if (bdev_io == NULL) { 1749 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1750 assert(false); 1751 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1752 return -1; 1753 } 1754 ch->per_thread_cache_count++; 1755 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1756 } 1757 1758 TAILQ_INIT(&ch->shared_resources); 1759 TAILQ_INIT(&ch->io_wait_queue); 1760 1761 return 0; 1762 } 1763 1764 static void 1765 bdev_init_complete(int rc) 1766 { 1767 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1768 void *cb_arg = g_init_cb_arg; 1769 struct spdk_bdev_module *m; 1770 1771 g_bdev_mgr.init_complete = true; 1772 g_init_cb_fn = NULL; 1773 g_init_cb_arg = NULL; 1774 1775 /* 1776 * For modules that need to know when subsystem init is complete, 1777 * inform them now. 1778 */ 1779 if (rc == 0) { 1780 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1781 if (m->init_complete) { 1782 m->init_complete(); 1783 } 1784 } 1785 } 1786 1787 cb_fn(cb_arg, rc); 1788 } 1789 1790 static bool 1791 bdev_module_all_actions_completed(void) 1792 { 1793 struct spdk_bdev_module *m; 1794 1795 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1796 if (m->internal.action_in_progress > 0) { 1797 return false; 1798 } 1799 } 1800 return true; 1801 } 1802 1803 static void 1804 bdev_module_action_complete(void) 1805 { 1806 /* 1807 * Don't finish bdev subsystem initialization if 1808 * module pre-initialization is still in progress, or 1809 * the subsystem been already initialized. 1810 */ 1811 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1812 return; 1813 } 1814 1815 /* 1816 * Check all bdev modules for inits/examinations in progress. If any 1817 * exist, return immediately since we cannot finish bdev subsystem 1818 * initialization until all are completed. 1819 */ 1820 if (!bdev_module_all_actions_completed()) { 1821 return; 1822 } 1823 1824 /* 1825 * Modules already finished initialization - now that all 1826 * the bdev modules have finished their asynchronous I/O 1827 * processing, the entire bdev layer can be marked as complete. 1828 */ 1829 bdev_init_complete(0); 1830 } 1831 1832 static void 1833 bdev_module_action_done(struct spdk_bdev_module *module) 1834 { 1835 spdk_spin_lock(&module->internal.spinlock); 1836 assert(module->internal.action_in_progress > 0); 1837 module->internal.action_in_progress--; 1838 spdk_spin_unlock(&module->internal.spinlock); 1839 bdev_module_action_complete(); 1840 } 1841 1842 void 1843 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1844 { 1845 assert(module->async_init); 1846 bdev_module_action_done(module); 1847 } 1848 1849 void 1850 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1851 { 1852 bdev_module_action_done(module); 1853 } 1854 1855 /** The last initialized bdev module */ 1856 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1857 1858 static void 1859 bdev_init_failed(void *cb_arg) 1860 { 1861 struct spdk_bdev_module *module = cb_arg; 1862 1863 spdk_spin_lock(&module->internal.spinlock); 1864 assert(module->internal.action_in_progress > 0); 1865 module->internal.action_in_progress--; 1866 spdk_spin_unlock(&module->internal.spinlock); 1867 bdev_init_complete(-1); 1868 } 1869 1870 static int 1871 bdev_modules_init(void) 1872 { 1873 struct spdk_bdev_module *module; 1874 int rc = 0; 1875 1876 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1877 g_resume_bdev_module = module; 1878 if (module->async_init) { 1879 spdk_spin_lock(&module->internal.spinlock); 1880 module->internal.action_in_progress = 1; 1881 spdk_spin_unlock(&module->internal.spinlock); 1882 } 1883 rc = module->module_init(); 1884 if (rc != 0) { 1885 /* Bump action_in_progress to prevent other modules from completion of modules_init 1886 * Send message to defer application shutdown until resources are cleaned up */ 1887 spdk_spin_lock(&module->internal.spinlock); 1888 module->internal.action_in_progress = 1; 1889 spdk_spin_unlock(&module->internal.spinlock); 1890 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1891 return rc; 1892 } 1893 } 1894 1895 g_resume_bdev_module = NULL; 1896 return 0; 1897 } 1898 1899 void 1900 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1901 { 1902 int rc = 0; 1903 char mempool_name[32]; 1904 1905 assert(cb_fn != NULL); 1906 1907 g_init_cb_fn = cb_fn; 1908 g_init_cb_arg = cb_arg; 1909 1910 spdk_notify_type_register("bdev_register"); 1911 spdk_notify_type_register("bdev_unregister"); 1912 1913 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1914 1915 rc = spdk_iobuf_register_module("bdev"); 1916 if (rc != 0) { 1917 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1918 bdev_init_complete(-1); 1919 return; 1920 } 1921 1922 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1923 g_bdev_opts.bdev_io_pool_size, 1924 sizeof(struct spdk_bdev_io) + 1925 bdev_module_get_max_ctx_size(), 1926 0, 1927 SPDK_ENV_SOCKET_ID_ANY); 1928 1929 if (g_bdev_mgr.bdev_io_pool == NULL) { 1930 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1931 bdev_init_complete(-1); 1932 return; 1933 } 1934 1935 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1936 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1937 if (!g_bdev_mgr.zero_buffer) { 1938 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1939 bdev_init_complete(-1); 1940 return; 1941 } 1942 1943 #ifdef SPDK_CONFIG_VTUNE 1944 SPDK_LOG_DEPRECATED(vtune_support); 1945 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1946 #endif 1947 1948 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1949 bdev_mgmt_channel_destroy, 1950 sizeof(struct spdk_bdev_mgmt_channel), 1951 "bdev_mgr"); 1952 1953 rc = bdev_modules_init(); 1954 g_bdev_mgr.module_init_complete = true; 1955 if (rc != 0) { 1956 SPDK_ERRLOG("bdev modules init failed\n"); 1957 return; 1958 } 1959 1960 bdev_module_action_complete(); 1961 } 1962 1963 static void 1964 bdev_mgr_unregister_cb(void *io_device) 1965 { 1966 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1967 1968 if (g_bdev_mgr.bdev_io_pool) { 1969 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1970 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1971 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1972 g_bdev_opts.bdev_io_pool_size); 1973 } 1974 1975 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1976 } 1977 1978 spdk_free(g_bdev_mgr.zero_buffer); 1979 1980 bdev_examine_allowlist_free(); 1981 1982 cb_fn(g_fini_cb_arg); 1983 g_fini_cb_fn = NULL; 1984 g_fini_cb_arg = NULL; 1985 g_bdev_mgr.init_complete = false; 1986 g_bdev_mgr.module_init_complete = false; 1987 } 1988 1989 static void 1990 bdev_module_fini_iter(void *arg) 1991 { 1992 struct spdk_bdev_module *bdev_module; 1993 1994 /* FIXME: Handling initialization failures is broken now, 1995 * so we won't even try cleaning up after successfully 1996 * initialized modules. if module_init_complete is false, 1997 * just call spdk_bdev_mgr_unregister_cb 1998 */ 1999 if (!g_bdev_mgr.module_init_complete) { 2000 bdev_mgr_unregister_cb(NULL); 2001 return; 2002 } 2003 2004 /* Start iterating from the last touched module */ 2005 if (!g_resume_bdev_module) { 2006 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2007 } else { 2008 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2009 internal.tailq); 2010 } 2011 2012 while (bdev_module) { 2013 if (bdev_module->async_fini) { 2014 /* Save our place so we can resume later. We must 2015 * save the variable here, before calling module_fini() 2016 * below, because in some cases the module may immediately 2017 * call spdk_bdev_module_fini_done() and re-enter 2018 * this function to continue iterating. */ 2019 g_resume_bdev_module = bdev_module; 2020 } 2021 2022 if (bdev_module->module_fini) { 2023 bdev_module->module_fini(); 2024 } 2025 2026 if (bdev_module->async_fini) { 2027 return; 2028 } 2029 2030 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2031 internal.tailq); 2032 } 2033 2034 g_resume_bdev_module = NULL; 2035 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2036 } 2037 2038 void 2039 spdk_bdev_module_fini_done(void) 2040 { 2041 if (spdk_get_thread() != g_fini_thread) { 2042 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2043 } else { 2044 bdev_module_fini_iter(NULL); 2045 } 2046 } 2047 2048 static void 2049 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2050 { 2051 struct spdk_bdev *bdev = cb_arg; 2052 2053 if (bdeverrno && bdev) { 2054 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2055 bdev->name); 2056 2057 /* 2058 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2059 * bdev; try to continue by manually removing this bdev from the list and continue 2060 * with the next bdev in the list. 2061 */ 2062 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2063 } 2064 2065 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2066 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2067 /* 2068 * Bdev module finish need to be deferred as we might be in the middle of some context 2069 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2070 * after returning. 2071 */ 2072 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2073 return; 2074 } 2075 2076 /* 2077 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2078 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2079 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2080 * base bdevs. 2081 * 2082 * Also, walk the list in the reverse order. 2083 */ 2084 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2085 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2086 spdk_spin_lock(&bdev->internal.spinlock); 2087 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2088 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2089 spdk_spin_unlock(&bdev->internal.spinlock); 2090 continue; 2091 } 2092 spdk_spin_unlock(&bdev->internal.spinlock); 2093 2094 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2095 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2096 return; 2097 } 2098 2099 /* 2100 * If any bdev fails to unclaim underlying bdev properly, we may face the 2101 * case of bdev list consisting of claimed bdevs only (if claims are managed 2102 * correctly, this would mean there's a loop in the claims graph which is 2103 * clearly impossible). Warn and unregister last bdev on the list then. 2104 */ 2105 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2106 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2107 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2108 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2109 return; 2110 } 2111 } 2112 2113 static void 2114 bdev_module_fini_start_iter(void *arg) 2115 { 2116 struct spdk_bdev_module *bdev_module; 2117 2118 if (!g_resume_bdev_module) { 2119 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2120 } else { 2121 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2122 } 2123 2124 while (bdev_module) { 2125 if (bdev_module->async_fini_start) { 2126 /* Save our place so we can resume later. We must 2127 * save the variable here, before calling fini_start() 2128 * below, because in some cases the module may immediately 2129 * call spdk_bdev_module_fini_start_done() and re-enter 2130 * this function to continue iterating. */ 2131 g_resume_bdev_module = bdev_module; 2132 } 2133 2134 if (bdev_module->fini_start) { 2135 bdev_module->fini_start(); 2136 } 2137 2138 if (bdev_module->async_fini_start) { 2139 return; 2140 } 2141 2142 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2143 } 2144 2145 g_resume_bdev_module = NULL; 2146 2147 bdev_finish_unregister_bdevs_iter(NULL, 0); 2148 } 2149 2150 void 2151 spdk_bdev_module_fini_start_done(void) 2152 { 2153 if (spdk_get_thread() != g_fini_thread) { 2154 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2155 } else { 2156 bdev_module_fini_start_iter(NULL); 2157 } 2158 } 2159 2160 static void 2161 bdev_finish_wait_for_examine_done(void *cb_arg) 2162 { 2163 bdev_module_fini_start_iter(NULL); 2164 } 2165 2166 void 2167 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2168 { 2169 int rc; 2170 2171 assert(cb_fn != NULL); 2172 2173 g_fini_thread = spdk_get_thread(); 2174 2175 g_fini_cb_fn = cb_fn; 2176 g_fini_cb_arg = cb_arg; 2177 2178 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2179 if (rc != 0) { 2180 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2181 bdev_finish_wait_for_examine_done(NULL); 2182 } 2183 } 2184 2185 struct spdk_bdev_io * 2186 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2187 { 2188 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2189 struct spdk_bdev_io *bdev_io; 2190 2191 if (ch->per_thread_cache_count > 0) { 2192 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2193 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2194 ch->per_thread_cache_count--; 2195 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2196 /* 2197 * Don't try to look for bdev_ios in the global pool if there are 2198 * waiters on bdev_ios - we don't want this caller to jump the line. 2199 */ 2200 bdev_io = NULL; 2201 } else { 2202 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2203 } 2204 2205 return bdev_io; 2206 } 2207 2208 void 2209 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2210 { 2211 struct spdk_bdev_mgmt_channel *ch; 2212 2213 assert(bdev_io != NULL); 2214 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2215 2216 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2217 2218 if (bdev_io->internal.buf != NULL) { 2219 bdev_io_put_buf(bdev_io); 2220 } 2221 2222 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2223 ch->per_thread_cache_count++; 2224 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2225 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2226 struct spdk_bdev_io_wait_entry *entry; 2227 2228 entry = TAILQ_FIRST(&ch->io_wait_queue); 2229 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2230 entry->cb_fn(entry->cb_arg); 2231 } 2232 } else { 2233 /* We should never have a full cache with entries on the io wait queue. */ 2234 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2235 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2236 } 2237 } 2238 2239 static bool 2240 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2241 { 2242 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2243 2244 switch (limit) { 2245 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2246 return true; 2247 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2248 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2249 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2250 return false; 2251 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2252 default: 2253 return false; 2254 } 2255 } 2256 2257 static bool 2258 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2259 { 2260 switch (bdev_io->type) { 2261 case SPDK_BDEV_IO_TYPE_NVME_IO: 2262 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2263 case SPDK_BDEV_IO_TYPE_READ: 2264 case SPDK_BDEV_IO_TYPE_WRITE: 2265 return true; 2266 case SPDK_BDEV_IO_TYPE_ZCOPY: 2267 if (bdev_io->u.bdev.zcopy.start) { 2268 return true; 2269 } else { 2270 return false; 2271 } 2272 default: 2273 return false; 2274 } 2275 } 2276 2277 static bool 2278 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2279 { 2280 switch (bdev_io->type) { 2281 case SPDK_BDEV_IO_TYPE_NVME_IO: 2282 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2283 /* Bit 1 (0x2) set for read operation */ 2284 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2285 return true; 2286 } else { 2287 return false; 2288 } 2289 case SPDK_BDEV_IO_TYPE_READ: 2290 return true; 2291 case SPDK_BDEV_IO_TYPE_ZCOPY: 2292 /* Populate to read from disk */ 2293 if (bdev_io->u.bdev.zcopy.populate) { 2294 return true; 2295 } else { 2296 return false; 2297 } 2298 default: 2299 return false; 2300 } 2301 } 2302 2303 static uint64_t 2304 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2305 { 2306 struct spdk_bdev *bdev = bdev_io->bdev; 2307 2308 switch (bdev_io->type) { 2309 case SPDK_BDEV_IO_TYPE_NVME_IO: 2310 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2311 return bdev_io->u.nvme_passthru.nbytes; 2312 case SPDK_BDEV_IO_TYPE_READ: 2313 case SPDK_BDEV_IO_TYPE_WRITE: 2314 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2315 case SPDK_BDEV_IO_TYPE_ZCOPY: 2316 /* Track the data in the start phase only */ 2317 if (bdev_io->u.bdev.zcopy.start) { 2318 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2319 } else { 2320 return 0; 2321 } 2322 default: 2323 return 0; 2324 } 2325 } 2326 2327 static bool 2328 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2329 { 2330 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2331 return true; 2332 } else { 2333 return false; 2334 } 2335 } 2336 2337 static bool 2338 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2339 { 2340 if (bdev_is_read_io(io) == false) { 2341 return false; 2342 } 2343 2344 return bdev_qos_rw_queue_io(limit, io); 2345 } 2346 2347 static bool 2348 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2349 { 2350 if (bdev_is_read_io(io) == true) { 2351 return false; 2352 } 2353 2354 return bdev_qos_rw_queue_io(limit, io); 2355 } 2356 2357 static void 2358 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2359 { 2360 limit->remaining_this_timeslice--; 2361 } 2362 2363 static void 2364 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2365 { 2366 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2367 } 2368 2369 static void 2370 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2371 { 2372 if (bdev_is_read_io(io) == false) { 2373 return; 2374 } 2375 2376 return bdev_qos_rw_bps_update_quota(limit, io); 2377 } 2378 2379 static void 2380 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2381 { 2382 if (bdev_is_read_io(io) == true) { 2383 return; 2384 } 2385 2386 return bdev_qos_rw_bps_update_quota(limit, io); 2387 } 2388 2389 static void 2390 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2391 { 2392 int i; 2393 2394 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2395 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2396 qos->rate_limits[i].queue_io = NULL; 2397 qos->rate_limits[i].update_quota = NULL; 2398 continue; 2399 } 2400 2401 switch (i) { 2402 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2403 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2404 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2405 break; 2406 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2407 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2408 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2409 break; 2410 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2411 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2412 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2413 break; 2414 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2415 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2416 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2417 break; 2418 default: 2419 break; 2420 } 2421 } 2422 } 2423 2424 static void 2425 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2426 struct spdk_bdev_io *bdev_io, 2427 enum spdk_bdev_io_status status) 2428 { 2429 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2430 2431 bdev_io->internal.in_submit_request = true; 2432 bdev_ch->io_outstanding++; 2433 shared_resource->io_outstanding++; 2434 spdk_bdev_io_complete(bdev_io, status); 2435 bdev_io->internal.in_submit_request = false; 2436 } 2437 2438 static inline void 2439 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2440 { 2441 struct spdk_bdev *bdev = bdev_io->bdev; 2442 struct spdk_io_channel *ch = bdev_ch->channel; 2443 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2444 2445 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2446 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2447 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2448 2449 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2450 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2451 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2452 SPDK_BDEV_IO_STATUS_SUCCESS); 2453 return; 2454 } 2455 } 2456 2457 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2458 bdev_io->bdev->split_on_write_unit && 2459 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2460 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2461 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2462 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2463 return; 2464 } 2465 2466 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2467 bdev_ch->io_outstanding++; 2468 shared_resource->io_outstanding++; 2469 bdev_io->internal.in_submit_request = true; 2470 bdev_submit_request(bdev, ch, bdev_io); 2471 bdev_io->internal.in_submit_request = false; 2472 } else { 2473 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2474 } 2475 } 2476 2477 static bool 2478 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2479 { 2480 int i; 2481 2482 if (bdev_qos_io_to_limit(bdev_io) == true) { 2483 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2484 if (!qos->rate_limits[i].queue_io) { 2485 continue; 2486 } 2487 2488 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2489 bdev_io) == true) { 2490 return true; 2491 } 2492 } 2493 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2494 if (!qos->rate_limits[i].update_quota) { 2495 continue; 2496 } 2497 2498 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2499 } 2500 } 2501 2502 return false; 2503 } 2504 2505 static inline void 2506 _bdev_io_do_submit(void *ctx) 2507 { 2508 struct spdk_bdev_io *bdev_io = ctx; 2509 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2510 2511 bdev_io_do_submit(ch, bdev_io); 2512 } 2513 2514 static int 2515 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2516 { 2517 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2518 int submitted_ios = 0; 2519 2520 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2521 if (!bdev_qos_queue_io(qos, bdev_io)) { 2522 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2523 2524 if (bdev_io->internal.io_submit_ch) { 2525 /* Send back the IO to the original thread for the actual processing. */ 2526 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2527 bdev_io->internal.io_submit_ch = NULL; 2528 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2529 _bdev_io_do_submit, bdev_io); 2530 } else { 2531 bdev_io_do_submit(ch, bdev_io); 2532 } 2533 2534 submitted_ios++; 2535 } 2536 } 2537 2538 return submitted_ios; 2539 } 2540 2541 static void 2542 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2543 { 2544 int rc; 2545 2546 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2547 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2548 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2549 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2550 &bdev_io->internal.waitq_entry); 2551 if (rc != 0) { 2552 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2553 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2554 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2555 } 2556 } 2557 2558 static bool 2559 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2560 { 2561 uint32_t io_boundary; 2562 struct spdk_bdev *bdev = bdev_io->bdev; 2563 uint32_t max_size = bdev->max_segment_size; 2564 int max_segs = bdev->max_num_segments; 2565 2566 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2567 io_boundary = bdev->write_unit_size; 2568 } else if (bdev->split_on_optimal_io_boundary) { 2569 io_boundary = bdev->optimal_io_boundary; 2570 } else { 2571 io_boundary = 0; 2572 } 2573 2574 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2575 return false; 2576 } 2577 2578 if (io_boundary) { 2579 uint64_t start_stripe, end_stripe; 2580 2581 start_stripe = bdev_io->u.bdev.offset_blocks; 2582 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2583 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2584 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2585 start_stripe >>= spdk_u32log2(io_boundary); 2586 end_stripe >>= spdk_u32log2(io_boundary); 2587 } else { 2588 start_stripe /= io_boundary; 2589 end_stripe /= io_boundary; 2590 } 2591 2592 if (start_stripe != end_stripe) { 2593 return true; 2594 } 2595 } 2596 2597 if (max_segs) { 2598 if (bdev_io->u.bdev.iovcnt > max_segs) { 2599 return true; 2600 } 2601 } 2602 2603 if (max_size) { 2604 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2605 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2606 return true; 2607 } 2608 } 2609 } 2610 2611 return false; 2612 } 2613 2614 static bool 2615 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2616 { 2617 uint32_t num_unmap_segments; 2618 2619 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2620 return false; 2621 } 2622 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2623 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2624 return true; 2625 } 2626 2627 return false; 2628 } 2629 2630 static bool 2631 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2632 { 2633 if (!bdev_io->bdev->max_write_zeroes) { 2634 return false; 2635 } 2636 2637 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2638 return true; 2639 } 2640 2641 return false; 2642 } 2643 2644 static bool 2645 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2646 { 2647 if (bdev_io->bdev->max_copy != 0 && 2648 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2649 return true; 2650 } 2651 2652 return false; 2653 } 2654 2655 static bool 2656 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2657 { 2658 switch (bdev_io->type) { 2659 case SPDK_BDEV_IO_TYPE_READ: 2660 case SPDK_BDEV_IO_TYPE_WRITE: 2661 return bdev_rw_should_split(bdev_io); 2662 case SPDK_BDEV_IO_TYPE_UNMAP: 2663 return bdev_unmap_should_split(bdev_io); 2664 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2665 return bdev_write_zeroes_should_split(bdev_io); 2666 case SPDK_BDEV_IO_TYPE_COPY: 2667 return bdev_copy_should_split(bdev_io); 2668 default: 2669 return false; 2670 } 2671 } 2672 2673 static uint32_t 2674 _to_next_boundary(uint64_t offset, uint32_t boundary) 2675 { 2676 return (boundary - (offset % boundary)); 2677 } 2678 2679 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2680 2681 static void _bdev_rw_split(void *_bdev_io); 2682 2683 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2684 2685 static void 2686 _bdev_unmap_split(void *_bdev_io) 2687 { 2688 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2689 } 2690 2691 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2692 2693 static void 2694 _bdev_write_zeroes_split(void *_bdev_io) 2695 { 2696 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2697 } 2698 2699 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2700 2701 static void 2702 _bdev_copy_split(void *_bdev_io) 2703 { 2704 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2705 } 2706 2707 static int 2708 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2709 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2710 { 2711 int rc; 2712 uint64_t current_offset, current_remaining, current_src_offset; 2713 spdk_bdev_io_wait_cb io_wait_fn; 2714 2715 current_offset = *offset; 2716 current_remaining = *remaining; 2717 2718 bdev_io->u.bdev.split_outstanding++; 2719 2720 io_wait_fn = _bdev_rw_split; 2721 switch (bdev_io->type) { 2722 case SPDK_BDEV_IO_TYPE_READ: 2723 assert(bdev_io->u.bdev.accel_sequence == NULL); 2724 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2725 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2726 iov, iovcnt, md_buf, current_offset, 2727 num_blocks, bdev_io->internal.memory_domain, 2728 bdev_io->internal.memory_domain_ctx, NULL, 2729 bdev_io_split_done, bdev_io); 2730 break; 2731 case SPDK_BDEV_IO_TYPE_WRITE: 2732 assert(bdev_io->u.bdev.accel_sequence == NULL); 2733 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2734 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2735 iov, iovcnt, md_buf, current_offset, 2736 num_blocks, bdev_io->internal.memory_domain, 2737 bdev_io->internal.memory_domain_ctx, NULL, 2738 bdev_io_split_done, bdev_io); 2739 break; 2740 case SPDK_BDEV_IO_TYPE_UNMAP: 2741 io_wait_fn = _bdev_unmap_split; 2742 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2743 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2744 current_offset, num_blocks, 2745 bdev_io_split_done, bdev_io); 2746 break; 2747 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2748 io_wait_fn = _bdev_write_zeroes_split; 2749 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2750 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2751 current_offset, num_blocks, 2752 bdev_io_split_done, bdev_io); 2753 break; 2754 case SPDK_BDEV_IO_TYPE_COPY: 2755 io_wait_fn = _bdev_copy_split; 2756 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2757 (current_offset - bdev_io->u.bdev.offset_blocks); 2758 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2759 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2760 current_offset, current_src_offset, num_blocks, 2761 bdev_io_split_done, bdev_io); 2762 break; 2763 default: 2764 assert(false); 2765 rc = -EINVAL; 2766 break; 2767 } 2768 2769 if (rc == 0) { 2770 current_offset += num_blocks; 2771 current_remaining -= num_blocks; 2772 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2773 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2774 *offset = current_offset; 2775 *remaining = current_remaining; 2776 } else { 2777 bdev_io->u.bdev.split_outstanding--; 2778 if (rc == -ENOMEM) { 2779 if (bdev_io->u.bdev.split_outstanding == 0) { 2780 /* No I/O is outstanding. Hence we should wait here. */ 2781 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2782 } 2783 } else { 2784 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2785 if (bdev_io->u.bdev.split_outstanding == 0) { 2786 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2787 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2788 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2789 } 2790 } 2791 } 2792 2793 return rc; 2794 } 2795 2796 static void 2797 _bdev_rw_split(void *_bdev_io) 2798 { 2799 struct iovec *parent_iov, *iov; 2800 struct spdk_bdev_io *bdev_io = _bdev_io; 2801 struct spdk_bdev *bdev = bdev_io->bdev; 2802 uint64_t parent_offset, current_offset, remaining; 2803 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2804 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2805 uint32_t iovcnt, iov_len, child_iovsize; 2806 uint32_t blocklen = bdev->blocklen; 2807 uint32_t io_boundary; 2808 uint32_t max_segment_size = bdev->max_segment_size; 2809 uint32_t max_child_iovcnt = bdev->max_num_segments; 2810 void *md_buf = NULL; 2811 int rc; 2812 2813 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2814 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2815 SPDK_BDEV_IO_NUM_CHILD_IOV; 2816 2817 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2818 io_boundary = bdev->write_unit_size; 2819 } else if (bdev->split_on_optimal_io_boundary) { 2820 io_boundary = bdev->optimal_io_boundary; 2821 } else { 2822 io_boundary = UINT32_MAX; 2823 } 2824 2825 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2826 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2827 parent_offset = bdev_io->u.bdev.offset_blocks; 2828 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2829 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2830 2831 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2832 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2833 if (parent_iov_offset < parent_iov->iov_len) { 2834 break; 2835 } 2836 parent_iov_offset -= parent_iov->iov_len; 2837 } 2838 2839 child_iovcnt = 0; 2840 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2841 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2842 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2843 to_next_boundary = spdk_min(remaining, to_next_boundary); 2844 to_next_boundary_bytes = to_next_boundary * blocklen; 2845 2846 iov = &bdev_io->child_iov[child_iovcnt]; 2847 iovcnt = 0; 2848 2849 if (bdev_io->u.bdev.md_buf) { 2850 md_buf = (char *)bdev_io->u.bdev.md_buf + 2851 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2852 } 2853 2854 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2855 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2856 iovcnt < child_iovsize) { 2857 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2858 iov_len = parent_iov->iov_len - parent_iov_offset; 2859 2860 iov_len = spdk_min(iov_len, max_segment_size); 2861 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2862 to_next_boundary_bytes -= iov_len; 2863 2864 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2865 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2866 2867 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2868 parent_iov_offset += iov_len; 2869 } else { 2870 parent_iovpos++; 2871 parent_iov_offset = 0; 2872 } 2873 child_iovcnt++; 2874 iovcnt++; 2875 } 2876 2877 if (to_next_boundary_bytes > 0) { 2878 /* We had to stop this child I/O early because we ran out of 2879 * child_iov space or were limited by max_num_segments. 2880 * Ensure the iovs to be aligned with block size and 2881 * then adjust to_next_boundary before starting the 2882 * child I/O. 2883 */ 2884 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2885 iovcnt == child_iovsize); 2886 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2887 if (to_last_block_bytes != 0) { 2888 uint32_t child_iovpos = child_iovcnt - 1; 2889 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2890 * so the loop will naturally end 2891 */ 2892 2893 to_last_block_bytes = blocklen - to_last_block_bytes; 2894 to_next_boundary_bytes += to_last_block_bytes; 2895 while (to_last_block_bytes > 0 && iovcnt > 0) { 2896 iov_len = spdk_min(to_last_block_bytes, 2897 bdev_io->child_iov[child_iovpos].iov_len); 2898 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2899 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2900 child_iovpos--; 2901 if (--iovcnt == 0) { 2902 /* If the child IO is less than a block size just return. 2903 * If the first child IO of any split round is less than 2904 * a block size, an error exit. 2905 */ 2906 if (bdev_io->u.bdev.split_outstanding == 0) { 2907 SPDK_ERRLOG("The first child io was less than a block size\n"); 2908 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2909 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2910 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2911 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2912 } 2913 2914 return; 2915 } 2916 } 2917 2918 to_last_block_bytes -= iov_len; 2919 2920 if (parent_iov_offset == 0) { 2921 parent_iovpos--; 2922 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2923 } 2924 parent_iov_offset -= iov_len; 2925 } 2926 2927 assert(to_last_block_bytes == 0); 2928 } 2929 to_next_boundary -= to_next_boundary_bytes / blocklen; 2930 } 2931 2932 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2933 ¤t_offset, &remaining); 2934 if (spdk_unlikely(rc)) { 2935 return; 2936 } 2937 } 2938 } 2939 2940 static void 2941 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2942 { 2943 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2944 uint32_t num_children_reqs = 0; 2945 int rc; 2946 2947 offset = bdev_io->u.bdev.split_current_offset_blocks; 2948 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2949 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2950 2951 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2952 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2953 2954 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2955 &offset, &remaining); 2956 if (spdk_likely(rc == 0)) { 2957 num_children_reqs++; 2958 } else { 2959 return; 2960 } 2961 } 2962 } 2963 2964 static void 2965 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2966 { 2967 uint64_t offset, write_zeroes_blocks, remaining; 2968 uint32_t num_children_reqs = 0; 2969 int rc; 2970 2971 offset = bdev_io->u.bdev.split_current_offset_blocks; 2972 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2973 2974 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2975 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2976 2977 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2978 &offset, &remaining); 2979 if (spdk_likely(rc == 0)) { 2980 num_children_reqs++; 2981 } else { 2982 return; 2983 } 2984 } 2985 } 2986 2987 static void 2988 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2989 { 2990 uint64_t offset, copy_blocks, remaining; 2991 uint32_t num_children_reqs = 0; 2992 int rc; 2993 2994 offset = bdev_io->u.bdev.split_current_offset_blocks; 2995 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2996 2997 assert(bdev_io->bdev->max_copy != 0); 2998 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2999 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3000 3001 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3002 &offset, &remaining); 3003 if (spdk_likely(rc == 0)) { 3004 num_children_reqs++; 3005 } else { 3006 return; 3007 } 3008 } 3009 } 3010 3011 static void 3012 parent_bdev_io_complete(void *ctx, int rc) 3013 { 3014 struct spdk_bdev_io *parent_io = ctx; 3015 3016 if (rc) { 3017 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3018 } 3019 3020 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3021 parent_io->internal.caller_ctx); 3022 } 3023 3024 static void 3025 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3026 { 3027 struct spdk_bdev_io *bdev_io = ctx; 3028 3029 /* u.bdev.accel_sequence should have already been cleared at this point */ 3030 assert(bdev_io->u.bdev.accel_sequence == NULL); 3031 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3032 3033 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 3034 bdev_io->internal.accel_sequence = NULL; 3035 3036 if (spdk_unlikely(status != 0)) { 3037 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3038 } 3039 3040 parent_bdev_io_complete(bdev_io, status); 3041 } 3042 3043 static void 3044 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3045 { 3046 struct spdk_bdev_io *parent_io = cb_arg; 3047 3048 spdk_bdev_free_io(bdev_io); 3049 3050 if (!success) { 3051 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3052 /* If any child I/O failed, stop further splitting process. */ 3053 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3054 parent_io->u.bdev.split_remaining_num_blocks = 0; 3055 } 3056 parent_io->u.bdev.split_outstanding--; 3057 if (parent_io->u.bdev.split_outstanding != 0) { 3058 return; 3059 } 3060 3061 /* 3062 * Parent I/O finishes when all blocks are consumed. 3063 */ 3064 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3065 assert(parent_io->internal.cb != bdev_io_split_done); 3066 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3067 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3068 3069 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io) && 3070 spdk_likely(success)) { 3071 bdev_io_exec_sequence(bdev_io, bdev_io_complete_parent_sequence_cb); 3072 } else if (parent_io->internal.orig_iovcnt != 0) { 3073 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3074 /* bdev IO will be completed in the callback */ 3075 } else { 3076 parent_bdev_io_complete(parent_io, 0); 3077 } 3078 return; 3079 } 3080 3081 /* 3082 * Continue with the splitting process. This function will complete the parent I/O if the 3083 * splitting is done. 3084 */ 3085 switch (parent_io->type) { 3086 case SPDK_BDEV_IO_TYPE_READ: 3087 case SPDK_BDEV_IO_TYPE_WRITE: 3088 _bdev_rw_split(parent_io); 3089 break; 3090 case SPDK_BDEV_IO_TYPE_UNMAP: 3091 bdev_unmap_split(parent_io); 3092 break; 3093 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3094 bdev_write_zeroes_split(parent_io); 3095 break; 3096 case SPDK_BDEV_IO_TYPE_COPY: 3097 bdev_copy_split(parent_io); 3098 break; 3099 default: 3100 assert(false); 3101 break; 3102 } 3103 } 3104 3105 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3106 bool success); 3107 3108 static void 3109 bdev_io_split(struct spdk_bdev_io *bdev_io) 3110 { 3111 assert(bdev_io_should_split(bdev_io)); 3112 3113 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3114 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3115 bdev_io->u.bdev.split_outstanding = 0; 3116 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3117 3118 switch (bdev_io->type) { 3119 case SPDK_BDEV_IO_TYPE_READ: 3120 case SPDK_BDEV_IO_TYPE_WRITE: 3121 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3122 _bdev_rw_split(bdev_io); 3123 } else { 3124 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3125 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3126 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3127 } 3128 break; 3129 case SPDK_BDEV_IO_TYPE_UNMAP: 3130 bdev_unmap_split(bdev_io); 3131 break; 3132 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3133 bdev_write_zeroes_split(bdev_io); 3134 break; 3135 case SPDK_BDEV_IO_TYPE_COPY: 3136 bdev_copy_split(bdev_io); 3137 break; 3138 default: 3139 assert(false); 3140 break; 3141 } 3142 } 3143 3144 static void 3145 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3146 { 3147 if (!success) { 3148 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3149 return; 3150 } 3151 3152 _bdev_rw_split(bdev_io); 3153 } 3154 3155 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3156 * be inlined, at least on some compilers. 3157 */ 3158 static inline void 3159 _bdev_io_submit(void *ctx) 3160 { 3161 struct spdk_bdev_io *bdev_io = ctx; 3162 struct spdk_bdev *bdev = bdev_io->bdev; 3163 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3164 3165 if (spdk_likely(bdev_ch->flags == 0)) { 3166 bdev_io_do_submit(bdev_ch, bdev_io); 3167 return; 3168 } 3169 3170 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3171 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3172 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3173 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3174 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3175 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3176 } else { 3177 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3178 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3179 } 3180 } else { 3181 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3182 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3183 } 3184 } 3185 3186 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3187 3188 bool 3189 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3190 { 3191 if (range1->length == 0 || range2->length == 0) { 3192 return false; 3193 } 3194 3195 if (range1->offset + range1->length <= range2->offset) { 3196 return false; 3197 } 3198 3199 if (range2->offset + range2->length <= range1->offset) { 3200 return false; 3201 } 3202 3203 return true; 3204 } 3205 3206 static bool 3207 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3208 { 3209 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3210 struct lba_range r; 3211 3212 switch (bdev_io->type) { 3213 case SPDK_BDEV_IO_TYPE_NVME_IO: 3214 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3215 /* Don't try to decode the NVMe command - just assume worst-case and that 3216 * it overlaps a locked range. 3217 */ 3218 return true; 3219 case SPDK_BDEV_IO_TYPE_WRITE: 3220 case SPDK_BDEV_IO_TYPE_UNMAP: 3221 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3222 case SPDK_BDEV_IO_TYPE_ZCOPY: 3223 case SPDK_BDEV_IO_TYPE_COPY: 3224 r.offset = bdev_io->u.bdev.offset_blocks; 3225 r.length = bdev_io->u.bdev.num_blocks; 3226 if (!bdev_lba_range_overlapped(range, &r)) { 3227 /* This I/O doesn't overlap the specified LBA range. */ 3228 return false; 3229 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3230 /* This I/O overlaps, but the I/O is on the same channel that locked this 3231 * range, and the caller_ctx is the same as the locked_ctx. This means 3232 * that this I/O is associated with the lock, and is allowed to execute. 3233 */ 3234 return false; 3235 } else { 3236 return true; 3237 } 3238 default: 3239 return false; 3240 } 3241 } 3242 3243 void 3244 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3245 { 3246 struct spdk_bdev *bdev = bdev_io->bdev; 3247 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3248 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3249 3250 assert(thread != NULL); 3251 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3252 3253 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3254 struct lba_range *range; 3255 3256 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3257 if (bdev_io_range_is_locked(bdev_io, range)) { 3258 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3259 return; 3260 } 3261 } 3262 } 3263 3264 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3265 3266 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3267 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3268 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3269 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3270 spdk_bdev_get_name(bdev)); 3271 3272 if (bdev_io->internal.split) { 3273 bdev_io_split(bdev_io); 3274 return; 3275 } 3276 3277 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3278 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3279 _bdev_io_submit(bdev_io); 3280 } else { 3281 bdev_io->internal.io_submit_ch = ch; 3282 bdev_io->internal.ch = bdev->internal.qos->ch; 3283 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3284 } 3285 } else { 3286 _bdev_io_submit(bdev_io); 3287 } 3288 } 3289 3290 static inline void 3291 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3292 { 3293 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3294 3295 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3296 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3297 * For write operation we need to pull buffers from memory domain before submitting IO. 3298 * Once read operation completes, we need to use memory_domain push functionality to 3299 * update data in original memory domain IO buffer 3300 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3301 bdev_io->u.bdev.memory_domain = NULL; 3302 bdev_io->u.bdev.memory_domain_ctx = NULL; 3303 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 3304 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3305 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3306 } 3307 3308 static inline void 3309 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3310 { 3311 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3312 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3313 3314 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3315 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3316 bdev_io_complete_unsubmitted(bdev_io); 3317 return; 3318 } 3319 3320 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3321 * support them, but we need to execute an accel sequence and the data buffer is from accel 3322 * memory domain (to avoid doing a push/pull from that domain). 3323 */ 3324 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3325 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3326 _bdev_io_ext_use_bounce_buffer(bdev_io); 3327 return; 3328 } 3329 3330 if (needs_exec) { 3331 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3332 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3333 return; 3334 } 3335 /* For reads we'll execute the sequence after the data is read, so, for now, only 3336 * clear out accel_sequence pointer and submit the IO */ 3337 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3338 bdev_io->u.bdev.accel_sequence = NULL; 3339 } 3340 3341 bdev_io_submit(bdev_io); 3342 } 3343 3344 static void 3345 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3346 { 3347 struct spdk_bdev *bdev = bdev_io->bdev; 3348 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3349 struct spdk_io_channel *ch = bdev_ch->channel; 3350 3351 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3352 3353 bdev_io->internal.in_submit_request = true; 3354 bdev_submit_request(bdev, ch, bdev_io); 3355 bdev_io->internal.in_submit_request = false; 3356 } 3357 3358 void 3359 bdev_io_init(struct spdk_bdev_io *bdev_io, 3360 struct spdk_bdev *bdev, void *cb_arg, 3361 spdk_bdev_io_completion_cb cb) 3362 { 3363 bdev_io->bdev = bdev; 3364 bdev_io->internal.caller_ctx = cb_arg; 3365 bdev_io->internal.cb = cb; 3366 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3367 bdev_io->internal.in_submit_request = false; 3368 bdev_io->internal.buf = NULL; 3369 bdev_io->internal.io_submit_ch = NULL; 3370 bdev_io->internal.orig_iovs = NULL; 3371 bdev_io->internal.orig_iovcnt = 0; 3372 bdev_io->internal.orig_md_iov.iov_base = NULL; 3373 bdev_io->internal.error.nvme.cdw0 = 0; 3374 bdev_io->num_retries = 0; 3375 bdev_io->internal.get_buf_cb = NULL; 3376 bdev_io->internal.get_aux_buf_cb = NULL; 3377 bdev_io->internal.memory_domain = NULL; 3378 bdev_io->internal.memory_domain_ctx = NULL; 3379 bdev_io->internal.data_transfer_cpl = NULL; 3380 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3381 bdev_io->internal.accel_sequence = NULL; 3382 } 3383 3384 static bool 3385 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3386 { 3387 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3388 } 3389 3390 bool 3391 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3392 { 3393 bool supported; 3394 3395 supported = bdev_io_type_supported(bdev, io_type); 3396 3397 if (!supported) { 3398 switch (io_type) { 3399 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3400 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3401 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3402 break; 3403 default: 3404 break; 3405 } 3406 } 3407 3408 return supported; 3409 } 3410 3411 uint64_t 3412 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3413 { 3414 return bdev_io->internal.submit_tsc; 3415 } 3416 3417 int 3418 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3419 { 3420 if (bdev->fn_table->dump_info_json) { 3421 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3422 } 3423 3424 return 0; 3425 } 3426 3427 static void 3428 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3429 { 3430 uint32_t max_per_timeslice = 0; 3431 int i; 3432 3433 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3434 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3435 qos->rate_limits[i].max_per_timeslice = 0; 3436 continue; 3437 } 3438 3439 max_per_timeslice = qos->rate_limits[i].limit * 3440 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3441 3442 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3443 qos->rate_limits[i].min_per_timeslice); 3444 3445 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3446 } 3447 3448 bdev_qos_set_ops(qos); 3449 } 3450 3451 static int 3452 bdev_channel_poll_qos(void *arg) 3453 { 3454 struct spdk_bdev_qos *qos = arg; 3455 uint64_t now = spdk_get_ticks(); 3456 int i; 3457 3458 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3459 /* We received our callback earlier than expected - return 3460 * immediately and wait to do accounting until at least one 3461 * timeslice has actually expired. This should never happen 3462 * with a well-behaved timer implementation. 3463 */ 3464 return SPDK_POLLER_IDLE; 3465 } 3466 3467 /* Reset for next round of rate limiting */ 3468 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3469 /* We may have allowed the IOs or bytes to slightly overrun in the last 3470 * timeslice. remaining_this_timeslice is signed, so if it's negative 3471 * here, we'll account for the overrun so that the next timeslice will 3472 * be appropriately reduced. 3473 */ 3474 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3475 qos->rate_limits[i].remaining_this_timeslice = 0; 3476 } 3477 } 3478 3479 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3480 qos->last_timeslice += qos->timeslice_size; 3481 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3482 qos->rate_limits[i].remaining_this_timeslice += 3483 qos->rate_limits[i].max_per_timeslice; 3484 } 3485 } 3486 3487 return bdev_qos_io_submit(qos->ch, qos); 3488 } 3489 3490 static void 3491 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3492 { 3493 struct spdk_bdev_shared_resource *shared_resource; 3494 struct lba_range *range; 3495 3496 bdev_free_io_stat(ch->stat); 3497 #ifdef SPDK_CONFIG_VTUNE 3498 bdev_free_io_stat(ch->prev_stat); 3499 #endif 3500 3501 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3502 range = TAILQ_FIRST(&ch->locked_ranges); 3503 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3504 free(range); 3505 } 3506 3507 spdk_put_io_channel(ch->channel); 3508 spdk_put_io_channel(ch->accel_channel); 3509 3510 shared_resource = ch->shared_resource; 3511 3512 assert(TAILQ_EMPTY(&ch->io_locked)); 3513 assert(TAILQ_EMPTY(&ch->io_submitted)); 3514 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3515 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3516 assert(ch->io_outstanding == 0); 3517 assert(shared_resource->ref > 0); 3518 shared_resource->ref--; 3519 if (shared_resource->ref == 0) { 3520 assert(shared_resource->io_outstanding == 0); 3521 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3522 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3523 free(shared_resource); 3524 } 3525 } 3526 3527 static void 3528 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3529 { 3530 struct spdk_bdev_qos *qos = bdev->internal.qos; 3531 int i; 3532 3533 assert(spdk_spin_held(&bdev->internal.spinlock)); 3534 3535 /* Rate limiting on this bdev enabled */ 3536 if (qos) { 3537 if (qos->ch == NULL) { 3538 struct spdk_io_channel *io_ch; 3539 3540 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3541 bdev->name, spdk_get_thread()); 3542 3543 /* No qos channel has been selected, so set one up */ 3544 3545 /* Take another reference to ch */ 3546 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3547 assert(io_ch != NULL); 3548 qos->ch = ch; 3549 3550 qos->thread = spdk_io_channel_get_thread(io_ch); 3551 3552 TAILQ_INIT(&qos->queued); 3553 3554 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3555 if (bdev_qos_is_iops_rate_limit(i) == true) { 3556 qos->rate_limits[i].min_per_timeslice = 3557 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3558 } else { 3559 qos->rate_limits[i].min_per_timeslice = 3560 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3561 } 3562 3563 if (qos->rate_limits[i].limit == 0) { 3564 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3565 } 3566 } 3567 bdev_qos_update_max_quota_per_timeslice(qos); 3568 qos->timeslice_size = 3569 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3570 qos->last_timeslice = spdk_get_ticks(); 3571 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3572 qos, 3573 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3574 } 3575 3576 ch->flags |= BDEV_CH_QOS_ENABLED; 3577 } 3578 } 3579 3580 struct poll_timeout_ctx { 3581 struct spdk_bdev_desc *desc; 3582 uint64_t timeout_in_sec; 3583 spdk_bdev_io_timeout_cb cb_fn; 3584 void *cb_arg; 3585 }; 3586 3587 static void 3588 bdev_desc_free(struct spdk_bdev_desc *desc) 3589 { 3590 spdk_spin_destroy(&desc->spinlock); 3591 free(desc->media_events_buffer); 3592 free(desc); 3593 } 3594 3595 static void 3596 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3597 { 3598 struct poll_timeout_ctx *ctx = _ctx; 3599 struct spdk_bdev_desc *desc = ctx->desc; 3600 3601 free(ctx); 3602 3603 spdk_spin_lock(&desc->spinlock); 3604 desc->refs--; 3605 if (desc->closed == true && desc->refs == 0) { 3606 spdk_spin_unlock(&desc->spinlock); 3607 bdev_desc_free(desc); 3608 return; 3609 } 3610 spdk_spin_unlock(&desc->spinlock); 3611 } 3612 3613 static void 3614 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3615 struct spdk_io_channel *io_ch, void *_ctx) 3616 { 3617 struct poll_timeout_ctx *ctx = _ctx; 3618 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3619 struct spdk_bdev_desc *desc = ctx->desc; 3620 struct spdk_bdev_io *bdev_io; 3621 uint64_t now; 3622 3623 spdk_spin_lock(&desc->spinlock); 3624 if (desc->closed == true) { 3625 spdk_spin_unlock(&desc->spinlock); 3626 spdk_bdev_for_each_channel_continue(i, -1); 3627 return; 3628 } 3629 spdk_spin_unlock(&desc->spinlock); 3630 3631 now = spdk_get_ticks(); 3632 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3633 /* Exclude any I/O that are generated via splitting. */ 3634 if (bdev_io->internal.cb == bdev_io_split_done) { 3635 continue; 3636 } 3637 3638 /* Once we find an I/O that has not timed out, we can immediately 3639 * exit the loop. 3640 */ 3641 if (now < (bdev_io->internal.submit_tsc + 3642 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3643 goto end; 3644 } 3645 3646 if (bdev_io->internal.desc == desc) { 3647 ctx->cb_fn(ctx->cb_arg, bdev_io); 3648 } 3649 } 3650 3651 end: 3652 spdk_bdev_for_each_channel_continue(i, 0); 3653 } 3654 3655 static int 3656 bdev_poll_timeout_io(void *arg) 3657 { 3658 struct spdk_bdev_desc *desc = arg; 3659 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3660 struct poll_timeout_ctx *ctx; 3661 3662 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3663 if (!ctx) { 3664 SPDK_ERRLOG("failed to allocate memory\n"); 3665 return SPDK_POLLER_BUSY; 3666 } 3667 ctx->desc = desc; 3668 ctx->cb_arg = desc->cb_arg; 3669 ctx->cb_fn = desc->cb_fn; 3670 ctx->timeout_in_sec = desc->timeout_in_sec; 3671 3672 /* Take a ref on the descriptor in case it gets closed while we are checking 3673 * all of the channels. 3674 */ 3675 spdk_spin_lock(&desc->spinlock); 3676 desc->refs++; 3677 spdk_spin_unlock(&desc->spinlock); 3678 3679 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3680 bdev_channel_poll_timeout_io_done); 3681 3682 return SPDK_POLLER_BUSY; 3683 } 3684 3685 int 3686 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3687 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3688 { 3689 assert(desc->thread == spdk_get_thread()); 3690 3691 spdk_poller_unregister(&desc->io_timeout_poller); 3692 3693 if (timeout_in_sec) { 3694 assert(cb_fn != NULL); 3695 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3696 desc, 3697 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3698 1000); 3699 if (desc->io_timeout_poller == NULL) { 3700 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3701 return -1; 3702 } 3703 } 3704 3705 desc->cb_fn = cb_fn; 3706 desc->cb_arg = cb_arg; 3707 desc->timeout_in_sec = timeout_in_sec; 3708 3709 return 0; 3710 } 3711 3712 static int 3713 bdev_channel_create(void *io_device, void *ctx_buf) 3714 { 3715 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3716 struct spdk_bdev_channel *ch = ctx_buf; 3717 struct spdk_io_channel *mgmt_io_ch; 3718 struct spdk_bdev_mgmt_channel *mgmt_ch; 3719 struct spdk_bdev_shared_resource *shared_resource; 3720 struct lba_range *range; 3721 3722 ch->bdev = bdev; 3723 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3724 if (!ch->channel) { 3725 return -1; 3726 } 3727 3728 ch->accel_channel = spdk_accel_get_io_channel(); 3729 if (!ch->accel_channel) { 3730 spdk_put_io_channel(ch->channel); 3731 return -1; 3732 } 3733 3734 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3735 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3736 3737 assert(ch->histogram == NULL); 3738 if (bdev->internal.histogram_enabled) { 3739 ch->histogram = spdk_histogram_data_alloc(); 3740 if (ch->histogram == NULL) { 3741 SPDK_ERRLOG("Could not allocate histogram\n"); 3742 } 3743 } 3744 3745 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3746 if (!mgmt_io_ch) { 3747 spdk_put_io_channel(ch->channel); 3748 spdk_put_io_channel(ch->accel_channel); 3749 return -1; 3750 } 3751 3752 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3753 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3754 if (shared_resource->shared_ch == ch->channel) { 3755 spdk_put_io_channel(mgmt_io_ch); 3756 shared_resource->ref++; 3757 break; 3758 } 3759 } 3760 3761 if (shared_resource == NULL) { 3762 shared_resource = calloc(1, sizeof(*shared_resource)); 3763 if (shared_resource == NULL) { 3764 spdk_put_io_channel(ch->channel); 3765 spdk_put_io_channel(ch->accel_channel); 3766 spdk_put_io_channel(mgmt_io_ch); 3767 return -1; 3768 } 3769 3770 shared_resource->mgmt_ch = mgmt_ch; 3771 shared_resource->io_outstanding = 0; 3772 TAILQ_INIT(&shared_resource->nomem_io); 3773 shared_resource->nomem_threshold = 0; 3774 shared_resource->shared_ch = ch->channel; 3775 shared_resource->ref = 1; 3776 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3777 } 3778 3779 ch->io_outstanding = 0; 3780 TAILQ_INIT(&ch->queued_resets); 3781 TAILQ_INIT(&ch->locked_ranges); 3782 ch->flags = 0; 3783 ch->shared_resource = shared_resource; 3784 3785 TAILQ_INIT(&ch->io_submitted); 3786 TAILQ_INIT(&ch->io_locked); 3787 TAILQ_INIT(&ch->io_accel_exec); 3788 TAILQ_INIT(&ch->io_memory_domain); 3789 3790 ch->stat = bdev_alloc_io_stat(false); 3791 if (ch->stat == NULL) { 3792 bdev_channel_destroy_resource(ch); 3793 return -1; 3794 } 3795 3796 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3797 3798 #ifdef SPDK_CONFIG_VTUNE 3799 { 3800 char *name; 3801 __itt_init_ittlib(NULL, 0); 3802 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3803 if (!name) { 3804 bdev_channel_destroy_resource(ch); 3805 return -1; 3806 } 3807 ch->handle = __itt_string_handle_create(name); 3808 free(name); 3809 ch->start_tsc = spdk_get_ticks(); 3810 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3811 ch->prev_stat = bdev_alloc_io_stat(false); 3812 if (ch->prev_stat == NULL) { 3813 bdev_channel_destroy_resource(ch); 3814 return -1; 3815 } 3816 } 3817 #endif 3818 3819 spdk_spin_lock(&bdev->internal.spinlock); 3820 bdev_enable_qos(bdev, ch); 3821 3822 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3823 struct lba_range *new_range; 3824 3825 new_range = calloc(1, sizeof(*new_range)); 3826 if (new_range == NULL) { 3827 spdk_spin_unlock(&bdev->internal.spinlock); 3828 bdev_channel_destroy_resource(ch); 3829 return -1; 3830 } 3831 new_range->length = range->length; 3832 new_range->offset = range->offset; 3833 new_range->locked_ctx = range->locked_ctx; 3834 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3835 } 3836 3837 spdk_spin_unlock(&bdev->internal.spinlock); 3838 3839 return 0; 3840 } 3841 3842 static int 3843 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3844 void *cb_ctx) 3845 { 3846 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3847 struct spdk_bdev_io *bdev_io; 3848 uint64_t buf_len; 3849 3850 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3851 if (bdev_io->internal.ch == bdev_ch) { 3852 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3853 spdk_iobuf_entry_abort(ch, entry, buf_len); 3854 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3855 } 3856 3857 return 0; 3858 } 3859 3860 /* 3861 * Abort I/O that are waiting on a data buffer. 3862 */ 3863 static void 3864 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3865 { 3866 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3867 bdev_abort_all_buf_io_cb, ch); 3868 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3869 bdev_abort_all_buf_io_cb, ch); 3870 } 3871 3872 /* 3873 * Abort I/O that are queued waiting for submission. These types of I/O are 3874 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3875 */ 3876 static void 3877 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3878 { 3879 struct spdk_bdev_io *bdev_io, *tmp; 3880 3881 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3882 if (bdev_io->internal.ch == ch) { 3883 TAILQ_REMOVE(queue, bdev_io, internal.link); 3884 /* 3885 * spdk_bdev_io_complete() assumes that the completed I/O had 3886 * been submitted to the bdev module. Since in this case it 3887 * hadn't, bump io_outstanding to account for the decrement 3888 * that spdk_bdev_io_complete() will do. 3889 */ 3890 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3891 ch->io_outstanding++; 3892 ch->shared_resource->io_outstanding++; 3893 } 3894 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3895 } 3896 } 3897 } 3898 3899 static bool 3900 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3901 { 3902 struct spdk_bdev_io *bdev_io; 3903 3904 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3905 if (bdev_io == bio_to_abort) { 3906 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3907 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3908 return true; 3909 } 3910 } 3911 3912 return false; 3913 } 3914 3915 static int 3916 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3917 { 3918 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3919 uint64_t buf_len; 3920 3921 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3922 if (bdev_io == bio_to_abort) { 3923 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3924 spdk_iobuf_entry_abort(ch, entry, buf_len); 3925 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3926 return 1; 3927 } 3928 3929 return 0; 3930 } 3931 3932 static bool 3933 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3934 { 3935 int rc; 3936 3937 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3938 bdev_abort_buf_io_cb, bio_to_abort); 3939 if (rc == 1) { 3940 return true; 3941 } 3942 3943 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3944 bdev_abort_buf_io_cb, bio_to_abort); 3945 return rc == 1; 3946 } 3947 3948 static void 3949 bdev_qos_channel_destroy(void *cb_arg) 3950 { 3951 struct spdk_bdev_qos *qos = cb_arg; 3952 3953 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3954 spdk_poller_unregister(&qos->poller); 3955 3956 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3957 3958 free(qos); 3959 } 3960 3961 static int 3962 bdev_qos_destroy(struct spdk_bdev *bdev) 3963 { 3964 int i; 3965 3966 /* 3967 * Cleanly shutting down the QoS poller is tricky, because 3968 * during the asynchronous operation the user could open 3969 * a new descriptor and create a new channel, spawning 3970 * a new QoS poller. 3971 * 3972 * The strategy is to create a new QoS structure here and swap it 3973 * in. The shutdown path then continues to refer to the old one 3974 * until it completes and then releases it. 3975 */ 3976 struct spdk_bdev_qos *new_qos, *old_qos; 3977 3978 old_qos = bdev->internal.qos; 3979 3980 new_qos = calloc(1, sizeof(*new_qos)); 3981 if (!new_qos) { 3982 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3983 return -ENOMEM; 3984 } 3985 3986 /* Copy the old QoS data into the newly allocated structure */ 3987 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3988 3989 /* Zero out the key parts of the QoS structure */ 3990 new_qos->ch = NULL; 3991 new_qos->thread = NULL; 3992 new_qos->poller = NULL; 3993 TAILQ_INIT(&new_qos->queued); 3994 /* 3995 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3996 * It will be used later for the new QoS structure. 3997 */ 3998 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3999 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4000 new_qos->rate_limits[i].min_per_timeslice = 0; 4001 new_qos->rate_limits[i].max_per_timeslice = 0; 4002 } 4003 4004 bdev->internal.qos = new_qos; 4005 4006 if (old_qos->thread == NULL) { 4007 free(old_qos); 4008 } else { 4009 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4010 } 4011 4012 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4013 * been destroyed yet. The destruction path will end up waiting for the final 4014 * channel to be put before it releases resources. */ 4015 4016 return 0; 4017 } 4018 4019 void 4020 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4021 { 4022 total->bytes_read += add->bytes_read; 4023 total->num_read_ops += add->num_read_ops; 4024 total->bytes_written += add->bytes_written; 4025 total->num_write_ops += add->num_write_ops; 4026 total->bytes_unmapped += add->bytes_unmapped; 4027 total->num_unmap_ops += add->num_unmap_ops; 4028 total->bytes_copied += add->bytes_copied; 4029 total->num_copy_ops += add->num_copy_ops; 4030 total->read_latency_ticks += add->read_latency_ticks; 4031 total->write_latency_ticks += add->write_latency_ticks; 4032 total->unmap_latency_ticks += add->unmap_latency_ticks; 4033 total->copy_latency_ticks += add->copy_latency_ticks; 4034 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4035 total->max_read_latency_ticks = add->max_read_latency_ticks; 4036 } 4037 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4038 total->min_read_latency_ticks = add->min_read_latency_ticks; 4039 } 4040 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4041 total->max_write_latency_ticks = add->max_write_latency_ticks; 4042 } 4043 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4044 total->min_write_latency_ticks = add->min_write_latency_ticks; 4045 } 4046 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4047 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4048 } 4049 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4050 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4051 } 4052 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4053 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4054 } 4055 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4056 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4057 } 4058 } 4059 4060 static void 4061 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4062 { 4063 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4064 4065 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4066 memcpy(to_stat->io_error, from_stat->io_error, 4067 sizeof(struct spdk_bdev_io_error_stat)); 4068 } 4069 } 4070 4071 void 4072 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4073 { 4074 stat->max_read_latency_ticks = 0; 4075 stat->min_read_latency_ticks = UINT64_MAX; 4076 stat->max_write_latency_ticks = 0; 4077 stat->min_write_latency_ticks = UINT64_MAX; 4078 stat->max_unmap_latency_ticks = 0; 4079 stat->min_unmap_latency_ticks = UINT64_MAX; 4080 stat->max_copy_latency_ticks = 0; 4081 stat->min_copy_latency_ticks = UINT64_MAX; 4082 4083 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4084 return; 4085 } 4086 4087 stat->bytes_read = 0; 4088 stat->num_read_ops = 0; 4089 stat->bytes_written = 0; 4090 stat->num_write_ops = 0; 4091 stat->bytes_unmapped = 0; 4092 stat->num_unmap_ops = 0; 4093 stat->bytes_copied = 0; 4094 stat->num_copy_ops = 0; 4095 stat->read_latency_ticks = 0; 4096 stat->write_latency_ticks = 0; 4097 stat->unmap_latency_ticks = 0; 4098 stat->copy_latency_ticks = 0; 4099 4100 if (stat->io_error != NULL) { 4101 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4102 } 4103 } 4104 4105 struct spdk_bdev_io_stat * 4106 bdev_alloc_io_stat(bool io_error_stat) 4107 { 4108 struct spdk_bdev_io_stat *stat; 4109 4110 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4111 if (stat == NULL) { 4112 return NULL; 4113 } 4114 4115 if (io_error_stat) { 4116 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4117 if (stat->io_error == NULL) { 4118 free(stat); 4119 return NULL; 4120 } 4121 } else { 4122 stat->io_error = NULL; 4123 } 4124 4125 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4126 4127 return stat; 4128 } 4129 4130 void 4131 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4132 { 4133 if (stat != NULL) { 4134 free(stat->io_error); 4135 free(stat); 4136 } 4137 } 4138 4139 void 4140 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4141 { 4142 int i; 4143 4144 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4145 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4146 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4147 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4148 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4149 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4150 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4151 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4152 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4153 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4154 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4155 stat->min_read_latency_ticks != UINT64_MAX ? 4156 stat->min_read_latency_ticks : 0); 4157 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4158 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4159 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4160 stat->min_write_latency_ticks != UINT64_MAX ? 4161 stat->min_write_latency_ticks : 0); 4162 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4163 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4164 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4165 stat->min_unmap_latency_ticks != UINT64_MAX ? 4166 stat->min_unmap_latency_ticks : 0); 4167 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4168 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4169 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4170 stat->min_copy_latency_ticks != UINT64_MAX ? 4171 stat->min_copy_latency_ticks : 0); 4172 4173 if (stat->io_error != NULL) { 4174 spdk_json_write_named_object_begin(w, "io_error"); 4175 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4176 if (stat->io_error->error_status[i] != 0) { 4177 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4178 stat->io_error->error_status[i]); 4179 } 4180 } 4181 spdk_json_write_object_end(w); 4182 } 4183 } 4184 4185 static void 4186 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4187 { 4188 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4189 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4190 4191 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4192 bdev_abort_all_buf_io(mgmt_ch, ch); 4193 bdev_abort_all_buf_io(mgmt_ch, ch); 4194 } 4195 4196 static void 4197 bdev_channel_destroy(void *io_device, void *ctx_buf) 4198 { 4199 struct spdk_bdev_channel *ch = ctx_buf; 4200 4201 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4202 spdk_get_thread()); 4203 4204 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4205 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4206 4207 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4208 spdk_spin_lock(&ch->bdev->internal.spinlock); 4209 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4210 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4211 4212 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4213 4214 bdev_channel_abort_queued_ios(ch); 4215 4216 if (ch->histogram) { 4217 spdk_histogram_data_free(ch->histogram); 4218 } 4219 4220 bdev_channel_destroy_resource(ch); 4221 } 4222 4223 /* 4224 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4225 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4226 */ 4227 static int 4228 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4229 { 4230 struct spdk_bdev_name *tmp; 4231 4232 bdev_name->name = strdup(name); 4233 if (bdev_name->name == NULL) { 4234 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4235 return -ENOMEM; 4236 } 4237 4238 bdev_name->bdev = bdev; 4239 4240 spdk_spin_lock(&g_bdev_mgr.spinlock); 4241 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4242 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4243 4244 if (tmp != NULL) { 4245 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4246 free(bdev_name->name); 4247 return -EEXIST; 4248 } 4249 4250 return 0; 4251 } 4252 4253 static void 4254 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4255 { 4256 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4257 free(bdev_name->name); 4258 } 4259 4260 static void 4261 bdev_name_del(struct spdk_bdev_name *bdev_name) 4262 { 4263 spdk_spin_lock(&g_bdev_mgr.spinlock); 4264 bdev_name_del_unsafe(bdev_name); 4265 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4266 } 4267 4268 int 4269 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4270 { 4271 struct spdk_bdev_alias *tmp; 4272 int ret; 4273 4274 if (alias == NULL) { 4275 SPDK_ERRLOG("Empty alias passed\n"); 4276 return -EINVAL; 4277 } 4278 4279 tmp = calloc(1, sizeof(*tmp)); 4280 if (tmp == NULL) { 4281 SPDK_ERRLOG("Unable to allocate alias\n"); 4282 return -ENOMEM; 4283 } 4284 4285 ret = bdev_name_add(&tmp->alias, bdev, alias); 4286 if (ret != 0) { 4287 free(tmp); 4288 return ret; 4289 } 4290 4291 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4292 4293 return 0; 4294 } 4295 4296 static int 4297 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4298 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4299 { 4300 struct spdk_bdev_alias *tmp; 4301 4302 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4303 if (strcmp(alias, tmp->alias.name) == 0) { 4304 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4305 alias_del_fn(&tmp->alias); 4306 free(tmp); 4307 return 0; 4308 } 4309 } 4310 4311 return -ENOENT; 4312 } 4313 4314 int 4315 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4316 { 4317 int rc; 4318 4319 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4320 if (rc == -ENOENT) { 4321 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4322 } 4323 4324 return rc; 4325 } 4326 4327 void 4328 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4329 { 4330 struct spdk_bdev_alias *p, *tmp; 4331 4332 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4333 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4334 bdev_name_del(&p->alias); 4335 free(p); 4336 } 4337 } 4338 4339 struct spdk_io_channel * 4340 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4341 { 4342 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4343 } 4344 4345 void * 4346 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4347 { 4348 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4349 void *ctx = NULL; 4350 4351 if (bdev->fn_table->get_module_ctx) { 4352 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4353 } 4354 4355 return ctx; 4356 } 4357 4358 const char * 4359 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4360 { 4361 return bdev->module->name; 4362 } 4363 4364 const char * 4365 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4366 { 4367 return bdev->name; 4368 } 4369 4370 const char * 4371 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4372 { 4373 return bdev->product_name; 4374 } 4375 4376 const struct spdk_bdev_aliases_list * 4377 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4378 { 4379 return &bdev->aliases; 4380 } 4381 4382 uint32_t 4383 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4384 { 4385 return bdev->blocklen; 4386 } 4387 4388 uint32_t 4389 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4390 { 4391 return bdev->write_unit_size; 4392 } 4393 4394 uint64_t 4395 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4396 { 4397 return bdev->blockcnt; 4398 } 4399 4400 const char * 4401 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4402 { 4403 return qos_rpc_type[type]; 4404 } 4405 4406 void 4407 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4408 { 4409 int i; 4410 4411 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4412 4413 spdk_spin_lock(&bdev->internal.spinlock); 4414 if (bdev->internal.qos) { 4415 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4416 if (bdev->internal.qos->rate_limits[i].limit != 4417 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4418 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4419 if (bdev_qos_is_iops_rate_limit(i) == false) { 4420 /* Change from Byte to Megabyte which is user visible. */ 4421 limits[i] = limits[i] / 1024 / 1024; 4422 } 4423 } 4424 } 4425 } 4426 spdk_spin_unlock(&bdev->internal.spinlock); 4427 } 4428 4429 size_t 4430 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4431 { 4432 return 1 << bdev->required_alignment; 4433 } 4434 4435 uint32_t 4436 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4437 { 4438 return bdev->optimal_io_boundary; 4439 } 4440 4441 bool 4442 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4443 { 4444 return bdev->write_cache; 4445 } 4446 4447 const struct spdk_uuid * 4448 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4449 { 4450 return &bdev->uuid; 4451 } 4452 4453 uint16_t 4454 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4455 { 4456 return bdev->acwu; 4457 } 4458 4459 uint32_t 4460 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4461 { 4462 return bdev->md_len; 4463 } 4464 4465 bool 4466 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4467 { 4468 return (bdev->md_len != 0) && bdev->md_interleave; 4469 } 4470 4471 bool 4472 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4473 { 4474 return (bdev->md_len != 0) && !bdev->md_interleave; 4475 } 4476 4477 bool 4478 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4479 { 4480 return bdev->zoned; 4481 } 4482 4483 uint32_t 4484 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4485 { 4486 if (spdk_bdev_is_md_interleaved(bdev)) { 4487 return bdev->blocklen - bdev->md_len; 4488 } else { 4489 return bdev->blocklen; 4490 } 4491 } 4492 4493 uint32_t 4494 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4495 { 4496 return bdev->phys_blocklen; 4497 } 4498 4499 static uint32_t 4500 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4501 { 4502 if (!spdk_bdev_is_md_interleaved(bdev)) { 4503 return bdev->blocklen + bdev->md_len; 4504 } else { 4505 return bdev->blocklen; 4506 } 4507 } 4508 4509 /* We have to use the typedef in the function declaration to appease astyle. */ 4510 typedef enum spdk_dif_type spdk_dif_type_t; 4511 4512 spdk_dif_type_t 4513 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4514 { 4515 if (bdev->md_len != 0) { 4516 return bdev->dif_type; 4517 } else { 4518 return SPDK_DIF_DISABLE; 4519 } 4520 } 4521 4522 bool 4523 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4524 { 4525 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4526 return bdev->dif_is_head_of_md; 4527 } else { 4528 return false; 4529 } 4530 } 4531 4532 bool 4533 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4534 enum spdk_dif_check_type check_type) 4535 { 4536 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4537 return false; 4538 } 4539 4540 switch (check_type) { 4541 case SPDK_DIF_CHECK_TYPE_REFTAG: 4542 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4543 case SPDK_DIF_CHECK_TYPE_APPTAG: 4544 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4545 case SPDK_DIF_CHECK_TYPE_GUARD: 4546 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4547 default: 4548 return false; 4549 } 4550 } 4551 4552 uint32_t 4553 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4554 { 4555 uint64_t alighed_length; 4556 uint64_t max_copy_blocks; 4557 uint64_t temp_max_copy_blocks; 4558 struct spdk_iobuf_opts opts; 4559 4560 if (spdk_bdev_io_type_supported((struct spdk_bdev *)bdev, SPDK_BDEV_IO_TYPE_COPY)) { 4561 return bdev->max_copy; 4562 } else { 4563 spdk_iobuf_get_opts(&opts); 4564 alighed_length = opts.large_bufsize - spdk_bdev_get_buf_align(bdev); 4565 temp_max_copy_blocks = spdk_bdev_is_md_separate(bdev) ? 4566 alighed_length / (bdev->blocklen + bdev->md_len) : 4567 alighed_length / bdev->blocklen; 4568 max_copy_blocks = 1 << spdk_u64log2(temp_max_copy_blocks); 4569 return max_copy_blocks; 4570 } 4571 } 4572 4573 uint64_t 4574 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4575 { 4576 return bdev->internal.measured_queue_depth; 4577 } 4578 4579 uint64_t 4580 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4581 { 4582 return bdev->internal.period; 4583 } 4584 4585 uint64_t 4586 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4587 { 4588 return bdev->internal.weighted_io_time; 4589 } 4590 4591 uint64_t 4592 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4593 { 4594 return bdev->internal.io_time; 4595 } 4596 4597 static void bdev_update_qd_sampling_period(void *ctx); 4598 4599 static void 4600 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4601 { 4602 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4603 4604 if (bdev->internal.measured_queue_depth) { 4605 bdev->internal.io_time += bdev->internal.period; 4606 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4607 } 4608 4609 bdev->internal.qd_poll_in_progress = false; 4610 4611 bdev_update_qd_sampling_period(bdev); 4612 } 4613 4614 static void 4615 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4616 struct spdk_io_channel *io_ch, void *_ctx) 4617 { 4618 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4619 4620 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4621 spdk_bdev_for_each_channel_continue(i, 0); 4622 } 4623 4624 static int 4625 bdev_calculate_measured_queue_depth(void *ctx) 4626 { 4627 struct spdk_bdev *bdev = ctx; 4628 4629 bdev->internal.qd_poll_in_progress = true; 4630 bdev->internal.temporary_queue_depth = 0; 4631 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4632 return SPDK_POLLER_BUSY; 4633 } 4634 4635 static void 4636 bdev_update_qd_sampling_period(void *ctx) 4637 { 4638 struct spdk_bdev *bdev = ctx; 4639 4640 if (bdev->internal.period == bdev->internal.new_period) { 4641 return; 4642 } 4643 4644 if (bdev->internal.qd_poll_in_progress) { 4645 return; 4646 } 4647 4648 bdev->internal.period = bdev->internal.new_period; 4649 4650 spdk_poller_unregister(&bdev->internal.qd_poller); 4651 if (bdev->internal.period != 0) { 4652 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4653 bdev, bdev->internal.period); 4654 } else { 4655 spdk_bdev_close(bdev->internal.qd_desc); 4656 bdev->internal.qd_desc = NULL; 4657 } 4658 } 4659 4660 static void 4661 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4662 { 4663 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4664 } 4665 4666 void 4667 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4668 { 4669 int rc; 4670 4671 if (bdev->internal.new_period == period) { 4672 return; 4673 } 4674 4675 bdev->internal.new_period = period; 4676 4677 if (bdev->internal.qd_desc != NULL) { 4678 assert(bdev->internal.period != 0); 4679 4680 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4681 bdev_update_qd_sampling_period, bdev); 4682 return; 4683 } 4684 4685 assert(bdev->internal.period == 0); 4686 4687 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4688 NULL, &bdev->internal.qd_desc); 4689 if (rc != 0) { 4690 return; 4691 } 4692 4693 bdev->internal.period = period; 4694 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4695 bdev, period); 4696 } 4697 4698 struct bdev_get_current_qd_ctx { 4699 uint64_t current_qd; 4700 spdk_bdev_get_current_qd_cb cb_fn; 4701 void *cb_arg; 4702 }; 4703 4704 static void 4705 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4706 { 4707 struct bdev_get_current_qd_ctx *ctx = _ctx; 4708 4709 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4710 4711 free(ctx); 4712 } 4713 4714 static void 4715 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4716 struct spdk_io_channel *io_ch, void *_ctx) 4717 { 4718 struct bdev_get_current_qd_ctx *ctx = _ctx; 4719 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4720 4721 ctx->current_qd += bdev_ch->io_outstanding; 4722 4723 spdk_bdev_for_each_channel_continue(i, 0); 4724 } 4725 4726 void 4727 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4728 void *cb_arg) 4729 { 4730 struct bdev_get_current_qd_ctx *ctx; 4731 4732 assert(cb_fn != NULL); 4733 4734 ctx = calloc(1, sizeof(*ctx)); 4735 if (ctx == NULL) { 4736 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4737 return; 4738 } 4739 4740 ctx->cb_fn = cb_fn; 4741 ctx->cb_arg = cb_arg; 4742 4743 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4744 } 4745 4746 static void 4747 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4748 { 4749 assert(desc->thread == spdk_get_thread()); 4750 4751 spdk_spin_lock(&desc->spinlock); 4752 desc->refs--; 4753 if (!desc->closed) { 4754 spdk_spin_unlock(&desc->spinlock); 4755 desc->callback.event_fn(type, 4756 desc->bdev, 4757 desc->callback.ctx); 4758 return; 4759 } else if (desc->refs == 0) { 4760 /* This descriptor was closed after this event_notify message was sent. 4761 * spdk_bdev_close() could not free the descriptor since this message was 4762 * in flight, so we free it now using bdev_desc_free(). 4763 */ 4764 spdk_spin_unlock(&desc->spinlock); 4765 bdev_desc_free(desc); 4766 return; 4767 } 4768 spdk_spin_unlock(&desc->spinlock); 4769 } 4770 4771 static void 4772 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4773 { 4774 spdk_spin_lock(&desc->spinlock); 4775 desc->refs++; 4776 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4777 spdk_spin_unlock(&desc->spinlock); 4778 } 4779 4780 static void 4781 _resize_notify(void *ctx) 4782 { 4783 struct spdk_bdev_desc *desc = ctx; 4784 4785 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4786 } 4787 4788 int 4789 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4790 { 4791 struct spdk_bdev_desc *desc; 4792 int ret; 4793 4794 if (size == bdev->blockcnt) { 4795 return 0; 4796 } 4797 4798 spdk_spin_lock(&bdev->internal.spinlock); 4799 4800 /* bdev has open descriptors */ 4801 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4802 bdev->blockcnt > size) { 4803 ret = -EBUSY; 4804 } else { 4805 bdev->blockcnt = size; 4806 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4807 event_notify(desc, _resize_notify); 4808 } 4809 ret = 0; 4810 } 4811 4812 spdk_spin_unlock(&bdev->internal.spinlock); 4813 4814 return ret; 4815 } 4816 4817 /* 4818 * Convert I/O offset and length from bytes to blocks. 4819 * 4820 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4821 */ 4822 static uint64_t 4823 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4824 uint64_t num_bytes, uint64_t *num_blocks) 4825 { 4826 uint32_t block_size = bdev->blocklen; 4827 uint8_t shift_cnt; 4828 4829 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4830 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4831 shift_cnt = spdk_u32log2(block_size); 4832 *offset_blocks = offset_bytes >> shift_cnt; 4833 *num_blocks = num_bytes >> shift_cnt; 4834 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4835 (num_bytes - (*num_blocks << shift_cnt)); 4836 } else { 4837 *offset_blocks = offset_bytes / block_size; 4838 *num_blocks = num_bytes / block_size; 4839 return (offset_bytes % block_size) | (num_bytes % block_size); 4840 } 4841 } 4842 4843 static bool 4844 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4845 { 4846 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4847 * has been an overflow and hence the offset has been wrapped around */ 4848 if (offset_blocks + num_blocks < offset_blocks) { 4849 return false; 4850 } 4851 4852 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4853 if (offset_blocks + num_blocks > bdev->blockcnt) { 4854 return false; 4855 } 4856 4857 return true; 4858 } 4859 4860 static void 4861 bdev_seek_complete_cb(void *ctx) 4862 { 4863 struct spdk_bdev_io *bdev_io = ctx; 4864 4865 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4866 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4867 } 4868 4869 static int 4870 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4871 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4872 spdk_bdev_io_completion_cb cb, void *cb_arg) 4873 { 4874 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4875 struct spdk_bdev_io *bdev_io; 4876 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4877 4878 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4879 4880 /* Check if offset_blocks is valid looking at the validity of one block */ 4881 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4882 return -EINVAL; 4883 } 4884 4885 bdev_io = bdev_channel_get_io(channel); 4886 if (!bdev_io) { 4887 return -ENOMEM; 4888 } 4889 4890 bdev_io->internal.ch = channel; 4891 bdev_io->internal.desc = desc; 4892 bdev_io->type = io_type; 4893 bdev_io->u.bdev.offset_blocks = offset_blocks; 4894 bdev_io->u.bdev.memory_domain = NULL; 4895 bdev_io->u.bdev.memory_domain_ctx = NULL; 4896 bdev_io->u.bdev.accel_sequence = NULL; 4897 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4898 4899 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4900 /* In case bdev doesn't support seek to next data/hole offset, 4901 * it is assumed that only data and no holes are present */ 4902 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4903 bdev_io->u.bdev.seek.offset = offset_blocks; 4904 } else { 4905 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4906 } 4907 4908 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4909 return 0; 4910 } 4911 4912 bdev_io_submit(bdev_io); 4913 return 0; 4914 } 4915 4916 int 4917 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4918 uint64_t offset_blocks, 4919 spdk_bdev_io_completion_cb cb, void *cb_arg) 4920 { 4921 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4922 } 4923 4924 int 4925 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4926 uint64_t offset_blocks, 4927 spdk_bdev_io_completion_cb cb, void *cb_arg) 4928 { 4929 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4930 } 4931 4932 uint64_t 4933 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4934 { 4935 return bdev_io->u.bdev.seek.offset; 4936 } 4937 4938 static int 4939 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4940 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4941 spdk_bdev_io_completion_cb cb, void *cb_arg) 4942 { 4943 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4944 struct spdk_bdev_io *bdev_io; 4945 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4946 4947 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4948 return -EINVAL; 4949 } 4950 4951 bdev_io = bdev_channel_get_io(channel); 4952 if (!bdev_io) { 4953 return -ENOMEM; 4954 } 4955 4956 bdev_io->internal.ch = channel; 4957 bdev_io->internal.desc = desc; 4958 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4959 bdev_io->u.bdev.iovs = &bdev_io->iov; 4960 bdev_io->u.bdev.iovs[0].iov_base = buf; 4961 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4962 bdev_io->u.bdev.iovcnt = 1; 4963 bdev_io->u.bdev.md_buf = md_buf; 4964 bdev_io->u.bdev.num_blocks = num_blocks; 4965 bdev_io->u.bdev.offset_blocks = offset_blocks; 4966 bdev_io->u.bdev.memory_domain = NULL; 4967 bdev_io->u.bdev.memory_domain_ctx = NULL; 4968 bdev_io->u.bdev.accel_sequence = NULL; 4969 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4970 4971 bdev_io_submit(bdev_io); 4972 return 0; 4973 } 4974 4975 int 4976 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4977 void *buf, uint64_t offset, uint64_t nbytes, 4978 spdk_bdev_io_completion_cb cb, void *cb_arg) 4979 { 4980 uint64_t offset_blocks, num_blocks; 4981 4982 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4983 nbytes, &num_blocks) != 0) { 4984 return -EINVAL; 4985 } 4986 4987 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4988 } 4989 4990 int 4991 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4992 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4993 spdk_bdev_io_completion_cb cb, void *cb_arg) 4994 { 4995 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4996 } 4997 4998 int 4999 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5000 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5001 spdk_bdev_io_completion_cb cb, void *cb_arg) 5002 { 5003 struct iovec iov = { 5004 .iov_base = buf, 5005 }; 5006 5007 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5008 return -EINVAL; 5009 } 5010 5011 if (md_buf && !_is_buf_allocated(&iov)) { 5012 return -EINVAL; 5013 } 5014 5015 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5016 cb, cb_arg); 5017 } 5018 5019 int 5020 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5021 struct iovec *iov, int iovcnt, 5022 uint64_t offset, uint64_t nbytes, 5023 spdk_bdev_io_completion_cb cb, void *cb_arg) 5024 { 5025 uint64_t offset_blocks, num_blocks; 5026 5027 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5028 nbytes, &num_blocks) != 0) { 5029 return -EINVAL; 5030 } 5031 5032 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5033 } 5034 5035 static int 5036 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5037 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5038 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5039 struct spdk_accel_sequence *seq, 5040 spdk_bdev_io_completion_cb cb, void *cb_arg) 5041 { 5042 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5043 struct spdk_bdev_io *bdev_io; 5044 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5045 5046 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5047 return -EINVAL; 5048 } 5049 5050 bdev_io = bdev_channel_get_io(channel); 5051 if (!bdev_io) { 5052 return -ENOMEM; 5053 } 5054 5055 bdev_io->internal.ch = channel; 5056 bdev_io->internal.desc = desc; 5057 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5058 bdev_io->u.bdev.iovs = iov; 5059 bdev_io->u.bdev.iovcnt = iovcnt; 5060 bdev_io->u.bdev.md_buf = md_buf; 5061 bdev_io->u.bdev.num_blocks = num_blocks; 5062 bdev_io->u.bdev.offset_blocks = offset_blocks; 5063 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5064 bdev_io->internal.memory_domain = domain; 5065 bdev_io->internal.memory_domain_ctx = domain_ctx; 5066 bdev_io->internal.accel_sequence = seq; 5067 bdev_io->u.bdev.memory_domain = domain; 5068 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5069 bdev_io->u.bdev.accel_sequence = seq; 5070 5071 _bdev_io_submit_ext(desc, bdev_io); 5072 5073 return 0; 5074 } 5075 5076 int 5077 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5078 struct iovec *iov, int iovcnt, 5079 uint64_t offset_blocks, uint64_t num_blocks, 5080 spdk_bdev_io_completion_cb cb, void *cb_arg) 5081 { 5082 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5083 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5084 } 5085 5086 int 5087 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5088 struct iovec *iov, int iovcnt, void *md_buf, 5089 uint64_t offset_blocks, uint64_t num_blocks, 5090 spdk_bdev_io_completion_cb cb, void *cb_arg) 5091 { 5092 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5093 return -EINVAL; 5094 } 5095 5096 if (md_buf && !_is_buf_allocated(iov)) { 5097 return -EINVAL; 5098 } 5099 5100 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5101 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5102 } 5103 5104 static inline bool 5105 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5106 { 5107 /* 5108 * We check if opts size is at least of size when we first introduced 5109 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5110 * are not checked internal. 5111 */ 5112 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5113 sizeof(opts->metadata) && 5114 opts->size <= sizeof(*opts) && 5115 /* When memory domain is used, the user must provide data buffers */ 5116 (!opts->memory_domain || (iov && iov[0].iov_base)); 5117 } 5118 5119 int 5120 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5121 struct iovec *iov, int iovcnt, 5122 uint64_t offset_blocks, uint64_t num_blocks, 5123 spdk_bdev_io_completion_cb cb, void *cb_arg, 5124 struct spdk_bdev_ext_io_opts *opts) 5125 { 5126 void *md = NULL; 5127 5128 if (opts) { 5129 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5130 return -EINVAL; 5131 } 5132 md = opts->metadata; 5133 } 5134 5135 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5136 return -EINVAL; 5137 } 5138 5139 if (md && !_is_buf_allocated(iov)) { 5140 return -EINVAL; 5141 } 5142 5143 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5144 num_blocks, 5145 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5146 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5147 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5148 cb, cb_arg); 5149 } 5150 5151 static int 5152 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5153 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5154 spdk_bdev_io_completion_cb cb, void *cb_arg) 5155 { 5156 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5157 struct spdk_bdev_io *bdev_io; 5158 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5159 5160 if (!desc->write) { 5161 return -EBADF; 5162 } 5163 5164 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5165 return -EINVAL; 5166 } 5167 5168 bdev_io = bdev_channel_get_io(channel); 5169 if (!bdev_io) { 5170 return -ENOMEM; 5171 } 5172 5173 bdev_io->internal.ch = channel; 5174 bdev_io->internal.desc = desc; 5175 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5176 bdev_io->u.bdev.iovs = &bdev_io->iov; 5177 bdev_io->u.bdev.iovs[0].iov_base = buf; 5178 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5179 bdev_io->u.bdev.iovcnt = 1; 5180 bdev_io->u.bdev.md_buf = md_buf; 5181 bdev_io->u.bdev.num_blocks = num_blocks; 5182 bdev_io->u.bdev.offset_blocks = offset_blocks; 5183 bdev_io->u.bdev.memory_domain = NULL; 5184 bdev_io->u.bdev.memory_domain_ctx = NULL; 5185 bdev_io->u.bdev.accel_sequence = NULL; 5186 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5187 5188 bdev_io_submit(bdev_io); 5189 return 0; 5190 } 5191 5192 int 5193 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5194 void *buf, uint64_t offset, uint64_t nbytes, 5195 spdk_bdev_io_completion_cb cb, void *cb_arg) 5196 { 5197 uint64_t offset_blocks, num_blocks; 5198 5199 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5200 nbytes, &num_blocks) != 0) { 5201 return -EINVAL; 5202 } 5203 5204 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5205 } 5206 5207 int 5208 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5209 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5210 spdk_bdev_io_completion_cb cb, void *cb_arg) 5211 { 5212 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5213 cb, cb_arg); 5214 } 5215 5216 int 5217 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5218 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5219 spdk_bdev_io_completion_cb cb, void *cb_arg) 5220 { 5221 struct iovec iov = { 5222 .iov_base = buf, 5223 }; 5224 5225 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5226 return -EINVAL; 5227 } 5228 5229 if (md_buf && !_is_buf_allocated(&iov)) { 5230 return -EINVAL; 5231 } 5232 5233 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5234 cb, cb_arg); 5235 } 5236 5237 static int 5238 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5239 struct iovec *iov, int iovcnt, void *md_buf, 5240 uint64_t offset_blocks, uint64_t num_blocks, 5241 struct spdk_memory_domain *domain, void *domain_ctx, 5242 struct spdk_accel_sequence *seq, 5243 spdk_bdev_io_completion_cb cb, void *cb_arg) 5244 { 5245 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5246 struct spdk_bdev_io *bdev_io; 5247 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5248 5249 if (!desc->write) { 5250 return -EBADF; 5251 } 5252 5253 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5254 return -EINVAL; 5255 } 5256 5257 bdev_io = bdev_channel_get_io(channel); 5258 if (!bdev_io) { 5259 return -ENOMEM; 5260 } 5261 5262 bdev_io->internal.ch = channel; 5263 bdev_io->internal.desc = desc; 5264 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5265 bdev_io->u.bdev.iovs = iov; 5266 bdev_io->u.bdev.iovcnt = iovcnt; 5267 bdev_io->u.bdev.md_buf = md_buf; 5268 bdev_io->u.bdev.num_blocks = num_blocks; 5269 bdev_io->u.bdev.offset_blocks = offset_blocks; 5270 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5271 bdev_io->internal.memory_domain = domain; 5272 bdev_io->internal.memory_domain_ctx = domain_ctx; 5273 bdev_io->internal.accel_sequence = seq; 5274 bdev_io->u.bdev.memory_domain = domain; 5275 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5276 bdev_io->u.bdev.accel_sequence = seq; 5277 5278 _bdev_io_submit_ext(desc, bdev_io); 5279 5280 return 0; 5281 } 5282 5283 int 5284 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5285 struct iovec *iov, int iovcnt, 5286 uint64_t offset, uint64_t len, 5287 spdk_bdev_io_completion_cb cb, void *cb_arg) 5288 { 5289 uint64_t offset_blocks, num_blocks; 5290 5291 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5292 len, &num_blocks) != 0) { 5293 return -EINVAL; 5294 } 5295 5296 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5297 } 5298 5299 int 5300 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5301 struct iovec *iov, int iovcnt, 5302 uint64_t offset_blocks, uint64_t num_blocks, 5303 spdk_bdev_io_completion_cb cb, void *cb_arg) 5304 { 5305 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5306 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5307 } 5308 5309 int 5310 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5311 struct iovec *iov, int iovcnt, void *md_buf, 5312 uint64_t offset_blocks, uint64_t num_blocks, 5313 spdk_bdev_io_completion_cb cb, void *cb_arg) 5314 { 5315 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5316 return -EINVAL; 5317 } 5318 5319 if (md_buf && !_is_buf_allocated(iov)) { 5320 return -EINVAL; 5321 } 5322 5323 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5324 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5325 } 5326 5327 int 5328 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5329 struct iovec *iov, int iovcnt, 5330 uint64_t offset_blocks, uint64_t num_blocks, 5331 spdk_bdev_io_completion_cb cb, void *cb_arg, 5332 struct spdk_bdev_ext_io_opts *opts) 5333 { 5334 void *md = NULL; 5335 5336 if (opts) { 5337 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5338 return -EINVAL; 5339 } 5340 md = opts->metadata; 5341 } 5342 5343 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5344 return -EINVAL; 5345 } 5346 5347 if (md && !_is_buf_allocated(iov)) { 5348 return -EINVAL; 5349 } 5350 5351 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5352 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5353 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5354 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5355 cb, cb_arg); 5356 } 5357 5358 static void 5359 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5360 { 5361 struct spdk_bdev_io *parent_io = cb_arg; 5362 struct spdk_bdev *bdev = parent_io->bdev; 5363 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5364 int i, rc = 0; 5365 5366 if (!success) { 5367 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5368 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5369 spdk_bdev_free_io(bdev_io); 5370 return; 5371 } 5372 5373 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5374 rc = memcmp(read_buf, 5375 parent_io->u.bdev.iovs[i].iov_base, 5376 parent_io->u.bdev.iovs[i].iov_len); 5377 if (rc) { 5378 break; 5379 } 5380 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5381 } 5382 5383 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5384 rc = memcmp(bdev_io->u.bdev.md_buf, 5385 parent_io->u.bdev.md_buf, 5386 spdk_bdev_get_md_size(bdev)); 5387 } 5388 5389 spdk_bdev_free_io(bdev_io); 5390 5391 if (rc == 0) { 5392 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5393 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5394 } else { 5395 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5396 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5397 } 5398 } 5399 5400 static void 5401 bdev_compare_do_read(void *_bdev_io) 5402 { 5403 struct spdk_bdev_io *bdev_io = _bdev_io; 5404 int rc; 5405 5406 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5407 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5408 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5409 bdev_compare_do_read_done, bdev_io); 5410 5411 if (rc == -ENOMEM) { 5412 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5413 } else if (rc != 0) { 5414 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5415 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5416 } 5417 } 5418 5419 static int 5420 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5421 struct iovec *iov, int iovcnt, void *md_buf, 5422 uint64_t offset_blocks, uint64_t num_blocks, 5423 spdk_bdev_io_completion_cb cb, void *cb_arg) 5424 { 5425 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5426 struct spdk_bdev_io *bdev_io; 5427 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5428 5429 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5430 return -EINVAL; 5431 } 5432 5433 bdev_io = bdev_channel_get_io(channel); 5434 if (!bdev_io) { 5435 return -ENOMEM; 5436 } 5437 5438 bdev_io->internal.ch = channel; 5439 bdev_io->internal.desc = desc; 5440 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5441 bdev_io->u.bdev.iovs = iov; 5442 bdev_io->u.bdev.iovcnt = iovcnt; 5443 bdev_io->u.bdev.md_buf = md_buf; 5444 bdev_io->u.bdev.num_blocks = num_blocks; 5445 bdev_io->u.bdev.offset_blocks = offset_blocks; 5446 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5447 bdev_io->u.bdev.memory_domain = NULL; 5448 bdev_io->u.bdev.memory_domain_ctx = NULL; 5449 bdev_io->u.bdev.accel_sequence = NULL; 5450 5451 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5452 bdev_io_submit(bdev_io); 5453 return 0; 5454 } 5455 5456 bdev_compare_do_read(bdev_io); 5457 5458 return 0; 5459 } 5460 5461 int 5462 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5463 struct iovec *iov, int iovcnt, 5464 uint64_t offset_blocks, uint64_t num_blocks, 5465 spdk_bdev_io_completion_cb cb, void *cb_arg) 5466 { 5467 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5468 num_blocks, cb, cb_arg); 5469 } 5470 5471 int 5472 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5473 struct iovec *iov, int iovcnt, void *md_buf, 5474 uint64_t offset_blocks, uint64_t num_blocks, 5475 spdk_bdev_io_completion_cb cb, void *cb_arg) 5476 { 5477 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5478 return -EINVAL; 5479 } 5480 5481 if (md_buf && !_is_buf_allocated(iov)) { 5482 return -EINVAL; 5483 } 5484 5485 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5486 num_blocks, cb, cb_arg); 5487 } 5488 5489 static int 5490 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5491 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5492 spdk_bdev_io_completion_cb cb, void *cb_arg) 5493 { 5494 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5495 struct spdk_bdev_io *bdev_io; 5496 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5497 5498 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5499 return -EINVAL; 5500 } 5501 5502 bdev_io = bdev_channel_get_io(channel); 5503 if (!bdev_io) { 5504 return -ENOMEM; 5505 } 5506 5507 bdev_io->internal.ch = channel; 5508 bdev_io->internal.desc = desc; 5509 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5510 bdev_io->u.bdev.iovs = &bdev_io->iov; 5511 bdev_io->u.bdev.iovs[0].iov_base = buf; 5512 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5513 bdev_io->u.bdev.iovcnt = 1; 5514 bdev_io->u.bdev.md_buf = md_buf; 5515 bdev_io->u.bdev.num_blocks = num_blocks; 5516 bdev_io->u.bdev.offset_blocks = offset_blocks; 5517 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5518 bdev_io->u.bdev.memory_domain = NULL; 5519 bdev_io->u.bdev.memory_domain_ctx = NULL; 5520 bdev_io->u.bdev.accel_sequence = NULL; 5521 5522 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5523 bdev_io_submit(bdev_io); 5524 return 0; 5525 } 5526 5527 bdev_compare_do_read(bdev_io); 5528 5529 return 0; 5530 } 5531 5532 int 5533 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5534 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5535 spdk_bdev_io_completion_cb cb, void *cb_arg) 5536 { 5537 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5538 cb, cb_arg); 5539 } 5540 5541 int 5542 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5543 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5544 spdk_bdev_io_completion_cb cb, void *cb_arg) 5545 { 5546 struct iovec iov = { 5547 .iov_base = buf, 5548 }; 5549 5550 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5551 return -EINVAL; 5552 } 5553 5554 if (md_buf && !_is_buf_allocated(&iov)) { 5555 return -EINVAL; 5556 } 5557 5558 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5559 cb, cb_arg); 5560 } 5561 5562 static void 5563 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5564 { 5565 struct spdk_bdev_io *bdev_io = ctx; 5566 5567 if (unlock_status) { 5568 SPDK_ERRLOG("LBA range unlock failed\n"); 5569 } 5570 5571 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5572 false, bdev_io->internal.caller_ctx); 5573 } 5574 5575 static void 5576 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5577 { 5578 bdev_io->internal.status = status; 5579 5580 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5581 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5582 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5583 } 5584 5585 static void 5586 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5587 { 5588 struct spdk_bdev_io *parent_io = cb_arg; 5589 5590 if (!success) { 5591 SPDK_ERRLOG("Compare and write operation failed\n"); 5592 } 5593 5594 spdk_bdev_free_io(bdev_io); 5595 5596 bdev_comparev_and_writev_blocks_unlock(parent_io, 5597 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5598 } 5599 5600 static void 5601 bdev_compare_and_write_do_write(void *_bdev_io) 5602 { 5603 struct spdk_bdev_io *bdev_io = _bdev_io; 5604 int rc; 5605 5606 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5607 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5608 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5609 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5610 bdev_compare_and_write_do_write_done, bdev_io); 5611 5612 5613 if (rc == -ENOMEM) { 5614 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5615 } else if (rc != 0) { 5616 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5617 } 5618 } 5619 5620 static void 5621 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5622 { 5623 struct spdk_bdev_io *parent_io = cb_arg; 5624 5625 spdk_bdev_free_io(bdev_io); 5626 5627 if (!success) { 5628 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5629 return; 5630 } 5631 5632 bdev_compare_and_write_do_write(parent_io); 5633 } 5634 5635 static void 5636 bdev_compare_and_write_do_compare(void *_bdev_io) 5637 { 5638 struct spdk_bdev_io *bdev_io = _bdev_io; 5639 int rc; 5640 5641 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5642 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5643 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5644 bdev_compare_and_write_do_compare_done, bdev_io); 5645 5646 if (rc == -ENOMEM) { 5647 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5648 } else if (rc != 0) { 5649 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5650 } 5651 } 5652 5653 static void 5654 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5655 { 5656 struct spdk_bdev_io *bdev_io = ctx; 5657 5658 if (status) { 5659 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5660 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5661 return; 5662 } 5663 5664 bdev_compare_and_write_do_compare(bdev_io); 5665 } 5666 5667 int 5668 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5669 struct iovec *compare_iov, int compare_iovcnt, 5670 struct iovec *write_iov, int write_iovcnt, 5671 uint64_t offset_blocks, uint64_t num_blocks, 5672 spdk_bdev_io_completion_cb cb, void *cb_arg) 5673 { 5674 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5675 struct spdk_bdev_io *bdev_io; 5676 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5677 5678 if (!desc->write) { 5679 return -EBADF; 5680 } 5681 5682 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5683 return -EINVAL; 5684 } 5685 5686 if (num_blocks > bdev->acwu) { 5687 return -EINVAL; 5688 } 5689 5690 bdev_io = bdev_channel_get_io(channel); 5691 if (!bdev_io) { 5692 return -ENOMEM; 5693 } 5694 5695 bdev_io->internal.ch = channel; 5696 bdev_io->internal.desc = desc; 5697 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5698 bdev_io->u.bdev.iovs = compare_iov; 5699 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5700 bdev_io->u.bdev.fused_iovs = write_iov; 5701 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5702 bdev_io->u.bdev.md_buf = NULL; 5703 bdev_io->u.bdev.num_blocks = num_blocks; 5704 bdev_io->u.bdev.offset_blocks = offset_blocks; 5705 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5706 bdev_io->u.bdev.memory_domain = NULL; 5707 bdev_io->u.bdev.memory_domain_ctx = NULL; 5708 bdev_io->u.bdev.accel_sequence = NULL; 5709 5710 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5711 bdev_io_submit(bdev_io); 5712 return 0; 5713 } 5714 5715 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5716 bdev_comparev_and_writev_blocks_locked, bdev_io); 5717 } 5718 5719 int 5720 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5721 struct iovec *iov, int iovcnt, 5722 uint64_t offset_blocks, uint64_t num_blocks, 5723 bool populate, 5724 spdk_bdev_io_completion_cb cb, void *cb_arg) 5725 { 5726 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5727 struct spdk_bdev_io *bdev_io; 5728 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5729 5730 if (!desc->write) { 5731 return -EBADF; 5732 } 5733 5734 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5735 return -EINVAL; 5736 } 5737 5738 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5739 return -ENOTSUP; 5740 } 5741 5742 bdev_io = bdev_channel_get_io(channel); 5743 if (!bdev_io) { 5744 return -ENOMEM; 5745 } 5746 5747 bdev_io->internal.ch = channel; 5748 bdev_io->internal.desc = desc; 5749 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5750 bdev_io->u.bdev.num_blocks = num_blocks; 5751 bdev_io->u.bdev.offset_blocks = offset_blocks; 5752 bdev_io->u.bdev.iovs = iov; 5753 bdev_io->u.bdev.iovcnt = iovcnt; 5754 bdev_io->u.bdev.md_buf = NULL; 5755 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5756 bdev_io->u.bdev.zcopy.commit = 0; 5757 bdev_io->u.bdev.zcopy.start = 1; 5758 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5759 bdev_io->u.bdev.memory_domain = NULL; 5760 bdev_io->u.bdev.memory_domain_ctx = NULL; 5761 bdev_io->u.bdev.accel_sequence = NULL; 5762 5763 bdev_io_submit(bdev_io); 5764 5765 return 0; 5766 } 5767 5768 int 5769 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5770 spdk_bdev_io_completion_cb cb, void *cb_arg) 5771 { 5772 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5773 return -EINVAL; 5774 } 5775 5776 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5777 bdev_io->u.bdev.zcopy.start = 0; 5778 bdev_io->internal.caller_ctx = cb_arg; 5779 bdev_io->internal.cb = cb; 5780 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5781 5782 bdev_io_submit(bdev_io); 5783 5784 return 0; 5785 } 5786 5787 int 5788 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5789 uint64_t offset, uint64_t len, 5790 spdk_bdev_io_completion_cb cb, void *cb_arg) 5791 { 5792 uint64_t offset_blocks, num_blocks; 5793 5794 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5795 len, &num_blocks) != 0) { 5796 return -EINVAL; 5797 } 5798 5799 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5800 } 5801 5802 int 5803 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5804 uint64_t offset_blocks, uint64_t num_blocks, 5805 spdk_bdev_io_completion_cb cb, void *cb_arg) 5806 { 5807 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5808 struct spdk_bdev_io *bdev_io; 5809 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5810 5811 if (!desc->write) { 5812 return -EBADF; 5813 } 5814 5815 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5816 return -EINVAL; 5817 } 5818 5819 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5820 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5821 return -ENOTSUP; 5822 } 5823 5824 bdev_io = bdev_channel_get_io(channel); 5825 5826 if (!bdev_io) { 5827 return -ENOMEM; 5828 } 5829 5830 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5831 bdev_io->internal.ch = channel; 5832 bdev_io->internal.desc = desc; 5833 bdev_io->u.bdev.offset_blocks = offset_blocks; 5834 bdev_io->u.bdev.num_blocks = num_blocks; 5835 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5836 bdev_io->u.bdev.memory_domain = NULL; 5837 bdev_io->u.bdev.memory_domain_ctx = NULL; 5838 bdev_io->u.bdev.accel_sequence = NULL; 5839 5840 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5841 bdev_io_submit(bdev_io); 5842 return 0; 5843 } 5844 5845 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5846 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5847 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5848 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5849 bdev_write_zero_buffer_next(bdev_io); 5850 5851 return 0; 5852 } 5853 5854 int 5855 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5856 uint64_t offset, uint64_t nbytes, 5857 spdk_bdev_io_completion_cb cb, void *cb_arg) 5858 { 5859 uint64_t offset_blocks, num_blocks; 5860 5861 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5862 nbytes, &num_blocks) != 0) { 5863 return -EINVAL; 5864 } 5865 5866 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5867 } 5868 5869 int 5870 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5871 uint64_t offset_blocks, uint64_t num_blocks, 5872 spdk_bdev_io_completion_cb cb, void *cb_arg) 5873 { 5874 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5875 struct spdk_bdev_io *bdev_io; 5876 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5877 5878 if (!desc->write) { 5879 return -EBADF; 5880 } 5881 5882 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5883 return -EINVAL; 5884 } 5885 5886 if (num_blocks == 0) { 5887 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5888 return -EINVAL; 5889 } 5890 5891 bdev_io = bdev_channel_get_io(channel); 5892 if (!bdev_io) { 5893 return -ENOMEM; 5894 } 5895 5896 bdev_io->internal.ch = channel; 5897 bdev_io->internal.desc = desc; 5898 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5899 5900 bdev_io->u.bdev.iovs = &bdev_io->iov; 5901 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5902 bdev_io->u.bdev.iovs[0].iov_len = 0; 5903 bdev_io->u.bdev.iovcnt = 1; 5904 5905 bdev_io->u.bdev.offset_blocks = offset_blocks; 5906 bdev_io->u.bdev.num_blocks = num_blocks; 5907 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5908 bdev_io->u.bdev.memory_domain = NULL; 5909 bdev_io->u.bdev.memory_domain_ctx = NULL; 5910 bdev_io->u.bdev.accel_sequence = NULL; 5911 5912 bdev_io_submit(bdev_io); 5913 return 0; 5914 } 5915 5916 int 5917 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5918 uint64_t offset, uint64_t length, 5919 spdk_bdev_io_completion_cb cb, void *cb_arg) 5920 { 5921 uint64_t offset_blocks, num_blocks; 5922 5923 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5924 length, &num_blocks) != 0) { 5925 return -EINVAL; 5926 } 5927 5928 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5929 } 5930 5931 int 5932 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5933 uint64_t offset_blocks, uint64_t num_blocks, 5934 spdk_bdev_io_completion_cb cb, void *cb_arg) 5935 { 5936 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5937 struct spdk_bdev_io *bdev_io; 5938 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5939 5940 if (!desc->write) { 5941 return -EBADF; 5942 } 5943 5944 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5945 return -EINVAL; 5946 } 5947 5948 bdev_io = bdev_channel_get_io(channel); 5949 if (!bdev_io) { 5950 return -ENOMEM; 5951 } 5952 5953 bdev_io->internal.ch = channel; 5954 bdev_io->internal.desc = desc; 5955 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5956 bdev_io->u.bdev.iovs = NULL; 5957 bdev_io->u.bdev.iovcnt = 0; 5958 bdev_io->u.bdev.offset_blocks = offset_blocks; 5959 bdev_io->u.bdev.num_blocks = num_blocks; 5960 bdev_io->u.bdev.memory_domain = NULL; 5961 bdev_io->u.bdev.memory_domain_ctx = NULL; 5962 bdev_io->u.bdev.accel_sequence = NULL; 5963 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5964 5965 bdev_io_submit(bdev_io); 5966 return 0; 5967 } 5968 5969 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5970 5971 static void 5972 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5973 { 5974 struct spdk_bdev_channel *ch = _ctx; 5975 struct spdk_bdev_io *bdev_io; 5976 5977 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5978 5979 if (status == -EBUSY) { 5980 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5981 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5982 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5983 } else { 5984 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5985 5986 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 5987 /* If outstanding IOs are still present and reset_io_drain_timeout 5988 * seconds passed, start the reset. */ 5989 bdev_io_submit_reset(bdev_io); 5990 } else { 5991 /* We still have in progress memory domain pull/push or we're 5992 * executing accel sequence. Since we cannot abort either of those 5993 * operaions, fail the reset request. */ 5994 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5995 } 5996 } 5997 } else { 5998 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5999 SPDK_DEBUGLOG(bdev, 6000 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6001 ch->bdev->name); 6002 /* Mark the completion status as a SUCCESS and complete the reset. */ 6003 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6004 } 6005 } 6006 6007 static void 6008 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6009 struct spdk_io_channel *io_ch, void *_ctx) 6010 { 6011 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6012 int status = 0; 6013 6014 if (cur_ch->io_outstanding > 0 || 6015 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6016 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6017 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6018 * further iteration over the rest of the channels and pass non-zero status 6019 * to the callback function. */ 6020 status = -EBUSY; 6021 } 6022 spdk_bdev_for_each_channel_continue(i, status); 6023 } 6024 6025 static int 6026 bdev_reset_poll_for_outstanding_io(void *ctx) 6027 { 6028 struct spdk_bdev_channel *ch = ctx; 6029 struct spdk_bdev_io *bdev_io; 6030 6031 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6032 6033 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6034 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6035 bdev_reset_check_outstanding_io_done); 6036 6037 return SPDK_POLLER_BUSY; 6038 } 6039 6040 static void 6041 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6042 { 6043 struct spdk_bdev_channel *ch = _ctx; 6044 struct spdk_bdev_io *bdev_io; 6045 6046 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6047 6048 if (bdev->reset_io_drain_timeout == 0) { 6049 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6050 6051 bdev_io_submit_reset(bdev_io); 6052 return; 6053 } 6054 6055 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6056 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6057 6058 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6059 * submit the reset to the underlying module only if outstanding I/O 6060 * remain after reset_io_drain_timeout seconds have passed. */ 6061 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6062 bdev_reset_check_outstanding_io_done); 6063 } 6064 6065 static void 6066 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6067 struct spdk_io_channel *ch, void *_ctx) 6068 { 6069 struct spdk_bdev_channel *channel; 6070 struct spdk_bdev_mgmt_channel *mgmt_channel; 6071 struct spdk_bdev_shared_resource *shared_resource; 6072 bdev_io_tailq_t tmp_queued; 6073 6074 TAILQ_INIT(&tmp_queued); 6075 6076 channel = __io_ch_to_bdev_ch(ch); 6077 shared_resource = channel->shared_resource; 6078 mgmt_channel = shared_resource->mgmt_ch; 6079 6080 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6081 6082 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6083 /* The QoS object is always valid and readable while 6084 * the channel flag is set, so the lock here should not 6085 * be necessary. We're not in the fast path though, so 6086 * just take it anyway. */ 6087 spdk_spin_lock(&channel->bdev->internal.spinlock); 6088 if (channel->bdev->internal.qos->ch == channel) { 6089 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6090 } 6091 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6092 } 6093 6094 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6095 bdev_abort_all_buf_io(mgmt_channel, channel); 6096 bdev_abort_all_buf_io(mgmt_channel, channel); 6097 bdev_abort_all_queued_io(&tmp_queued, channel); 6098 6099 spdk_bdev_for_each_channel_continue(i, 0); 6100 } 6101 6102 static void 6103 bdev_start_reset(void *ctx) 6104 { 6105 struct spdk_bdev_channel *ch = ctx; 6106 6107 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6108 bdev_reset_freeze_channel_done); 6109 } 6110 6111 static void 6112 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6113 { 6114 struct spdk_bdev *bdev = ch->bdev; 6115 6116 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6117 6118 spdk_spin_lock(&bdev->internal.spinlock); 6119 if (bdev->internal.reset_in_progress == NULL) { 6120 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6121 /* 6122 * Take a channel reference for the target bdev for the life of this 6123 * reset. This guards against the channel getting destroyed while 6124 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6125 * progress. We will release the reference when this reset is 6126 * completed. 6127 */ 6128 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6129 bdev_start_reset(ch); 6130 } 6131 spdk_spin_unlock(&bdev->internal.spinlock); 6132 } 6133 6134 int 6135 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6136 spdk_bdev_io_completion_cb cb, void *cb_arg) 6137 { 6138 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6139 struct spdk_bdev_io *bdev_io; 6140 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6141 6142 bdev_io = bdev_channel_get_io(channel); 6143 if (!bdev_io) { 6144 return -ENOMEM; 6145 } 6146 6147 bdev_io->internal.ch = channel; 6148 bdev_io->internal.desc = desc; 6149 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6150 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6151 bdev_io->u.reset.ch_ref = NULL; 6152 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6153 6154 spdk_spin_lock(&bdev->internal.spinlock); 6155 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6156 spdk_spin_unlock(&bdev->internal.spinlock); 6157 6158 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6159 internal.ch_link); 6160 6161 bdev_channel_start_reset(channel); 6162 6163 return 0; 6164 } 6165 6166 void 6167 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6168 struct spdk_bdev_io_stat *stat) 6169 { 6170 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6171 6172 bdev_get_io_stat(stat, channel->stat); 6173 } 6174 6175 static void 6176 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6177 { 6178 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6179 6180 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6181 bdev_iostat_ctx->cb_arg, 0); 6182 free(bdev_iostat_ctx); 6183 } 6184 6185 static void 6186 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6187 struct spdk_io_channel *ch, void *_ctx) 6188 { 6189 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6190 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6191 6192 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6193 spdk_bdev_for_each_channel_continue(i, 0); 6194 } 6195 6196 void 6197 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6198 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6199 { 6200 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6201 6202 assert(bdev != NULL); 6203 assert(stat != NULL); 6204 assert(cb != NULL); 6205 6206 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6207 if (bdev_iostat_ctx == NULL) { 6208 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6209 cb(bdev, stat, cb_arg, -ENOMEM); 6210 return; 6211 } 6212 6213 bdev_iostat_ctx->stat = stat; 6214 bdev_iostat_ctx->cb = cb; 6215 bdev_iostat_ctx->cb_arg = cb_arg; 6216 6217 /* Start with the statistics from previously deleted channels. */ 6218 spdk_spin_lock(&bdev->internal.spinlock); 6219 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6220 spdk_spin_unlock(&bdev->internal.spinlock); 6221 6222 /* Then iterate and add the statistics from each existing channel. */ 6223 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6224 bdev_get_device_stat_done); 6225 } 6226 6227 struct bdev_iostat_reset_ctx { 6228 enum spdk_bdev_reset_stat_mode mode; 6229 bdev_reset_device_stat_cb cb; 6230 void *cb_arg; 6231 }; 6232 6233 static void 6234 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6235 { 6236 struct bdev_iostat_reset_ctx *ctx = _ctx; 6237 6238 ctx->cb(bdev, ctx->cb_arg, 0); 6239 6240 free(ctx); 6241 } 6242 6243 static void 6244 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6245 struct spdk_io_channel *ch, void *_ctx) 6246 { 6247 struct bdev_iostat_reset_ctx *ctx = _ctx; 6248 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6249 6250 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6251 6252 spdk_bdev_for_each_channel_continue(i, 0); 6253 } 6254 6255 void 6256 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6257 bdev_reset_device_stat_cb cb, void *cb_arg) 6258 { 6259 struct bdev_iostat_reset_ctx *ctx; 6260 6261 assert(bdev != NULL); 6262 assert(cb != NULL); 6263 6264 ctx = calloc(1, sizeof(*ctx)); 6265 if (ctx == NULL) { 6266 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6267 cb(bdev, cb_arg, -ENOMEM); 6268 return; 6269 } 6270 6271 ctx->mode = mode; 6272 ctx->cb = cb; 6273 ctx->cb_arg = cb_arg; 6274 6275 spdk_spin_lock(&bdev->internal.spinlock); 6276 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6277 spdk_spin_unlock(&bdev->internal.spinlock); 6278 6279 spdk_bdev_for_each_channel(bdev, 6280 bdev_reset_each_channel_stat, 6281 ctx, 6282 bdev_reset_device_stat_done); 6283 } 6284 6285 int 6286 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6287 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6288 spdk_bdev_io_completion_cb cb, void *cb_arg) 6289 { 6290 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6291 struct spdk_bdev_io *bdev_io; 6292 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6293 6294 if (!desc->write) { 6295 return -EBADF; 6296 } 6297 6298 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6299 return -ENOTSUP; 6300 } 6301 6302 bdev_io = bdev_channel_get_io(channel); 6303 if (!bdev_io) { 6304 return -ENOMEM; 6305 } 6306 6307 bdev_io->internal.ch = channel; 6308 bdev_io->internal.desc = desc; 6309 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6310 bdev_io->u.nvme_passthru.cmd = *cmd; 6311 bdev_io->u.nvme_passthru.buf = buf; 6312 bdev_io->u.nvme_passthru.nbytes = nbytes; 6313 bdev_io->u.nvme_passthru.md_buf = NULL; 6314 bdev_io->u.nvme_passthru.md_len = 0; 6315 6316 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6317 6318 bdev_io_submit(bdev_io); 6319 return 0; 6320 } 6321 6322 int 6323 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6324 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6325 spdk_bdev_io_completion_cb cb, void *cb_arg) 6326 { 6327 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6328 struct spdk_bdev_io *bdev_io; 6329 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6330 6331 if (!desc->write) { 6332 /* 6333 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6334 * to easily determine if the command is a read or write, but for now just 6335 * do not allow io_passthru with a read-only descriptor. 6336 */ 6337 return -EBADF; 6338 } 6339 6340 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6341 return -ENOTSUP; 6342 } 6343 6344 bdev_io = bdev_channel_get_io(channel); 6345 if (!bdev_io) { 6346 return -ENOMEM; 6347 } 6348 6349 bdev_io->internal.ch = channel; 6350 bdev_io->internal.desc = desc; 6351 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6352 bdev_io->u.nvme_passthru.cmd = *cmd; 6353 bdev_io->u.nvme_passthru.buf = buf; 6354 bdev_io->u.nvme_passthru.nbytes = nbytes; 6355 bdev_io->u.nvme_passthru.md_buf = NULL; 6356 bdev_io->u.nvme_passthru.md_len = 0; 6357 6358 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6359 6360 bdev_io_submit(bdev_io); 6361 return 0; 6362 } 6363 6364 int 6365 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6366 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6367 spdk_bdev_io_completion_cb cb, void *cb_arg) 6368 { 6369 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6370 struct spdk_bdev_io *bdev_io; 6371 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6372 6373 if (!desc->write) { 6374 /* 6375 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6376 * to easily determine if the command is a read or write, but for now just 6377 * do not allow io_passthru with a read-only descriptor. 6378 */ 6379 return -EBADF; 6380 } 6381 6382 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6383 return -ENOTSUP; 6384 } 6385 6386 bdev_io = bdev_channel_get_io(channel); 6387 if (!bdev_io) { 6388 return -ENOMEM; 6389 } 6390 6391 bdev_io->internal.ch = channel; 6392 bdev_io->internal.desc = desc; 6393 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6394 bdev_io->u.nvme_passthru.cmd = *cmd; 6395 bdev_io->u.nvme_passthru.buf = buf; 6396 bdev_io->u.nvme_passthru.nbytes = nbytes; 6397 bdev_io->u.nvme_passthru.md_buf = md_buf; 6398 bdev_io->u.nvme_passthru.md_len = md_len; 6399 6400 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6401 6402 bdev_io_submit(bdev_io); 6403 return 0; 6404 } 6405 6406 static void bdev_abort_retry(void *ctx); 6407 static void bdev_abort(struct spdk_bdev_io *parent_io); 6408 6409 static void 6410 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6411 { 6412 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6413 struct spdk_bdev_io *parent_io = cb_arg; 6414 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6415 6416 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6417 6418 spdk_bdev_free_io(bdev_io); 6419 6420 if (!success) { 6421 /* Check if the target I/O completed in the meantime. */ 6422 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6423 if (tmp_io == bio_to_abort) { 6424 break; 6425 } 6426 } 6427 6428 /* If the target I/O still exists, set the parent to failed. */ 6429 if (tmp_io != NULL) { 6430 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6431 } 6432 } 6433 6434 parent_io->u.bdev.split_outstanding--; 6435 if (parent_io->u.bdev.split_outstanding == 0) { 6436 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6437 bdev_abort_retry(parent_io); 6438 } else { 6439 bdev_io_complete(parent_io); 6440 } 6441 } 6442 } 6443 6444 static int 6445 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6446 struct spdk_bdev_io *bio_to_abort, 6447 spdk_bdev_io_completion_cb cb, void *cb_arg) 6448 { 6449 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6450 struct spdk_bdev_io *bdev_io; 6451 6452 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6453 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6454 /* TODO: Abort reset or abort request. */ 6455 return -ENOTSUP; 6456 } 6457 6458 bdev_io = bdev_channel_get_io(channel); 6459 if (bdev_io == NULL) { 6460 return -ENOMEM; 6461 } 6462 6463 bdev_io->internal.ch = channel; 6464 bdev_io->internal.desc = desc; 6465 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6466 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6467 6468 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6469 assert(bdev_io_should_split(bio_to_abort)); 6470 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6471 6472 /* Parent abort request is not submitted directly, but to manage its 6473 * execution add it to the submitted list here. 6474 */ 6475 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6476 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6477 6478 bdev_abort(bdev_io); 6479 6480 return 0; 6481 } 6482 6483 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6484 6485 /* Submit the abort request to the underlying bdev module. */ 6486 bdev_io_submit(bdev_io); 6487 6488 return 0; 6489 } 6490 6491 static bool 6492 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6493 { 6494 struct spdk_bdev_io *iter; 6495 6496 TAILQ_FOREACH(iter, tailq, internal.link) { 6497 if (iter == bdev_io) { 6498 return true; 6499 } 6500 } 6501 6502 return false; 6503 } 6504 6505 static uint32_t 6506 _bdev_abort(struct spdk_bdev_io *parent_io) 6507 { 6508 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6509 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6510 void *bio_cb_arg; 6511 struct spdk_bdev_io *bio_to_abort; 6512 uint32_t matched_ios; 6513 int rc; 6514 6515 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6516 6517 /* matched_ios is returned and will be kept by the caller. 6518 * 6519 * This function will be used for two cases, 1) the same cb_arg is used for 6520 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6521 * Incrementing split_outstanding directly here may confuse readers especially 6522 * for the 1st case. 6523 * 6524 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6525 * works as expected. 6526 */ 6527 matched_ios = 0; 6528 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6529 6530 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6531 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6532 continue; 6533 } 6534 6535 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6536 /* Any I/O which was submitted after this abort command should be excluded. */ 6537 continue; 6538 } 6539 6540 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6541 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6542 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6543 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6544 break; 6545 } 6546 6547 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6548 if (rc != 0) { 6549 if (rc == -ENOMEM) { 6550 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6551 } else { 6552 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6553 } 6554 break; 6555 } 6556 matched_ios++; 6557 } 6558 6559 return matched_ios; 6560 } 6561 6562 static void 6563 bdev_abort_retry(void *ctx) 6564 { 6565 struct spdk_bdev_io *parent_io = ctx; 6566 uint32_t matched_ios; 6567 6568 matched_ios = _bdev_abort(parent_io); 6569 6570 if (matched_ios == 0) { 6571 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6572 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6573 } else { 6574 /* For retry, the case that no target I/O was found is success 6575 * because it means target I/Os completed in the meantime. 6576 */ 6577 bdev_io_complete(parent_io); 6578 } 6579 return; 6580 } 6581 6582 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6583 parent_io->u.bdev.split_outstanding = matched_ios; 6584 } 6585 6586 static void 6587 bdev_abort(struct spdk_bdev_io *parent_io) 6588 { 6589 uint32_t matched_ios; 6590 6591 matched_ios = _bdev_abort(parent_io); 6592 6593 if (matched_ios == 0) { 6594 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6595 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6596 } else { 6597 /* The case the no target I/O was found is failure. */ 6598 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6599 bdev_io_complete(parent_io); 6600 } 6601 return; 6602 } 6603 6604 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6605 parent_io->u.bdev.split_outstanding = matched_ios; 6606 } 6607 6608 int 6609 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6610 void *bio_cb_arg, 6611 spdk_bdev_io_completion_cb cb, void *cb_arg) 6612 { 6613 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6614 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6615 struct spdk_bdev_io *bdev_io; 6616 6617 if (bio_cb_arg == NULL) { 6618 return -EINVAL; 6619 } 6620 6621 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6622 return -ENOTSUP; 6623 } 6624 6625 bdev_io = bdev_channel_get_io(channel); 6626 if (bdev_io == NULL) { 6627 return -ENOMEM; 6628 } 6629 6630 bdev_io->internal.ch = channel; 6631 bdev_io->internal.desc = desc; 6632 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6633 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6634 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6635 6636 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6637 6638 /* Parent abort request is not submitted directly, but to manage its execution, 6639 * add it to the submitted list here. 6640 */ 6641 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6642 6643 bdev_abort(bdev_io); 6644 6645 return 0; 6646 } 6647 6648 int 6649 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6650 struct spdk_bdev_io_wait_entry *entry) 6651 { 6652 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6653 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6654 6655 if (bdev != entry->bdev) { 6656 SPDK_ERRLOG("bdevs do not match\n"); 6657 return -EINVAL; 6658 } 6659 6660 if (mgmt_ch->per_thread_cache_count > 0) { 6661 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6662 return -EINVAL; 6663 } 6664 6665 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6666 return 0; 6667 } 6668 6669 static inline void 6670 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6671 { 6672 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6673 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6674 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6675 uint32_t blocklen = bdev_io->bdev->blocklen; 6676 6677 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6678 switch (bdev_io->type) { 6679 case SPDK_BDEV_IO_TYPE_READ: 6680 io_stat->bytes_read += num_blocks * blocklen; 6681 io_stat->num_read_ops++; 6682 io_stat->read_latency_ticks += tsc_diff; 6683 if (io_stat->max_read_latency_ticks < tsc_diff) { 6684 io_stat->max_read_latency_ticks = tsc_diff; 6685 } 6686 if (io_stat->min_read_latency_ticks > tsc_diff) { 6687 io_stat->min_read_latency_ticks = tsc_diff; 6688 } 6689 break; 6690 case SPDK_BDEV_IO_TYPE_WRITE: 6691 io_stat->bytes_written += num_blocks * blocklen; 6692 io_stat->num_write_ops++; 6693 io_stat->write_latency_ticks += tsc_diff; 6694 if (io_stat->max_write_latency_ticks < tsc_diff) { 6695 io_stat->max_write_latency_ticks = tsc_diff; 6696 } 6697 if (io_stat->min_write_latency_ticks > tsc_diff) { 6698 io_stat->min_write_latency_ticks = tsc_diff; 6699 } 6700 break; 6701 case SPDK_BDEV_IO_TYPE_UNMAP: 6702 io_stat->bytes_unmapped += num_blocks * blocklen; 6703 io_stat->num_unmap_ops++; 6704 io_stat->unmap_latency_ticks += tsc_diff; 6705 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6706 io_stat->max_unmap_latency_ticks = tsc_diff; 6707 } 6708 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6709 io_stat->min_unmap_latency_ticks = tsc_diff; 6710 } 6711 break; 6712 case SPDK_BDEV_IO_TYPE_ZCOPY: 6713 /* Track the data in the start phase only */ 6714 if (bdev_io->u.bdev.zcopy.start) { 6715 if (bdev_io->u.bdev.zcopy.populate) { 6716 io_stat->bytes_read += num_blocks * blocklen; 6717 io_stat->num_read_ops++; 6718 io_stat->read_latency_ticks += tsc_diff; 6719 if (io_stat->max_read_latency_ticks < tsc_diff) { 6720 io_stat->max_read_latency_ticks = tsc_diff; 6721 } 6722 if (io_stat->min_read_latency_ticks > tsc_diff) { 6723 io_stat->min_read_latency_ticks = tsc_diff; 6724 } 6725 } else { 6726 io_stat->bytes_written += num_blocks * blocklen; 6727 io_stat->num_write_ops++; 6728 io_stat->write_latency_ticks += tsc_diff; 6729 if (io_stat->max_write_latency_ticks < tsc_diff) { 6730 io_stat->max_write_latency_ticks = tsc_diff; 6731 } 6732 if (io_stat->min_write_latency_ticks > tsc_diff) { 6733 io_stat->min_write_latency_ticks = tsc_diff; 6734 } 6735 } 6736 } 6737 break; 6738 case SPDK_BDEV_IO_TYPE_COPY: 6739 io_stat->bytes_copied += num_blocks * blocklen; 6740 io_stat->num_copy_ops++; 6741 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6742 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6743 io_stat->max_copy_latency_ticks = tsc_diff; 6744 } 6745 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6746 io_stat->min_copy_latency_ticks = tsc_diff; 6747 } 6748 break; 6749 default: 6750 break; 6751 } 6752 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6753 io_stat = bdev_io->bdev->internal.stat; 6754 assert(io_stat->io_error != NULL); 6755 6756 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6757 io_stat->io_error->error_status[-io_status - 1]++; 6758 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6759 } 6760 6761 #ifdef SPDK_CONFIG_VTUNE 6762 uint64_t now_tsc = spdk_get_ticks(); 6763 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6764 uint64_t data[5]; 6765 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6766 6767 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6768 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6769 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6770 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6771 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6772 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6773 6774 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6775 __itt_metadata_u64, 5, data); 6776 6777 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6778 bdev_io->internal.ch->start_tsc = now_tsc; 6779 } 6780 #endif 6781 } 6782 6783 static inline void 6784 _bdev_io_complete(void *ctx) 6785 { 6786 struct spdk_bdev_io *bdev_io = ctx; 6787 6788 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6789 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6790 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6791 } 6792 6793 assert(bdev_io->internal.cb != NULL); 6794 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6795 6796 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6797 bdev_io->internal.caller_ctx); 6798 } 6799 6800 static inline void 6801 bdev_io_complete(void *ctx) 6802 { 6803 struct spdk_bdev_io *bdev_io = ctx; 6804 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6805 uint64_t tsc, tsc_diff; 6806 6807 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6808 /* 6809 * Defer completion to avoid potential infinite recursion if the 6810 * user's completion callback issues a new I/O. 6811 */ 6812 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6813 bdev_io_complete, bdev_io); 6814 return; 6815 } 6816 6817 tsc = spdk_get_ticks(); 6818 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6819 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6820 bdev_io->internal.caller_ctx); 6821 6822 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6823 6824 if (bdev_io->internal.ch->histogram) { 6825 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6826 } 6827 6828 bdev_io_update_io_stat(bdev_io, tsc_diff); 6829 _bdev_io_complete(bdev_io); 6830 } 6831 6832 /* The difference between this function and bdev_io_complete() is that this should be called to 6833 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6834 * io_submitted list and don't have submit_tsc updated. 6835 */ 6836 static inline void 6837 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 6838 { 6839 /* Since the IO hasn't been submitted it's bound to be failed */ 6840 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6841 6842 /* At this point we don't know if the IO is completed from submission context or not, but, 6843 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 6844 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6845 _bdev_io_complete, bdev_io); 6846 } 6847 6848 static void bdev_destroy_cb(void *io_device); 6849 6850 static void 6851 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6852 { 6853 struct spdk_bdev_io *bdev_io = _ctx; 6854 6855 if (bdev_io->u.reset.ch_ref != NULL) { 6856 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6857 bdev_io->u.reset.ch_ref = NULL; 6858 } 6859 6860 bdev_io_complete(bdev_io); 6861 6862 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6863 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6864 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6865 } 6866 } 6867 6868 static void 6869 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6870 struct spdk_io_channel *_ch, void *_ctx) 6871 { 6872 struct spdk_bdev_io *bdev_io = _ctx; 6873 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6874 struct spdk_bdev_io *queued_reset; 6875 6876 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6877 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6878 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6879 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6880 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6881 } 6882 6883 spdk_bdev_for_each_channel_continue(i, 0); 6884 } 6885 6886 static void 6887 bdev_io_complete_sequence_cb(void *ctx, int status) 6888 { 6889 struct spdk_bdev_io *bdev_io = ctx; 6890 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6891 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6892 6893 /* u.bdev.accel_sequence should have already been cleared at this point */ 6894 assert(bdev_io->u.bdev.accel_sequence == NULL); 6895 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 6896 6897 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 6898 bdev_io->internal.accel_sequence = NULL; 6899 6900 if (spdk_unlikely(status != 0)) { 6901 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 6902 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6903 } 6904 6905 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6906 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6907 return; 6908 } 6909 6910 bdev_io_complete(bdev_io); 6911 } 6912 6913 void 6914 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6915 { 6916 struct spdk_bdev *bdev = bdev_io->bdev; 6917 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6918 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6919 6920 bdev_io->internal.status = status; 6921 6922 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6923 bool unlock_channels = false; 6924 6925 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6926 SPDK_ERRLOG("NOMEM returned for reset\n"); 6927 } 6928 spdk_spin_lock(&bdev->internal.spinlock); 6929 if (bdev_io == bdev->internal.reset_in_progress) { 6930 bdev->internal.reset_in_progress = NULL; 6931 unlock_channels = true; 6932 } 6933 spdk_spin_unlock(&bdev->internal.spinlock); 6934 6935 if (unlock_channels) { 6936 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6937 bdev_reset_complete); 6938 return; 6939 } 6940 } else { 6941 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) && 6942 spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6943 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 6944 return; 6945 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6946 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6947 /* bdev IO will be completed in the callback */ 6948 return; 6949 } 6950 6951 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6952 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6953 return; 6954 } 6955 } 6956 6957 bdev_io_complete(bdev_io); 6958 } 6959 6960 void 6961 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6962 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6963 { 6964 if (sc == SPDK_SCSI_STATUS_GOOD) { 6965 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6966 } else { 6967 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6968 bdev_io->internal.error.scsi.sc = sc; 6969 bdev_io->internal.error.scsi.sk = sk; 6970 bdev_io->internal.error.scsi.asc = asc; 6971 bdev_io->internal.error.scsi.ascq = ascq; 6972 } 6973 6974 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6975 } 6976 6977 void 6978 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6979 int *sc, int *sk, int *asc, int *ascq) 6980 { 6981 assert(sc != NULL); 6982 assert(sk != NULL); 6983 assert(asc != NULL); 6984 assert(ascq != NULL); 6985 6986 switch (bdev_io->internal.status) { 6987 case SPDK_BDEV_IO_STATUS_SUCCESS: 6988 *sc = SPDK_SCSI_STATUS_GOOD; 6989 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6990 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6991 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6992 break; 6993 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6994 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6995 break; 6996 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6997 *sc = bdev_io->internal.error.scsi.sc; 6998 *sk = bdev_io->internal.error.scsi.sk; 6999 *asc = bdev_io->internal.error.scsi.asc; 7000 *ascq = bdev_io->internal.error.scsi.ascq; 7001 break; 7002 default: 7003 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7004 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7005 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7006 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7007 break; 7008 } 7009 } 7010 7011 void 7012 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7013 { 7014 if (aio_result == 0) { 7015 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7016 } else { 7017 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7018 } 7019 7020 bdev_io->internal.error.aio_result = aio_result; 7021 7022 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 7023 } 7024 7025 void 7026 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7027 { 7028 assert(aio_result != NULL); 7029 7030 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7031 *aio_result = bdev_io->internal.error.aio_result; 7032 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7033 *aio_result = 0; 7034 } else { 7035 *aio_result = -EIO; 7036 } 7037 } 7038 7039 void 7040 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7041 { 7042 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7043 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7044 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7045 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 7046 } else { 7047 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7048 } 7049 7050 bdev_io->internal.error.nvme.cdw0 = cdw0; 7051 bdev_io->internal.error.nvme.sct = sct; 7052 bdev_io->internal.error.nvme.sc = sc; 7053 7054 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 7055 } 7056 7057 void 7058 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7059 { 7060 assert(sct != NULL); 7061 assert(sc != NULL); 7062 assert(cdw0 != NULL); 7063 7064 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7065 *sct = SPDK_NVME_SCT_GENERIC; 7066 *sc = SPDK_NVME_SC_SUCCESS; 7067 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7068 *cdw0 = 0; 7069 } else { 7070 *cdw0 = 1U; 7071 } 7072 return; 7073 } 7074 7075 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7076 *sct = bdev_io->internal.error.nvme.sct; 7077 *sc = bdev_io->internal.error.nvme.sc; 7078 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7079 *sct = SPDK_NVME_SCT_GENERIC; 7080 *sc = SPDK_NVME_SC_SUCCESS; 7081 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7082 *sct = SPDK_NVME_SCT_GENERIC; 7083 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7084 } else { 7085 *sct = SPDK_NVME_SCT_GENERIC; 7086 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7087 } 7088 7089 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7090 } 7091 7092 void 7093 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7094 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7095 { 7096 assert(first_sct != NULL); 7097 assert(first_sc != NULL); 7098 assert(second_sct != NULL); 7099 assert(second_sc != NULL); 7100 assert(cdw0 != NULL); 7101 7102 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7103 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7104 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7105 *first_sct = bdev_io->internal.error.nvme.sct; 7106 *first_sc = bdev_io->internal.error.nvme.sc; 7107 *second_sct = SPDK_NVME_SCT_GENERIC; 7108 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7109 } else { 7110 *first_sct = SPDK_NVME_SCT_GENERIC; 7111 *first_sc = SPDK_NVME_SC_SUCCESS; 7112 *second_sct = bdev_io->internal.error.nvme.sct; 7113 *second_sc = bdev_io->internal.error.nvme.sc; 7114 } 7115 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7116 *first_sct = SPDK_NVME_SCT_GENERIC; 7117 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7118 *second_sct = SPDK_NVME_SCT_GENERIC; 7119 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7120 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7121 *first_sct = SPDK_NVME_SCT_GENERIC; 7122 *first_sc = SPDK_NVME_SC_SUCCESS; 7123 *second_sct = SPDK_NVME_SCT_GENERIC; 7124 *second_sc = SPDK_NVME_SC_SUCCESS; 7125 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7126 *first_sct = SPDK_NVME_SCT_GENERIC; 7127 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7128 *second_sct = SPDK_NVME_SCT_GENERIC; 7129 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7130 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7131 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7132 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7133 *second_sct = SPDK_NVME_SCT_GENERIC; 7134 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7135 } else { 7136 *first_sct = SPDK_NVME_SCT_GENERIC; 7137 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7138 *second_sct = SPDK_NVME_SCT_GENERIC; 7139 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7140 } 7141 7142 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7143 } 7144 7145 struct spdk_thread * 7146 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7147 { 7148 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7149 } 7150 7151 struct spdk_io_channel * 7152 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7153 { 7154 return bdev_io->internal.ch->channel; 7155 } 7156 7157 static int 7158 bdev_register(struct spdk_bdev *bdev) 7159 { 7160 char *bdev_name; 7161 char uuid[SPDK_UUID_STRING_LEN]; 7162 int ret, i; 7163 7164 assert(bdev->module != NULL); 7165 7166 if (!bdev->name) { 7167 SPDK_ERRLOG("Bdev name is NULL\n"); 7168 return -EINVAL; 7169 } 7170 7171 if (!strlen(bdev->name)) { 7172 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7173 return -EINVAL; 7174 } 7175 7176 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7177 if (bdev->fn_table->accel_sequence_supported == NULL) { 7178 continue; 7179 } 7180 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7181 (enum spdk_bdev_io_type)i)) { 7182 continue; 7183 } 7184 7185 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7186 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7187 "memory domains\n"); 7188 return -EINVAL; 7189 } 7190 7191 if (spdk_bdev_is_md_separate(bdev)) { 7192 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7193 "accel sequence support\n"); 7194 return -EINVAL; 7195 } 7196 } 7197 7198 /* Users often register their own I/O devices using the bdev name. In 7199 * order to avoid conflicts, prepend bdev_. */ 7200 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7201 if (!bdev_name) { 7202 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7203 return -ENOMEM; 7204 } 7205 7206 bdev->internal.stat = bdev_alloc_io_stat(true); 7207 if (!bdev->internal.stat) { 7208 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7209 free(bdev_name); 7210 return -ENOMEM; 7211 } 7212 7213 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7214 bdev->internal.measured_queue_depth = UINT64_MAX; 7215 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7216 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7217 bdev->internal.qd_poller = NULL; 7218 bdev->internal.qos = NULL; 7219 7220 TAILQ_INIT(&bdev->internal.open_descs); 7221 TAILQ_INIT(&bdev->internal.locked_ranges); 7222 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7223 TAILQ_INIT(&bdev->aliases); 7224 7225 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7226 if (ret != 0) { 7227 bdev_free_io_stat(bdev->internal.stat); 7228 free(bdev_name); 7229 return ret; 7230 } 7231 7232 /* UUID has to be specified by the user or defined by bdev itself. 7233 * Otherwise this field must remain empty, to indicate that this 7234 * value cannot be depended upon. */ 7235 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7236 /* Add the UUID alias only if it's different than the name */ 7237 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7238 if (strcmp(bdev->name, uuid) != 0) { 7239 ret = spdk_bdev_alias_add(bdev, uuid); 7240 if (ret != 0) { 7241 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7242 bdev_name_del(&bdev->internal.bdev_name); 7243 bdev_free_io_stat(bdev->internal.stat); 7244 free(bdev_name); 7245 return ret; 7246 } 7247 } 7248 } 7249 7250 if (spdk_bdev_get_buf_align(bdev) > 1) { 7251 if (bdev->split_on_optimal_io_boundary) { 7252 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7253 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7254 } else { 7255 bdev->split_on_optimal_io_boundary = true; 7256 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7257 } 7258 } 7259 7260 /* If the user didn't specify a write unit size, set it to one. */ 7261 if (bdev->write_unit_size == 0) { 7262 bdev->write_unit_size = 1; 7263 } 7264 7265 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7266 if (bdev->acwu == 0) { 7267 bdev->acwu = bdev->write_unit_size; 7268 } 7269 7270 if (bdev->phys_blocklen == 0) { 7271 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7272 } 7273 7274 bdev->internal.reset_in_progress = NULL; 7275 bdev->internal.qd_poll_in_progress = false; 7276 bdev->internal.period = 0; 7277 bdev->internal.new_period = 0; 7278 7279 spdk_io_device_register(__bdev_to_io_dev(bdev), 7280 bdev_channel_create, bdev_channel_destroy, 7281 sizeof(struct spdk_bdev_channel), 7282 bdev_name); 7283 7284 free(bdev_name); 7285 7286 spdk_spin_init(&bdev->internal.spinlock); 7287 7288 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7289 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7290 7291 return 0; 7292 } 7293 7294 static void 7295 bdev_destroy_cb(void *io_device) 7296 { 7297 int rc; 7298 struct spdk_bdev *bdev; 7299 spdk_bdev_unregister_cb cb_fn; 7300 void *cb_arg; 7301 7302 bdev = __bdev_from_io_dev(io_device); 7303 7304 if (bdev->internal.unregister_td != spdk_get_thread()) { 7305 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7306 return; 7307 } 7308 7309 cb_fn = bdev->internal.unregister_cb; 7310 cb_arg = bdev->internal.unregister_ctx; 7311 7312 spdk_spin_destroy(&bdev->internal.spinlock); 7313 free(bdev->internal.qos); 7314 bdev_free_io_stat(bdev->internal.stat); 7315 7316 rc = bdev->fn_table->destruct(bdev->ctxt); 7317 if (rc < 0) { 7318 SPDK_ERRLOG("destruct failed\n"); 7319 } 7320 if (rc <= 0 && cb_fn != NULL) { 7321 cb_fn(cb_arg, rc); 7322 } 7323 } 7324 7325 void 7326 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7327 { 7328 if (bdev->internal.unregister_cb != NULL) { 7329 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7330 } 7331 } 7332 7333 static void 7334 _remove_notify(void *arg) 7335 { 7336 struct spdk_bdev_desc *desc = arg; 7337 7338 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7339 } 7340 7341 /* returns: 0 - bdev removed and ready to be destructed. 7342 * -EBUSY - bdev can't be destructed yet. */ 7343 static int 7344 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7345 { 7346 struct spdk_bdev_desc *desc, *tmp; 7347 int rc = 0; 7348 char uuid[SPDK_UUID_STRING_LEN]; 7349 7350 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7351 assert(spdk_spin_held(&bdev->internal.spinlock)); 7352 7353 /* Notify each descriptor about hotremoval */ 7354 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7355 rc = -EBUSY; 7356 /* 7357 * Defer invocation of the event_cb to a separate message that will 7358 * run later on its thread. This ensures this context unwinds and 7359 * we don't recursively unregister this bdev again if the event_cb 7360 * immediately closes its descriptor. 7361 */ 7362 event_notify(desc, _remove_notify); 7363 } 7364 7365 /* If there are no descriptors, proceed removing the bdev */ 7366 if (rc == 0) { 7367 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7368 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7369 7370 /* Delete the name and the UUID alias */ 7371 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7372 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7373 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7374 7375 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7376 7377 if (bdev->internal.reset_in_progress != NULL) { 7378 /* If reset is in progress, let the completion callback for reset 7379 * unregister the bdev. 7380 */ 7381 rc = -EBUSY; 7382 } 7383 } 7384 7385 return rc; 7386 } 7387 7388 static void 7389 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7390 struct spdk_io_channel *io_ch, void *_ctx) 7391 { 7392 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7393 7394 bdev_channel_abort_queued_ios(bdev_ch); 7395 spdk_bdev_for_each_channel_continue(i, 0); 7396 } 7397 7398 static void 7399 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7400 { 7401 int rc; 7402 7403 spdk_spin_lock(&g_bdev_mgr.spinlock); 7404 spdk_spin_lock(&bdev->internal.spinlock); 7405 /* 7406 * Set the status to REMOVING after completing to abort channels. Otherwise, 7407 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7408 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7409 * may fail. 7410 */ 7411 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7412 rc = bdev_unregister_unsafe(bdev); 7413 spdk_spin_unlock(&bdev->internal.spinlock); 7414 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7415 7416 if (rc == 0) { 7417 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7418 } 7419 } 7420 7421 void 7422 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7423 { 7424 struct spdk_thread *thread; 7425 7426 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7427 7428 thread = spdk_get_thread(); 7429 if (!thread) { 7430 /* The user called this from a non-SPDK thread. */ 7431 if (cb_fn != NULL) { 7432 cb_fn(cb_arg, -ENOTSUP); 7433 } 7434 return; 7435 } 7436 7437 spdk_spin_lock(&g_bdev_mgr.spinlock); 7438 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7439 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7440 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7441 if (cb_fn) { 7442 cb_fn(cb_arg, -EBUSY); 7443 } 7444 return; 7445 } 7446 7447 spdk_spin_lock(&bdev->internal.spinlock); 7448 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7449 bdev->internal.unregister_cb = cb_fn; 7450 bdev->internal.unregister_ctx = cb_arg; 7451 bdev->internal.unregister_td = thread; 7452 spdk_spin_unlock(&bdev->internal.spinlock); 7453 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7454 7455 spdk_bdev_set_qd_sampling_period(bdev, 0); 7456 7457 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7458 bdev_unregister); 7459 } 7460 7461 int 7462 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7463 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7464 { 7465 struct spdk_bdev_desc *desc; 7466 struct spdk_bdev *bdev; 7467 int rc; 7468 7469 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7470 if (rc != 0) { 7471 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7472 return rc; 7473 } 7474 7475 bdev = spdk_bdev_desc_get_bdev(desc); 7476 7477 if (bdev->module != module) { 7478 spdk_bdev_close(desc); 7479 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7480 bdev_name); 7481 return -ENODEV; 7482 } 7483 7484 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7485 7486 spdk_bdev_close(desc); 7487 7488 return 0; 7489 } 7490 7491 static int 7492 bdev_start_qos(struct spdk_bdev *bdev) 7493 { 7494 struct set_qos_limit_ctx *ctx; 7495 7496 /* Enable QoS */ 7497 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7498 ctx = calloc(1, sizeof(*ctx)); 7499 if (ctx == NULL) { 7500 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7501 return -ENOMEM; 7502 } 7503 ctx->bdev = bdev; 7504 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7505 } 7506 7507 return 0; 7508 } 7509 7510 static void 7511 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7512 struct spdk_bdev *bdev) 7513 { 7514 enum spdk_bdev_claim_type type; 7515 const char *typename, *modname; 7516 extern struct spdk_log_flag SPDK_LOG_bdev; 7517 7518 assert(spdk_spin_held(&bdev->internal.spinlock)); 7519 7520 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7521 return; 7522 } 7523 7524 type = bdev->internal.claim_type; 7525 typename = spdk_bdev_claim_get_name(type); 7526 7527 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7528 modname = bdev->internal.claim.v1.module->name; 7529 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7530 bdev->name, detail, typename, modname); 7531 return; 7532 } 7533 7534 if (claim_type_is_v2(type)) { 7535 struct spdk_bdev_module_claim *claim; 7536 7537 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7538 modname = claim->module->name; 7539 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7540 bdev->name, detail, typename, modname); 7541 } 7542 return; 7543 } 7544 7545 assert(false); 7546 } 7547 7548 static int 7549 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7550 { 7551 struct spdk_thread *thread; 7552 int rc = 0; 7553 7554 thread = spdk_get_thread(); 7555 if (!thread) { 7556 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7557 return -ENOTSUP; 7558 } 7559 7560 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7561 spdk_get_thread()); 7562 7563 desc->bdev = bdev; 7564 desc->thread = thread; 7565 desc->write = write; 7566 7567 spdk_spin_lock(&bdev->internal.spinlock); 7568 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7569 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7570 spdk_spin_unlock(&bdev->internal.spinlock); 7571 return -ENODEV; 7572 } 7573 7574 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7575 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7576 spdk_spin_unlock(&bdev->internal.spinlock); 7577 return -EPERM; 7578 } 7579 7580 rc = bdev_start_qos(bdev); 7581 if (rc != 0) { 7582 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7583 spdk_spin_unlock(&bdev->internal.spinlock); 7584 return rc; 7585 } 7586 7587 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7588 7589 spdk_spin_unlock(&bdev->internal.spinlock); 7590 7591 return 0; 7592 } 7593 7594 static int 7595 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7596 struct spdk_bdev_desc **_desc) 7597 { 7598 struct spdk_bdev_desc *desc; 7599 unsigned int i; 7600 7601 desc = calloc(1, sizeof(*desc)); 7602 if (desc == NULL) { 7603 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7604 return -ENOMEM; 7605 } 7606 7607 TAILQ_INIT(&desc->pending_media_events); 7608 TAILQ_INIT(&desc->free_media_events); 7609 7610 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7611 desc->callback.event_fn = event_cb; 7612 desc->callback.ctx = event_ctx; 7613 spdk_spin_init(&desc->spinlock); 7614 7615 if (bdev->media_events) { 7616 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7617 sizeof(*desc->media_events_buffer)); 7618 if (desc->media_events_buffer == NULL) { 7619 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7620 bdev_desc_free(desc); 7621 return -ENOMEM; 7622 } 7623 7624 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7625 TAILQ_INSERT_TAIL(&desc->free_media_events, 7626 &desc->media_events_buffer[i], tailq); 7627 } 7628 } 7629 7630 if (bdev->fn_table->accel_sequence_supported != NULL) { 7631 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7632 desc->accel_sequence_supported[i] = 7633 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7634 (enum spdk_bdev_io_type)i); 7635 } 7636 } 7637 7638 *_desc = desc; 7639 7640 return 0; 7641 } 7642 7643 int 7644 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7645 void *event_ctx, struct spdk_bdev_desc **_desc) 7646 { 7647 struct spdk_bdev_desc *desc; 7648 struct spdk_bdev *bdev; 7649 int rc; 7650 7651 if (event_cb == NULL) { 7652 SPDK_ERRLOG("Missing event callback function\n"); 7653 return -EINVAL; 7654 } 7655 7656 spdk_spin_lock(&g_bdev_mgr.spinlock); 7657 7658 bdev = bdev_get_by_name(bdev_name); 7659 7660 if (bdev == NULL) { 7661 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7662 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7663 return -ENODEV; 7664 } 7665 7666 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7667 if (rc != 0) { 7668 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7669 return rc; 7670 } 7671 7672 rc = bdev_open(bdev, write, desc); 7673 if (rc != 0) { 7674 bdev_desc_free(desc); 7675 desc = NULL; 7676 } 7677 7678 *_desc = desc; 7679 7680 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7681 7682 return rc; 7683 } 7684 7685 static void 7686 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7687 { 7688 int rc; 7689 7690 spdk_spin_lock(&bdev->internal.spinlock); 7691 spdk_spin_lock(&desc->spinlock); 7692 7693 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7694 7695 desc->closed = true; 7696 7697 if (desc->claim != NULL) { 7698 bdev_desc_release_claims(desc); 7699 } 7700 7701 if (0 == desc->refs) { 7702 spdk_spin_unlock(&desc->spinlock); 7703 bdev_desc_free(desc); 7704 } else { 7705 spdk_spin_unlock(&desc->spinlock); 7706 } 7707 7708 /* If no more descriptors, kill QoS channel */ 7709 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7710 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7711 bdev->name, spdk_get_thread()); 7712 7713 if (bdev_qos_destroy(bdev)) { 7714 /* There isn't anything we can do to recover here. Just let the 7715 * old QoS poller keep running. The QoS handling won't change 7716 * cores when the user allocates a new channel, but it won't break. */ 7717 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7718 } 7719 } 7720 7721 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7722 rc = bdev_unregister_unsafe(bdev); 7723 spdk_spin_unlock(&bdev->internal.spinlock); 7724 7725 if (rc == 0) { 7726 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7727 } 7728 } else { 7729 spdk_spin_unlock(&bdev->internal.spinlock); 7730 } 7731 } 7732 7733 void 7734 spdk_bdev_close(struct spdk_bdev_desc *desc) 7735 { 7736 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7737 7738 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7739 spdk_get_thread()); 7740 7741 assert(desc->thread == spdk_get_thread()); 7742 7743 spdk_poller_unregister(&desc->io_timeout_poller); 7744 7745 spdk_spin_lock(&g_bdev_mgr.spinlock); 7746 7747 bdev_close(bdev, desc); 7748 7749 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7750 } 7751 7752 static void 7753 bdev_register_finished(void *arg) 7754 { 7755 struct spdk_bdev_desc *desc = arg; 7756 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7757 7758 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7759 7760 spdk_spin_lock(&g_bdev_mgr.spinlock); 7761 7762 bdev_close(bdev, desc); 7763 7764 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7765 } 7766 7767 int 7768 spdk_bdev_register(struct spdk_bdev *bdev) 7769 { 7770 struct spdk_bdev_desc *desc; 7771 struct spdk_thread *thread = spdk_get_thread(); 7772 int rc; 7773 7774 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7775 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7776 thread ? spdk_thread_get_name(thread) : "null"); 7777 return -EINVAL; 7778 } 7779 7780 rc = bdev_register(bdev); 7781 if (rc != 0) { 7782 return rc; 7783 } 7784 7785 /* A descriptor is opened to prevent bdev deletion during examination */ 7786 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7787 if (rc != 0) { 7788 spdk_bdev_unregister(bdev, NULL, NULL); 7789 return rc; 7790 } 7791 7792 rc = bdev_open(bdev, false, desc); 7793 if (rc != 0) { 7794 bdev_desc_free(desc); 7795 spdk_bdev_unregister(bdev, NULL, NULL); 7796 return rc; 7797 } 7798 7799 /* Examine configuration before initializing I/O */ 7800 bdev_examine(bdev); 7801 7802 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7803 if (rc != 0) { 7804 bdev_close(bdev, desc); 7805 spdk_bdev_unregister(bdev, NULL, NULL); 7806 } 7807 7808 return rc; 7809 } 7810 7811 int 7812 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7813 struct spdk_bdev_module *module) 7814 { 7815 spdk_spin_lock(&bdev->internal.spinlock); 7816 7817 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7818 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7819 spdk_spin_unlock(&bdev->internal.spinlock); 7820 return -EPERM; 7821 } 7822 7823 if (desc && !desc->write) { 7824 desc->write = true; 7825 } 7826 7827 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7828 bdev->internal.claim.v1.module = module; 7829 7830 spdk_spin_unlock(&bdev->internal.spinlock); 7831 return 0; 7832 } 7833 7834 void 7835 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7836 { 7837 spdk_spin_lock(&bdev->internal.spinlock); 7838 7839 assert(bdev->internal.claim.v1.module != NULL); 7840 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7841 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7842 bdev->internal.claim.v1.module = NULL; 7843 7844 spdk_spin_unlock(&bdev->internal.spinlock); 7845 } 7846 7847 /* 7848 * Start claims v2 7849 */ 7850 7851 const char * 7852 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7853 { 7854 switch (type) { 7855 case SPDK_BDEV_CLAIM_NONE: 7856 return "not_claimed"; 7857 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7858 return "exclusive_write"; 7859 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7860 return "read_many_write_one"; 7861 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7862 return "read_many_write_none"; 7863 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7864 return "read_many_write_many"; 7865 default: 7866 break; 7867 } 7868 return "invalid_claim"; 7869 } 7870 7871 static bool 7872 claim_type_is_v2(enum spdk_bdev_claim_type type) 7873 { 7874 switch (type) { 7875 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7876 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7877 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7878 return true; 7879 default: 7880 break; 7881 } 7882 return false; 7883 } 7884 7885 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7886 static bool 7887 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7888 { 7889 switch (type) { 7890 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7891 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7892 return true; 7893 default: 7894 break; 7895 } 7896 return false; 7897 } 7898 7899 void 7900 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7901 { 7902 if (opts == NULL) { 7903 SPDK_ERRLOG("opts should not be NULL\n"); 7904 assert(opts != NULL); 7905 return; 7906 } 7907 if (size == 0) { 7908 SPDK_ERRLOG("size should not be zero\n"); 7909 assert(size != 0); 7910 return; 7911 } 7912 7913 memset(opts, 0, size); 7914 opts->opts_size = size; 7915 7916 #define FIELD_OK(field) \ 7917 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7918 7919 #define SET_FIELD(field, value) \ 7920 if (FIELD_OK(field)) { \ 7921 opts->field = value; \ 7922 } \ 7923 7924 SET_FIELD(shared_claim_key, 0); 7925 7926 #undef FIELD_OK 7927 #undef SET_FIELD 7928 } 7929 7930 static int 7931 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7932 { 7933 if (src->opts_size == 0) { 7934 SPDK_ERRLOG("size should not be zero\n"); 7935 return -1; 7936 } 7937 7938 memset(dst, 0, sizeof(*dst)); 7939 dst->opts_size = src->opts_size; 7940 7941 #define FIELD_OK(field) \ 7942 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7943 7944 #define SET_FIELD(field) \ 7945 if (FIELD_OK(field)) { \ 7946 dst->field = src->field; \ 7947 } \ 7948 7949 if (FIELD_OK(name)) { 7950 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7951 } 7952 7953 SET_FIELD(shared_claim_key); 7954 7955 /* You should not remove this statement, but need to update the assert statement 7956 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7957 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7958 7959 #undef FIELD_OK 7960 #undef SET_FIELD 7961 return 0; 7962 } 7963 7964 /* Returns 0 if a read-write-once claim can be taken. */ 7965 static int 7966 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7967 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7968 { 7969 struct spdk_bdev *bdev = desc->bdev; 7970 struct spdk_bdev_desc *open_desc; 7971 7972 assert(spdk_spin_held(&bdev->internal.spinlock)); 7973 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7974 7975 if (opts->shared_claim_key != 0) { 7976 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7977 bdev->name); 7978 return -EINVAL; 7979 } 7980 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7981 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7982 return -EPERM; 7983 } 7984 if (desc->claim != NULL) { 7985 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7986 bdev->name, desc->claim->module->name); 7987 return -EPERM; 7988 } 7989 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7990 if (desc != open_desc && open_desc->write) { 7991 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7992 "another descriptor is open for writing\n", 7993 bdev->name); 7994 return -EPERM; 7995 } 7996 } 7997 7998 return 0; 7999 } 8000 8001 /* Returns 0 if a read-only-many claim can be taken. */ 8002 static int 8003 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8004 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8005 { 8006 struct spdk_bdev *bdev = desc->bdev; 8007 struct spdk_bdev_desc *open_desc; 8008 8009 assert(spdk_spin_held(&bdev->internal.spinlock)); 8010 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8011 assert(desc->claim == NULL); 8012 8013 if (desc->write) { 8014 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8015 bdev->name); 8016 return -EINVAL; 8017 } 8018 if (opts->shared_claim_key != 0) { 8019 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8020 return -EINVAL; 8021 } 8022 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8023 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8024 if (open_desc->write) { 8025 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8026 "another descriptor is open for writing\n", 8027 bdev->name); 8028 return -EPERM; 8029 } 8030 } 8031 } 8032 8033 return 0; 8034 } 8035 8036 /* Returns 0 if a read-write-many claim can be taken. */ 8037 static int 8038 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8039 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8040 { 8041 struct spdk_bdev *bdev = desc->bdev; 8042 struct spdk_bdev_desc *open_desc; 8043 8044 assert(spdk_spin_held(&bdev->internal.spinlock)); 8045 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8046 assert(desc->claim == NULL); 8047 8048 if (opts->shared_claim_key == 0) { 8049 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8050 bdev->name); 8051 return -EINVAL; 8052 } 8053 switch (bdev->internal.claim_type) { 8054 case SPDK_BDEV_CLAIM_NONE: 8055 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8056 if (open_desc == desc) { 8057 continue; 8058 } 8059 if (open_desc->write) { 8060 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8061 "another descriptor is open for writing without a " 8062 "claim\n", bdev->name); 8063 return -EPERM; 8064 } 8065 } 8066 break; 8067 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8068 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8069 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8070 return -EPERM; 8071 } 8072 break; 8073 default: 8074 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8075 return -EBUSY; 8076 } 8077 8078 return 0; 8079 } 8080 8081 /* Updates desc and its bdev with a v2 claim. */ 8082 static int 8083 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8084 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8085 { 8086 struct spdk_bdev *bdev = desc->bdev; 8087 struct spdk_bdev_module_claim *claim; 8088 8089 assert(spdk_spin_held(&bdev->internal.spinlock)); 8090 assert(claim_type_is_v2(type)); 8091 assert(desc->claim == NULL); 8092 8093 claim = calloc(1, sizeof(*desc->claim)); 8094 if (claim == NULL) { 8095 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8096 return -ENOMEM; 8097 } 8098 claim->module = module; 8099 claim->desc = desc; 8100 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8101 memcpy(claim->name, opts->name, sizeof(claim->name)); 8102 desc->claim = claim; 8103 8104 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8105 bdev->internal.claim_type = type; 8106 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8107 bdev->internal.claim.v2.key = opts->shared_claim_key; 8108 } 8109 assert(type == bdev->internal.claim_type); 8110 8111 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8112 8113 if (!desc->write && claim_type_promotes_to_write(type)) { 8114 desc->write = true; 8115 } 8116 8117 return 0; 8118 } 8119 8120 int 8121 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8122 struct spdk_bdev_claim_opts *_opts, 8123 struct spdk_bdev_module *module) 8124 { 8125 struct spdk_bdev *bdev = desc->bdev; 8126 struct spdk_bdev_claim_opts opts; 8127 int rc = 0; 8128 8129 if (desc == NULL) { 8130 SPDK_ERRLOG("descriptor must not be NULL\n"); 8131 return -EINVAL; 8132 } 8133 8134 if (_opts == NULL) { 8135 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8136 } else if (claim_opts_copy(_opts, &opts) != 0) { 8137 return -EINVAL; 8138 } 8139 8140 spdk_spin_lock(&bdev->internal.spinlock); 8141 8142 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8143 bdev->internal.claim_type != type) { 8144 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8145 spdk_spin_unlock(&bdev->internal.spinlock); 8146 return -EPERM; 8147 } 8148 8149 if (claim_type_is_v2(type) && desc->claim != NULL) { 8150 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8151 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8152 spdk_spin_unlock(&bdev->internal.spinlock); 8153 return -EPERM; 8154 } 8155 8156 switch (type) { 8157 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8158 spdk_spin_unlock(&bdev->internal.spinlock); 8159 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8160 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8161 rc = claim_verify_rwo(desc, type, &opts, module); 8162 break; 8163 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8164 rc = claim_verify_rom(desc, type, &opts, module); 8165 break; 8166 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8167 rc = claim_verify_rwm(desc, type, &opts, module); 8168 break; 8169 default: 8170 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8171 rc = -ENOTSUP; 8172 } 8173 8174 if (rc == 0) { 8175 rc = claim_bdev(desc, type, &opts, module); 8176 } 8177 8178 spdk_spin_unlock(&bdev->internal.spinlock); 8179 return rc; 8180 } 8181 8182 static void 8183 claim_reset(struct spdk_bdev *bdev) 8184 { 8185 assert(spdk_spin_held(&bdev->internal.spinlock)); 8186 assert(claim_type_is_v2(bdev->internal.claim_type)); 8187 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8188 8189 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8190 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8191 } 8192 8193 static void 8194 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8195 { 8196 struct spdk_bdev *bdev = desc->bdev; 8197 8198 assert(spdk_spin_held(&bdev->internal.spinlock)); 8199 assert(claim_type_is_v2(bdev->internal.claim_type)); 8200 8201 if (bdev->internal.examine_in_progress == 0) { 8202 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8203 free(desc->claim); 8204 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8205 claim_reset(bdev); 8206 } 8207 } else { 8208 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8209 desc->claim->module = NULL; 8210 desc->claim->desc = NULL; 8211 } 8212 desc->claim = NULL; 8213 } 8214 8215 /* 8216 * End claims v2 8217 */ 8218 8219 struct spdk_bdev * 8220 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8221 { 8222 assert(desc != NULL); 8223 return desc->bdev; 8224 } 8225 8226 int 8227 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8228 { 8229 struct spdk_bdev *bdev, *tmp; 8230 struct spdk_bdev_desc *desc; 8231 int rc = 0; 8232 8233 assert(fn != NULL); 8234 8235 spdk_spin_lock(&g_bdev_mgr.spinlock); 8236 bdev = spdk_bdev_first(); 8237 while (bdev != NULL) { 8238 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8239 if (rc != 0) { 8240 break; 8241 } 8242 rc = bdev_open(bdev, false, desc); 8243 if (rc != 0) { 8244 bdev_desc_free(desc); 8245 if (rc == -ENODEV) { 8246 /* Ignore the error and move to the next bdev. */ 8247 rc = 0; 8248 bdev = spdk_bdev_next(bdev); 8249 continue; 8250 } 8251 break; 8252 } 8253 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8254 8255 rc = fn(ctx, bdev); 8256 8257 spdk_spin_lock(&g_bdev_mgr.spinlock); 8258 tmp = spdk_bdev_next(bdev); 8259 bdev_close(bdev, desc); 8260 if (rc != 0) { 8261 break; 8262 } 8263 bdev = tmp; 8264 } 8265 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8266 8267 return rc; 8268 } 8269 8270 int 8271 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8272 { 8273 struct spdk_bdev *bdev, *tmp; 8274 struct spdk_bdev_desc *desc; 8275 int rc = 0; 8276 8277 assert(fn != NULL); 8278 8279 spdk_spin_lock(&g_bdev_mgr.spinlock); 8280 bdev = spdk_bdev_first_leaf(); 8281 while (bdev != NULL) { 8282 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8283 if (rc != 0) { 8284 break; 8285 } 8286 rc = bdev_open(bdev, false, desc); 8287 if (rc != 0) { 8288 bdev_desc_free(desc); 8289 if (rc == -ENODEV) { 8290 /* Ignore the error and move to the next bdev. */ 8291 rc = 0; 8292 bdev = spdk_bdev_next_leaf(bdev); 8293 continue; 8294 } 8295 break; 8296 } 8297 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8298 8299 rc = fn(ctx, bdev); 8300 8301 spdk_spin_lock(&g_bdev_mgr.spinlock); 8302 tmp = spdk_bdev_next_leaf(bdev); 8303 bdev_close(bdev, desc); 8304 if (rc != 0) { 8305 break; 8306 } 8307 bdev = tmp; 8308 } 8309 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8310 8311 return rc; 8312 } 8313 8314 void 8315 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8316 { 8317 struct iovec *iovs; 8318 int iovcnt; 8319 8320 if (bdev_io == NULL) { 8321 return; 8322 } 8323 8324 switch (bdev_io->type) { 8325 case SPDK_BDEV_IO_TYPE_READ: 8326 case SPDK_BDEV_IO_TYPE_WRITE: 8327 case SPDK_BDEV_IO_TYPE_ZCOPY: 8328 iovs = bdev_io->u.bdev.iovs; 8329 iovcnt = bdev_io->u.bdev.iovcnt; 8330 break; 8331 default: 8332 iovs = NULL; 8333 iovcnt = 0; 8334 break; 8335 } 8336 8337 if (iovp) { 8338 *iovp = iovs; 8339 } 8340 if (iovcntp) { 8341 *iovcntp = iovcnt; 8342 } 8343 } 8344 8345 void * 8346 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8347 { 8348 if (bdev_io == NULL) { 8349 return NULL; 8350 } 8351 8352 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8353 return NULL; 8354 } 8355 8356 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8357 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8358 return bdev_io->u.bdev.md_buf; 8359 } 8360 8361 return NULL; 8362 } 8363 8364 void * 8365 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8366 { 8367 if (bdev_io == NULL) { 8368 assert(false); 8369 return NULL; 8370 } 8371 8372 return bdev_io->internal.caller_ctx; 8373 } 8374 8375 void 8376 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8377 { 8378 8379 if (spdk_bdev_module_list_find(bdev_module->name)) { 8380 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8381 assert(false); 8382 } 8383 8384 spdk_spin_init(&bdev_module->internal.spinlock); 8385 8386 /* 8387 * Modules with examine callbacks must be initialized first, so they are 8388 * ready to handle examine callbacks from later modules that will 8389 * register physical bdevs. 8390 */ 8391 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8392 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8393 } else { 8394 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8395 } 8396 } 8397 8398 struct spdk_bdev_module * 8399 spdk_bdev_module_list_find(const char *name) 8400 { 8401 struct spdk_bdev_module *bdev_module; 8402 8403 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8404 if (strcmp(name, bdev_module->name) == 0) { 8405 break; 8406 } 8407 } 8408 8409 return bdev_module; 8410 } 8411 8412 static void 8413 bdev_write_zero_buffer_next(void *_bdev_io) 8414 { 8415 struct spdk_bdev_io *bdev_io = _bdev_io; 8416 uint64_t num_bytes, num_blocks; 8417 void *md_buf = NULL; 8418 int rc; 8419 8420 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8421 bdev_io->u.bdev.split_remaining_num_blocks, 8422 ZERO_BUFFER_SIZE); 8423 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8424 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8425 8426 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8427 md_buf = (char *)g_bdev_mgr.zero_buffer + 8428 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8429 } 8430 8431 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8432 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8433 g_bdev_mgr.zero_buffer, md_buf, 8434 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8435 bdev_write_zero_buffer_done, bdev_io); 8436 if (rc == 0) { 8437 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8438 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8439 } else if (rc == -ENOMEM) { 8440 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8441 } else { 8442 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8443 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8444 } 8445 } 8446 8447 static void 8448 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8449 { 8450 struct spdk_bdev_io *parent_io = cb_arg; 8451 8452 spdk_bdev_free_io(bdev_io); 8453 8454 if (!success) { 8455 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8456 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8457 return; 8458 } 8459 8460 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8461 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8462 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8463 return; 8464 } 8465 8466 bdev_write_zero_buffer_next(parent_io); 8467 } 8468 8469 static void 8470 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8471 { 8472 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8473 ctx->bdev->internal.qos_mod_in_progress = false; 8474 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8475 8476 if (ctx->cb_fn) { 8477 ctx->cb_fn(ctx->cb_arg, status); 8478 } 8479 free(ctx); 8480 } 8481 8482 static void 8483 bdev_disable_qos_done(void *cb_arg) 8484 { 8485 struct set_qos_limit_ctx *ctx = cb_arg; 8486 struct spdk_bdev *bdev = ctx->bdev; 8487 struct spdk_bdev_io *bdev_io; 8488 struct spdk_bdev_qos *qos; 8489 8490 spdk_spin_lock(&bdev->internal.spinlock); 8491 qos = bdev->internal.qos; 8492 bdev->internal.qos = NULL; 8493 spdk_spin_unlock(&bdev->internal.spinlock); 8494 8495 while (!TAILQ_EMPTY(&qos->queued)) { 8496 /* Send queued I/O back to their original thread for resubmission. */ 8497 bdev_io = TAILQ_FIRST(&qos->queued); 8498 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8499 8500 if (bdev_io->internal.io_submit_ch) { 8501 /* 8502 * Channel was changed when sending it to the QoS thread - change it back 8503 * before sending it back to the original thread. 8504 */ 8505 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8506 bdev_io->internal.io_submit_ch = NULL; 8507 } 8508 8509 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8510 _bdev_io_submit, bdev_io); 8511 } 8512 8513 if (qos->thread != NULL) { 8514 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8515 spdk_poller_unregister(&qos->poller); 8516 } 8517 8518 free(qos); 8519 8520 bdev_set_qos_limit_done(ctx, 0); 8521 } 8522 8523 static void 8524 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8525 { 8526 struct set_qos_limit_ctx *ctx = _ctx; 8527 struct spdk_thread *thread; 8528 8529 spdk_spin_lock(&bdev->internal.spinlock); 8530 thread = bdev->internal.qos->thread; 8531 spdk_spin_unlock(&bdev->internal.spinlock); 8532 8533 if (thread != NULL) { 8534 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8535 } else { 8536 bdev_disable_qos_done(ctx); 8537 } 8538 } 8539 8540 static void 8541 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8542 struct spdk_io_channel *ch, void *_ctx) 8543 { 8544 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8545 8546 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8547 8548 spdk_bdev_for_each_channel_continue(i, 0); 8549 } 8550 8551 static void 8552 bdev_update_qos_rate_limit_msg(void *cb_arg) 8553 { 8554 struct set_qos_limit_ctx *ctx = cb_arg; 8555 struct spdk_bdev *bdev = ctx->bdev; 8556 8557 spdk_spin_lock(&bdev->internal.spinlock); 8558 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8559 spdk_spin_unlock(&bdev->internal.spinlock); 8560 8561 bdev_set_qos_limit_done(ctx, 0); 8562 } 8563 8564 static void 8565 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8566 struct spdk_io_channel *ch, void *_ctx) 8567 { 8568 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8569 8570 spdk_spin_lock(&bdev->internal.spinlock); 8571 bdev_enable_qos(bdev, bdev_ch); 8572 spdk_spin_unlock(&bdev->internal.spinlock); 8573 spdk_bdev_for_each_channel_continue(i, 0); 8574 } 8575 8576 static void 8577 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8578 { 8579 struct set_qos_limit_ctx *ctx = _ctx; 8580 8581 bdev_set_qos_limit_done(ctx, status); 8582 } 8583 8584 static void 8585 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8586 { 8587 int i; 8588 8589 assert(bdev->internal.qos != NULL); 8590 8591 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8592 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8593 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8594 8595 if (limits[i] == 0) { 8596 bdev->internal.qos->rate_limits[i].limit = 8597 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8598 } 8599 } 8600 } 8601 } 8602 8603 void 8604 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8605 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8606 { 8607 struct set_qos_limit_ctx *ctx; 8608 uint32_t limit_set_complement; 8609 uint64_t min_limit_per_sec; 8610 int i; 8611 bool disable_rate_limit = true; 8612 8613 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8614 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8615 continue; 8616 } 8617 8618 if (limits[i] > 0) { 8619 disable_rate_limit = false; 8620 } 8621 8622 if (bdev_qos_is_iops_rate_limit(i) == true) { 8623 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8624 } else { 8625 /* Change from megabyte to byte rate limit */ 8626 limits[i] = limits[i] * 1024 * 1024; 8627 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8628 } 8629 8630 limit_set_complement = limits[i] % min_limit_per_sec; 8631 if (limit_set_complement) { 8632 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8633 limits[i], min_limit_per_sec); 8634 limits[i] += min_limit_per_sec - limit_set_complement; 8635 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8636 } 8637 } 8638 8639 ctx = calloc(1, sizeof(*ctx)); 8640 if (ctx == NULL) { 8641 cb_fn(cb_arg, -ENOMEM); 8642 return; 8643 } 8644 8645 ctx->cb_fn = cb_fn; 8646 ctx->cb_arg = cb_arg; 8647 ctx->bdev = bdev; 8648 8649 spdk_spin_lock(&bdev->internal.spinlock); 8650 if (bdev->internal.qos_mod_in_progress) { 8651 spdk_spin_unlock(&bdev->internal.spinlock); 8652 free(ctx); 8653 cb_fn(cb_arg, -EAGAIN); 8654 return; 8655 } 8656 bdev->internal.qos_mod_in_progress = true; 8657 8658 if (disable_rate_limit == true && bdev->internal.qos) { 8659 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8660 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8661 (bdev->internal.qos->rate_limits[i].limit > 0 && 8662 bdev->internal.qos->rate_limits[i].limit != 8663 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8664 disable_rate_limit = false; 8665 break; 8666 } 8667 } 8668 } 8669 8670 if (disable_rate_limit == false) { 8671 if (bdev->internal.qos == NULL) { 8672 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8673 if (!bdev->internal.qos) { 8674 spdk_spin_unlock(&bdev->internal.spinlock); 8675 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8676 bdev_set_qos_limit_done(ctx, -ENOMEM); 8677 return; 8678 } 8679 } 8680 8681 if (bdev->internal.qos->thread == NULL) { 8682 /* Enabling */ 8683 bdev_set_qos_rate_limits(bdev, limits); 8684 8685 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8686 bdev_enable_qos_done); 8687 } else { 8688 /* Updating */ 8689 bdev_set_qos_rate_limits(bdev, limits); 8690 8691 spdk_thread_send_msg(bdev->internal.qos->thread, 8692 bdev_update_qos_rate_limit_msg, ctx); 8693 } 8694 } else { 8695 if (bdev->internal.qos != NULL) { 8696 bdev_set_qos_rate_limits(bdev, limits); 8697 8698 /* Disabling */ 8699 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8700 bdev_disable_qos_msg_done); 8701 } else { 8702 spdk_spin_unlock(&bdev->internal.spinlock); 8703 bdev_set_qos_limit_done(ctx, 0); 8704 return; 8705 } 8706 } 8707 8708 spdk_spin_unlock(&bdev->internal.spinlock); 8709 } 8710 8711 struct spdk_bdev_histogram_ctx { 8712 spdk_bdev_histogram_status_cb cb_fn; 8713 void *cb_arg; 8714 struct spdk_bdev *bdev; 8715 int status; 8716 }; 8717 8718 static void 8719 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8720 { 8721 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8722 8723 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8724 ctx->bdev->internal.histogram_in_progress = false; 8725 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8726 ctx->cb_fn(ctx->cb_arg, ctx->status); 8727 free(ctx); 8728 } 8729 8730 static void 8731 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8732 struct spdk_io_channel *_ch, void *_ctx) 8733 { 8734 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8735 8736 if (ch->histogram != NULL) { 8737 spdk_histogram_data_free(ch->histogram); 8738 ch->histogram = NULL; 8739 } 8740 spdk_bdev_for_each_channel_continue(i, 0); 8741 } 8742 8743 static void 8744 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8745 { 8746 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8747 8748 if (status != 0) { 8749 ctx->status = status; 8750 ctx->bdev->internal.histogram_enabled = false; 8751 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8752 bdev_histogram_disable_channel_cb); 8753 } else { 8754 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8755 ctx->bdev->internal.histogram_in_progress = false; 8756 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8757 ctx->cb_fn(ctx->cb_arg, ctx->status); 8758 free(ctx); 8759 } 8760 } 8761 8762 static void 8763 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8764 struct spdk_io_channel *_ch, void *_ctx) 8765 { 8766 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8767 int status = 0; 8768 8769 if (ch->histogram == NULL) { 8770 ch->histogram = spdk_histogram_data_alloc(); 8771 if (ch->histogram == NULL) { 8772 status = -ENOMEM; 8773 } 8774 } 8775 8776 spdk_bdev_for_each_channel_continue(i, status); 8777 } 8778 8779 void 8780 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8781 void *cb_arg, bool enable) 8782 { 8783 struct spdk_bdev_histogram_ctx *ctx; 8784 8785 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8786 if (ctx == NULL) { 8787 cb_fn(cb_arg, -ENOMEM); 8788 return; 8789 } 8790 8791 ctx->bdev = bdev; 8792 ctx->status = 0; 8793 ctx->cb_fn = cb_fn; 8794 ctx->cb_arg = cb_arg; 8795 8796 spdk_spin_lock(&bdev->internal.spinlock); 8797 if (bdev->internal.histogram_in_progress) { 8798 spdk_spin_unlock(&bdev->internal.spinlock); 8799 free(ctx); 8800 cb_fn(cb_arg, -EAGAIN); 8801 return; 8802 } 8803 8804 bdev->internal.histogram_in_progress = true; 8805 spdk_spin_unlock(&bdev->internal.spinlock); 8806 8807 bdev->internal.histogram_enabled = enable; 8808 8809 if (enable) { 8810 /* Allocate histogram for each channel */ 8811 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8812 bdev_histogram_enable_channel_cb); 8813 } else { 8814 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8815 bdev_histogram_disable_channel_cb); 8816 } 8817 } 8818 8819 struct spdk_bdev_histogram_data_ctx { 8820 spdk_bdev_histogram_data_cb cb_fn; 8821 void *cb_arg; 8822 struct spdk_bdev *bdev; 8823 /** merged histogram data from all channels */ 8824 struct spdk_histogram_data *histogram; 8825 }; 8826 8827 static void 8828 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8829 { 8830 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8831 8832 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8833 free(ctx); 8834 } 8835 8836 static void 8837 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8838 struct spdk_io_channel *_ch, void *_ctx) 8839 { 8840 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8841 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8842 int status = 0; 8843 8844 if (ch->histogram == NULL) { 8845 status = -EFAULT; 8846 } else { 8847 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8848 } 8849 8850 spdk_bdev_for_each_channel_continue(i, status); 8851 } 8852 8853 void 8854 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8855 spdk_bdev_histogram_data_cb cb_fn, 8856 void *cb_arg) 8857 { 8858 struct spdk_bdev_histogram_data_ctx *ctx; 8859 8860 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8861 if (ctx == NULL) { 8862 cb_fn(cb_arg, -ENOMEM, NULL); 8863 return; 8864 } 8865 8866 ctx->bdev = bdev; 8867 ctx->cb_fn = cb_fn; 8868 ctx->cb_arg = cb_arg; 8869 8870 ctx->histogram = histogram; 8871 8872 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8873 bdev_histogram_get_channel_cb); 8874 } 8875 8876 void 8877 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8878 void *cb_arg) 8879 { 8880 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8881 int status = 0; 8882 8883 assert(cb_fn != NULL); 8884 8885 if (bdev_ch->histogram == NULL) { 8886 status = -EFAULT; 8887 } 8888 cb_fn(cb_arg, status, bdev_ch->histogram); 8889 } 8890 8891 size_t 8892 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8893 size_t max_events) 8894 { 8895 struct media_event_entry *entry; 8896 size_t num_events = 0; 8897 8898 for (; num_events < max_events; ++num_events) { 8899 entry = TAILQ_FIRST(&desc->pending_media_events); 8900 if (entry == NULL) { 8901 break; 8902 } 8903 8904 events[num_events] = entry->event; 8905 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8906 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8907 } 8908 8909 return num_events; 8910 } 8911 8912 int 8913 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8914 size_t num_events) 8915 { 8916 struct spdk_bdev_desc *desc; 8917 struct media_event_entry *entry; 8918 size_t event_id; 8919 int rc = 0; 8920 8921 assert(bdev->media_events); 8922 8923 spdk_spin_lock(&bdev->internal.spinlock); 8924 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8925 if (desc->write) { 8926 break; 8927 } 8928 } 8929 8930 if (desc == NULL || desc->media_events_buffer == NULL) { 8931 rc = -ENODEV; 8932 goto out; 8933 } 8934 8935 for (event_id = 0; event_id < num_events; ++event_id) { 8936 entry = TAILQ_FIRST(&desc->free_media_events); 8937 if (entry == NULL) { 8938 break; 8939 } 8940 8941 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8942 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8943 entry->event = events[event_id]; 8944 } 8945 8946 rc = event_id; 8947 out: 8948 spdk_spin_unlock(&bdev->internal.spinlock); 8949 return rc; 8950 } 8951 8952 static void 8953 _media_management_notify(void *arg) 8954 { 8955 struct spdk_bdev_desc *desc = arg; 8956 8957 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8958 } 8959 8960 void 8961 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8962 { 8963 struct spdk_bdev_desc *desc; 8964 8965 spdk_spin_lock(&bdev->internal.spinlock); 8966 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8967 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8968 event_notify(desc, _media_management_notify); 8969 } 8970 } 8971 spdk_spin_unlock(&bdev->internal.spinlock); 8972 } 8973 8974 struct locked_lba_range_ctx { 8975 struct lba_range range; 8976 struct spdk_bdev *bdev; 8977 struct lba_range *current_range; 8978 struct lba_range *owner_range; 8979 struct spdk_poller *poller; 8980 lock_range_cb cb_fn; 8981 void *cb_arg; 8982 }; 8983 8984 static void 8985 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8986 { 8987 struct locked_lba_range_ctx *ctx = _ctx; 8988 8989 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8990 free(ctx); 8991 } 8992 8993 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8994 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8995 8996 static void 8997 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8998 { 8999 struct locked_lba_range_ctx *ctx = _ctx; 9000 9001 if (status == -ENOMEM) { 9002 /* One of the channels could not allocate a range object. 9003 * So we have to go back and clean up any ranges that were 9004 * allocated successfully before we return error status to 9005 * the caller. We can reuse the unlock function to do that 9006 * clean up. 9007 */ 9008 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9009 bdev_lock_error_cleanup_cb); 9010 return; 9011 } 9012 9013 /* All channels have locked this range and no I/O overlapping the range 9014 * are outstanding! Set the owner_ch for the range object for the 9015 * locking channel, so that this channel will know that it is allowed 9016 * to write to this range. 9017 */ 9018 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9019 ctx->cb_fn(ctx->cb_arg, status); 9020 9021 /* Don't free the ctx here. Its range is in the bdev's global list of 9022 * locked ranges still, and will be removed and freed when this range 9023 * is later unlocked. 9024 */ 9025 } 9026 9027 static int 9028 bdev_lock_lba_range_check_io(void *_i) 9029 { 9030 struct spdk_bdev_channel_iter *i = _i; 9031 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9032 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9033 struct locked_lba_range_ctx *ctx = i->ctx; 9034 struct lba_range *range = ctx->current_range; 9035 struct spdk_bdev_io *bdev_io; 9036 9037 spdk_poller_unregister(&ctx->poller); 9038 9039 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9040 * range. But we need to wait until any outstanding IO overlapping with this range 9041 * are completed. 9042 */ 9043 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9044 if (bdev_io_range_is_locked(bdev_io, range)) { 9045 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9046 return SPDK_POLLER_BUSY; 9047 } 9048 } 9049 9050 spdk_bdev_for_each_channel_continue(i, 0); 9051 return SPDK_POLLER_BUSY; 9052 } 9053 9054 static void 9055 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9056 struct spdk_io_channel *_ch, void *_ctx) 9057 { 9058 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9059 struct locked_lba_range_ctx *ctx = _ctx; 9060 struct lba_range *range; 9061 9062 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9063 if (range->length == ctx->range.length && 9064 range->offset == ctx->range.offset && 9065 range->locked_ctx == ctx->range.locked_ctx) { 9066 /* This range already exists on this channel, so don't add 9067 * it again. This can happen when a new channel is created 9068 * while the for_each_channel operation is in progress. 9069 * Do not check for outstanding I/O in that case, since the 9070 * range was locked before any I/O could be submitted to the 9071 * new channel. 9072 */ 9073 spdk_bdev_for_each_channel_continue(i, 0); 9074 return; 9075 } 9076 } 9077 9078 range = calloc(1, sizeof(*range)); 9079 if (range == NULL) { 9080 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9081 return; 9082 } 9083 9084 range->length = ctx->range.length; 9085 range->offset = ctx->range.offset; 9086 range->locked_ctx = ctx->range.locked_ctx; 9087 ctx->current_range = range; 9088 if (ctx->range.owner_ch == ch) { 9089 /* This is the range object for the channel that will hold 9090 * the lock. Store it in the ctx object so that we can easily 9091 * set its owner_ch after the lock is finally acquired. 9092 */ 9093 ctx->owner_range = range; 9094 } 9095 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9096 bdev_lock_lba_range_check_io(i); 9097 } 9098 9099 static void 9100 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9101 { 9102 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 9103 9104 /* We will add a copy of this range to each channel now. */ 9105 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9106 bdev_lock_lba_range_cb); 9107 } 9108 9109 static bool 9110 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9111 { 9112 struct lba_range *r; 9113 9114 TAILQ_FOREACH(r, tailq, tailq) { 9115 if (bdev_lba_range_overlapped(range, r)) { 9116 return true; 9117 } 9118 } 9119 return false; 9120 } 9121 9122 static int 9123 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9124 uint64_t offset, uint64_t length, 9125 lock_range_cb cb_fn, void *cb_arg) 9126 { 9127 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9128 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9129 struct locked_lba_range_ctx *ctx; 9130 9131 if (cb_arg == NULL) { 9132 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9133 return -EINVAL; 9134 } 9135 9136 ctx = calloc(1, sizeof(*ctx)); 9137 if (ctx == NULL) { 9138 return -ENOMEM; 9139 } 9140 9141 ctx->range.offset = offset; 9142 ctx->range.length = length; 9143 ctx->range.owner_ch = ch; 9144 ctx->range.locked_ctx = cb_arg; 9145 ctx->bdev = bdev; 9146 ctx->cb_fn = cb_fn; 9147 ctx->cb_arg = cb_arg; 9148 9149 spdk_spin_lock(&bdev->internal.spinlock); 9150 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9151 /* There is an active lock overlapping with this range. 9152 * Put it on the pending list until this range no 9153 * longer overlaps with another. 9154 */ 9155 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9156 } else { 9157 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9158 bdev_lock_lba_range_ctx(bdev, ctx); 9159 } 9160 spdk_spin_unlock(&bdev->internal.spinlock); 9161 return 0; 9162 } 9163 9164 static void 9165 bdev_lock_lba_range_ctx_msg(void *_ctx) 9166 { 9167 struct locked_lba_range_ctx *ctx = _ctx; 9168 9169 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 9170 } 9171 9172 static void 9173 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9174 { 9175 struct locked_lba_range_ctx *ctx = _ctx; 9176 struct locked_lba_range_ctx *pending_ctx; 9177 struct lba_range *range, *tmp; 9178 9179 spdk_spin_lock(&bdev->internal.spinlock); 9180 /* Check if there are any pending locked ranges that overlap with this range 9181 * that was just unlocked. If there are, check that it doesn't overlap with any 9182 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9183 * the lock process. 9184 */ 9185 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9186 if (bdev_lba_range_overlapped(range, &ctx->range) && 9187 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9188 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9189 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9190 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9191 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 9192 bdev_lock_lba_range_ctx_msg, pending_ctx); 9193 } 9194 } 9195 spdk_spin_unlock(&bdev->internal.spinlock); 9196 9197 ctx->cb_fn(ctx->cb_arg, status); 9198 free(ctx); 9199 } 9200 9201 static void 9202 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9203 struct spdk_io_channel *_ch, void *_ctx) 9204 { 9205 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9206 struct locked_lba_range_ctx *ctx = _ctx; 9207 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9208 struct spdk_bdev_io *bdev_io; 9209 struct lba_range *range; 9210 9211 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9212 if (ctx->range.offset == range->offset && 9213 ctx->range.length == range->length && 9214 ctx->range.locked_ctx == range->locked_ctx) { 9215 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9216 free(range); 9217 break; 9218 } 9219 } 9220 9221 /* Note: we should almost always be able to assert that the range specified 9222 * was found. But there are some very rare corner cases where a new channel 9223 * gets created simultaneously with a range unlock, where this function 9224 * would execute on that new channel and wouldn't have the range. 9225 * We also use this to clean up range allocations when a later allocation 9226 * fails in the locking path. 9227 * So we can't actually assert() here. 9228 */ 9229 9230 /* Swap the locked IO into a temporary list, and then try to submit them again. 9231 * We could hyper-optimize this to only resubmit locked I/O that overlap 9232 * with the range that was just unlocked, but this isn't a performance path so 9233 * we go for simplicity here. 9234 */ 9235 TAILQ_INIT(&io_locked); 9236 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9237 while (!TAILQ_EMPTY(&io_locked)) { 9238 bdev_io = TAILQ_FIRST(&io_locked); 9239 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9240 bdev_io_submit(bdev_io); 9241 } 9242 9243 spdk_bdev_for_each_channel_continue(i, 0); 9244 } 9245 9246 static int 9247 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9248 uint64_t offset, uint64_t length, 9249 lock_range_cb cb_fn, void *cb_arg) 9250 { 9251 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9252 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9253 struct locked_lba_range_ctx *ctx; 9254 struct lba_range *range; 9255 bool range_found = false; 9256 9257 /* Let's make sure the specified channel actually has a lock on 9258 * the specified range. Note that the range must match exactly. 9259 */ 9260 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9261 if (range->offset == offset && range->length == length && 9262 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9263 range_found = true; 9264 break; 9265 } 9266 } 9267 9268 if (!range_found) { 9269 return -EINVAL; 9270 } 9271 9272 spdk_spin_lock(&bdev->internal.spinlock); 9273 /* We confirmed that this channel has locked the specified range. To 9274 * start the unlock the process, we find the range in the bdev's locked_ranges 9275 * and remove it. This ensures new channels don't inherit the locked range. 9276 * Then we will send a message to each channel (including the one specified 9277 * here) to remove the range from its per-channel list. 9278 */ 9279 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9280 if (range->offset == offset && range->length == length && 9281 range->locked_ctx == cb_arg) { 9282 break; 9283 } 9284 } 9285 if (range == NULL) { 9286 assert(false); 9287 spdk_spin_unlock(&bdev->internal.spinlock); 9288 return -EINVAL; 9289 } 9290 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9291 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9292 spdk_spin_unlock(&bdev->internal.spinlock); 9293 9294 ctx->cb_fn = cb_fn; 9295 ctx->cb_arg = cb_arg; 9296 9297 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9298 bdev_unlock_lba_range_cb); 9299 return 0; 9300 } 9301 9302 int 9303 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9304 int array_size) 9305 { 9306 if (!bdev) { 9307 return -EINVAL; 9308 } 9309 9310 if (bdev->fn_table->get_memory_domains) { 9311 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9312 } 9313 9314 return 0; 9315 } 9316 9317 struct spdk_bdev_for_each_io_ctx { 9318 void *ctx; 9319 spdk_bdev_io_fn fn; 9320 spdk_bdev_for_each_io_cb cb; 9321 }; 9322 9323 static void 9324 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9325 struct spdk_io_channel *io_ch, void *_ctx) 9326 { 9327 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9328 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9329 struct spdk_bdev_io *bdev_io; 9330 int rc = 0; 9331 9332 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9333 rc = ctx->fn(ctx->ctx, bdev_io); 9334 if (rc != 0) { 9335 break; 9336 } 9337 } 9338 9339 spdk_bdev_for_each_channel_continue(i, rc); 9340 } 9341 9342 static void 9343 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9344 { 9345 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9346 9347 ctx->cb(ctx->ctx, status); 9348 9349 free(ctx); 9350 } 9351 9352 void 9353 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9354 spdk_bdev_for_each_io_cb cb) 9355 { 9356 struct spdk_bdev_for_each_io_ctx *ctx; 9357 9358 assert(fn != NULL && cb != NULL); 9359 9360 ctx = calloc(1, sizeof(*ctx)); 9361 if (ctx == NULL) { 9362 SPDK_ERRLOG("Failed to allocate context.\n"); 9363 cb(_ctx, -ENOMEM); 9364 return; 9365 } 9366 9367 ctx->ctx = _ctx; 9368 ctx->fn = fn; 9369 ctx->cb = cb; 9370 9371 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9372 bdev_for_each_io_done); 9373 } 9374 9375 void 9376 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9377 { 9378 spdk_for_each_channel_continue(iter->i, status); 9379 } 9380 9381 static struct spdk_bdev * 9382 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9383 { 9384 void *io_device = spdk_io_channel_iter_get_io_device(i); 9385 9386 return __bdev_from_io_dev(io_device); 9387 } 9388 9389 static void 9390 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9391 { 9392 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9393 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9394 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9395 9396 iter->i = i; 9397 iter->fn(iter, bdev, ch, iter->ctx); 9398 } 9399 9400 static void 9401 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9402 { 9403 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9404 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9405 9406 iter->i = i; 9407 iter->cpl(bdev, iter->ctx, status); 9408 9409 free(iter); 9410 } 9411 9412 void 9413 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9414 void *ctx, spdk_bdev_for_each_channel_done cpl) 9415 { 9416 struct spdk_bdev_channel_iter *iter; 9417 9418 assert(bdev != NULL && fn != NULL && ctx != NULL); 9419 9420 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9421 if (iter == NULL) { 9422 SPDK_ERRLOG("Unable to allocate iterator\n"); 9423 assert(false); 9424 return; 9425 } 9426 9427 iter->fn = fn; 9428 iter->cpl = cpl; 9429 iter->ctx = ctx; 9430 9431 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9432 iter, bdev_each_channel_cpl); 9433 } 9434 9435 static void 9436 bdev_copy_do_write_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9437 { 9438 struct spdk_bdev_io *parent_io = cb_arg; 9439 9440 /* Check return status of write */ 9441 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9442 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9443 spdk_bdev_free_io(bdev_io); 9444 } 9445 9446 static void 9447 bdev_copy_do_write(void *_bdev_io) 9448 { 9449 struct spdk_bdev_io *bdev_io = _bdev_io; 9450 int rc; 9451 9452 /* Write blocks */ 9453 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9454 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9455 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9456 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_complete, bdev_io); 9457 9458 if (rc == -ENOMEM) { 9459 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9460 } else if (rc != 0) { 9461 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9462 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9463 } 9464 } 9465 9466 static void 9467 bdev_copy_do_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9468 { 9469 struct spdk_bdev_io *parent_io = cb_arg; 9470 9471 /* Check return status of read */ 9472 if (!success) { 9473 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9474 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9475 spdk_bdev_free_io(bdev_io); 9476 return; 9477 } 9478 9479 spdk_bdev_free_io(bdev_io); 9480 9481 /* Do write */ 9482 bdev_copy_do_write(parent_io); 9483 } 9484 9485 static void 9486 bdev_copy_do_read(void *_bdev_io) 9487 { 9488 struct spdk_bdev_io *bdev_io = _bdev_io; 9489 int rc; 9490 9491 /* Read blocks */ 9492 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9493 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9494 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9495 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_complete, bdev_io); 9496 9497 if (rc == -ENOMEM) { 9498 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9499 } else if (rc != 0) { 9500 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9501 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9502 } 9503 } 9504 9505 static void 9506 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9507 { 9508 if (!success) { 9509 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9510 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9511 return; 9512 } 9513 9514 bdev_copy_do_read(bdev_io); 9515 } 9516 9517 int 9518 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9519 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9520 spdk_bdev_io_completion_cb cb, void *cb_arg) 9521 { 9522 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9523 struct spdk_bdev_io *bdev_io; 9524 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9525 9526 if (!desc->write) { 9527 return -EBADF; 9528 } 9529 9530 if (num_blocks == 0) { 9531 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9532 return -EINVAL; 9533 } 9534 9535 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9536 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9537 SPDK_DEBUGLOG(bdev, 9538 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9539 dst_offset_blocks, src_offset_blocks, num_blocks); 9540 return -EINVAL; 9541 } 9542 9543 bdev_io = bdev_channel_get_io(channel); 9544 if (!bdev_io) { 9545 return -ENOMEM; 9546 } 9547 9548 bdev_io->internal.ch = channel; 9549 bdev_io->internal.desc = desc; 9550 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9551 9552 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9553 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9554 bdev_io->u.bdev.num_blocks = num_blocks; 9555 bdev_io->u.bdev.memory_domain = NULL; 9556 bdev_io->u.bdev.memory_domain_ctx = NULL; 9557 bdev_io->u.bdev.iovs = NULL; 9558 bdev_io->u.bdev.iovcnt = 0; 9559 bdev_io->u.bdev.md_buf = NULL; 9560 bdev_io->u.bdev.accel_sequence = NULL; 9561 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9562 9563 if (dst_offset_blocks == src_offset_blocks) { 9564 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9565 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9566 9567 return 0; 9568 } 9569 9570 /* If the bdev backing device support copy directly, pass to it to process. 9571 * Else do general processing from bdev layer. 9572 */ 9573 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 9574 bdev_io_submit(bdev_io); 9575 return 0; 9576 } 9577 9578 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9579 9580 return 0; 9581 } 9582 9583 SPDK_LOG_REGISTER_COMPONENT(bdev) 9584 9585 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9586 { 9587 struct spdk_trace_tpoint_opts opts[] = { 9588 { 9589 "BDEV_IO_START", TRACE_BDEV_IO_START, 9590 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9591 { 9592 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9593 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9594 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9595 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9596 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9597 } 9598 }, 9599 { 9600 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9601 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9602 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9603 }, 9604 { 9605 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9606 OWNER_BDEV, OBJECT_NONE, 1, 9607 { 9608 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9609 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9610 } 9611 }, 9612 { 9613 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9614 OWNER_BDEV, OBJECT_NONE, 0, 9615 { 9616 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9617 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9618 } 9619 }, 9620 }; 9621 9622 9623 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9624 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9625 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9626 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9627 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9628 } 9629