1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 /* List of I/Os with accel sequence being currently executed */ 293 bdev_io_tailq_t io_accel_exec; 294 295 /* List of I/Os doing memory domain pull/push */ 296 bdev_io_tailq_t io_memory_domain; 297 298 uint32_t flags; 299 300 struct spdk_histogram_data *histogram; 301 302 #ifdef SPDK_CONFIG_VTUNE 303 uint64_t start_tsc; 304 uint64_t interval_tsc; 305 __itt_string_handle *handle; 306 struct spdk_bdev_io_stat *prev_stat; 307 #endif 308 309 bdev_io_tailq_t queued_resets; 310 311 lba_range_tailq_t locked_ranges; 312 }; 313 314 struct media_event_entry { 315 struct spdk_bdev_media_event event; 316 TAILQ_ENTRY(media_event_entry) tailq; 317 }; 318 319 #define MEDIA_EVENT_POOL_SIZE 64 320 321 struct spdk_bdev_desc { 322 struct spdk_bdev *bdev; 323 struct spdk_thread *thread; 324 struct { 325 spdk_bdev_event_cb_t event_fn; 326 void *ctx; 327 } callback; 328 bool closed; 329 bool write; 330 bool memory_domains_supported; 331 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 332 struct spdk_spinlock spinlock; 333 uint32_t refs; 334 TAILQ_HEAD(, media_event_entry) pending_media_events; 335 TAILQ_HEAD(, media_event_entry) free_media_events; 336 struct media_event_entry *media_events_buffer; 337 TAILQ_ENTRY(spdk_bdev_desc) link; 338 339 uint64_t timeout_in_sec; 340 spdk_bdev_io_timeout_cb cb_fn; 341 void *cb_arg; 342 struct spdk_poller *io_timeout_poller; 343 struct spdk_bdev_module_claim *claim; 344 }; 345 346 struct spdk_bdev_iostat_ctx { 347 struct spdk_bdev_io_stat *stat; 348 spdk_bdev_get_device_stat_cb cb; 349 void *cb_arg; 350 }; 351 352 struct set_qos_limit_ctx { 353 void (*cb_fn)(void *cb_arg, int status); 354 void *cb_arg; 355 struct spdk_bdev *bdev; 356 }; 357 358 struct spdk_bdev_channel_iter { 359 spdk_bdev_for_each_channel_msg fn; 360 spdk_bdev_for_each_channel_done cpl; 361 struct spdk_io_channel_iter *i; 362 void *ctx; 363 }; 364 365 struct spdk_bdev_io_error_stat { 366 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 367 }; 368 369 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 370 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 371 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 372 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 373 374 static inline void bdev_io_complete(void *ctx); 375 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 376 377 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 378 static void bdev_write_zero_buffer_next(void *_bdev_io); 379 380 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 381 struct spdk_io_channel *ch, void *_ctx); 382 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 383 384 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 385 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 386 uint64_t num_blocks, 387 struct spdk_memory_domain *domain, void *domain_ctx, 388 struct spdk_accel_sequence *seq, 389 spdk_bdev_io_completion_cb cb, void *cb_arg); 390 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 391 struct iovec *iov, int iovcnt, void *md_buf, 392 uint64_t offset_blocks, uint64_t num_blocks, 393 struct spdk_memory_domain *domain, void *domain_ctx, 394 struct spdk_accel_sequence *seq, 395 spdk_bdev_io_completion_cb cb, void *cb_arg); 396 397 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 398 uint64_t offset, uint64_t length, 399 lock_range_cb cb_fn, void *cb_arg); 400 401 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 402 uint64_t offset, uint64_t length, 403 lock_range_cb cb_fn, void *cb_arg); 404 405 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 406 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 407 408 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 409 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 410 static void claim_reset(struct spdk_bdev *bdev); 411 412 #define bdev_get_ext_io_opt(opts, field, defval) \ 413 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 414 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 415 416 void 417 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 418 { 419 if (!opts) { 420 SPDK_ERRLOG("opts should not be NULL\n"); 421 return; 422 } 423 424 if (!opts_size) { 425 SPDK_ERRLOG("opts_size should not be zero value\n"); 426 return; 427 } 428 429 opts->opts_size = opts_size; 430 431 #define SET_FIELD(field) \ 432 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 433 opts->field = g_bdev_opts.field; \ 434 } \ 435 436 SET_FIELD(bdev_io_pool_size); 437 SET_FIELD(bdev_io_cache_size); 438 SET_FIELD(bdev_auto_examine); 439 SET_FIELD(small_buf_pool_size); 440 SET_FIELD(large_buf_pool_size); 441 442 /* Do not remove this statement, you should always update this statement when you adding a new field, 443 * and do not forget to add the SET_FIELD statement for your added field. */ 444 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 445 446 #undef SET_FIELD 447 } 448 449 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 450 "v23.05", 0); 451 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 452 "v23.05", 0); 453 int 454 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 455 { 456 struct spdk_iobuf_opts iobuf_opts; 457 uint32_t min_pool_size; 458 int rc; 459 460 if (!opts) { 461 SPDK_ERRLOG("opts cannot be NULL\n"); 462 return -1; 463 } 464 465 if (!opts->opts_size) { 466 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 467 return -1; 468 } 469 470 /* 471 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 472 * initialization. A second mgmt_ch will be created on the same thread when the application starts 473 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 474 */ 475 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 476 if (opts->bdev_io_pool_size < min_pool_size) { 477 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 478 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 479 spdk_thread_get_count()); 480 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 481 return -1; 482 } 483 484 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 485 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 486 } 487 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 488 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 489 } 490 491 #define SET_FIELD(field) \ 492 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 493 g_bdev_opts.field = opts->field; \ 494 } \ 495 496 SET_FIELD(bdev_io_pool_size); 497 SET_FIELD(bdev_io_cache_size); 498 SET_FIELD(bdev_auto_examine); 499 SET_FIELD(small_buf_pool_size); 500 SET_FIELD(large_buf_pool_size); 501 502 spdk_iobuf_get_opts(&iobuf_opts); 503 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 504 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 505 506 rc = spdk_iobuf_set_opts(&iobuf_opts); 507 if (rc != 0) { 508 SPDK_ERRLOG("Failed to set iobuf opts\n"); 509 return -1; 510 } 511 512 g_bdev_opts.opts_size = opts->opts_size; 513 514 #undef SET_FIELD 515 516 return 0; 517 } 518 519 static struct spdk_bdev * 520 bdev_get_by_name(const char *bdev_name) 521 { 522 struct spdk_bdev_name find; 523 struct spdk_bdev_name *res; 524 525 find.name = (char *)bdev_name; 526 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 527 if (res != NULL) { 528 return res->bdev; 529 } 530 531 return NULL; 532 } 533 534 struct spdk_bdev * 535 spdk_bdev_get_by_name(const char *bdev_name) 536 { 537 struct spdk_bdev *bdev; 538 539 spdk_spin_lock(&g_bdev_mgr.spinlock); 540 bdev = bdev_get_by_name(bdev_name); 541 spdk_spin_unlock(&g_bdev_mgr.spinlock); 542 543 return bdev; 544 } 545 546 struct bdev_io_status_string { 547 enum spdk_bdev_io_status status; 548 const char *str; 549 }; 550 551 static const struct bdev_io_status_string bdev_io_status_strings[] = { 552 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 553 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 554 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 555 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 556 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 557 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 558 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 559 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 560 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 561 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 562 }; 563 564 static const char * 565 bdev_io_status_get_string(enum spdk_bdev_io_status status) 566 { 567 uint32_t i; 568 569 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 570 if (bdev_io_status_strings[i].status == status) { 571 return bdev_io_status_strings[i].str; 572 } 573 } 574 575 return "reserved"; 576 } 577 578 struct spdk_bdev_wait_for_examine_ctx { 579 struct spdk_poller *poller; 580 spdk_bdev_wait_for_examine_cb cb_fn; 581 void *cb_arg; 582 }; 583 584 static bool bdev_module_all_actions_completed(void); 585 586 static int 587 bdev_wait_for_examine_cb(void *arg) 588 { 589 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 590 591 if (!bdev_module_all_actions_completed()) { 592 return SPDK_POLLER_IDLE; 593 } 594 595 spdk_poller_unregister(&ctx->poller); 596 ctx->cb_fn(ctx->cb_arg); 597 free(ctx); 598 599 return SPDK_POLLER_BUSY; 600 } 601 602 int 603 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 604 { 605 struct spdk_bdev_wait_for_examine_ctx *ctx; 606 607 ctx = calloc(1, sizeof(*ctx)); 608 if (ctx == NULL) { 609 return -ENOMEM; 610 } 611 ctx->cb_fn = cb_fn; 612 ctx->cb_arg = cb_arg; 613 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 614 615 return 0; 616 } 617 618 struct spdk_bdev_examine_item { 619 char *name; 620 TAILQ_ENTRY(spdk_bdev_examine_item) link; 621 }; 622 623 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 624 625 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 626 g_bdev_examine_allowlist); 627 628 static inline bool 629 bdev_examine_allowlist_check(const char *name) 630 { 631 struct spdk_bdev_examine_item *item; 632 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 633 if (strcmp(name, item->name) == 0) { 634 return true; 635 } 636 } 637 return false; 638 } 639 640 static inline void 641 bdev_examine_allowlist_free(void) 642 { 643 struct spdk_bdev_examine_item *item; 644 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 645 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 646 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 647 free(item->name); 648 free(item); 649 } 650 } 651 652 static inline bool 653 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 654 { 655 struct spdk_bdev_alias *tmp; 656 if (bdev_examine_allowlist_check(bdev->name)) { 657 return true; 658 } 659 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 660 if (bdev_examine_allowlist_check(tmp->alias.name)) { 661 return true; 662 } 663 } 664 return false; 665 } 666 667 static inline bool 668 bdev_ok_to_examine(struct spdk_bdev *bdev) 669 { 670 if (g_bdev_opts.bdev_auto_examine) { 671 return true; 672 } else { 673 return bdev_in_examine_allowlist(bdev); 674 } 675 } 676 677 static void 678 bdev_examine(struct spdk_bdev *bdev) 679 { 680 struct spdk_bdev_module *module; 681 struct spdk_bdev_module_claim *claim, *tmpclaim; 682 uint32_t action; 683 684 if (!bdev_ok_to_examine(bdev)) { 685 return; 686 } 687 688 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 689 if (module->examine_config) { 690 spdk_spin_lock(&module->internal.spinlock); 691 action = module->internal.action_in_progress; 692 module->internal.action_in_progress++; 693 spdk_spin_unlock(&module->internal.spinlock); 694 module->examine_config(bdev); 695 if (action != module->internal.action_in_progress) { 696 SPDK_ERRLOG("examine_config for module %s did not call " 697 "spdk_bdev_module_examine_done()\n", module->name); 698 } 699 } 700 } 701 702 spdk_spin_lock(&bdev->internal.spinlock); 703 704 switch (bdev->internal.claim_type) { 705 case SPDK_BDEV_CLAIM_NONE: 706 /* Examine by all bdev modules */ 707 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 spdk_spin_lock(&bdev->internal.spinlock); 715 } 716 } 717 break; 718 case SPDK_BDEV_CLAIM_EXCL_WRITE: 719 /* Examine by the one bdev module with a v1 claim */ 720 module = bdev->internal.claim.v1.module; 721 if (module->examine_disk) { 722 spdk_spin_lock(&module->internal.spinlock); 723 module->internal.action_in_progress++; 724 spdk_spin_unlock(&module->internal.spinlock); 725 spdk_spin_unlock(&bdev->internal.spinlock); 726 module->examine_disk(bdev); 727 return; 728 } 729 break; 730 default: 731 /* Examine by all bdev modules with a v2 claim */ 732 assert(claim_type_is_v2(bdev->internal.claim_type)); 733 /* 734 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 735 * list, perhaps accessing freed memory. Without protection, this could happen 736 * while the lock is dropped during the examine callback. 737 */ 738 bdev->internal.examine_in_progress++; 739 740 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 741 module = claim->module; 742 743 if (module == NULL) { 744 /* This is a vestigial claim, held by examine_count */ 745 continue; 746 } 747 748 if (module->examine_disk == NULL) { 749 continue; 750 } 751 752 spdk_spin_lock(&module->internal.spinlock); 753 module->internal.action_in_progress++; 754 spdk_spin_unlock(&module->internal.spinlock); 755 756 /* Call examine_disk without holding internal.spinlock. */ 757 spdk_spin_unlock(&bdev->internal.spinlock); 758 module->examine_disk(bdev); 759 spdk_spin_lock(&bdev->internal.spinlock); 760 } 761 762 assert(bdev->internal.examine_in_progress > 0); 763 bdev->internal.examine_in_progress--; 764 if (bdev->internal.examine_in_progress == 0) { 765 /* Remove any claims that were released during examine_disk */ 766 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 767 if (claim->desc != NULL) { 768 continue; 769 } 770 771 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 772 free(claim); 773 } 774 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 775 claim_reset(bdev); 776 } 777 } 778 } 779 780 spdk_spin_unlock(&bdev->internal.spinlock); 781 } 782 783 int 784 spdk_bdev_examine(const char *name) 785 { 786 struct spdk_bdev *bdev; 787 struct spdk_bdev_examine_item *item; 788 struct spdk_thread *thread = spdk_get_thread(); 789 790 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 791 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 792 thread ? spdk_thread_get_name(thread) : "null"); 793 return -EINVAL; 794 } 795 796 if (g_bdev_opts.bdev_auto_examine) { 797 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 798 return -EINVAL; 799 } 800 801 if (bdev_examine_allowlist_check(name)) { 802 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 803 return -EEXIST; 804 } 805 806 item = calloc(1, sizeof(*item)); 807 if (!item) { 808 return -ENOMEM; 809 } 810 item->name = strdup(name); 811 if (!item->name) { 812 free(item); 813 return -ENOMEM; 814 } 815 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 816 817 bdev = spdk_bdev_get_by_name(name); 818 if (bdev) { 819 bdev_examine(bdev); 820 } 821 return 0; 822 } 823 824 static inline void 825 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 826 { 827 struct spdk_bdev_examine_item *item; 828 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 829 spdk_json_write_object_begin(w); 830 spdk_json_write_named_string(w, "method", "bdev_examine"); 831 spdk_json_write_named_object_begin(w, "params"); 832 spdk_json_write_named_string(w, "name", item->name); 833 spdk_json_write_object_end(w); 834 spdk_json_write_object_end(w); 835 } 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_first(void) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 struct spdk_bdev * 852 spdk_bdev_next(struct spdk_bdev *prev) 853 { 854 struct spdk_bdev *bdev; 855 856 bdev = TAILQ_NEXT(prev, internal.link); 857 if (bdev) { 858 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 859 } 860 861 return bdev; 862 } 863 864 static struct spdk_bdev * 865 _bdev_next_leaf(struct spdk_bdev *bdev) 866 { 867 while (bdev != NULL) { 868 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 869 return bdev; 870 } else { 871 bdev = TAILQ_NEXT(bdev, internal.link); 872 } 873 } 874 875 return bdev; 876 } 877 878 struct spdk_bdev * 879 spdk_bdev_first_leaf(void) 880 { 881 struct spdk_bdev *bdev; 882 883 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 884 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next_leaf(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 898 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static inline bool 907 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 908 { 909 return bdev_io->internal.memory_domain; 910 } 911 912 static inline bool 913 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 914 { 915 return bdev_io->internal.accel_sequence; 916 } 917 918 void 919 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 920 { 921 struct iovec *iovs; 922 923 if (bdev_io->u.bdev.iovs == NULL) { 924 bdev_io->u.bdev.iovs = &bdev_io->iov; 925 bdev_io->u.bdev.iovcnt = 1; 926 } 927 928 iovs = bdev_io->u.bdev.iovs; 929 930 assert(iovs != NULL); 931 assert(bdev_io->u.bdev.iovcnt >= 1); 932 933 iovs[0].iov_base = buf; 934 iovs[0].iov_len = len; 935 } 936 937 void 938 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 939 { 940 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 941 bdev_io->u.bdev.md_buf = md_buf; 942 } 943 944 static bool 945 _is_buf_allocated(const struct iovec *iovs) 946 { 947 if (iovs == NULL) { 948 return false; 949 } 950 951 return iovs[0].iov_base != NULL; 952 } 953 954 static bool 955 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 956 { 957 int i; 958 uintptr_t iov_base; 959 960 if (spdk_likely(alignment == 1)) { 961 return true; 962 } 963 964 for (i = 0; i < iovcnt; i++) { 965 iov_base = (uintptr_t)iovs[i].iov_base; 966 if ((iov_base & (alignment - 1)) != 0) { 967 return false; 968 } 969 } 970 971 return true; 972 } 973 974 static inline bool 975 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 976 { 977 if (!bdev_io_use_accel_sequence(bdev_io)) { 978 return false; 979 } 980 981 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 982 * bdev module didn't support accel sequences */ 983 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 984 } 985 986 static void 987 bdev_io_submit_sequence_cb(void *ctx, int status) 988 { 989 struct spdk_bdev_io *bdev_io = ctx; 990 991 bdev_io->u.bdev.accel_sequence = NULL; 992 bdev_io->internal.accel_sequence = NULL; 993 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 994 995 if (spdk_unlikely(status != 0)) { 996 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 997 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 998 bdev_io_complete_unsubmitted(bdev_io); 999 return; 1000 } 1001 1002 bdev_io_submit(bdev_io); 1003 } 1004 1005 static void 1006 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, spdk_accel_completion_cb cb_fn) 1007 { 1008 int rc; 1009 1010 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1011 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1012 1013 /* Since the operations are appended during submission, they're in the opposite order than 1014 * how we want to execute them for reads (i.e. we need to execute the most recently added 1015 * operation first), so reverse the sequence before executing it. 1016 */ 1017 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1018 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1019 } 1020 1021 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1022 1023 rc = spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, cb_fn, bdev_io); 1024 if (spdk_unlikely(rc != 0)) { 1025 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", rc); 1026 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1027 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1028 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1029 /* Writes haven't been submitted at this point yet */ 1030 bdev_io_complete_unsubmitted(bdev_io); 1031 } else { 1032 bdev_io_complete(bdev_io); 1033 } 1034 } 1035 } 1036 1037 static void 1038 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1039 { 1040 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1041 void *buf; 1042 1043 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1044 buf = bdev_io->internal.buf; 1045 bdev_io->internal.buf = NULL; 1046 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1047 bdev_io->internal.get_aux_buf_cb = NULL; 1048 } else { 1049 assert(bdev_io->internal.get_buf_cb != NULL); 1050 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1051 bdev_io->internal.get_buf_cb = NULL; 1052 } 1053 } 1054 1055 static void 1056 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1057 { 1058 struct spdk_bdev_io *bdev_io = ctx; 1059 1060 if (rc) { 1061 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1062 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1063 } 1064 bdev_io_get_buf_complete(bdev_io, !rc); 1065 } 1066 1067 static void 1068 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1069 { 1070 int rc = 0; 1071 1072 /* save original md_buf */ 1073 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1074 bdev_io->internal.orig_md_iov.iov_len = len; 1075 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1076 bdev_io->internal.bounce_md_iov.iov_len = len; 1077 /* set bounce md_buf */ 1078 bdev_io->u.bdev.md_buf = md_buf; 1079 1080 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1081 if (bdev_io_use_memory_domain(bdev_io)) { 1082 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1083 bdev_io->internal.memory_domain_ctx, 1084 &bdev_io->internal.orig_md_iov, 1, 1085 &bdev_io->internal.bounce_md_iov, 1, 1086 bdev_io->internal.data_transfer_cpl, 1087 bdev_io); 1088 if (rc == 0) { 1089 /* Continue to submit IO in completion callback */ 1090 return; 1091 } 1092 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1093 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1094 } else { 1095 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1096 } 1097 } 1098 1099 assert(bdev_io->internal.data_transfer_cpl); 1100 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1101 } 1102 1103 static void 1104 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1105 { 1106 struct spdk_bdev *bdev = bdev_io->bdev; 1107 uint64_t md_len; 1108 void *buf; 1109 1110 if (spdk_bdev_is_md_separate(bdev)) { 1111 assert(!bdev_io_use_accel_sequence(bdev_io)); 1112 1113 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1114 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1115 1116 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1117 1118 if (bdev_io->u.bdev.md_buf != NULL) { 1119 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1120 return; 1121 } else { 1122 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1123 } 1124 } 1125 1126 bdev_io_get_buf_complete(bdev_io, true); 1127 } 1128 1129 static void 1130 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1131 { 1132 struct spdk_bdev_io *bdev_io = ctx; 1133 1134 if (rc) { 1135 SPDK_ERRLOG("Failed to get data buffer\n"); 1136 assert(bdev_io->internal.data_transfer_cpl); 1137 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1138 return; 1139 } 1140 1141 _bdev_io_set_md_buf(bdev_io); 1142 } 1143 1144 static void 1145 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1146 bdev_copy_bounce_buffer_cpl cpl_cb) 1147 { 1148 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1149 int rc = 0; 1150 1151 bdev_io->internal.data_transfer_cpl = cpl_cb; 1152 /* save original iovec */ 1153 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1154 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1155 /* set bounce iov */ 1156 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1157 bdev_io->u.bdev.iovcnt = 1; 1158 /* set bounce buffer for this operation */ 1159 bdev_io->u.bdev.iovs[0].iov_base = buf; 1160 bdev_io->u.bdev.iovs[0].iov_len = len; 1161 1162 /* If we need to exec an accel sequence, append a copy operation making accel change the 1163 * src/dst buffers of the previous operation */ 1164 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1165 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1166 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1167 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1168 NULL, NULL, 1169 bdev_io->internal.orig_iovs, 1170 bdev_io->internal.orig_iovcnt, 1171 bdev_io->internal.memory_domain, 1172 bdev_io->internal.memory_domain_ctx, 1173 0, NULL, NULL); 1174 } else { 1175 /* We need to reverse the src/dst for reads */ 1176 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1177 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1178 bdev_io->internal.orig_iovs, 1179 bdev_io->internal.orig_iovcnt, 1180 bdev_io->internal.memory_domain, 1181 bdev_io->internal.memory_domain_ctx, 1182 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1183 NULL, NULL, 0, NULL, NULL); 1184 } 1185 1186 if (spdk_unlikely(rc != 0)) { 1187 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1188 bdev_io->internal.accel_sequence); 1189 } 1190 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1191 /* if this is write path, copy data from original buffer to bounce buffer */ 1192 if (bdev_io_use_memory_domain(bdev_io)) { 1193 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1194 bdev_io->internal.memory_domain_ctx, 1195 bdev_io->internal.orig_iovs, 1196 (uint32_t) bdev_io->internal.orig_iovcnt, 1197 bdev_io->u.bdev.iovs, 1, 1198 _bdev_io_pull_bounce_data_buf_done, 1199 bdev_io); 1200 if (rc == 0) { 1201 /* Continue to submit IO in completion callback */ 1202 return; 1203 } 1204 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1205 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1206 } else { 1207 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1208 } 1209 } 1210 1211 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1212 } 1213 1214 static void 1215 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1216 { 1217 struct spdk_bdev *bdev = bdev_io->bdev; 1218 bool buf_allocated; 1219 uint64_t alignment; 1220 void *aligned_buf; 1221 1222 bdev_io->internal.buf = buf; 1223 1224 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1225 bdev_io_get_buf_complete(bdev_io, true); 1226 return; 1227 } 1228 1229 alignment = spdk_bdev_get_buf_align(bdev); 1230 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1231 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1232 1233 if (buf_allocated) { 1234 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1235 /* Continue in completion callback */ 1236 return; 1237 } else { 1238 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1239 } 1240 1241 _bdev_io_set_md_buf(bdev_io); 1242 } 1243 1244 static inline uint64_t 1245 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1246 { 1247 struct spdk_bdev *bdev = bdev_io->bdev; 1248 uint64_t md_len, alignment; 1249 1250 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1251 1252 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1253 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1254 1255 return len + alignment + md_len; 1256 } 1257 1258 static void 1259 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1260 { 1261 struct spdk_bdev_mgmt_channel *ch; 1262 1263 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1264 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1265 } 1266 1267 static void 1268 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1269 { 1270 assert(bdev_io->internal.buf != NULL); 1271 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1272 bdev_io->internal.buf = NULL; 1273 } 1274 1275 void 1276 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1277 { 1278 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1279 1280 assert(buf != NULL); 1281 _bdev_io_put_buf(bdev_io, buf, len); 1282 } 1283 1284 static inline void 1285 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1286 struct spdk_bdev_io *bdev_io) 1287 { 1288 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1289 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1290 * sequence pointer to make sure we won't touch it anymore. */ 1291 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1292 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1293 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1294 bdev_io->internal.accel_sequence = NULL; 1295 } 1296 1297 bdev->fn_table->submit_request(ioch, bdev_io); 1298 } 1299 1300 static void 1301 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1302 { 1303 struct spdk_bdev *bdev = bdev_ch->bdev; 1304 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1305 struct spdk_bdev_io *bdev_io; 1306 1307 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1308 /* 1309 * Allow some more I/O to complete before retrying the nomem_io queue. 1310 * Some drivers (such as nvme) cannot immediately take a new I/O in 1311 * the context of a completion, because the resources for the I/O are 1312 * not released until control returns to the bdev poller. Also, we 1313 * may require several small I/O to complete before a larger I/O 1314 * (that requires splitting) can be submitted. 1315 */ 1316 return; 1317 } 1318 1319 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1320 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1321 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1322 bdev_io->internal.ch->io_outstanding++; 1323 shared_resource->io_outstanding++; 1324 bdev_io->internal.error.nvme.cdw0 = 0; 1325 bdev_io->num_retries++; 1326 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1327 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1328 /* This IO completed again with NOMEM status, so break the loop and 1329 * don't try anymore. Note that a bdev_io that fails with NOMEM 1330 * always gets requeued at the front of the list, to maintain 1331 * ordering. 1332 */ 1333 break; 1334 } 1335 } 1336 } 1337 1338 static inline void 1339 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1340 struct spdk_bdev_shared_resource *shared_resource) 1341 { 1342 assert(bdev_ch->io_outstanding > 0); 1343 assert(shared_resource->io_outstanding > 0); 1344 bdev_ch->io_outstanding--; 1345 shared_resource->io_outstanding--; 1346 } 1347 1348 static inline bool 1349 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1350 { 1351 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1352 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1353 1354 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1355 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1356 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1357 /* 1358 * Wait for some of the outstanding I/O to complete before we 1359 * retry any of the nomem_io. Normally we will wait for 1360 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1361 * depth channels we will instead wait for half to complete. 1362 */ 1363 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1364 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1365 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1366 * ownership of that sequence is transferred back to the bdev layer, so we need to 1367 * restore internal.accel_sequence to make sure that the sequence is handled 1368 * correctly in case the I/O is later aborted. */ 1369 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1370 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1371 assert(bdev_io->internal.accel_sequence == NULL); 1372 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1373 } 1374 1375 return true; 1376 } 1377 1378 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1379 bdev_ch_retry_io(bdev_ch); 1380 } 1381 1382 return false; 1383 } 1384 1385 static void 1386 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1387 { 1388 struct spdk_bdev_io *bdev_io = ctx; 1389 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1390 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1391 1392 if (rc) { 1393 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1394 } 1395 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1396 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1397 */ 1398 bdev_io_put_buf(bdev_io); 1399 1400 /* Continue with IO completion flow */ 1401 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1402 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1403 return; 1404 } 1405 1406 bdev_io_complete(bdev_io); 1407 } 1408 1409 static void 1410 _bdev_io_push_bounce_md_buffer_done(void *ctx, int rc) 1411 { 1412 struct spdk_bdev_io *bdev_io = ctx; 1413 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1414 1415 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1416 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1417 } 1418 1419 static inline void 1420 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1421 { 1422 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1423 int rc = 0; 1424 1425 /* do the same for metadata buffer */ 1426 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1427 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1428 1429 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1430 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1431 if (bdev_io_use_memory_domain(bdev_io)) { 1432 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1433 /* If memory domain is used then we need to call async push function */ 1434 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1435 bdev_io->internal.memory_domain_ctx, 1436 &bdev_io->internal.orig_md_iov, 1437 (uint32_t)bdev_io->internal.orig_iovcnt, 1438 &bdev_io->internal.bounce_md_iov, 1, 1439 _bdev_io_push_bounce_md_buffer_done, 1440 bdev_io); 1441 if (rc == 0) { 1442 /* Continue IO completion in async callback */ 1443 return; 1444 } 1445 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1446 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1447 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1448 } else { 1449 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1450 bdev_io->internal.orig_md_iov.iov_len); 1451 } 1452 } 1453 } 1454 1455 assert(bdev_io->internal.data_transfer_cpl); 1456 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1457 } 1458 1459 static void 1460 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1461 { 1462 struct spdk_bdev_io *bdev_io = ctx; 1463 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1464 1465 assert(bdev_io->internal.data_transfer_cpl); 1466 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1467 1468 if (rc) { 1469 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1470 return; 1471 } 1472 1473 /* set original buffer for this io */ 1474 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1475 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1476 /* disable bouncing buffer for this io */ 1477 bdev_io->internal.orig_iovcnt = 0; 1478 bdev_io->internal.orig_iovs = NULL; 1479 1480 _bdev_io_push_bounce_md_buffer(bdev_io); 1481 } 1482 1483 static inline void 1484 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1485 { 1486 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1487 int rc = 0; 1488 1489 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1490 bdev_io->internal.data_transfer_cpl = cpl_cb; 1491 1492 /* if this is read path, copy data from bounce buffer to original buffer */ 1493 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1494 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1495 if (bdev_io_use_memory_domain(bdev_io)) { 1496 /* If memory domain is used then we need to call async push function */ 1497 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1498 bdev_io->internal.memory_domain_ctx, 1499 bdev_io->internal.orig_iovs, 1500 (uint32_t)bdev_io->internal.orig_iovcnt, 1501 &bdev_io->internal.bounce_iov, 1, 1502 _bdev_io_push_bounce_data_buffer_done, 1503 bdev_io); 1504 if (rc == 0) { 1505 /* Continue IO completion in async callback */ 1506 return; 1507 } 1508 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1509 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1510 } else { 1511 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1512 bdev_io->internal.orig_iovcnt, 1513 bdev_io->internal.bounce_iov.iov_base, 1514 bdev_io->internal.bounce_iov.iov_len); 1515 } 1516 } 1517 1518 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1519 } 1520 1521 static void 1522 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1523 { 1524 struct spdk_bdev_io *bdev_io; 1525 1526 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1527 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1528 } 1529 1530 static void 1531 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1532 { 1533 struct spdk_bdev_mgmt_channel *mgmt_ch; 1534 uint64_t max_len; 1535 void *buf; 1536 1537 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1538 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1539 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1540 1541 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1542 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1543 bdev_io_get_buf_complete(bdev_io, false); 1544 return; 1545 } 1546 1547 bdev_io->internal.buf_len = len; 1548 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1549 bdev_io_get_iobuf_cb); 1550 if (buf != NULL) { 1551 _bdev_io_set_buf(bdev_io, buf, len); 1552 } 1553 } 1554 1555 void 1556 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1557 { 1558 struct spdk_bdev *bdev = bdev_io->bdev; 1559 uint64_t alignment; 1560 1561 assert(cb != NULL); 1562 bdev_io->internal.get_buf_cb = cb; 1563 1564 alignment = spdk_bdev_get_buf_align(bdev); 1565 1566 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1567 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1568 /* Buffer already present and aligned */ 1569 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1570 return; 1571 } 1572 1573 bdev_io_get_buf(bdev_io, len); 1574 } 1575 1576 static void 1577 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1578 bool success) 1579 { 1580 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1581 1582 TAILQ_REMOVE(&bdev_ch->io_memory_domain, bdev_io, internal.link); 1583 1584 if (!success) { 1585 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1586 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1587 bdev_io_complete_unsubmitted(bdev_io); 1588 return; 1589 } 1590 1591 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1592 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1593 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1594 return; 1595 } 1596 /* For reads we'll execute the sequence after the data is read, so, for now, only 1597 * clear out accel_sequence pointer and submit the IO */ 1598 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1599 bdev_io->u.bdev.accel_sequence = NULL; 1600 } 1601 1602 bdev_io_submit(bdev_io); 1603 } 1604 1605 static void 1606 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1607 uint64_t len) 1608 { 1609 assert(cb != NULL); 1610 bdev_io->internal.get_buf_cb = cb; 1611 1612 bdev_io_get_buf(bdev_io, len); 1613 } 1614 1615 void 1616 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1617 { 1618 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1619 1620 assert(cb != NULL); 1621 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1622 bdev_io->internal.get_aux_buf_cb = cb; 1623 bdev_io_get_buf(bdev_io, len); 1624 } 1625 1626 static int 1627 bdev_module_get_max_ctx_size(void) 1628 { 1629 struct spdk_bdev_module *bdev_module; 1630 int max_bdev_module_size = 0; 1631 1632 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1633 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1634 max_bdev_module_size = bdev_module->get_ctx_size(); 1635 } 1636 } 1637 1638 return max_bdev_module_size; 1639 } 1640 1641 static void 1642 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1643 { 1644 int i; 1645 struct spdk_bdev_qos *qos = bdev->internal.qos; 1646 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1647 1648 if (!qos) { 1649 return; 1650 } 1651 1652 spdk_bdev_get_qos_rate_limits(bdev, limits); 1653 1654 spdk_json_write_object_begin(w); 1655 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1656 1657 spdk_json_write_named_object_begin(w, "params"); 1658 spdk_json_write_named_string(w, "name", bdev->name); 1659 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1660 if (limits[i] > 0) { 1661 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1662 } 1663 } 1664 spdk_json_write_object_end(w); 1665 1666 spdk_json_write_object_end(w); 1667 } 1668 1669 void 1670 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1671 { 1672 struct spdk_bdev_module *bdev_module; 1673 struct spdk_bdev *bdev; 1674 1675 assert(w != NULL); 1676 1677 spdk_json_write_array_begin(w); 1678 1679 spdk_json_write_object_begin(w); 1680 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1681 spdk_json_write_named_object_begin(w, "params"); 1682 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1683 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1684 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1685 spdk_json_write_object_end(w); 1686 spdk_json_write_object_end(w); 1687 1688 bdev_examine_allowlist_config_json(w); 1689 1690 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1691 if (bdev_module->config_json) { 1692 bdev_module->config_json(w); 1693 } 1694 } 1695 1696 spdk_spin_lock(&g_bdev_mgr.spinlock); 1697 1698 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1699 if (bdev->fn_table->write_config_json) { 1700 bdev->fn_table->write_config_json(bdev, w); 1701 } 1702 1703 bdev_qos_config_json(bdev, w); 1704 } 1705 1706 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1707 1708 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1709 spdk_json_write_object_begin(w); 1710 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1711 spdk_json_write_object_end(w); 1712 1713 spdk_json_write_array_end(w); 1714 } 1715 1716 static void 1717 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1718 { 1719 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1720 struct spdk_bdev_io *bdev_io; 1721 1722 spdk_iobuf_channel_fini(&ch->iobuf); 1723 1724 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1725 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1726 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1727 ch->per_thread_cache_count--; 1728 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1729 } 1730 1731 assert(ch->per_thread_cache_count == 0); 1732 } 1733 1734 static int 1735 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1736 { 1737 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1738 struct spdk_bdev_io *bdev_io; 1739 uint32_t i; 1740 int rc; 1741 1742 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1743 if (rc != 0) { 1744 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1745 return -1; 1746 } 1747 1748 STAILQ_INIT(&ch->per_thread_cache); 1749 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1750 1751 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1752 ch->per_thread_cache_count = 0; 1753 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1754 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1755 if (bdev_io == NULL) { 1756 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1757 assert(false); 1758 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1759 return -1; 1760 } 1761 ch->per_thread_cache_count++; 1762 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1763 } 1764 1765 TAILQ_INIT(&ch->shared_resources); 1766 TAILQ_INIT(&ch->io_wait_queue); 1767 1768 return 0; 1769 } 1770 1771 static void 1772 bdev_init_complete(int rc) 1773 { 1774 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1775 void *cb_arg = g_init_cb_arg; 1776 struct spdk_bdev_module *m; 1777 1778 g_bdev_mgr.init_complete = true; 1779 g_init_cb_fn = NULL; 1780 g_init_cb_arg = NULL; 1781 1782 /* 1783 * For modules that need to know when subsystem init is complete, 1784 * inform them now. 1785 */ 1786 if (rc == 0) { 1787 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1788 if (m->init_complete) { 1789 m->init_complete(); 1790 } 1791 } 1792 } 1793 1794 cb_fn(cb_arg, rc); 1795 } 1796 1797 static bool 1798 bdev_module_all_actions_completed(void) 1799 { 1800 struct spdk_bdev_module *m; 1801 1802 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1803 if (m->internal.action_in_progress > 0) { 1804 return false; 1805 } 1806 } 1807 return true; 1808 } 1809 1810 static void 1811 bdev_module_action_complete(void) 1812 { 1813 /* 1814 * Don't finish bdev subsystem initialization if 1815 * module pre-initialization is still in progress, or 1816 * the subsystem been already initialized. 1817 */ 1818 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1819 return; 1820 } 1821 1822 /* 1823 * Check all bdev modules for inits/examinations in progress. If any 1824 * exist, return immediately since we cannot finish bdev subsystem 1825 * initialization until all are completed. 1826 */ 1827 if (!bdev_module_all_actions_completed()) { 1828 return; 1829 } 1830 1831 /* 1832 * Modules already finished initialization - now that all 1833 * the bdev modules have finished their asynchronous I/O 1834 * processing, the entire bdev layer can be marked as complete. 1835 */ 1836 bdev_init_complete(0); 1837 } 1838 1839 static void 1840 bdev_module_action_done(struct spdk_bdev_module *module) 1841 { 1842 spdk_spin_lock(&module->internal.spinlock); 1843 assert(module->internal.action_in_progress > 0); 1844 module->internal.action_in_progress--; 1845 spdk_spin_unlock(&module->internal.spinlock); 1846 bdev_module_action_complete(); 1847 } 1848 1849 void 1850 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1851 { 1852 assert(module->async_init); 1853 bdev_module_action_done(module); 1854 } 1855 1856 void 1857 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1858 { 1859 bdev_module_action_done(module); 1860 } 1861 1862 /** The last initialized bdev module */ 1863 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1864 1865 static void 1866 bdev_init_failed(void *cb_arg) 1867 { 1868 struct spdk_bdev_module *module = cb_arg; 1869 1870 spdk_spin_lock(&module->internal.spinlock); 1871 assert(module->internal.action_in_progress > 0); 1872 module->internal.action_in_progress--; 1873 spdk_spin_unlock(&module->internal.spinlock); 1874 bdev_init_complete(-1); 1875 } 1876 1877 static int 1878 bdev_modules_init(void) 1879 { 1880 struct spdk_bdev_module *module; 1881 int rc = 0; 1882 1883 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1884 g_resume_bdev_module = module; 1885 if (module->async_init) { 1886 spdk_spin_lock(&module->internal.spinlock); 1887 module->internal.action_in_progress = 1; 1888 spdk_spin_unlock(&module->internal.spinlock); 1889 } 1890 rc = module->module_init(); 1891 if (rc != 0) { 1892 /* Bump action_in_progress to prevent other modules from completion of modules_init 1893 * Send message to defer application shutdown until resources are cleaned up */ 1894 spdk_spin_lock(&module->internal.spinlock); 1895 module->internal.action_in_progress = 1; 1896 spdk_spin_unlock(&module->internal.spinlock); 1897 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1898 return rc; 1899 } 1900 } 1901 1902 g_resume_bdev_module = NULL; 1903 return 0; 1904 } 1905 1906 void 1907 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1908 { 1909 int rc = 0; 1910 char mempool_name[32]; 1911 1912 assert(cb_fn != NULL); 1913 1914 g_init_cb_fn = cb_fn; 1915 g_init_cb_arg = cb_arg; 1916 1917 spdk_notify_type_register("bdev_register"); 1918 spdk_notify_type_register("bdev_unregister"); 1919 1920 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1921 1922 rc = spdk_iobuf_register_module("bdev"); 1923 if (rc != 0) { 1924 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1925 bdev_init_complete(-1); 1926 return; 1927 } 1928 1929 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1930 g_bdev_opts.bdev_io_pool_size, 1931 sizeof(struct spdk_bdev_io) + 1932 bdev_module_get_max_ctx_size(), 1933 0, 1934 SPDK_ENV_SOCKET_ID_ANY); 1935 1936 if (g_bdev_mgr.bdev_io_pool == NULL) { 1937 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1938 bdev_init_complete(-1); 1939 return; 1940 } 1941 1942 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1943 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1944 if (!g_bdev_mgr.zero_buffer) { 1945 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1946 bdev_init_complete(-1); 1947 return; 1948 } 1949 1950 #ifdef SPDK_CONFIG_VTUNE 1951 SPDK_LOG_DEPRECATED(vtune_support); 1952 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1953 #endif 1954 1955 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1956 bdev_mgmt_channel_destroy, 1957 sizeof(struct spdk_bdev_mgmt_channel), 1958 "bdev_mgr"); 1959 1960 rc = bdev_modules_init(); 1961 g_bdev_mgr.module_init_complete = true; 1962 if (rc != 0) { 1963 SPDK_ERRLOG("bdev modules init failed\n"); 1964 return; 1965 } 1966 1967 bdev_module_action_complete(); 1968 } 1969 1970 static void 1971 bdev_mgr_unregister_cb(void *io_device) 1972 { 1973 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1974 1975 if (g_bdev_mgr.bdev_io_pool) { 1976 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1977 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1978 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1979 g_bdev_opts.bdev_io_pool_size); 1980 } 1981 1982 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1983 } 1984 1985 spdk_free(g_bdev_mgr.zero_buffer); 1986 1987 bdev_examine_allowlist_free(); 1988 1989 cb_fn(g_fini_cb_arg); 1990 g_fini_cb_fn = NULL; 1991 g_fini_cb_arg = NULL; 1992 g_bdev_mgr.init_complete = false; 1993 g_bdev_mgr.module_init_complete = false; 1994 } 1995 1996 static void 1997 bdev_module_fini_iter(void *arg) 1998 { 1999 struct spdk_bdev_module *bdev_module; 2000 2001 /* FIXME: Handling initialization failures is broken now, 2002 * so we won't even try cleaning up after successfully 2003 * initialized modules. if module_init_complete is false, 2004 * just call spdk_bdev_mgr_unregister_cb 2005 */ 2006 if (!g_bdev_mgr.module_init_complete) { 2007 bdev_mgr_unregister_cb(NULL); 2008 return; 2009 } 2010 2011 /* Start iterating from the last touched module */ 2012 if (!g_resume_bdev_module) { 2013 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2014 } else { 2015 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2016 internal.tailq); 2017 } 2018 2019 while (bdev_module) { 2020 if (bdev_module->async_fini) { 2021 /* Save our place so we can resume later. We must 2022 * save the variable here, before calling module_fini() 2023 * below, because in some cases the module may immediately 2024 * call spdk_bdev_module_fini_done() and re-enter 2025 * this function to continue iterating. */ 2026 g_resume_bdev_module = bdev_module; 2027 } 2028 2029 if (bdev_module->module_fini) { 2030 bdev_module->module_fini(); 2031 } 2032 2033 if (bdev_module->async_fini) { 2034 return; 2035 } 2036 2037 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2038 internal.tailq); 2039 } 2040 2041 g_resume_bdev_module = NULL; 2042 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2043 } 2044 2045 void 2046 spdk_bdev_module_fini_done(void) 2047 { 2048 if (spdk_get_thread() != g_fini_thread) { 2049 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2050 } else { 2051 bdev_module_fini_iter(NULL); 2052 } 2053 } 2054 2055 static void 2056 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2057 { 2058 struct spdk_bdev *bdev = cb_arg; 2059 2060 if (bdeverrno && bdev) { 2061 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2062 bdev->name); 2063 2064 /* 2065 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2066 * bdev; try to continue by manually removing this bdev from the list and continue 2067 * with the next bdev in the list. 2068 */ 2069 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2070 } 2071 2072 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2073 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2074 /* 2075 * Bdev module finish need to be deferred as we might be in the middle of some context 2076 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2077 * after returning. 2078 */ 2079 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2080 return; 2081 } 2082 2083 /* 2084 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2085 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2086 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2087 * base bdevs. 2088 * 2089 * Also, walk the list in the reverse order. 2090 */ 2091 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2092 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2093 spdk_spin_lock(&bdev->internal.spinlock); 2094 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2095 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2096 spdk_spin_unlock(&bdev->internal.spinlock); 2097 continue; 2098 } 2099 spdk_spin_unlock(&bdev->internal.spinlock); 2100 2101 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2102 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2103 return; 2104 } 2105 2106 /* 2107 * If any bdev fails to unclaim underlying bdev properly, we may face the 2108 * case of bdev list consisting of claimed bdevs only (if claims are managed 2109 * correctly, this would mean there's a loop in the claims graph which is 2110 * clearly impossible). Warn and unregister last bdev on the list then. 2111 */ 2112 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2113 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2114 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2115 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2116 return; 2117 } 2118 } 2119 2120 static void 2121 bdev_module_fini_start_iter(void *arg) 2122 { 2123 struct spdk_bdev_module *bdev_module; 2124 2125 if (!g_resume_bdev_module) { 2126 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2127 } else { 2128 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2129 } 2130 2131 while (bdev_module) { 2132 if (bdev_module->async_fini_start) { 2133 /* Save our place so we can resume later. We must 2134 * save the variable here, before calling fini_start() 2135 * below, because in some cases the module may immediately 2136 * call spdk_bdev_module_fini_start_done() and re-enter 2137 * this function to continue iterating. */ 2138 g_resume_bdev_module = bdev_module; 2139 } 2140 2141 if (bdev_module->fini_start) { 2142 bdev_module->fini_start(); 2143 } 2144 2145 if (bdev_module->async_fini_start) { 2146 return; 2147 } 2148 2149 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2150 } 2151 2152 g_resume_bdev_module = NULL; 2153 2154 bdev_finish_unregister_bdevs_iter(NULL, 0); 2155 } 2156 2157 void 2158 spdk_bdev_module_fini_start_done(void) 2159 { 2160 if (spdk_get_thread() != g_fini_thread) { 2161 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2162 } else { 2163 bdev_module_fini_start_iter(NULL); 2164 } 2165 } 2166 2167 static void 2168 bdev_finish_wait_for_examine_done(void *cb_arg) 2169 { 2170 bdev_module_fini_start_iter(NULL); 2171 } 2172 2173 void 2174 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2175 { 2176 int rc; 2177 2178 assert(cb_fn != NULL); 2179 2180 g_fini_thread = spdk_get_thread(); 2181 2182 g_fini_cb_fn = cb_fn; 2183 g_fini_cb_arg = cb_arg; 2184 2185 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2186 if (rc != 0) { 2187 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2188 bdev_finish_wait_for_examine_done(NULL); 2189 } 2190 } 2191 2192 struct spdk_bdev_io * 2193 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2194 { 2195 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2196 struct spdk_bdev_io *bdev_io; 2197 2198 if (ch->per_thread_cache_count > 0) { 2199 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2200 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2201 ch->per_thread_cache_count--; 2202 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2203 /* 2204 * Don't try to look for bdev_ios in the global pool if there are 2205 * waiters on bdev_ios - we don't want this caller to jump the line. 2206 */ 2207 bdev_io = NULL; 2208 } else { 2209 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2210 } 2211 2212 return bdev_io; 2213 } 2214 2215 void 2216 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2217 { 2218 struct spdk_bdev_mgmt_channel *ch; 2219 2220 assert(bdev_io != NULL); 2221 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2222 2223 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2224 2225 if (bdev_io->internal.buf != NULL) { 2226 bdev_io_put_buf(bdev_io); 2227 } 2228 2229 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2230 ch->per_thread_cache_count++; 2231 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2232 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2233 struct spdk_bdev_io_wait_entry *entry; 2234 2235 entry = TAILQ_FIRST(&ch->io_wait_queue); 2236 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2237 entry->cb_fn(entry->cb_arg); 2238 } 2239 } else { 2240 /* We should never have a full cache with entries on the io wait queue. */ 2241 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2242 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2243 } 2244 } 2245 2246 static bool 2247 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2248 { 2249 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2250 2251 switch (limit) { 2252 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2253 return true; 2254 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2255 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2256 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2257 return false; 2258 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2259 default: 2260 return false; 2261 } 2262 } 2263 2264 static bool 2265 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2266 { 2267 switch (bdev_io->type) { 2268 case SPDK_BDEV_IO_TYPE_NVME_IO: 2269 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2270 case SPDK_BDEV_IO_TYPE_READ: 2271 case SPDK_BDEV_IO_TYPE_WRITE: 2272 return true; 2273 case SPDK_BDEV_IO_TYPE_ZCOPY: 2274 if (bdev_io->u.bdev.zcopy.start) { 2275 return true; 2276 } else { 2277 return false; 2278 } 2279 default: 2280 return false; 2281 } 2282 } 2283 2284 static bool 2285 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2286 { 2287 switch (bdev_io->type) { 2288 case SPDK_BDEV_IO_TYPE_NVME_IO: 2289 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2290 /* Bit 1 (0x2) set for read operation */ 2291 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2292 return true; 2293 } else { 2294 return false; 2295 } 2296 case SPDK_BDEV_IO_TYPE_READ: 2297 return true; 2298 case SPDK_BDEV_IO_TYPE_ZCOPY: 2299 /* Populate to read from disk */ 2300 if (bdev_io->u.bdev.zcopy.populate) { 2301 return true; 2302 } else { 2303 return false; 2304 } 2305 default: 2306 return false; 2307 } 2308 } 2309 2310 static uint64_t 2311 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2312 { 2313 struct spdk_bdev *bdev = bdev_io->bdev; 2314 2315 switch (bdev_io->type) { 2316 case SPDK_BDEV_IO_TYPE_NVME_IO: 2317 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2318 return bdev_io->u.nvme_passthru.nbytes; 2319 case SPDK_BDEV_IO_TYPE_READ: 2320 case SPDK_BDEV_IO_TYPE_WRITE: 2321 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2322 case SPDK_BDEV_IO_TYPE_ZCOPY: 2323 /* Track the data in the start phase only */ 2324 if (bdev_io->u.bdev.zcopy.start) { 2325 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2326 } else { 2327 return 0; 2328 } 2329 default: 2330 return 0; 2331 } 2332 } 2333 2334 static bool 2335 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2336 { 2337 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2338 return true; 2339 } else { 2340 return false; 2341 } 2342 } 2343 2344 static bool 2345 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2346 { 2347 if (bdev_is_read_io(io) == false) { 2348 return false; 2349 } 2350 2351 return bdev_qos_rw_queue_io(limit, io); 2352 } 2353 2354 static bool 2355 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2356 { 2357 if (bdev_is_read_io(io) == true) { 2358 return false; 2359 } 2360 2361 return bdev_qos_rw_queue_io(limit, io); 2362 } 2363 2364 static void 2365 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2366 { 2367 limit->remaining_this_timeslice--; 2368 } 2369 2370 static void 2371 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2372 { 2373 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2374 } 2375 2376 static void 2377 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2378 { 2379 if (bdev_is_read_io(io) == false) { 2380 return; 2381 } 2382 2383 return bdev_qos_rw_bps_update_quota(limit, io); 2384 } 2385 2386 static void 2387 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2388 { 2389 if (bdev_is_read_io(io) == true) { 2390 return; 2391 } 2392 2393 return bdev_qos_rw_bps_update_quota(limit, io); 2394 } 2395 2396 static void 2397 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2398 { 2399 int i; 2400 2401 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2402 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2403 qos->rate_limits[i].queue_io = NULL; 2404 qos->rate_limits[i].update_quota = NULL; 2405 continue; 2406 } 2407 2408 switch (i) { 2409 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2410 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2411 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2412 break; 2413 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2414 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2415 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2416 break; 2417 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2418 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2419 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2420 break; 2421 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2422 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2423 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2424 break; 2425 default: 2426 break; 2427 } 2428 } 2429 } 2430 2431 static void 2432 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2433 struct spdk_bdev_io *bdev_io, 2434 enum spdk_bdev_io_status status) 2435 { 2436 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2437 2438 bdev_io->internal.in_submit_request = true; 2439 bdev_ch->io_outstanding++; 2440 shared_resource->io_outstanding++; 2441 spdk_bdev_io_complete(bdev_io, status); 2442 bdev_io->internal.in_submit_request = false; 2443 } 2444 2445 static inline void 2446 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2447 { 2448 struct spdk_bdev *bdev = bdev_io->bdev; 2449 struct spdk_io_channel *ch = bdev_ch->channel; 2450 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2451 2452 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2453 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2454 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2455 2456 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2457 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2458 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2459 SPDK_BDEV_IO_STATUS_SUCCESS); 2460 return; 2461 } 2462 } 2463 2464 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2465 bdev_io->bdev->split_on_write_unit && 2466 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2467 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2468 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2469 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2470 return; 2471 } 2472 2473 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2474 bdev_ch->io_outstanding++; 2475 shared_resource->io_outstanding++; 2476 bdev_io->internal.in_submit_request = true; 2477 bdev_submit_request(bdev, ch, bdev_io); 2478 bdev_io->internal.in_submit_request = false; 2479 } else { 2480 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2481 } 2482 } 2483 2484 static bool 2485 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2486 { 2487 int i; 2488 2489 if (bdev_qos_io_to_limit(bdev_io) == true) { 2490 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2491 if (!qos->rate_limits[i].queue_io) { 2492 continue; 2493 } 2494 2495 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2496 bdev_io) == true) { 2497 return true; 2498 } 2499 } 2500 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2501 if (!qos->rate_limits[i].update_quota) { 2502 continue; 2503 } 2504 2505 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2506 } 2507 } 2508 2509 return false; 2510 } 2511 2512 static inline void 2513 _bdev_io_do_submit(void *ctx) 2514 { 2515 struct spdk_bdev_io *bdev_io = ctx; 2516 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2517 2518 bdev_io_do_submit(ch, bdev_io); 2519 } 2520 2521 static int 2522 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2523 { 2524 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2525 int submitted_ios = 0; 2526 2527 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2528 if (!bdev_qos_queue_io(qos, bdev_io)) { 2529 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2530 2531 if (bdev_io->internal.io_submit_ch) { 2532 /* Send back the IO to the original thread for the actual processing. */ 2533 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2534 bdev_io->internal.io_submit_ch = NULL; 2535 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2536 _bdev_io_do_submit, bdev_io); 2537 } else { 2538 bdev_io_do_submit(ch, bdev_io); 2539 } 2540 2541 submitted_ios++; 2542 } 2543 } 2544 2545 return submitted_ios; 2546 } 2547 2548 static void 2549 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2550 { 2551 int rc; 2552 2553 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2554 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2555 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2556 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2557 &bdev_io->internal.waitq_entry); 2558 if (rc != 0) { 2559 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2560 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2561 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2562 } 2563 } 2564 2565 static bool 2566 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2567 { 2568 uint32_t io_boundary; 2569 struct spdk_bdev *bdev = bdev_io->bdev; 2570 uint32_t max_size = bdev->max_segment_size; 2571 int max_segs = bdev->max_num_segments; 2572 2573 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2574 io_boundary = bdev->write_unit_size; 2575 } else if (bdev->split_on_optimal_io_boundary) { 2576 io_boundary = bdev->optimal_io_boundary; 2577 } else { 2578 io_boundary = 0; 2579 } 2580 2581 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2582 return false; 2583 } 2584 2585 if (io_boundary) { 2586 uint64_t start_stripe, end_stripe; 2587 2588 start_stripe = bdev_io->u.bdev.offset_blocks; 2589 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2590 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2591 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2592 start_stripe >>= spdk_u32log2(io_boundary); 2593 end_stripe >>= spdk_u32log2(io_boundary); 2594 } else { 2595 start_stripe /= io_boundary; 2596 end_stripe /= io_boundary; 2597 } 2598 2599 if (start_stripe != end_stripe) { 2600 return true; 2601 } 2602 } 2603 2604 if (max_segs) { 2605 if (bdev_io->u.bdev.iovcnt > max_segs) { 2606 return true; 2607 } 2608 } 2609 2610 if (max_size) { 2611 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2612 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2613 return true; 2614 } 2615 } 2616 } 2617 2618 return false; 2619 } 2620 2621 static bool 2622 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2623 { 2624 uint32_t num_unmap_segments; 2625 2626 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2627 return false; 2628 } 2629 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2630 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2631 return true; 2632 } 2633 2634 return false; 2635 } 2636 2637 static bool 2638 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2639 { 2640 if (!bdev_io->bdev->max_write_zeroes) { 2641 return false; 2642 } 2643 2644 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2645 return true; 2646 } 2647 2648 return false; 2649 } 2650 2651 static bool 2652 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2653 { 2654 if (bdev_io->bdev->max_copy != 0 && 2655 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2656 return true; 2657 } 2658 2659 return false; 2660 } 2661 2662 static bool 2663 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2664 { 2665 switch (bdev_io->type) { 2666 case SPDK_BDEV_IO_TYPE_READ: 2667 case SPDK_BDEV_IO_TYPE_WRITE: 2668 return bdev_rw_should_split(bdev_io); 2669 case SPDK_BDEV_IO_TYPE_UNMAP: 2670 return bdev_unmap_should_split(bdev_io); 2671 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2672 return bdev_write_zeroes_should_split(bdev_io); 2673 case SPDK_BDEV_IO_TYPE_COPY: 2674 return bdev_copy_should_split(bdev_io); 2675 default: 2676 return false; 2677 } 2678 } 2679 2680 static uint32_t 2681 _to_next_boundary(uint64_t offset, uint32_t boundary) 2682 { 2683 return (boundary - (offset % boundary)); 2684 } 2685 2686 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2687 2688 static void _bdev_rw_split(void *_bdev_io); 2689 2690 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2691 2692 static void 2693 _bdev_unmap_split(void *_bdev_io) 2694 { 2695 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2696 } 2697 2698 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2699 2700 static void 2701 _bdev_write_zeroes_split(void *_bdev_io) 2702 { 2703 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2704 } 2705 2706 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2707 2708 static void 2709 _bdev_copy_split(void *_bdev_io) 2710 { 2711 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2712 } 2713 2714 static int 2715 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2716 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2717 { 2718 int rc; 2719 uint64_t current_offset, current_remaining, current_src_offset; 2720 spdk_bdev_io_wait_cb io_wait_fn; 2721 2722 current_offset = *offset; 2723 current_remaining = *remaining; 2724 2725 bdev_io->u.bdev.split_outstanding++; 2726 2727 io_wait_fn = _bdev_rw_split; 2728 switch (bdev_io->type) { 2729 case SPDK_BDEV_IO_TYPE_READ: 2730 assert(bdev_io->u.bdev.accel_sequence == NULL); 2731 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2732 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2733 iov, iovcnt, md_buf, current_offset, 2734 num_blocks, bdev_io->internal.memory_domain, 2735 bdev_io->internal.memory_domain_ctx, NULL, 2736 bdev_io_split_done, bdev_io); 2737 break; 2738 case SPDK_BDEV_IO_TYPE_WRITE: 2739 assert(bdev_io->u.bdev.accel_sequence == NULL); 2740 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2741 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2742 iov, iovcnt, md_buf, current_offset, 2743 num_blocks, bdev_io->internal.memory_domain, 2744 bdev_io->internal.memory_domain_ctx, NULL, 2745 bdev_io_split_done, bdev_io); 2746 break; 2747 case SPDK_BDEV_IO_TYPE_UNMAP: 2748 io_wait_fn = _bdev_unmap_split; 2749 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2750 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2751 current_offset, num_blocks, 2752 bdev_io_split_done, bdev_io); 2753 break; 2754 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2755 io_wait_fn = _bdev_write_zeroes_split; 2756 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2757 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2758 current_offset, num_blocks, 2759 bdev_io_split_done, bdev_io); 2760 break; 2761 case SPDK_BDEV_IO_TYPE_COPY: 2762 io_wait_fn = _bdev_copy_split; 2763 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2764 (current_offset - bdev_io->u.bdev.offset_blocks); 2765 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2766 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2767 current_offset, current_src_offset, num_blocks, 2768 bdev_io_split_done, bdev_io); 2769 break; 2770 default: 2771 assert(false); 2772 rc = -EINVAL; 2773 break; 2774 } 2775 2776 if (rc == 0) { 2777 current_offset += num_blocks; 2778 current_remaining -= num_blocks; 2779 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2780 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2781 *offset = current_offset; 2782 *remaining = current_remaining; 2783 } else { 2784 bdev_io->u.bdev.split_outstanding--; 2785 if (rc == -ENOMEM) { 2786 if (bdev_io->u.bdev.split_outstanding == 0) { 2787 /* No I/O is outstanding. Hence we should wait here. */ 2788 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2789 } 2790 } else { 2791 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2792 if (bdev_io->u.bdev.split_outstanding == 0) { 2793 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2794 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2795 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2796 } 2797 } 2798 } 2799 2800 return rc; 2801 } 2802 2803 static void 2804 _bdev_rw_split(void *_bdev_io) 2805 { 2806 struct iovec *parent_iov, *iov; 2807 struct spdk_bdev_io *bdev_io = _bdev_io; 2808 struct spdk_bdev *bdev = bdev_io->bdev; 2809 uint64_t parent_offset, current_offset, remaining; 2810 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2811 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2812 uint32_t iovcnt, iov_len, child_iovsize; 2813 uint32_t blocklen = bdev->blocklen; 2814 uint32_t io_boundary; 2815 uint32_t max_segment_size = bdev->max_segment_size; 2816 uint32_t max_child_iovcnt = bdev->max_num_segments; 2817 void *md_buf = NULL; 2818 int rc; 2819 2820 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2821 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2822 SPDK_BDEV_IO_NUM_CHILD_IOV; 2823 2824 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2825 io_boundary = bdev->write_unit_size; 2826 } else if (bdev->split_on_optimal_io_boundary) { 2827 io_boundary = bdev->optimal_io_boundary; 2828 } else { 2829 io_boundary = UINT32_MAX; 2830 } 2831 2832 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2833 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2834 parent_offset = bdev_io->u.bdev.offset_blocks; 2835 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2836 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2837 2838 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2839 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2840 if (parent_iov_offset < parent_iov->iov_len) { 2841 break; 2842 } 2843 parent_iov_offset -= parent_iov->iov_len; 2844 } 2845 2846 child_iovcnt = 0; 2847 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2848 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2849 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2850 to_next_boundary = spdk_min(remaining, to_next_boundary); 2851 to_next_boundary_bytes = to_next_boundary * blocklen; 2852 2853 iov = &bdev_io->child_iov[child_iovcnt]; 2854 iovcnt = 0; 2855 2856 if (bdev_io->u.bdev.md_buf) { 2857 md_buf = (char *)bdev_io->u.bdev.md_buf + 2858 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2859 } 2860 2861 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2862 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2863 iovcnt < child_iovsize) { 2864 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2865 iov_len = parent_iov->iov_len - parent_iov_offset; 2866 2867 iov_len = spdk_min(iov_len, max_segment_size); 2868 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2869 to_next_boundary_bytes -= iov_len; 2870 2871 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2872 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2873 2874 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2875 parent_iov_offset += iov_len; 2876 } else { 2877 parent_iovpos++; 2878 parent_iov_offset = 0; 2879 } 2880 child_iovcnt++; 2881 iovcnt++; 2882 } 2883 2884 if (to_next_boundary_bytes > 0) { 2885 /* We had to stop this child I/O early because we ran out of 2886 * child_iov space or were limited by max_num_segments. 2887 * Ensure the iovs to be aligned with block size and 2888 * then adjust to_next_boundary before starting the 2889 * child I/O. 2890 */ 2891 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2892 iovcnt == child_iovsize); 2893 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2894 if (to_last_block_bytes != 0) { 2895 uint32_t child_iovpos = child_iovcnt - 1; 2896 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2897 * so the loop will naturally end 2898 */ 2899 2900 to_last_block_bytes = blocklen - to_last_block_bytes; 2901 to_next_boundary_bytes += to_last_block_bytes; 2902 while (to_last_block_bytes > 0 && iovcnt > 0) { 2903 iov_len = spdk_min(to_last_block_bytes, 2904 bdev_io->child_iov[child_iovpos].iov_len); 2905 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2906 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2907 child_iovpos--; 2908 if (--iovcnt == 0) { 2909 /* If the child IO is less than a block size just return. 2910 * If the first child IO of any split round is less than 2911 * a block size, an error exit. 2912 */ 2913 if (bdev_io->u.bdev.split_outstanding == 0) { 2914 SPDK_ERRLOG("The first child io was less than a block size\n"); 2915 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2916 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2917 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2918 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2919 } 2920 2921 return; 2922 } 2923 } 2924 2925 to_last_block_bytes -= iov_len; 2926 2927 if (parent_iov_offset == 0) { 2928 parent_iovpos--; 2929 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2930 } 2931 parent_iov_offset -= iov_len; 2932 } 2933 2934 assert(to_last_block_bytes == 0); 2935 } 2936 to_next_boundary -= to_next_boundary_bytes / blocklen; 2937 } 2938 2939 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2940 ¤t_offset, &remaining); 2941 if (spdk_unlikely(rc)) { 2942 return; 2943 } 2944 } 2945 } 2946 2947 static void 2948 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2949 { 2950 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2951 uint32_t num_children_reqs = 0; 2952 int rc; 2953 2954 offset = bdev_io->u.bdev.split_current_offset_blocks; 2955 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2956 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2957 2958 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2959 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2960 2961 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2962 &offset, &remaining); 2963 if (spdk_likely(rc == 0)) { 2964 num_children_reqs++; 2965 } else { 2966 return; 2967 } 2968 } 2969 } 2970 2971 static void 2972 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2973 { 2974 uint64_t offset, write_zeroes_blocks, remaining; 2975 uint32_t num_children_reqs = 0; 2976 int rc; 2977 2978 offset = bdev_io->u.bdev.split_current_offset_blocks; 2979 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2980 2981 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2982 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2983 2984 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2985 &offset, &remaining); 2986 if (spdk_likely(rc == 0)) { 2987 num_children_reqs++; 2988 } else { 2989 return; 2990 } 2991 } 2992 } 2993 2994 static void 2995 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2996 { 2997 uint64_t offset, copy_blocks, remaining; 2998 uint32_t num_children_reqs = 0; 2999 int rc; 3000 3001 offset = bdev_io->u.bdev.split_current_offset_blocks; 3002 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3003 3004 assert(bdev_io->bdev->max_copy != 0); 3005 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3006 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3007 3008 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3009 &offset, &remaining); 3010 if (spdk_likely(rc == 0)) { 3011 num_children_reqs++; 3012 } else { 3013 return; 3014 } 3015 } 3016 } 3017 3018 static void 3019 parent_bdev_io_complete(void *ctx, int rc) 3020 { 3021 struct spdk_bdev_io *parent_io = ctx; 3022 3023 if (rc) { 3024 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3025 } 3026 3027 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3028 parent_io->internal.caller_ctx); 3029 } 3030 3031 static void 3032 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3033 { 3034 struct spdk_bdev_io *bdev_io = ctx; 3035 3036 /* u.bdev.accel_sequence should have already been cleared at this point */ 3037 assert(bdev_io->u.bdev.accel_sequence == NULL); 3038 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3039 3040 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 3041 bdev_io->internal.accel_sequence = NULL; 3042 3043 if (spdk_unlikely(status != 0)) { 3044 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3045 } 3046 3047 parent_bdev_io_complete(bdev_io, status); 3048 } 3049 3050 static void 3051 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3052 { 3053 struct spdk_bdev_io *parent_io = cb_arg; 3054 3055 spdk_bdev_free_io(bdev_io); 3056 3057 if (!success) { 3058 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3059 /* If any child I/O failed, stop further splitting process. */ 3060 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3061 parent_io->u.bdev.split_remaining_num_blocks = 0; 3062 } 3063 parent_io->u.bdev.split_outstanding--; 3064 if (parent_io->u.bdev.split_outstanding != 0) { 3065 return; 3066 } 3067 3068 /* 3069 * Parent I/O finishes when all blocks are consumed. 3070 */ 3071 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3072 assert(parent_io->internal.cb != bdev_io_split_done); 3073 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3074 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3075 3076 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io) && 3077 spdk_likely(success)) { 3078 bdev_io_exec_sequence(bdev_io, bdev_io_complete_parent_sequence_cb); 3079 } else if (parent_io->internal.orig_iovcnt != 0) { 3080 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3081 /* bdev IO will be completed in the callback */ 3082 } else { 3083 parent_bdev_io_complete(parent_io, 0); 3084 } 3085 return; 3086 } 3087 3088 /* 3089 * Continue with the splitting process. This function will complete the parent I/O if the 3090 * splitting is done. 3091 */ 3092 switch (parent_io->type) { 3093 case SPDK_BDEV_IO_TYPE_READ: 3094 case SPDK_BDEV_IO_TYPE_WRITE: 3095 _bdev_rw_split(parent_io); 3096 break; 3097 case SPDK_BDEV_IO_TYPE_UNMAP: 3098 bdev_unmap_split(parent_io); 3099 break; 3100 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3101 bdev_write_zeroes_split(parent_io); 3102 break; 3103 case SPDK_BDEV_IO_TYPE_COPY: 3104 bdev_copy_split(parent_io); 3105 break; 3106 default: 3107 assert(false); 3108 break; 3109 } 3110 } 3111 3112 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3113 bool success); 3114 3115 static void 3116 bdev_io_split(struct spdk_bdev_io *bdev_io) 3117 { 3118 assert(bdev_io_should_split(bdev_io)); 3119 3120 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3121 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3122 bdev_io->u.bdev.split_outstanding = 0; 3123 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3124 3125 switch (bdev_io->type) { 3126 case SPDK_BDEV_IO_TYPE_READ: 3127 case SPDK_BDEV_IO_TYPE_WRITE: 3128 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3129 _bdev_rw_split(bdev_io); 3130 } else { 3131 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3132 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3133 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3134 } 3135 break; 3136 case SPDK_BDEV_IO_TYPE_UNMAP: 3137 bdev_unmap_split(bdev_io); 3138 break; 3139 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3140 bdev_write_zeroes_split(bdev_io); 3141 break; 3142 case SPDK_BDEV_IO_TYPE_COPY: 3143 bdev_copy_split(bdev_io); 3144 break; 3145 default: 3146 assert(false); 3147 break; 3148 } 3149 } 3150 3151 static void 3152 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3153 { 3154 if (!success) { 3155 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3156 return; 3157 } 3158 3159 _bdev_rw_split(bdev_io); 3160 } 3161 3162 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3163 * be inlined, at least on some compilers. 3164 */ 3165 static inline void 3166 _bdev_io_submit(void *ctx) 3167 { 3168 struct spdk_bdev_io *bdev_io = ctx; 3169 struct spdk_bdev *bdev = bdev_io->bdev; 3170 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3171 3172 if (spdk_likely(bdev_ch->flags == 0)) { 3173 bdev_io_do_submit(bdev_ch, bdev_io); 3174 return; 3175 } 3176 3177 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3178 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3179 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3180 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3181 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3182 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3183 } else { 3184 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3185 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3186 } 3187 } else { 3188 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3189 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3190 } 3191 } 3192 3193 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3194 3195 bool 3196 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3197 { 3198 if (range1->length == 0 || range2->length == 0) { 3199 return false; 3200 } 3201 3202 if (range1->offset + range1->length <= range2->offset) { 3203 return false; 3204 } 3205 3206 if (range2->offset + range2->length <= range1->offset) { 3207 return false; 3208 } 3209 3210 return true; 3211 } 3212 3213 static bool 3214 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3215 { 3216 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3217 struct lba_range r; 3218 3219 switch (bdev_io->type) { 3220 case SPDK_BDEV_IO_TYPE_NVME_IO: 3221 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3222 /* Don't try to decode the NVMe command - just assume worst-case and that 3223 * it overlaps a locked range. 3224 */ 3225 return true; 3226 case SPDK_BDEV_IO_TYPE_WRITE: 3227 case SPDK_BDEV_IO_TYPE_UNMAP: 3228 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3229 case SPDK_BDEV_IO_TYPE_ZCOPY: 3230 case SPDK_BDEV_IO_TYPE_COPY: 3231 r.offset = bdev_io->u.bdev.offset_blocks; 3232 r.length = bdev_io->u.bdev.num_blocks; 3233 if (!bdev_lba_range_overlapped(range, &r)) { 3234 /* This I/O doesn't overlap the specified LBA range. */ 3235 return false; 3236 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3237 /* This I/O overlaps, but the I/O is on the same channel that locked this 3238 * range, and the caller_ctx is the same as the locked_ctx. This means 3239 * that this I/O is associated with the lock, and is allowed to execute. 3240 */ 3241 return false; 3242 } else { 3243 return true; 3244 } 3245 default: 3246 return false; 3247 } 3248 } 3249 3250 void 3251 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3252 { 3253 struct spdk_bdev *bdev = bdev_io->bdev; 3254 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3255 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3256 3257 assert(thread != NULL); 3258 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3259 3260 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3261 struct lba_range *range; 3262 3263 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3264 if (bdev_io_range_is_locked(bdev_io, range)) { 3265 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3266 return; 3267 } 3268 } 3269 } 3270 3271 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3272 3273 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3274 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3275 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3276 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3277 spdk_bdev_get_name(bdev)); 3278 3279 if (bdev_io->internal.split) { 3280 bdev_io_split(bdev_io); 3281 return; 3282 } 3283 3284 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3285 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3286 _bdev_io_submit(bdev_io); 3287 } else { 3288 bdev_io->internal.io_submit_ch = ch; 3289 bdev_io->internal.ch = bdev->internal.qos->ch; 3290 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3291 } 3292 } else { 3293 _bdev_io_submit(bdev_io); 3294 } 3295 } 3296 3297 static inline void 3298 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3299 { 3300 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3301 3302 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3303 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3304 * For write operation we need to pull buffers from memory domain before submitting IO. 3305 * Once read operation completes, we need to use memory_domain push functionality to 3306 * update data in original memory domain IO buffer 3307 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3308 bdev_io->u.bdev.memory_domain = NULL; 3309 bdev_io->u.bdev.memory_domain_ctx = NULL; 3310 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 3311 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3312 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3313 } 3314 3315 static inline void 3316 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3317 { 3318 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3319 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3320 3321 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3322 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3323 bdev_io_complete_unsubmitted(bdev_io); 3324 return; 3325 } 3326 3327 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3328 * support them, but we need to execute an accel sequence and the data buffer is from accel 3329 * memory domain (to avoid doing a push/pull from that domain). 3330 */ 3331 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3332 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3333 _bdev_io_ext_use_bounce_buffer(bdev_io); 3334 return; 3335 } 3336 3337 if (needs_exec) { 3338 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3339 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3340 return; 3341 } 3342 /* For reads we'll execute the sequence after the data is read, so, for now, only 3343 * clear out accel_sequence pointer and submit the IO */ 3344 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3345 bdev_io->u.bdev.accel_sequence = NULL; 3346 } 3347 3348 bdev_io_submit(bdev_io); 3349 } 3350 3351 static void 3352 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3353 { 3354 struct spdk_bdev *bdev = bdev_io->bdev; 3355 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3356 struct spdk_io_channel *ch = bdev_ch->channel; 3357 3358 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3359 3360 bdev_io->internal.in_submit_request = true; 3361 bdev_submit_request(bdev, ch, bdev_io); 3362 bdev_io->internal.in_submit_request = false; 3363 } 3364 3365 void 3366 bdev_io_init(struct spdk_bdev_io *bdev_io, 3367 struct spdk_bdev *bdev, void *cb_arg, 3368 spdk_bdev_io_completion_cb cb) 3369 { 3370 bdev_io->bdev = bdev; 3371 bdev_io->internal.caller_ctx = cb_arg; 3372 bdev_io->internal.cb = cb; 3373 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3374 bdev_io->internal.in_submit_request = false; 3375 bdev_io->internal.buf = NULL; 3376 bdev_io->internal.io_submit_ch = NULL; 3377 bdev_io->internal.orig_iovs = NULL; 3378 bdev_io->internal.orig_iovcnt = 0; 3379 bdev_io->internal.orig_md_iov.iov_base = NULL; 3380 bdev_io->internal.error.nvme.cdw0 = 0; 3381 bdev_io->num_retries = 0; 3382 bdev_io->internal.get_buf_cb = NULL; 3383 bdev_io->internal.get_aux_buf_cb = NULL; 3384 bdev_io->internal.memory_domain = NULL; 3385 bdev_io->internal.memory_domain_ctx = NULL; 3386 bdev_io->internal.data_transfer_cpl = NULL; 3387 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3388 bdev_io->internal.accel_sequence = NULL; 3389 } 3390 3391 static bool 3392 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3393 { 3394 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3395 } 3396 3397 bool 3398 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3399 { 3400 bool supported; 3401 3402 supported = bdev_io_type_supported(bdev, io_type); 3403 3404 if (!supported) { 3405 switch (io_type) { 3406 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3407 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3408 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3409 break; 3410 default: 3411 break; 3412 } 3413 } 3414 3415 return supported; 3416 } 3417 3418 uint64_t 3419 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3420 { 3421 return bdev_io->internal.submit_tsc; 3422 } 3423 3424 int 3425 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3426 { 3427 if (bdev->fn_table->dump_info_json) { 3428 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3429 } 3430 3431 return 0; 3432 } 3433 3434 static void 3435 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3436 { 3437 uint32_t max_per_timeslice = 0; 3438 int i; 3439 3440 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3441 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3442 qos->rate_limits[i].max_per_timeslice = 0; 3443 continue; 3444 } 3445 3446 max_per_timeslice = qos->rate_limits[i].limit * 3447 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3448 3449 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3450 qos->rate_limits[i].min_per_timeslice); 3451 3452 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3453 } 3454 3455 bdev_qos_set_ops(qos); 3456 } 3457 3458 static int 3459 bdev_channel_poll_qos(void *arg) 3460 { 3461 struct spdk_bdev_qos *qos = arg; 3462 uint64_t now = spdk_get_ticks(); 3463 int i; 3464 3465 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3466 /* We received our callback earlier than expected - return 3467 * immediately and wait to do accounting until at least one 3468 * timeslice has actually expired. This should never happen 3469 * with a well-behaved timer implementation. 3470 */ 3471 return SPDK_POLLER_IDLE; 3472 } 3473 3474 /* Reset for next round of rate limiting */ 3475 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3476 /* We may have allowed the IOs or bytes to slightly overrun in the last 3477 * timeslice. remaining_this_timeslice is signed, so if it's negative 3478 * here, we'll account for the overrun so that the next timeslice will 3479 * be appropriately reduced. 3480 */ 3481 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3482 qos->rate_limits[i].remaining_this_timeslice = 0; 3483 } 3484 } 3485 3486 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3487 qos->last_timeslice += qos->timeslice_size; 3488 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3489 qos->rate_limits[i].remaining_this_timeslice += 3490 qos->rate_limits[i].max_per_timeslice; 3491 } 3492 } 3493 3494 return bdev_qos_io_submit(qos->ch, qos); 3495 } 3496 3497 static void 3498 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3499 { 3500 struct spdk_bdev_shared_resource *shared_resource; 3501 struct lba_range *range; 3502 3503 bdev_free_io_stat(ch->stat); 3504 #ifdef SPDK_CONFIG_VTUNE 3505 bdev_free_io_stat(ch->prev_stat); 3506 #endif 3507 3508 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3509 range = TAILQ_FIRST(&ch->locked_ranges); 3510 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3511 free(range); 3512 } 3513 3514 spdk_put_io_channel(ch->channel); 3515 spdk_put_io_channel(ch->accel_channel); 3516 3517 shared_resource = ch->shared_resource; 3518 3519 assert(TAILQ_EMPTY(&ch->io_locked)); 3520 assert(TAILQ_EMPTY(&ch->io_submitted)); 3521 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3522 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3523 assert(ch->io_outstanding == 0); 3524 assert(shared_resource->ref > 0); 3525 shared_resource->ref--; 3526 if (shared_resource->ref == 0) { 3527 assert(shared_resource->io_outstanding == 0); 3528 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3529 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3530 free(shared_resource); 3531 } 3532 } 3533 3534 static void 3535 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3536 { 3537 struct spdk_bdev_qos *qos = bdev->internal.qos; 3538 int i; 3539 3540 assert(spdk_spin_held(&bdev->internal.spinlock)); 3541 3542 /* Rate limiting on this bdev enabled */ 3543 if (qos) { 3544 if (qos->ch == NULL) { 3545 struct spdk_io_channel *io_ch; 3546 3547 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3548 bdev->name, spdk_get_thread()); 3549 3550 /* No qos channel has been selected, so set one up */ 3551 3552 /* Take another reference to ch */ 3553 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3554 assert(io_ch != NULL); 3555 qos->ch = ch; 3556 3557 qos->thread = spdk_io_channel_get_thread(io_ch); 3558 3559 TAILQ_INIT(&qos->queued); 3560 3561 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3562 if (bdev_qos_is_iops_rate_limit(i) == true) { 3563 qos->rate_limits[i].min_per_timeslice = 3564 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3565 } else { 3566 qos->rate_limits[i].min_per_timeslice = 3567 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3568 } 3569 3570 if (qos->rate_limits[i].limit == 0) { 3571 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3572 } 3573 } 3574 bdev_qos_update_max_quota_per_timeslice(qos); 3575 qos->timeslice_size = 3576 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3577 qos->last_timeslice = spdk_get_ticks(); 3578 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3579 qos, 3580 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3581 } 3582 3583 ch->flags |= BDEV_CH_QOS_ENABLED; 3584 } 3585 } 3586 3587 struct poll_timeout_ctx { 3588 struct spdk_bdev_desc *desc; 3589 uint64_t timeout_in_sec; 3590 spdk_bdev_io_timeout_cb cb_fn; 3591 void *cb_arg; 3592 }; 3593 3594 static void 3595 bdev_desc_free(struct spdk_bdev_desc *desc) 3596 { 3597 spdk_spin_destroy(&desc->spinlock); 3598 free(desc->media_events_buffer); 3599 free(desc); 3600 } 3601 3602 static void 3603 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3604 { 3605 struct poll_timeout_ctx *ctx = _ctx; 3606 struct spdk_bdev_desc *desc = ctx->desc; 3607 3608 free(ctx); 3609 3610 spdk_spin_lock(&desc->spinlock); 3611 desc->refs--; 3612 if (desc->closed == true && desc->refs == 0) { 3613 spdk_spin_unlock(&desc->spinlock); 3614 bdev_desc_free(desc); 3615 return; 3616 } 3617 spdk_spin_unlock(&desc->spinlock); 3618 } 3619 3620 static void 3621 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3622 struct spdk_io_channel *io_ch, void *_ctx) 3623 { 3624 struct poll_timeout_ctx *ctx = _ctx; 3625 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3626 struct spdk_bdev_desc *desc = ctx->desc; 3627 struct spdk_bdev_io *bdev_io; 3628 uint64_t now; 3629 3630 spdk_spin_lock(&desc->spinlock); 3631 if (desc->closed == true) { 3632 spdk_spin_unlock(&desc->spinlock); 3633 spdk_bdev_for_each_channel_continue(i, -1); 3634 return; 3635 } 3636 spdk_spin_unlock(&desc->spinlock); 3637 3638 now = spdk_get_ticks(); 3639 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3640 /* Exclude any I/O that are generated via splitting. */ 3641 if (bdev_io->internal.cb == bdev_io_split_done) { 3642 continue; 3643 } 3644 3645 /* Once we find an I/O that has not timed out, we can immediately 3646 * exit the loop. 3647 */ 3648 if (now < (bdev_io->internal.submit_tsc + 3649 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3650 goto end; 3651 } 3652 3653 if (bdev_io->internal.desc == desc) { 3654 ctx->cb_fn(ctx->cb_arg, bdev_io); 3655 } 3656 } 3657 3658 end: 3659 spdk_bdev_for_each_channel_continue(i, 0); 3660 } 3661 3662 static int 3663 bdev_poll_timeout_io(void *arg) 3664 { 3665 struct spdk_bdev_desc *desc = arg; 3666 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3667 struct poll_timeout_ctx *ctx; 3668 3669 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3670 if (!ctx) { 3671 SPDK_ERRLOG("failed to allocate memory\n"); 3672 return SPDK_POLLER_BUSY; 3673 } 3674 ctx->desc = desc; 3675 ctx->cb_arg = desc->cb_arg; 3676 ctx->cb_fn = desc->cb_fn; 3677 ctx->timeout_in_sec = desc->timeout_in_sec; 3678 3679 /* Take a ref on the descriptor in case it gets closed while we are checking 3680 * all of the channels. 3681 */ 3682 spdk_spin_lock(&desc->spinlock); 3683 desc->refs++; 3684 spdk_spin_unlock(&desc->spinlock); 3685 3686 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3687 bdev_channel_poll_timeout_io_done); 3688 3689 return SPDK_POLLER_BUSY; 3690 } 3691 3692 int 3693 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3694 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3695 { 3696 assert(desc->thread == spdk_get_thread()); 3697 3698 spdk_poller_unregister(&desc->io_timeout_poller); 3699 3700 if (timeout_in_sec) { 3701 assert(cb_fn != NULL); 3702 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3703 desc, 3704 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3705 1000); 3706 if (desc->io_timeout_poller == NULL) { 3707 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3708 return -1; 3709 } 3710 } 3711 3712 desc->cb_fn = cb_fn; 3713 desc->cb_arg = cb_arg; 3714 desc->timeout_in_sec = timeout_in_sec; 3715 3716 return 0; 3717 } 3718 3719 static int 3720 bdev_channel_create(void *io_device, void *ctx_buf) 3721 { 3722 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3723 struct spdk_bdev_channel *ch = ctx_buf; 3724 struct spdk_io_channel *mgmt_io_ch; 3725 struct spdk_bdev_mgmt_channel *mgmt_ch; 3726 struct spdk_bdev_shared_resource *shared_resource; 3727 struct lba_range *range; 3728 3729 ch->bdev = bdev; 3730 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3731 if (!ch->channel) { 3732 return -1; 3733 } 3734 3735 ch->accel_channel = spdk_accel_get_io_channel(); 3736 if (!ch->accel_channel) { 3737 spdk_put_io_channel(ch->channel); 3738 return -1; 3739 } 3740 3741 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3742 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3743 3744 assert(ch->histogram == NULL); 3745 if (bdev->internal.histogram_enabled) { 3746 ch->histogram = spdk_histogram_data_alloc(); 3747 if (ch->histogram == NULL) { 3748 SPDK_ERRLOG("Could not allocate histogram\n"); 3749 } 3750 } 3751 3752 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3753 if (!mgmt_io_ch) { 3754 spdk_put_io_channel(ch->channel); 3755 spdk_put_io_channel(ch->accel_channel); 3756 return -1; 3757 } 3758 3759 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3760 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3761 if (shared_resource->shared_ch == ch->channel) { 3762 spdk_put_io_channel(mgmt_io_ch); 3763 shared_resource->ref++; 3764 break; 3765 } 3766 } 3767 3768 if (shared_resource == NULL) { 3769 shared_resource = calloc(1, sizeof(*shared_resource)); 3770 if (shared_resource == NULL) { 3771 spdk_put_io_channel(ch->channel); 3772 spdk_put_io_channel(ch->accel_channel); 3773 spdk_put_io_channel(mgmt_io_ch); 3774 return -1; 3775 } 3776 3777 shared_resource->mgmt_ch = mgmt_ch; 3778 shared_resource->io_outstanding = 0; 3779 TAILQ_INIT(&shared_resource->nomem_io); 3780 shared_resource->nomem_threshold = 0; 3781 shared_resource->shared_ch = ch->channel; 3782 shared_resource->ref = 1; 3783 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3784 } 3785 3786 ch->io_outstanding = 0; 3787 TAILQ_INIT(&ch->queued_resets); 3788 TAILQ_INIT(&ch->locked_ranges); 3789 ch->flags = 0; 3790 ch->shared_resource = shared_resource; 3791 3792 TAILQ_INIT(&ch->io_submitted); 3793 TAILQ_INIT(&ch->io_locked); 3794 TAILQ_INIT(&ch->io_accel_exec); 3795 TAILQ_INIT(&ch->io_memory_domain); 3796 3797 ch->stat = bdev_alloc_io_stat(false); 3798 if (ch->stat == NULL) { 3799 bdev_channel_destroy_resource(ch); 3800 return -1; 3801 } 3802 3803 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3804 3805 #ifdef SPDK_CONFIG_VTUNE 3806 { 3807 char *name; 3808 __itt_init_ittlib(NULL, 0); 3809 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3810 if (!name) { 3811 bdev_channel_destroy_resource(ch); 3812 return -1; 3813 } 3814 ch->handle = __itt_string_handle_create(name); 3815 free(name); 3816 ch->start_tsc = spdk_get_ticks(); 3817 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3818 ch->prev_stat = bdev_alloc_io_stat(false); 3819 if (ch->prev_stat == NULL) { 3820 bdev_channel_destroy_resource(ch); 3821 return -1; 3822 } 3823 } 3824 #endif 3825 3826 spdk_spin_lock(&bdev->internal.spinlock); 3827 bdev_enable_qos(bdev, ch); 3828 3829 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3830 struct lba_range *new_range; 3831 3832 new_range = calloc(1, sizeof(*new_range)); 3833 if (new_range == NULL) { 3834 spdk_spin_unlock(&bdev->internal.spinlock); 3835 bdev_channel_destroy_resource(ch); 3836 return -1; 3837 } 3838 new_range->length = range->length; 3839 new_range->offset = range->offset; 3840 new_range->locked_ctx = range->locked_ctx; 3841 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3842 } 3843 3844 spdk_spin_unlock(&bdev->internal.spinlock); 3845 3846 return 0; 3847 } 3848 3849 static int 3850 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3851 void *cb_ctx) 3852 { 3853 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3854 struct spdk_bdev_io *bdev_io; 3855 uint64_t buf_len; 3856 3857 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3858 if (bdev_io->internal.ch == bdev_ch) { 3859 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3860 spdk_iobuf_entry_abort(ch, entry, buf_len); 3861 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3862 } 3863 3864 return 0; 3865 } 3866 3867 /* 3868 * Abort I/O that are waiting on a data buffer. 3869 */ 3870 static void 3871 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3872 { 3873 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3874 bdev_abort_all_buf_io_cb, ch); 3875 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3876 bdev_abort_all_buf_io_cb, ch); 3877 } 3878 3879 /* 3880 * Abort I/O that are queued waiting for submission. These types of I/O are 3881 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3882 */ 3883 static void 3884 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3885 { 3886 struct spdk_bdev_io *bdev_io, *tmp; 3887 3888 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3889 if (bdev_io->internal.ch == ch) { 3890 TAILQ_REMOVE(queue, bdev_io, internal.link); 3891 /* 3892 * spdk_bdev_io_complete() assumes that the completed I/O had 3893 * been submitted to the bdev module. Since in this case it 3894 * hadn't, bump io_outstanding to account for the decrement 3895 * that spdk_bdev_io_complete() will do. 3896 */ 3897 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3898 ch->io_outstanding++; 3899 ch->shared_resource->io_outstanding++; 3900 } 3901 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3902 } 3903 } 3904 } 3905 3906 static bool 3907 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3908 { 3909 struct spdk_bdev_io *bdev_io; 3910 3911 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3912 if (bdev_io == bio_to_abort) { 3913 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3914 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3915 return true; 3916 } 3917 } 3918 3919 return false; 3920 } 3921 3922 static int 3923 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3924 { 3925 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3926 uint64_t buf_len; 3927 3928 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3929 if (bdev_io == bio_to_abort) { 3930 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3931 spdk_iobuf_entry_abort(ch, entry, buf_len); 3932 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3933 return 1; 3934 } 3935 3936 return 0; 3937 } 3938 3939 static bool 3940 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3941 { 3942 int rc; 3943 3944 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3945 bdev_abort_buf_io_cb, bio_to_abort); 3946 if (rc == 1) { 3947 return true; 3948 } 3949 3950 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3951 bdev_abort_buf_io_cb, bio_to_abort); 3952 return rc == 1; 3953 } 3954 3955 static void 3956 bdev_qos_channel_destroy(void *cb_arg) 3957 { 3958 struct spdk_bdev_qos *qos = cb_arg; 3959 3960 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3961 spdk_poller_unregister(&qos->poller); 3962 3963 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3964 3965 free(qos); 3966 } 3967 3968 static int 3969 bdev_qos_destroy(struct spdk_bdev *bdev) 3970 { 3971 int i; 3972 3973 /* 3974 * Cleanly shutting down the QoS poller is tricky, because 3975 * during the asynchronous operation the user could open 3976 * a new descriptor and create a new channel, spawning 3977 * a new QoS poller. 3978 * 3979 * The strategy is to create a new QoS structure here and swap it 3980 * in. The shutdown path then continues to refer to the old one 3981 * until it completes and then releases it. 3982 */ 3983 struct spdk_bdev_qos *new_qos, *old_qos; 3984 3985 old_qos = bdev->internal.qos; 3986 3987 new_qos = calloc(1, sizeof(*new_qos)); 3988 if (!new_qos) { 3989 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3990 return -ENOMEM; 3991 } 3992 3993 /* Copy the old QoS data into the newly allocated structure */ 3994 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3995 3996 /* Zero out the key parts of the QoS structure */ 3997 new_qos->ch = NULL; 3998 new_qos->thread = NULL; 3999 new_qos->poller = NULL; 4000 TAILQ_INIT(&new_qos->queued); 4001 /* 4002 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4003 * It will be used later for the new QoS structure. 4004 */ 4005 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4006 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4007 new_qos->rate_limits[i].min_per_timeslice = 0; 4008 new_qos->rate_limits[i].max_per_timeslice = 0; 4009 } 4010 4011 bdev->internal.qos = new_qos; 4012 4013 if (old_qos->thread == NULL) { 4014 free(old_qos); 4015 } else { 4016 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4017 } 4018 4019 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4020 * been destroyed yet. The destruction path will end up waiting for the final 4021 * channel to be put before it releases resources. */ 4022 4023 return 0; 4024 } 4025 4026 void 4027 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4028 { 4029 total->bytes_read += add->bytes_read; 4030 total->num_read_ops += add->num_read_ops; 4031 total->bytes_written += add->bytes_written; 4032 total->num_write_ops += add->num_write_ops; 4033 total->bytes_unmapped += add->bytes_unmapped; 4034 total->num_unmap_ops += add->num_unmap_ops; 4035 total->bytes_copied += add->bytes_copied; 4036 total->num_copy_ops += add->num_copy_ops; 4037 total->read_latency_ticks += add->read_latency_ticks; 4038 total->write_latency_ticks += add->write_latency_ticks; 4039 total->unmap_latency_ticks += add->unmap_latency_ticks; 4040 total->copy_latency_ticks += add->copy_latency_ticks; 4041 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4042 total->max_read_latency_ticks = add->max_read_latency_ticks; 4043 } 4044 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4045 total->min_read_latency_ticks = add->min_read_latency_ticks; 4046 } 4047 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4048 total->max_write_latency_ticks = add->max_write_latency_ticks; 4049 } 4050 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4051 total->min_write_latency_ticks = add->min_write_latency_ticks; 4052 } 4053 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4054 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4055 } 4056 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4057 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4058 } 4059 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4060 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4061 } 4062 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4063 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4064 } 4065 } 4066 4067 static void 4068 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4069 { 4070 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4071 4072 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4073 memcpy(to_stat->io_error, from_stat->io_error, 4074 sizeof(struct spdk_bdev_io_error_stat)); 4075 } 4076 } 4077 4078 void 4079 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4080 { 4081 stat->max_read_latency_ticks = 0; 4082 stat->min_read_latency_ticks = UINT64_MAX; 4083 stat->max_write_latency_ticks = 0; 4084 stat->min_write_latency_ticks = UINT64_MAX; 4085 stat->max_unmap_latency_ticks = 0; 4086 stat->min_unmap_latency_ticks = UINT64_MAX; 4087 stat->max_copy_latency_ticks = 0; 4088 stat->min_copy_latency_ticks = UINT64_MAX; 4089 4090 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4091 return; 4092 } 4093 4094 stat->bytes_read = 0; 4095 stat->num_read_ops = 0; 4096 stat->bytes_written = 0; 4097 stat->num_write_ops = 0; 4098 stat->bytes_unmapped = 0; 4099 stat->num_unmap_ops = 0; 4100 stat->bytes_copied = 0; 4101 stat->num_copy_ops = 0; 4102 stat->read_latency_ticks = 0; 4103 stat->write_latency_ticks = 0; 4104 stat->unmap_latency_ticks = 0; 4105 stat->copy_latency_ticks = 0; 4106 4107 if (stat->io_error != NULL) { 4108 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4109 } 4110 } 4111 4112 struct spdk_bdev_io_stat * 4113 bdev_alloc_io_stat(bool io_error_stat) 4114 { 4115 struct spdk_bdev_io_stat *stat; 4116 4117 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4118 if (stat == NULL) { 4119 return NULL; 4120 } 4121 4122 if (io_error_stat) { 4123 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4124 if (stat->io_error == NULL) { 4125 free(stat); 4126 return NULL; 4127 } 4128 } else { 4129 stat->io_error = NULL; 4130 } 4131 4132 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4133 4134 return stat; 4135 } 4136 4137 void 4138 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4139 { 4140 if (stat != NULL) { 4141 free(stat->io_error); 4142 free(stat); 4143 } 4144 } 4145 4146 void 4147 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4148 { 4149 int i; 4150 4151 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4152 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4153 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4154 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4155 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4156 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4157 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4158 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4159 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4160 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4161 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4162 stat->min_read_latency_ticks != UINT64_MAX ? 4163 stat->min_read_latency_ticks : 0); 4164 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4165 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4166 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4167 stat->min_write_latency_ticks != UINT64_MAX ? 4168 stat->min_write_latency_ticks : 0); 4169 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4170 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4171 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4172 stat->min_unmap_latency_ticks != UINT64_MAX ? 4173 stat->min_unmap_latency_ticks : 0); 4174 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4175 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4176 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4177 stat->min_copy_latency_ticks != UINT64_MAX ? 4178 stat->min_copy_latency_ticks : 0); 4179 4180 if (stat->io_error != NULL) { 4181 spdk_json_write_named_object_begin(w, "io_error"); 4182 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4183 if (stat->io_error->error_status[i] != 0) { 4184 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4185 stat->io_error->error_status[i]); 4186 } 4187 } 4188 spdk_json_write_object_end(w); 4189 } 4190 } 4191 4192 static void 4193 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4194 { 4195 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4196 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4197 4198 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4199 bdev_abort_all_buf_io(mgmt_ch, ch); 4200 bdev_abort_all_buf_io(mgmt_ch, ch); 4201 } 4202 4203 static void 4204 bdev_channel_destroy(void *io_device, void *ctx_buf) 4205 { 4206 struct spdk_bdev_channel *ch = ctx_buf; 4207 4208 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4209 spdk_get_thread()); 4210 4211 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4212 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4213 4214 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4215 spdk_spin_lock(&ch->bdev->internal.spinlock); 4216 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4217 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4218 4219 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4220 4221 bdev_channel_abort_queued_ios(ch); 4222 4223 if (ch->histogram) { 4224 spdk_histogram_data_free(ch->histogram); 4225 } 4226 4227 bdev_channel_destroy_resource(ch); 4228 } 4229 4230 /* 4231 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4232 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4233 */ 4234 static int 4235 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4236 { 4237 struct spdk_bdev_name *tmp; 4238 4239 bdev_name->name = strdup(name); 4240 if (bdev_name->name == NULL) { 4241 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4242 return -ENOMEM; 4243 } 4244 4245 bdev_name->bdev = bdev; 4246 4247 spdk_spin_lock(&g_bdev_mgr.spinlock); 4248 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4249 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4250 4251 if (tmp != NULL) { 4252 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4253 free(bdev_name->name); 4254 return -EEXIST; 4255 } 4256 4257 return 0; 4258 } 4259 4260 static void 4261 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4262 { 4263 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4264 free(bdev_name->name); 4265 } 4266 4267 static void 4268 bdev_name_del(struct spdk_bdev_name *bdev_name) 4269 { 4270 spdk_spin_lock(&g_bdev_mgr.spinlock); 4271 bdev_name_del_unsafe(bdev_name); 4272 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4273 } 4274 4275 int 4276 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4277 { 4278 struct spdk_bdev_alias *tmp; 4279 int ret; 4280 4281 if (alias == NULL) { 4282 SPDK_ERRLOG("Empty alias passed\n"); 4283 return -EINVAL; 4284 } 4285 4286 tmp = calloc(1, sizeof(*tmp)); 4287 if (tmp == NULL) { 4288 SPDK_ERRLOG("Unable to allocate alias\n"); 4289 return -ENOMEM; 4290 } 4291 4292 ret = bdev_name_add(&tmp->alias, bdev, alias); 4293 if (ret != 0) { 4294 free(tmp); 4295 return ret; 4296 } 4297 4298 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4299 4300 return 0; 4301 } 4302 4303 static int 4304 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4305 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4306 { 4307 struct spdk_bdev_alias *tmp; 4308 4309 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4310 if (strcmp(alias, tmp->alias.name) == 0) { 4311 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4312 alias_del_fn(&tmp->alias); 4313 free(tmp); 4314 return 0; 4315 } 4316 } 4317 4318 return -ENOENT; 4319 } 4320 4321 int 4322 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4323 { 4324 int rc; 4325 4326 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4327 if (rc == -ENOENT) { 4328 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4329 } 4330 4331 return rc; 4332 } 4333 4334 void 4335 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4336 { 4337 struct spdk_bdev_alias *p, *tmp; 4338 4339 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4340 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4341 bdev_name_del(&p->alias); 4342 free(p); 4343 } 4344 } 4345 4346 struct spdk_io_channel * 4347 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4348 { 4349 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4350 } 4351 4352 void * 4353 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4354 { 4355 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4356 void *ctx = NULL; 4357 4358 if (bdev->fn_table->get_module_ctx) { 4359 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4360 } 4361 4362 return ctx; 4363 } 4364 4365 const char * 4366 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4367 { 4368 return bdev->module->name; 4369 } 4370 4371 const char * 4372 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4373 { 4374 return bdev->name; 4375 } 4376 4377 const char * 4378 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4379 { 4380 return bdev->product_name; 4381 } 4382 4383 const struct spdk_bdev_aliases_list * 4384 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4385 { 4386 return &bdev->aliases; 4387 } 4388 4389 uint32_t 4390 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4391 { 4392 return bdev->blocklen; 4393 } 4394 4395 uint32_t 4396 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4397 { 4398 return bdev->write_unit_size; 4399 } 4400 4401 uint64_t 4402 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4403 { 4404 return bdev->blockcnt; 4405 } 4406 4407 const char * 4408 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4409 { 4410 return qos_rpc_type[type]; 4411 } 4412 4413 void 4414 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4415 { 4416 int i; 4417 4418 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4419 4420 spdk_spin_lock(&bdev->internal.spinlock); 4421 if (bdev->internal.qos) { 4422 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4423 if (bdev->internal.qos->rate_limits[i].limit != 4424 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4425 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4426 if (bdev_qos_is_iops_rate_limit(i) == false) { 4427 /* Change from Byte to Megabyte which is user visible. */ 4428 limits[i] = limits[i] / 1024 / 1024; 4429 } 4430 } 4431 } 4432 } 4433 spdk_spin_unlock(&bdev->internal.spinlock); 4434 } 4435 4436 size_t 4437 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4438 { 4439 return 1 << bdev->required_alignment; 4440 } 4441 4442 uint32_t 4443 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4444 { 4445 return bdev->optimal_io_boundary; 4446 } 4447 4448 bool 4449 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4450 { 4451 return bdev->write_cache; 4452 } 4453 4454 const struct spdk_uuid * 4455 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4456 { 4457 return &bdev->uuid; 4458 } 4459 4460 uint16_t 4461 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4462 { 4463 return bdev->acwu; 4464 } 4465 4466 uint32_t 4467 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4468 { 4469 return bdev->md_len; 4470 } 4471 4472 bool 4473 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4474 { 4475 return (bdev->md_len != 0) && bdev->md_interleave; 4476 } 4477 4478 bool 4479 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4480 { 4481 return (bdev->md_len != 0) && !bdev->md_interleave; 4482 } 4483 4484 bool 4485 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4486 { 4487 return bdev->zoned; 4488 } 4489 4490 uint32_t 4491 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4492 { 4493 if (spdk_bdev_is_md_interleaved(bdev)) { 4494 return bdev->blocklen - bdev->md_len; 4495 } else { 4496 return bdev->blocklen; 4497 } 4498 } 4499 4500 uint32_t 4501 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4502 { 4503 return bdev->phys_blocklen; 4504 } 4505 4506 static uint32_t 4507 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4508 { 4509 if (!spdk_bdev_is_md_interleaved(bdev)) { 4510 return bdev->blocklen + bdev->md_len; 4511 } else { 4512 return bdev->blocklen; 4513 } 4514 } 4515 4516 /* We have to use the typedef in the function declaration to appease astyle. */ 4517 typedef enum spdk_dif_type spdk_dif_type_t; 4518 4519 spdk_dif_type_t 4520 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4521 { 4522 if (bdev->md_len != 0) { 4523 return bdev->dif_type; 4524 } else { 4525 return SPDK_DIF_DISABLE; 4526 } 4527 } 4528 4529 bool 4530 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4531 { 4532 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4533 return bdev->dif_is_head_of_md; 4534 } else { 4535 return false; 4536 } 4537 } 4538 4539 bool 4540 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4541 enum spdk_dif_check_type check_type) 4542 { 4543 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4544 return false; 4545 } 4546 4547 switch (check_type) { 4548 case SPDK_DIF_CHECK_TYPE_REFTAG: 4549 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4550 case SPDK_DIF_CHECK_TYPE_APPTAG: 4551 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4552 case SPDK_DIF_CHECK_TYPE_GUARD: 4553 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4554 default: 4555 return false; 4556 } 4557 } 4558 4559 uint32_t 4560 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4561 { 4562 uint64_t alighed_length; 4563 uint64_t max_copy_blocks; 4564 uint64_t temp_max_copy_blocks; 4565 struct spdk_iobuf_opts opts; 4566 4567 if (spdk_bdev_io_type_supported((struct spdk_bdev *)bdev, SPDK_BDEV_IO_TYPE_COPY)) { 4568 return bdev->max_copy; 4569 } else { 4570 spdk_iobuf_get_opts(&opts); 4571 alighed_length = opts.large_bufsize - spdk_bdev_get_buf_align(bdev); 4572 temp_max_copy_blocks = spdk_bdev_is_md_separate(bdev) ? 4573 alighed_length / (bdev->blocklen + bdev->md_len) : 4574 alighed_length / bdev->blocklen; 4575 max_copy_blocks = 1 << spdk_u64log2(temp_max_copy_blocks); 4576 return max_copy_blocks; 4577 } 4578 } 4579 4580 uint64_t 4581 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4582 { 4583 return bdev->internal.measured_queue_depth; 4584 } 4585 4586 uint64_t 4587 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4588 { 4589 return bdev->internal.period; 4590 } 4591 4592 uint64_t 4593 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4594 { 4595 return bdev->internal.weighted_io_time; 4596 } 4597 4598 uint64_t 4599 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4600 { 4601 return bdev->internal.io_time; 4602 } 4603 4604 static void bdev_update_qd_sampling_period(void *ctx); 4605 4606 static void 4607 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4608 { 4609 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4610 4611 if (bdev->internal.measured_queue_depth) { 4612 bdev->internal.io_time += bdev->internal.period; 4613 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4614 } 4615 4616 bdev->internal.qd_poll_in_progress = false; 4617 4618 bdev_update_qd_sampling_period(bdev); 4619 } 4620 4621 static void 4622 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4623 struct spdk_io_channel *io_ch, void *_ctx) 4624 { 4625 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4626 4627 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4628 spdk_bdev_for_each_channel_continue(i, 0); 4629 } 4630 4631 static int 4632 bdev_calculate_measured_queue_depth(void *ctx) 4633 { 4634 struct spdk_bdev *bdev = ctx; 4635 4636 bdev->internal.qd_poll_in_progress = true; 4637 bdev->internal.temporary_queue_depth = 0; 4638 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4639 return SPDK_POLLER_BUSY; 4640 } 4641 4642 static void 4643 bdev_update_qd_sampling_period(void *ctx) 4644 { 4645 struct spdk_bdev *bdev = ctx; 4646 4647 if (bdev->internal.period == bdev->internal.new_period) { 4648 return; 4649 } 4650 4651 if (bdev->internal.qd_poll_in_progress) { 4652 return; 4653 } 4654 4655 bdev->internal.period = bdev->internal.new_period; 4656 4657 spdk_poller_unregister(&bdev->internal.qd_poller); 4658 if (bdev->internal.period != 0) { 4659 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4660 bdev, bdev->internal.period); 4661 } else { 4662 spdk_bdev_close(bdev->internal.qd_desc); 4663 bdev->internal.qd_desc = NULL; 4664 } 4665 } 4666 4667 static void 4668 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4669 { 4670 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4671 } 4672 4673 void 4674 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4675 { 4676 int rc; 4677 4678 if (bdev->internal.new_period == period) { 4679 return; 4680 } 4681 4682 bdev->internal.new_period = period; 4683 4684 if (bdev->internal.qd_desc != NULL) { 4685 assert(bdev->internal.period != 0); 4686 4687 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4688 bdev_update_qd_sampling_period, bdev); 4689 return; 4690 } 4691 4692 assert(bdev->internal.period == 0); 4693 4694 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4695 NULL, &bdev->internal.qd_desc); 4696 if (rc != 0) { 4697 return; 4698 } 4699 4700 bdev->internal.period = period; 4701 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4702 bdev, period); 4703 } 4704 4705 struct bdev_get_current_qd_ctx { 4706 uint64_t current_qd; 4707 spdk_bdev_get_current_qd_cb cb_fn; 4708 void *cb_arg; 4709 }; 4710 4711 static void 4712 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4713 { 4714 struct bdev_get_current_qd_ctx *ctx = _ctx; 4715 4716 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4717 4718 free(ctx); 4719 } 4720 4721 static void 4722 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4723 struct spdk_io_channel *io_ch, void *_ctx) 4724 { 4725 struct bdev_get_current_qd_ctx *ctx = _ctx; 4726 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4727 4728 ctx->current_qd += bdev_ch->io_outstanding; 4729 4730 spdk_bdev_for_each_channel_continue(i, 0); 4731 } 4732 4733 void 4734 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4735 void *cb_arg) 4736 { 4737 struct bdev_get_current_qd_ctx *ctx; 4738 4739 assert(cb_fn != NULL); 4740 4741 ctx = calloc(1, sizeof(*ctx)); 4742 if (ctx == NULL) { 4743 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4744 return; 4745 } 4746 4747 ctx->cb_fn = cb_fn; 4748 ctx->cb_arg = cb_arg; 4749 4750 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4751 } 4752 4753 static void 4754 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4755 { 4756 assert(desc->thread == spdk_get_thread()); 4757 4758 spdk_spin_lock(&desc->spinlock); 4759 desc->refs--; 4760 if (!desc->closed) { 4761 spdk_spin_unlock(&desc->spinlock); 4762 desc->callback.event_fn(type, 4763 desc->bdev, 4764 desc->callback.ctx); 4765 return; 4766 } else if (desc->refs == 0) { 4767 /* This descriptor was closed after this event_notify message was sent. 4768 * spdk_bdev_close() could not free the descriptor since this message was 4769 * in flight, so we free it now using bdev_desc_free(). 4770 */ 4771 spdk_spin_unlock(&desc->spinlock); 4772 bdev_desc_free(desc); 4773 return; 4774 } 4775 spdk_spin_unlock(&desc->spinlock); 4776 } 4777 4778 static void 4779 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4780 { 4781 spdk_spin_lock(&desc->spinlock); 4782 desc->refs++; 4783 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4784 spdk_spin_unlock(&desc->spinlock); 4785 } 4786 4787 static void 4788 _resize_notify(void *ctx) 4789 { 4790 struct spdk_bdev_desc *desc = ctx; 4791 4792 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4793 } 4794 4795 int 4796 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4797 { 4798 struct spdk_bdev_desc *desc; 4799 int ret; 4800 4801 if (size == bdev->blockcnt) { 4802 return 0; 4803 } 4804 4805 spdk_spin_lock(&bdev->internal.spinlock); 4806 4807 /* bdev has open descriptors */ 4808 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4809 bdev->blockcnt > size) { 4810 ret = -EBUSY; 4811 } else { 4812 bdev->blockcnt = size; 4813 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4814 event_notify(desc, _resize_notify); 4815 } 4816 ret = 0; 4817 } 4818 4819 spdk_spin_unlock(&bdev->internal.spinlock); 4820 4821 return ret; 4822 } 4823 4824 /* 4825 * Convert I/O offset and length from bytes to blocks. 4826 * 4827 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4828 */ 4829 static uint64_t 4830 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4831 uint64_t num_bytes, uint64_t *num_blocks) 4832 { 4833 uint32_t block_size = bdev->blocklen; 4834 uint8_t shift_cnt; 4835 4836 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4837 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4838 shift_cnt = spdk_u32log2(block_size); 4839 *offset_blocks = offset_bytes >> shift_cnt; 4840 *num_blocks = num_bytes >> shift_cnt; 4841 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4842 (num_bytes - (*num_blocks << shift_cnt)); 4843 } else { 4844 *offset_blocks = offset_bytes / block_size; 4845 *num_blocks = num_bytes / block_size; 4846 return (offset_bytes % block_size) | (num_bytes % block_size); 4847 } 4848 } 4849 4850 static bool 4851 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4852 { 4853 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4854 * has been an overflow and hence the offset has been wrapped around */ 4855 if (offset_blocks + num_blocks < offset_blocks) { 4856 return false; 4857 } 4858 4859 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4860 if (offset_blocks + num_blocks > bdev->blockcnt) { 4861 return false; 4862 } 4863 4864 return true; 4865 } 4866 4867 static void 4868 bdev_seek_complete_cb(void *ctx) 4869 { 4870 struct spdk_bdev_io *bdev_io = ctx; 4871 4872 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4873 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4874 } 4875 4876 static int 4877 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4878 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4879 spdk_bdev_io_completion_cb cb, void *cb_arg) 4880 { 4881 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4882 struct spdk_bdev_io *bdev_io; 4883 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4884 4885 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4886 4887 /* Check if offset_blocks is valid looking at the validity of one block */ 4888 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4889 return -EINVAL; 4890 } 4891 4892 bdev_io = bdev_channel_get_io(channel); 4893 if (!bdev_io) { 4894 return -ENOMEM; 4895 } 4896 4897 bdev_io->internal.ch = channel; 4898 bdev_io->internal.desc = desc; 4899 bdev_io->type = io_type; 4900 bdev_io->u.bdev.offset_blocks = offset_blocks; 4901 bdev_io->u.bdev.memory_domain = NULL; 4902 bdev_io->u.bdev.memory_domain_ctx = NULL; 4903 bdev_io->u.bdev.accel_sequence = NULL; 4904 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4905 4906 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4907 /* In case bdev doesn't support seek to next data/hole offset, 4908 * it is assumed that only data and no holes are present */ 4909 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4910 bdev_io->u.bdev.seek.offset = offset_blocks; 4911 } else { 4912 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4913 } 4914 4915 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4916 return 0; 4917 } 4918 4919 bdev_io_submit(bdev_io); 4920 return 0; 4921 } 4922 4923 int 4924 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4925 uint64_t offset_blocks, 4926 spdk_bdev_io_completion_cb cb, void *cb_arg) 4927 { 4928 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4929 } 4930 4931 int 4932 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4933 uint64_t offset_blocks, 4934 spdk_bdev_io_completion_cb cb, void *cb_arg) 4935 { 4936 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4937 } 4938 4939 uint64_t 4940 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4941 { 4942 return bdev_io->u.bdev.seek.offset; 4943 } 4944 4945 static int 4946 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4947 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4948 spdk_bdev_io_completion_cb cb, void *cb_arg) 4949 { 4950 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4951 struct spdk_bdev_io *bdev_io; 4952 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4953 4954 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4955 return -EINVAL; 4956 } 4957 4958 bdev_io = bdev_channel_get_io(channel); 4959 if (!bdev_io) { 4960 return -ENOMEM; 4961 } 4962 4963 bdev_io->internal.ch = channel; 4964 bdev_io->internal.desc = desc; 4965 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4966 bdev_io->u.bdev.iovs = &bdev_io->iov; 4967 bdev_io->u.bdev.iovs[0].iov_base = buf; 4968 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4969 bdev_io->u.bdev.iovcnt = 1; 4970 bdev_io->u.bdev.md_buf = md_buf; 4971 bdev_io->u.bdev.num_blocks = num_blocks; 4972 bdev_io->u.bdev.offset_blocks = offset_blocks; 4973 bdev_io->u.bdev.memory_domain = NULL; 4974 bdev_io->u.bdev.memory_domain_ctx = NULL; 4975 bdev_io->u.bdev.accel_sequence = NULL; 4976 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4977 4978 bdev_io_submit(bdev_io); 4979 return 0; 4980 } 4981 4982 int 4983 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4984 void *buf, uint64_t offset, uint64_t nbytes, 4985 spdk_bdev_io_completion_cb cb, void *cb_arg) 4986 { 4987 uint64_t offset_blocks, num_blocks; 4988 4989 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4990 nbytes, &num_blocks) != 0) { 4991 return -EINVAL; 4992 } 4993 4994 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4995 } 4996 4997 int 4998 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4999 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5000 spdk_bdev_io_completion_cb cb, void *cb_arg) 5001 { 5002 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5003 } 5004 5005 int 5006 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5007 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5008 spdk_bdev_io_completion_cb cb, void *cb_arg) 5009 { 5010 struct iovec iov = { 5011 .iov_base = buf, 5012 }; 5013 5014 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5015 return -EINVAL; 5016 } 5017 5018 if (md_buf && !_is_buf_allocated(&iov)) { 5019 return -EINVAL; 5020 } 5021 5022 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5023 cb, cb_arg); 5024 } 5025 5026 int 5027 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5028 struct iovec *iov, int iovcnt, 5029 uint64_t offset, uint64_t nbytes, 5030 spdk_bdev_io_completion_cb cb, void *cb_arg) 5031 { 5032 uint64_t offset_blocks, num_blocks; 5033 5034 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5035 nbytes, &num_blocks) != 0) { 5036 return -EINVAL; 5037 } 5038 5039 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5040 } 5041 5042 static int 5043 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5044 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5045 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5046 struct spdk_accel_sequence *seq, 5047 spdk_bdev_io_completion_cb cb, void *cb_arg) 5048 { 5049 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5050 struct spdk_bdev_io *bdev_io; 5051 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5052 5053 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5054 return -EINVAL; 5055 } 5056 5057 bdev_io = bdev_channel_get_io(channel); 5058 if (!bdev_io) { 5059 return -ENOMEM; 5060 } 5061 5062 bdev_io->internal.ch = channel; 5063 bdev_io->internal.desc = desc; 5064 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5065 bdev_io->u.bdev.iovs = iov; 5066 bdev_io->u.bdev.iovcnt = iovcnt; 5067 bdev_io->u.bdev.md_buf = md_buf; 5068 bdev_io->u.bdev.num_blocks = num_blocks; 5069 bdev_io->u.bdev.offset_blocks = offset_blocks; 5070 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5071 bdev_io->internal.memory_domain = domain; 5072 bdev_io->internal.memory_domain_ctx = domain_ctx; 5073 bdev_io->internal.accel_sequence = seq; 5074 bdev_io->u.bdev.memory_domain = domain; 5075 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5076 bdev_io->u.bdev.accel_sequence = seq; 5077 5078 _bdev_io_submit_ext(desc, bdev_io); 5079 5080 return 0; 5081 } 5082 5083 int 5084 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5085 struct iovec *iov, int iovcnt, 5086 uint64_t offset_blocks, uint64_t num_blocks, 5087 spdk_bdev_io_completion_cb cb, void *cb_arg) 5088 { 5089 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5090 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5091 } 5092 5093 int 5094 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5095 struct iovec *iov, int iovcnt, void *md_buf, 5096 uint64_t offset_blocks, uint64_t num_blocks, 5097 spdk_bdev_io_completion_cb cb, void *cb_arg) 5098 { 5099 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5100 return -EINVAL; 5101 } 5102 5103 if (md_buf && !_is_buf_allocated(iov)) { 5104 return -EINVAL; 5105 } 5106 5107 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5108 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5109 } 5110 5111 static inline bool 5112 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5113 { 5114 /* 5115 * We check if opts size is at least of size when we first introduced 5116 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5117 * are not checked internal. 5118 */ 5119 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5120 sizeof(opts->metadata) && 5121 opts->size <= sizeof(*opts) && 5122 /* When memory domain is used, the user must provide data buffers */ 5123 (!opts->memory_domain || (iov && iov[0].iov_base)); 5124 } 5125 5126 int 5127 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5128 struct iovec *iov, int iovcnt, 5129 uint64_t offset_blocks, uint64_t num_blocks, 5130 spdk_bdev_io_completion_cb cb, void *cb_arg, 5131 struct spdk_bdev_ext_io_opts *opts) 5132 { 5133 void *md = NULL; 5134 5135 if (opts) { 5136 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5137 return -EINVAL; 5138 } 5139 md = opts->metadata; 5140 } 5141 5142 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5143 return -EINVAL; 5144 } 5145 5146 if (md && !_is_buf_allocated(iov)) { 5147 return -EINVAL; 5148 } 5149 5150 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5151 num_blocks, 5152 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5153 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5154 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5155 cb, cb_arg); 5156 } 5157 5158 static int 5159 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5160 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5161 spdk_bdev_io_completion_cb cb, void *cb_arg) 5162 { 5163 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5164 struct spdk_bdev_io *bdev_io; 5165 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5166 5167 if (!desc->write) { 5168 return -EBADF; 5169 } 5170 5171 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5172 return -EINVAL; 5173 } 5174 5175 bdev_io = bdev_channel_get_io(channel); 5176 if (!bdev_io) { 5177 return -ENOMEM; 5178 } 5179 5180 bdev_io->internal.ch = channel; 5181 bdev_io->internal.desc = desc; 5182 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5183 bdev_io->u.bdev.iovs = &bdev_io->iov; 5184 bdev_io->u.bdev.iovs[0].iov_base = buf; 5185 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5186 bdev_io->u.bdev.iovcnt = 1; 5187 bdev_io->u.bdev.md_buf = md_buf; 5188 bdev_io->u.bdev.num_blocks = num_blocks; 5189 bdev_io->u.bdev.offset_blocks = offset_blocks; 5190 bdev_io->u.bdev.memory_domain = NULL; 5191 bdev_io->u.bdev.memory_domain_ctx = NULL; 5192 bdev_io->u.bdev.accel_sequence = NULL; 5193 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5194 5195 bdev_io_submit(bdev_io); 5196 return 0; 5197 } 5198 5199 int 5200 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5201 void *buf, uint64_t offset, uint64_t nbytes, 5202 spdk_bdev_io_completion_cb cb, void *cb_arg) 5203 { 5204 uint64_t offset_blocks, num_blocks; 5205 5206 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5207 nbytes, &num_blocks) != 0) { 5208 return -EINVAL; 5209 } 5210 5211 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5212 } 5213 5214 int 5215 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5216 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5217 spdk_bdev_io_completion_cb cb, void *cb_arg) 5218 { 5219 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5220 cb, cb_arg); 5221 } 5222 5223 int 5224 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5225 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5226 spdk_bdev_io_completion_cb cb, void *cb_arg) 5227 { 5228 struct iovec iov = { 5229 .iov_base = buf, 5230 }; 5231 5232 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5233 return -EINVAL; 5234 } 5235 5236 if (md_buf && !_is_buf_allocated(&iov)) { 5237 return -EINVAL; 5238 } 5239 5240 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5241 cb, cb_arg); 5242 } 5243 5244 static int 5245 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5246 struct iovec *iov, int iovcnt, void *md_buf, 5247 uint64_t offset_blocks, uint64_t num_blocks, 5248 struct spdk_memory_domain *domain, void *domain_ctx, 5249 struct spdk_accel_sequence *seq, 5250 spdk_bdev_io_completion_cb cb, void *cb_arg) 5251 { 5252 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5253 struct spdk_bdev_io *bdev_io; 5254 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5255 5256 if (!desc->write) { 5257 return -EBADF; 5258 } 5259 5260 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5261 return -EINVAL; 5262 } 5263 5264 bdev_io = bdev_channel_get_io(channel); 5265 if (!bdev_io) { 5266 return -ENOMEM; 5267 } 5268 5269 bdev_io->internal.ch = channel; 5270 bdev_io->internal.desc = desc; 5271 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5272 bdev_io->u.bdev.iovs = iov; 5273 bdev_io->u.bdev.iovcnt = iovcnt; 5274 bdev_io->u.bdev.md_buf = md_buf; 5275 bdev_io->u.bdev.num_blocks = num_blocks; 5276 bdev_io->u.bdev.offset_blocks = offset_blocks; 5277 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5278 bdev_io->internal.memory_domain = domain; 5279 bdev_io->internal.memory_domain_ctx = domain_ctx; 5280 bdev_io->internal.accel_sequence = seq; 5281 bdev_io->u.bdev.memory_domain = domain; 5282 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5283 bdev_io->u.bdev.accel_sequence = seq; 5284 5285 _bdev_io_submit_ext(desc, bdev_io); 5286 5287 return 0; 5288 } 5289 5290 int 5291 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5292 struct iovec *iov, int iovcnt, 5293 uint64_t offset, uint64_t len, 5294 spdk_bdev_io_completion_cb cb, void *cb_arg) 5295 { 5296 uint64_t offset_blocks, num_blocks; 5297 5298 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5299 len, &num_blocks) != 0) { 5300 return -EINVAL; 5301 } 5302 5303 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5304 } 5305 5306 int 5307 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5308 struct iovec *iov, int iovcnt, 5309 uint64_t offset_blocks, uint64_t num_blocks, 5310 spdk_bdev_io_completion_cb cb, void *cb_arg) 5311 { 5312 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5313 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5314 } 5315 5316 int 5317 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5318 struct iovec *iov, int iovcnt, void *md_buf, 5319 uint64_t offset_blocks, uint64_t num_blocks, 5320 spdk_bdev_io_completion_cb cb, void *cb_arg) 5321 { 5322 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5323 return -EINVAL; 5324 } 5325 5326 if (md_buf && !_is_buf_allocated(iov)) { 5327 return -EINVAL; 5328 } 5329 5330 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5331 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5332 } 5333 5334 int 5335 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5336 struct iovec *iov, int iovcnt, 5337 uint64_t offset_blocks, uint64_t num_blocks, 5338 spdk_bdev_io_completion_cb cb, void *cb_arg, 5339 struct spdk_bdev_ext_io_opts *opts) 5340 { 5341 void *md = NULL; 5342 5343 if (opts) { 5344 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5345 return -EINVAL; 5346 } 5347 md = opts->metadata; 5348 } 5349 5350 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5351 return -EINVAL; 5352 } 5353 5354 if (md && !_is_buf_allocated(iov)) { 5355 return -EINVAL; 5356 } 5357 5358 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5359 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5360 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5361 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5362 cb, cb_arg); 5363 } 5364 5365 static void 5366 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5367 { 5368 struct spdk_bdev_io *parent_io = cb_arg; 5369 struct spdk_bdev *bdev = parent_io->bdev; 5370 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5371 int i, rc = 0; 5372 5373 if (!success) { 5374 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5375 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5376 spdk_bdev_free_io(bdev_io); 5377 return; 5378 } 5379 5380 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5381 rc = memcmp(read_buf, 5382 parent_io->u.bdev.iovs[i].iov_base, 5383 parent_io->u.bdev.iovs[i].iov_len); 5384 if (rc) { 5385 break; 5386 } 5387 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5388 } 5389 5390 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5391 rc = memcmp(bdev_io->u.bdev.md_buf, 5392 parent_io->u.bdev.md_buf, 5393 spdk_bdev_get_md_size(bdev)); 5394 } 5395 5396 spdk_bdev_free_io(bdev_io); 5397 5398 if (rc == 0) { 5399 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5400 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5401 } else { 5402 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5403 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5404 } 5405 } 5406 5407 static void 5408 bdev_compare_do_read(void *_bdev_io) 5409 { 5410 struct spdk_bdev_io *bdev_io = _bdev_io; 5411 int rc; 5412 5413 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5414 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5415 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5416 bdev_compare_do_read_done, bdev_io); 5417 5418 if (rc == -ENOMEM) { 5419 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5420 } else if (rc != 0) { 5421 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5422 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5423 } 5424 } 5425 5426 static int 5427 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5428 struct iovec *iov, int iovcnt, void *md_buf, 5429 uint64_t offset_blocks, uint64_t num_blocks, 5430 spdk_bdev_io_completion_cb cb, void *cb_arg) 5431 { 5432 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5433 struct spdk_bdev_io *bdev_io; 5434 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5435 5436 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5437 return -EINVAL; 5438 } 5439 5440 bdev_io = bdev_channel_get_io(channel); 5441 if (!bdev_io) { 5442 return -ENOMEM; 5443 } 5444 5445 bdev_io->internal.ch = channel; 5446 bdev_io->internal.desc = desc; 5447 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5448 bdev_io->u.bdev.iovs = iov; 5449 bdev_io->u.bdev.iovcnt = iovcnt; 5450 bdev_io->u.bdev.md_buf = md_buf; 5451 bdev_io->u.bdev.num_blocks = num_blocks; 5452 bdev_io->u.bdev.offset_blocks = offset_blocks; 5453 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5454 bdev_io->u.bdev.memory_domain = NULL; 5455 bdev_io->u.bdev.memory_domain_ctx = NULL; 5456 bdev_io->u.bdev.accel_sequence = NULL; 5457 5458 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5459 bdev_io_submit(bdev_io); 5460 return 0; 5461 } 5462 5463 bdev_compare_do_read(bdev_io); 5464 5465 return 0; 5466 } 5467 5468 int 5469 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5470 struct iovec *iov, int iovcnt, 5471 uint64_t offset_blocks, uint64_t num_blocks, 5472 spdk_bdev_io_completion_cb cb, void *cb_arg) 5473 { 5474 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5475 num_blocks, cb, cb_arg); 5476 } 5477 5478 int 5479 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5480 struct iovec *iov, int iovcnt, void *md_buf, 5481 uint64_t offset_blocks, uint64_t num_blocks, 5482 spdk_bdev_io_completion_cb cb, void *cb_arg) 5483 { 5484 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5485 return -EINVAL; 5486 } 5487 5488 if (md_buf && !_is_buf_allocated(iov)) { 5489 return -EINVAL; 5490 } 5491 5492 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5493 num_blocks, cb, cb_arg); 5494 } 5495 5496 static int 5497 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5498 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5499 spdk_bdev_io_completion_cb cb, void *cb_arg) 5500 { 5501 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5502 struct spdk_bdev_io *bdev_io; 5503 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5504 5505 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5506 return -EINVAL; 5507 } 5508 5509 bdev_io = bdev_channel_get_io(channel); 5510 if (!bdev_io) { 5511 return -ENOMEM; 5512 } 5513 5514 bdev_io->internal.ch = channel; 5515 bdev_io->internal.desc = desc; 5516 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5517 bdev_io->u.bdev.iovs = &bdev_io->iov; 5518 bdev_io->u.bdev.iovs[0].iov_base = buf; 5519 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5520 bdev_io->u.bdev.iovcnt = 1; 5521 bdev_io->u.bdev.md_buf = md_buf; 5522 bdev_io->u.bdev.num_blocks = num_blocks; 5523 bdev_io->u.bdev.offset_blocks = offset_blocks; 5524 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5525 bdev_io->u.bdev.memory_domain = NULL; 5526 bdev_io->u.bdev.memory_domain_ctx = NULL; 5527 bdev_io->u.bdev.accel_sequence = NULL; 5528 5529 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5530 bdev_io_submit(bdev_io); 5531 return 0; 5532 } 5533 5534 bdev_compare_do_read(bdev_io); 5535 5536 return 0; 5537 } 5538 5539 int 5540 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5541 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5542 spdk_bdev_io_completion_cb cb, void *cb_arg) 5543 { 5544 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5545 cb, cb_arg); 5546 } 5547 5548 int 5549 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5550 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5551 spdk_bdev_io_completion_cb cb, void *cb_arg) 5552 { 5553 struct iovec iov = { 5554 .iov_base = buf, 5555 }; 5556 5557 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5558 return -EINVAL; 5559 } 5560 5561 if (md_buf && !_is_buf_allocated(&iov)) { 5562 return -EINVAL; 5563 } 5564 5565 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5566 cb, cb_arg); 5567 } 5568 5569 static void 5570 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5571 { 5572 struct spdk_bdev_io *bdev_io = ctx; 5573 5574 if (unlock_status) { 5575 SPDK_ERRLOG("LBA range unlock failed\n"); 5576 } 5577 5578 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5579 false, bdev_io->internal.caller_ctx); 5580 } 5581 5582 static void 5583 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5584 { 5585 bdev_io->internal.status = status; 5586 5587 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5588 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5589 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5590 } 5591 5592 static void 5593 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5594 { 5595 struct spdk_bdev_io *parent_io = cb_arg; 5596 5597 if (!success) { 5598 SPDK_ERRLOG("Compare and write operation failed\n"); 5599 } 5600 5601 spdk_bdev_free_io(bdev_io); 5602 5603 bdev_comparev_and_writev_blocks_unlock(parent_io, 5604 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5605 } 5606 5607 static void 5608 bdev_compare_and_write_do_write(void *_bdev_io) 5609 { 5610 struct spdk_bdev_io *bdev_io = _bdev_io; 5611 int rc; 5612 5613 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5614 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5615 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5616 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5617 bdev_compare_and_write_do_write_done, bdev_io); 5618 5619 5620 if (rc == -ENOMEM) { 5621 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5622 } else if (rc != 0) { 5623 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5624 } 5625 } 5626 5627 static void 5628 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5629 { 5630 struct spdk_bdev_io *parent_io = cb_arg; 5631 5632 spdk_bdev_free_io(bdev_io); 5633 5634 if (!success) { 5635 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5636 return; 5637 } 5638 5639 bdev_compare_and_write_do_write(parent_io); 5640 } 5641 5642 static void 5643 bdev_compare_and_write_do_compare(void *_bdev_io) 5644 { 5645 struct spdk_bdev_io *bdev_io = _bdev_io; 5646 int rc; 5647 5648 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5649 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5650 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5651 bdev_compare_and_write_do_compare_done, bdev_io); 5652 5653 if (rc == -ENOMEM) { 5654 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5655 } else if (rc != 0) { 5656 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5657 } 5658 } 5659 5660 static void 5661 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5662 { 5663 struct spdk_bdev_io *bdev_io = ctx; 5664 5665 if (status) { 5666 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5667 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5668 return; 5669 } 5670 5671 bdev_compare_and_write_do_compare(bdev_io); 5672 } 5673 5674 int 5675 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5676 struct iovec *compare_iov, int compare_iovcnt, 5677 struct iovec *write_iov, int write_iovcnt, 5678 uint64_t offset_blocks, uint64_t num_blocks, 5679 spdk_bdev_io_completion_cb cb, void *cb_arg) 5680 { 5681 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5682 struct spdk_bdev_io *bdev_io; 5683 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5684 5685 if (!desc->write) { 5686 return -EBADF; 5687 } 5688 5689 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5690 return -EINVAL; 5691 } 5692 5693 if (num_blocks > bdev->acwu) { 5694 return -EINVAL; 5695 } 5696 5697 bdev_io = bdev_channel_get_io(channel); 5698 if (!bdev_io) { 5699 return -ENOMEM; 5700 } 5701 5702 bdev_io->internal.ch = channel; 5703 bdev_io->internal.desc = desc; 5704 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5705 bdev_io->u.bdev.iovs = compare_iov; 5706 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5707 bdev_io->u.bdev.fused_iovs = write_iov; 5708 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5709 bdev_io->u.bdev.md_buf = NULL; 5710 bdev_io->u.bdev.num_blocks = num_blocks; 5711 bdev_io->u.bdev.offset_blocks = offset_blocks; 5712 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5713 bdev_io->u.bdev.memory_domain = NULL; 5714 bdev_io->u.bdev.memory_domain_ctx = NULL; 5715 bdev_io->u.bdev.accel_sequence = NULL; 5716 5717 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5718 bdev_io_submit(bdev_io); 5719 return 0; 5720 } 5721 5722 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5723 bdev_comparev_and_writev_blocks_locked, bdev_io); 5724 } 5725 5726 int 5727 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5728 struct iovec *iov, int iovcnt, 5729 uint64_t offset_blocks, uint64_t num_blocks, 5730 bool populate, 5731 spdk_bdev_io_completion_cb cb, void *cb_arg) 5732 { 5733 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5734 struct spdk_bdev_io *bdev_io; 5735 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5736 5737 if (!desc->write) { 5738 return -EBADF; 5739 } 5740 5741 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5742 return -EINVAL; 5743 } 5744 5745 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5746 return -ENOTSUP; 5747 } 5748 5749 bdev_io = bdev_channel_get_io(channel); 5750 if (!bdev_io) { 5751 return -ENOMEM; 5752 } 5753 5754 bdev_io->internal.ch = channel; 5755 bdev_io->internal.desc = desc; 5756 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5757 bdev_io->u.bdev.num_blocks = num_blocks; 5758 bdev_io->u.bdev.offset_blocks = offset_blocks; 5759 bdev_io->u.bdev.iovs = iov; 5760 bdev_io->u.bdev.iovcnt = iovcnt; 5761 bdev_io->u.bdev.md_buf = NULL; 5762 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5763 bdev_io->u.bdev.zcopy.commit = 0; 5764 bdev_io->u.bdev.zcopy.start = 1; 5765 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5766 bdev_io->u.bdev.memory_domain = NULL; 5767 bdev_io->u.bdev.memory_domain_ctx = NULL; 5768 bdev_io->u.bdev.accel_sequence = NULL; 5769 5770 bdev_io_submit(bdev_io); 5771 5772 return 0; 5773 } 5774 5775 int 5776 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5777 spdk_bdev_io_completion_cb cb, void *cb_arg) 5778 { 5779 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5780 return -EINVAL; 5781 } 5782 5783 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5784 bdev_io->u.bdev.zcopy.start = 0; 5785 bdev_io->internal.caller_ctx = cb_arg; 5786 bdev_io->internal.cb = cb; 5787 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5788 5789 bdev_io_submit(bdev_io); 5790 5791 return 0; 5792 } 5793 5794 int 5795 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5796 uint64_t offset, uint64_t len, 5797 spdk_bdev_io_completion_cb cb, void *cb_arg) 5798 { 5799 uint64_t offset_blocks, num_blocks; 5800 5801 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5802 len, &num_blocks) != 0) { 5803 return -EINVAL; 5804 } 5805 5806 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5807 } 5808 5809 int 5810 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5811 uint64_t offset_blocks, uint64_t num_blocks, 5812 spdk_bdev_io_completion_cb cb, void *cb_arg) 5813 { 5814 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5815 struct spdk_bdev_io *bdev_io; 5816 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5817 5818 if (!desc->write) { 5819 return -EBADF; 5820 } 5821 5822 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5823 return -EINVAL; 5824 } 5825 5826 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5827 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5828 return -ENOTSUP; 5829 } 5830 5831 bdev_io = bdev_channel_get_io(channel); 5832 5833 if (!bdev_io) { 5834 return -ENOMEM; 5835 } 5836 5837 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5838 bdev_io->internal.ch = channel; 5839 bdev_io->internal.desc = desc; 5840 bdev_io->u.bdev.offset_blocks = offset_blocks; 5841 bdev_io->u.bdev.num_blocks = num_blocks; 5842 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5843 bdev_io->u.bdev.memory_domain = NULL; 5844 bdev_io->u.bdev.memory_domain_ctx = NULL; 5845 bdev_io->u.bdev.accel_sequence = NULL; 5846 5847 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5848 bdev_io_submit(bdev_io); 5849 return 0; 5850 } 5851 5852 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5853 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5854 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5855 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5856 bdev_write_zero_buffer_next(bdev_io); 5857 5858 return 0; 5859 } 5860 5861 int 5862 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5863 uint64_t offset, uint64_t nbytes, 5864 spdk_bdev_io_completion_cb cb, void *cb_arg) 5865 { 5866 uint64_t offset_blocks, num_blocks; 5867 5868 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5869 nbytes, &num_blocks) != 0) { 5870 return -EINVAL; 5871 } 5872 5873 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5874 } 5875 5876 int 5877 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5878 uint64_t offset_blocks, uint64_t num_blocks, 5879 spdk_bdev_io_completion_cb cb, void *cb_arg) 5880 { 5881 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5882 struct spdk_bdev_io *bdev_io; 5883 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5884 5885 if (!desc->write) { 5886 return -EBADF; 5887 } 5888 5889 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5890 return -EINVAL; 5891 } 5892 5893 if (num_blocks == 0) { 5894 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5895 return -EINVAL; 5896 } 5897 5898 bdev_io = bdev_channel_get_io(channel); 5899 if (!bdev_io) { 5900 return -ENOMEM; 5901 } 5902 5903 bdev_io->internal.ch = channel; 5904 bdev_io->internal.desc = desc; 5905 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5906 5907 bdev_io->u.bdev.iovs = &bdev_io->iov; 5908 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5909 bdev_io->u.bdev.iovs[0].iov_len = 0; 5910 bdev_io->u.bdev.iovcnt = 1; 5911 5912 bdev_io->u.bdev.offset_blocks = offset_blocks; 5913 bdev_io->u.bdev.num_blocks = num_blocks; 5914 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5915 bdev_io->u.bdev.memory_domain = NULL; 5916 bdev_io->u.bdev.memory_domain_ctx = NULL; 5917 bdev_io->u.bdev.accel_sequence = NULL; 5918 5919 bdev_io_submit(bdev_io); 5920 return 0; 5921 } 5922 5923 int 5924 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5925 uint64_t offset, uint64_t length, 5926 spdk_bdev_io_completion_cb cb, void *cb_arg) 5927 { 5928 uint64_t offset_blocks, num_blocks; 5929 5930 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5931 length, &num_blocks) != 0) { 5932 return -EINVAL; 5933 } 5934 5935 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5936 } 5937 5938 int 5939 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5940 uint64_t offset_blocks, uint64_t num_blocks, 5941 spdk_bdev_io_completion_cb cb, void *cb_arg) 5942 { 5943 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5944 struct spdk_bdev_io *bdev_io; 5945 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5946 5947 if (!desc->write) { 5948 return -EBADF; 5949 } 5950 5951 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5952 return -EINVAL; 5953 } 5954 5955 bdev_io = bdev_channel_get_io(channel); 5956 if (!bdev_io) { 5957 return -ENOMEM; 5958 } 5959 5960 bdev_io->internal.ch = channel; 5961 bdev_io->internal.desc = desc; 5962 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5963 bdev_io->u.bdev.iovs = NULL; 5964 bdev_io->u.bdev.iovcnt = 0; 5965 bdev_io->u.bdev.offset_blocks = offset_blocks; 5966 bdev_io->u.bdev.num_blocks = num_blocks; 5967 bdev_io->u.bdev.memory_domain = NULL; 5968 bdev_io->u.bdev.memory_domain_ctx = NULL; 5969 bdev_io->u.bdev.accel_sequence = NULL; 5970 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5971 5972 bdev_io_submit(bdev_io); 5973 return 0; 5974 } 5975 5976 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5977 5978 static void 5979 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5980 { 5981 struct spdk_bdev_channel *ch = _ctx; 5982 struct spdk_bdev_io *bdev_io; 5983 5984 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5985 5986 if (status == -EBUSY) { 5987 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5988 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5989 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5990 } else { 5991 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5992 5993 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 5994 /* If outstanding IOs are still present and reset_io_drain_timeout 5995 * seconds passed, start the reset. */ 5996 bdev_io_submit_reset(bdev_io); 5997 } else { 5998 /* We still have in progress memory domain pull/push or we're 5999 * executing accel sequence. Since we cannot abort either of those 6000 * operaions, fail the reset request. */ 6001 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6002 } 6003 } 6004 } else { 6005 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6006 SPDK_DEBUGLOG(bdev, 6007 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6008 ch->bdev->name); 6009 /* Mark the completion status as a SUCCESS and complete the reset. */ 6010 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6011 } 6012 } 6013 6014 static void 6015 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6016 struct spdk_io_channel *io_ch, void *_ctx) 6017 { 6018 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6019 int status = 0; 6020 6021 if (cur_ch->io_outstanding > 0 || 6022 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6023 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6024 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6025 * further iteration over the rest of the channels and pass non-zero status 6026 * to the callback function. */ 6027 status = -EBUSY; 6028 } 6029 spdk_bdev_for_each_channel_continue(i, status); 6030 } 6031 6032 static int 6033 bdev_reset_poll_for_outstanding_io(void *ctx) 6034 { 6035 struct spdk_bdev_channel *ch = ctx; 6036 struct spdk_bdev_io *bdev_io; 6037 6038 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6039 6040 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6041 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6042 bdev_reset_check_outstanding_io_done); 6043 6044 return SPDK_POLLER_BUSY; 6045 } 6046 6047 static void 6048 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6049 { 6050 struct spdk_bdev_channel *ch = _ctx; 6051 struct spdk_bdev_io *bdev_io; 6052 6053 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6054 6055 if (bdev->reset_io_drain_timeout == 0) { 6056 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6057 6058 bdev_io_submit_reset(bdev_io); 6059 return; 6060 } 6061 6062 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6063 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6064 6065 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6066 * submit the reset to the underlying module only if outstanding I/O 6067 * remain after reset_io_drain_timeout seconds have passed. */ 6068 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6069 bdev_reset_check_outstanding_io_done); 6070 } 6071 6072 static void 6073 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6074 struct spdk_io_channel *ch, void *_ctx) 6075 { 6076 struct spdk_bdev_channel *channel; 6077 struct spdk_bdev_mgmt_channel *mgmt_channel; 6078 struct spdk_bdev_shared_resource *shared_resource; 6079 bdev_io_tailq_t tmp_queued; 6080 6081 TAILQ_INIT(&tmp_queued); 6082 6083 channel = __io_ch_to_bdev_ch(ch); 6084 shared_resource = channel->shared_resource; 6085 mgmt_channel = shared_resource->mgmt_ch; 6086 6087 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6088 6089 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6090 /* The QoS object is always valid and readable while 6091 * the channel flag is set, so the lock here should not 6092 * be necessary. We're not in the fast path though, so 6093 * just take it anyway. */ 6094 spdk_spin_lock(&channel->bdev->internal.spinlock); 6095 if (channel->bdev->internal.qos->ch == channel) { 6096 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6097 } 6098 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6099 } 6100 6101 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6102 bdev_abort_all_buf_io(mgmt_channel, channel); 6103 bdev_abort_all_buf_io(mgmt_channel, channel); 6104 bdev_abort_all_queued_io(&tmp_queued, channel); 6105 6106 spdk_bdev_for_each_channel_continue(i, 0); 6107 } 6108 6109 static void 6110 bdev_start_reset(void *ctx) 6111 { 6112 struct spdk_bdev_channel *ch = ctx; 6113 6114 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6115 bdev_reset_freeze_channel_done); 6116 } 6117 6118 static void 6119 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6120 { 6121 struct spdk_bdev *bdev = ch->bdev; 6122 6123 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6124 6125 spdk_spin_lock(&bdev->internal.spinlock); 6126 if (bdev->internal.reset_in_progress == NULL) { 6127 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6128 /* 6129 * Take a channel reference for the target bdev for the life of this 6130 * reset. This guards against the channel getting destroyed while 6131 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6132 * progress. We will release the reference when this reset is 6133 * completed. 6134 */ 6135 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6136 bdev_start_reset(ch); 6137 } 6138 spdk_spin_unlock(&bdev->internal.spinlock); 6139 } 6140 6141 int 6142 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6143 spdk_bdev_io_completion_cb cb, void *cb_arg) 6144 { 6145 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6146 struct spdk_bdev_io *bdev_io; 6147 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6148 6149 bdev_io = bdev_channel_get_io(channel); 6150 if (!bdev_io) { 6151 return -ENOMEM; 6152 } 6153 6154 bdev_io->internal.ch = channel; 6155 bdev_io->internal.desc = desc; 6156 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6157 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6158 bdev_io->u.reset.ch_ref = NULL; 6159 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6160 6161 spdk_spin_lock(&bdev->internal.spinlock); 6162 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6163 spdk_spin_unlock(&bdev->internal.spinlock); 6164 6165 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6166 internal.ch_link); 6167 6168 bdev_channel_start_reset(channel); 6169 6170 return 0; 6171 } 6172 6173 void 6174 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6175 struct spdk_bdev_io_stat *stat) 6176 { 6177 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6178 6179 bdev_get_io_stat(stat, channel->stat); 6180 } 6181 6182 static void 6183 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6184 { 6185 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6186 6187 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6188 bdev_iostat_ctx->cb_arg, 0); 6189 free(bdev_iostat_ctx); 6190 } 6191 6192 static void 6193 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6194 struct spdk_io_channel *ch, void *_ctx) 6195 { 6196 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6197 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6198 6199 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6200 spdk_bdev_for_each_channel_continue(i, 0); 6201 } 6202 6203 void 6204 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6205 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6206 { 6207 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6208 6209 assert(bdev != NULL); 6210 assert(stat != NULL); 6211 assert(cb != NULL); 6212 6213 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6214 if (bdev_iostat_ctx == NULL) { 6215 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6216 cb(bdev, stat, cb_arg, -ENOMEM); 6217 return; 6218 } 6219 6220 bdev_iostat_ctx->stat = stat; 6221 bdev_iostat_ctx->cb = cb; 6222 bdev_iostat_ctx->cb_arg = cb_arg; 6223 6224 /* Start with the statistics from previously deleted channels. */ 6225 spdk_spin_lock(&bdev->internal.spinlock); 6226 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6227 spdk_spin_unlock(&bdev->internal.spinlock); 6228 6229 /* Then iterate and add the statistics from each existing channel. */ 6230 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6231 bdev_get_device_stat_done); 6232 } 6233 6234 struct bdev_iostat_reset_ctx { 6235 enum spdk_bdev_reset_stat_mode mode; 6236 bdev_reset_device_stat_cb cb; 6237 void *cb_arg; 6238 }; 6239 6240 static void 6241 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6242 { 6243 struct bdev_iostat_reset_ctx *ctx = _ctx; 6244 6245 ctx->cb(bdev, ctx->cb_arg, 0); 6246 6247 free(ctx); 6248 } 6249 6250 static void 6251 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6252 struct spdk_io_channel *ch, void *_ctx) 6253 { 6254 struct bdev_iostat_reset_ctx *ctx = _ctx; 6255 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6256 6257 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6258 6259 spdk_bdev_for_each_channel_continue(i, 0); 6260 } 6261 6262 void 6263 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6264 bdev_reset_device_stat_cb cb, void *cb_arg) 6265 { 6266 struct bdev_iostat_reset_ctx *ctx; 6267 6268 assert(bdev != NULL); 6269 assert(cb != NULL); 6270 6271 ctx = calloc(1, sizeof(*ctx)); 6272 if (ctx == NULL) { 6273 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6274 cb(bdev, cb_arg, -ENOMEM); 6275 return; 6276 } 6277 6278 ctx->mode = mode; 6279 ctx->cb = cb; 6280 ctx->cb_arg = cb_arg; 6281 6282 spdk_spin_lock(&bdev->internal.spinlock); 6283 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6284 spdk_spin_unlock(&bdev->internal.spinlock); 6285 6286 spdk_bdev_for_each_channel(bdev, 6287 bdev_reset_each_channel_stat, 6288 ctx, 6289 bdev_reset_device_stat_done); 6290 } 6291 6292 int 6293 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6294 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6295 spdk_bdev_io_completion_cb cb, void *cb_arg) 6296 { 6297 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6298 struct spdk_bdev_io *bdev_io; 6299 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6300 6301 if (!desc->write) { 6302 return -EBADF; 6303 } 6304 6305 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6306 return -ENOTSUP; 6307 } 6308 6309 bdev_io = bdev_channel_get_io(channel); 6310 if (!bdev_io) { 6311 return -ENOMEM; 6312 } 6313 6314 bdev_io->internal.ch = channel; 6315 bdev_io->internal.desc = desc; 6316 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6317 bdev_io->u.nvme_passthru.cmd = *cmd; 6318 bdev_io->u.nvme_passthru.buf = buf; 6319 bdev_io->u.nvme_passthru.nbytes = nbytes; 6320 bdev_io->u.nvme_passthru.md_buf = NULL; 6321 bdev_io->u.nvme_passthru.md_len = 0; 6322 6323 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6324 6325 bdev_io_submit(bdev_io); 6326 return 0; 6327 } 6328 6329 int 6330 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6331 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6332 spdk_bdev_io_completion_cb cb, void *cb_arg) 6333 { 6334 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6335 struct spdk_bdev_io *bdev_io; 6336 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6337 6338 if (!desc->write) { 6339 /* 6340 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6341 * to easily determine if the command is a read or write, but for now just 6342 * do not allow io_passthru with a read-only descriptor. 6343 */ 6344 return -EBADF; 6345 } 6346 6347 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6348 return -ENOTSUP; 6349 } 6350 6351 bdev_io = bdev_channel_get_io(channel); 6352 if (!bdev_io) { 6353 return -ENOMEM; 6354 } 6355 6356 bdev_io->internal.ch = channel; 6357 bdev_io->internal.desc = desc; 6358 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6359 bdev_io->u.nvme_passthru.cmd = *cmd; 6360 bdev_io->u.nvme_passthru.buf = buf; 6361 bdev_io->u.nvme_passthru.nbytes = nbytes; 6362 bdev_io->u.nvme_passthru.md_buf = NULL; 6363 bdev_io->u.nvme_passthru.md_len = 0; 6364 6365 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6366 6367 bdev_io_submit(bdev_io); 6368 return 0; 6369 } 6370 6371 int 6372 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6373 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6374 spdk_bdev_io_completion_cb cb, void *cb_arg) 6375 { 6376 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6377 struct spdk_bdev_io *bdev_io; 6378 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6379 6380 if (!desc->write) { 6381 /* 6382 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6383 * to easily determine if the command is a read or write, but for now just 6384 * do not allow io_passthru with a read-only descriptor. 6385 */ 6386 return -EBADF; 6387 } 6388 6389 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6390 return -ENOTSUP; 6391 } 6392 6393 bdev_io = bdev_channel_get_io(channel); 6394 if (!bdev_io) { 6395 return -ENOMEM; 6396 } 6397 6398 bdev_io->internal.ch = channel; 6399 bdev_io->internal.desc = desc; 6400 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6401 bdev_io->u.nvme_passthru.cmd = *cmd; 6402 bdev_io->u.nvme_passthru.buf = buf; 6403 bdev_io->u.nvme_passthru.nbytes = nbytes; 6404 bdev_io->u.nvme_passthru.md_buf = md_buf; 6405 bdev_io->u.nvme_passthru.md_len = md_len; 6406 6407 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6408 6409 bdev_io_submit(bdev_io); 6410 return 0; 6411 } 6412 6413 static void bdev_abort_retry(void *ctx); 6414 static void bdev_abort(struct spdk_bdev_io *parent_io); 6415 6416 static void 6417 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6418 { 6419 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6420 struct spdk_bdev_io *parent_io = cb_arg; 6421 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6422 6423 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6424 6425 spdk_bdev_free_io(bdev_io); 6426 6427 if (!success) { 6428 /* Check if the target I/O completed in the meantime. */ 6429 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6430 if (tmp_io == bio_to_abort) { 6431 break; 6432 } 6433 } 6434 6435 /* If the target I/O still exists, set the parent to failed. */ 6436 if (tmp_io != NULL) { 6437 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6438 } 6439 } 6440 6441 parent_io->u.bdev.split_outstanding--; 6442 if (parent_io->u.bdev.split_outstanding == 0) { 6443 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6444 bdev_abort_retry(parent_io); 6445 } else { 6446 bdev_io_complete(parent_io); 6447 } 6448 } 6449 } 6450 6451 static int 6452 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6453 struct spdk_bdev_io *bio_to_abort, 6454 spdk_bdev_io_completion_cb cb, void *cb_arg) 6455 { 6456 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6457 struct spdk_bdev_io *bdev_io; 6458 6459 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6460 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6461 /* TODO: Abort reset or abort request. */ 6462 return -ENOTSUP; 6463 } 6464 6465 bdev_io = bdev_channel_get_io(channel); 6466 if (bdev_io == NULL) { 6467 return -ENOMEM; 6468 } 6469 6470 bdev_io->internal.ch = channel; 6471 bdev_io->internal.desc = desc; 6472 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6473 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6474 6475 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6476 assert(bdev_io_should_split(bio_to_abort)); 6477 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6478 6479 /* Parent abort request is not submitted directly, but to manage its 6480 * execution add it to the submitted list here. 6481 */ 6482 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6483 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6484 6485 bdev_abort(bdev_io); 6486 6487 return 0; 6488 } 6489 6490 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6491 6492 /* Submit the abort request to the underlying bdev module. */ 6493 bdev_io_submit(bdev_io); 6494 6495 return 0; 6496 } 6497 6498 static bool 6499 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6500 { 6501 struct spdk_bdev_io *iter; 6502 6503 TAILQ_FOREACH(iter, tailq, internal.link) { 6504 if (iter == bdev_io) { 6505 return true; 6506 } 6507 } 6508 6509 return false; 6510 } 6511 6512 static uint32_t 6513 _bdev_abort(struct spdk_bdev_io *parent_io) 6514 { 6515 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6516 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6517 void *bio_cb_arg; 6518 struct spdk_bdev_io *bio_to_abort; 6519 uint32_t matched_ios; 6520 int rc; 6521 6522 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6523 6524 /* matched_ios is returned and will be kept by the caller. 6525 * 6526 * This function will be used for two cases, 1) the same cb_arg is used for 6527 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6528 * Incrementing split_outstanding directly here may confuse readers especially 6529 * for the 1st case. 6530 * 6531 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6532 * works as expected. 6533 */ 6534 matched_ios = 0; 6535 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6536 6537 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6538 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6539 continue; 6540 } 6541 6542 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6543 /* Any I/O which was submitted after this abort command should be excluded. */ 6544 continue; 6545 } 6546 6547 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6548 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6549 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6550 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6551 break; 6552 } 6553 6554 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6555 if (rc != 0) { 6556 if (rc == -ENOMEM) { 6557 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6558 } else { 6559 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6560 } 6561 break; 6562 } 6563 matched_ios++; 6564 } 6565 6566 return matched_ios; 6567 } 6568 6569 static void 6570 bdev_abort_retry(void *ctx) 6571 { 6572 struct spdk_bdev_io *parent_io = ctx; 6573 uint32_t matched_ios; 6574 6575 matched_ios = _bdev_abort(parent_io); 6576 6577 if (matched_ios == 0) { 6578 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6579 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6580 } else { 6581 /* For retry, the case that no target I/O was found is success 6582 * because it means target I/Os completed in the meantime. 6583 */ 6584 bdev_io_complete(parent_io); 6585 } 6586 return; 6587 } 6588 6589 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6590 parent_io->u.bdev.split_outstanding = matched_ios; 6591 } 6592 6593 static void 6594 bdev_abort(struct spdk_bdev_io *parent_io) 6595 { 6596 uint32_t matched_ios; 6597 6598 matched_ios = _bdev_abort(parent_io); 6599 6600 if (matched_ios == 0) { 6601 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6602 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6603 } else { 6604 /* The case the no target I/O was found is failure. */ 6605 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6606 bdev_io_complete(parent_io); 6607 } 6608 return; 6609 } 6610 6611 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6612 parent_io->u.bdev.split_outstanding = matched_ios; 6613 } 6614 6615 int 6616 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6617 void *bio_cb_arg, 6618 spdk_bdev_io_completion_cb cb, void *cb_arg) 6619 { 6620 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6621 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6622 struct spdk_bdev_io *bdev_io; 6623 6624 if (bio_cb_arg == NULL) { 6625 return -EINVAL; 6626 } 6627 6628 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6629 return -ENOTSUP; 6630 } 6631 6632 bdev_io = bdev_channel_get_io(channel); 6633 if (bdev_io == NULL) { 6634 return -ENOMEM; 6635 } 6636 6637 bdev_io->internal.ch = channel; 6638 bdev_io->internal.desc = desc; 6639 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6640 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6641 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6642 6643 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6644 6645 /* Parent abort request is not submitted directly, but to manage its execution, 6646 * add it to the submitted list here. 6647 */ 6648 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6649 6650 bdev_abort(bdev_io); 6651 6652 return 0; 6653 } 6654 6655 int 6656 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6657 struct spdk_bdev_io_wait_entry *entry) 6658 { 6659 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6660 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6661 6662 if (bdev != entry->bdev) { 6663 SPDK_ERRLOG("bdevs do not match\n"); 6664 return -EINVAL; 6665 } 6666 6667 if (mgmt_ch->per_thread_cache_count > 0) { 6668 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6669 return -EINVAL; 6670 } 6671 6672 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6673 return 0; 6674 } 6675 6676 static inline void 6677 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6678 { 6679 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6680 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6681 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6682 uint32_t blocklen = bdev_io->bdev->blocklen; 6683 6684 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6685 switch (bdev_io->type) { 6686 case SPDK_BDEV_IO_TYPE_READ: 6687 io_stat->bytes_read += num_blocks * blocklen; 6688 io_stat->num_read_ops++; 6689 io_stat->read_latency_ticks += tsc_diff; 6690 if (io_stat->max_read_latency_ticks < tsc_diff) { 6691 io_stat->max_read_latency_ticks = tsc_diff; 6692 } 6693 if (io_stat->min_read_latency_ticks > tsc_diff) { 6694 io_stat->min_read_latency_ticks = tsc_diff; 6695 } 6696 break; 6697 case SPDK_BDEV_IO_TYPE_WRITE: 6698 io_stat->bytes_written += num_blocks * blocklen; 6699 io_stat->num_write_ops++; 6700 io_stat->write_latency_ticks += tsc_diff; 6701 if (io_stat->max_write_latency_ticks < tsc_diff) { 6702 io_stat->max_write_latency_ticks = tsc_diff; 6703 } 6704 if (io_stat->min_write_latency_ticks > tsc_diff) { 6705 io_stat->min_write_latency_ticks = tsc_diff; 6706 } 6707 break; 6708 case SPDK_BDEV_IO_TYPE_UNMAP: 6709 io_stat->bytes_unmapped += num_blocks * blocklen; 6710 io_stat->num_unmap_ops++; 6711 io_stat->unmap_latency_ticks += tsc_diff; 6712 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6713 io_stat->max_unmap_latency_ticks = tsc_diff; 6714 } 6715 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6716 io_stat->min_unmap_latency_ticks = tsc_diff; 6717 } 6718 break; 6719 case SPDK_BDEV_IO_TYPE_ZCOPY: 6720 /* Track the data in the start phase only */ 6721 if (bdev_io->u.bdev.zcopy.start) { 6722 if (bdev_io->u.bdev.zcopy.populate) { 6723 io_stat->bytes_read += num_blocks * blocklen; 6724 io_stat->num_read_ops++; 6725 io_stat->read_latency_ticks += tsc_diff; 6726 if (io_stat->max_read_latency_ticks < tsc_diff) { 6727 io_stat->max_read_latency_ticks = tsc_diff; 6728 } 6729 if (io_stat->min_read_latency_ticks > tsc_diff) { 6730 io_stat->min_read_latency_ticks = tsc_diff; 6731 } 6732 } else { 6733 io_stat->bytes_written += num_blocks * blocklen; 6734 io_stat->num_write_ops++; 6735 io_stat->write_latency_ticks += tsc_diff; 6736 if (io_stat->max_write_latency_ticks < tsc_diff) { 6737 io_stat->max_write_latency_ticks = tsc_diff; 6738 } 6739 if (io_stat->min_write_latency_ticks > tsc_diff) { 6740 io_stat->min_write_latency_ticks = tsc_diff; 6741 } 6742 } 6743 } 6744 break; 6745 case SPDK_BDEV_IO_TYPE_COPY: 6746 io_stat->bytes_copied += num_blocks * blocklen; 6747 io_stat->num_copy_ops++; 6748 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6749 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6750 io_stat->max_copy_latency_ticks = tsc_diff; 6751 } 6752 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6753 io_stat->min_copy_latency_ticks = tsc_diff; 6754 } 6755 break; 6756 default: 6757 break; 6758 } 6759 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6760 io_stat = bdev_io->bdev->internal.stat; 6761 assert(io_stat->io_error != NULL); 6762 6763 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6764 io_stat->io_error->error_status[-io_status - 1]++; 6765 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6766 } 6767 6768 #ifdef SPDK_CONFIG_VTUNE 6769 uint64_t now_tsc = spdk_get_ticks(); 6770 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6771 uint64_t data[5]; 6772 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6773 6774 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6775 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6776 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6777 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6778 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6779 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6780 6781 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6782 __itt_metadata_u64, 5, data); 6783 6784 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6785 bdev_io->internal.ch->start_tsc = now_tsc; 6786 } 6787 #endif 6788 } 6789 6790 static inline void 6791 _bdev_io_complete(void *ctx) 6792 { 6793 struct spdk_bdev_io *bdev_io = ctx; 6794 6795 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6796 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6797 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6798 } 6799 6800 assert(bdev_io->internal.cb != NULL); 6801 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6802 6803 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6804 bdev_io->internal.caller_ctx); 6805 } 6806 6807 static inline void 6808 bdev_io_complete(void *ctx) 6809 { 6810 struct spdk_bdev_io *bdev_io = ctx; 6811 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6812 uint64_t tsc, tsc_diff; 6813 6814 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6815 /* 6816 * Defer completion to avoid potential infinite recursion if the 6817 * user's completion callback issues a new I/O. 6818 */ 6819 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6820 bdev_io_complete, bdev_io); 6821 return; 6822 } 6823 6824 tsc = spdk_get_ticks(); 6825 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6826 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6827 bdev_io->internal.caller_ctx); 6828 6829 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6830 6831 if (bdev_io->internal.ch->histogram) { 6832 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6833 } 6834 6835 bdev_io_update_io_stat(bdev_io, tsc_diff); 6836 _bdev_io_complete(bdev_io); 6837 } 6838 6839 /* The difference between this function and bdev_io_complete() is that this should be called to 6840 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6841 * io_submitted list and don't have submit_tsc updated. 6842 */ 6843 static inline void 6844 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 6845 { 6846 /* Since the IO hasn't been submitted it's bound to be failed */ 6847 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6848 6849 /* At this point we don't know if the IO is completed from submission context or not, but, 6850 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 6851 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6852 _bdev_io_complete, bdev_io); 6853 } 6854 6855 static void bdev_destroy_cb(void *io_device); 6856 6857 static void 6858 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6859 { 6860 struct spdk_bdev_io *bdev_io = _ctx; 6861 6862 if (bdev_io->u.reset.ch_ref != NULL) { 6863 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6864 bdev_io->u.reset.ch_ref = NULL; 6865 } 6866 6867 bdev_io_complete(bdev_io); 6868 6869 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6870 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6871 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6872 } 6873 } 6874 6875 static void 6876 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6877 struct spdk_io_channel *_ch, void *_ctx) 6878 { 6879 struct spdk_bdev_io *bdev_io = _ctx; 6880 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6881 struct spdk_bdev_io *queued_reset; 6882 6883 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6884 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6885 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6886 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6887 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6888 } 6889 6890 spdk_bdev_for_each_channel_continue(i, 0); 6891 } 6892 6893 static void 6894 bdev_io_complete_sequence_cb(void *ctx, int status) 6895 { 6896 struct spdk_bdev_io *bdev_io = ctx; 6897 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6898 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6899 6900 /* u.bdev.accel_sequence should have already been cleared at this point */ 6901 assert(bdev_io->u.bdev.accel_sequence == NULL); 6902 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 6903 6904 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 6905 bdev_io->internal.accel_sequence = NULL; 6906 6907 if (spdk_unlikely(status != 0)) { 6908 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 6909 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6910 } 6911 6912 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6913 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6914 return; 6915 } 6916 6917 bdev_io_complete(bdev_io); 6918 } 6919 6920 void 6921 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6922 { 6923 struct spdk_bdev *bdev = bdev_io->bdev; 6924 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6925 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6926 6927 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 6928 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 6929 spdk_bdev_get_module_name(bdev), 6930 bdev_io_status_get_string(bdev_io->internal.status)); 6931 assert(false); 6932 } 6933 bdev_io->internal.status = status; 6934 6935 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6936 bool unlock_channels = false; 6937 6938 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6939 SPDK_ERRLOG("NOMEM returned for reset\n"); 6940 } 6941 spdk_spin_lock(&bdev->internal.spinlock); 6942 if (bdev_io == bdev->internal.reset_in_progress) { 6943 bdev->internal.reset_in_progress = NULL; 6944 unlock_channels = true; 6945 } 6946 spdk_spin_unlock(&bdev->internal.spinlock); 6947 6948 if (unlock_channels) { 6949 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6950 bdev_reset_complete); 6951 return; 6952 } 6953 } else { 6954 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) && 6955 spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6956 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 6957 return; 6958 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6959 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6960 /* bdev IO will be completed in the callback */ 6961 return; 6962 } 6963 6964 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6965 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6966 return; 6967 } 6968 } 6969 6970 bdev_io_complete(bdev_io); 6971 } 6972 6973 void 6974 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6975 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6976 { 6977 enum spdk_bdev_io_status status; 6978 6979 if (sc == SPDK_SCSI_STATUS_GOOD) { 6980 status = SPDK_BDEV_IO_STATUS_SUCCESS; 6981 } else { 6982 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6983 bdev_io->internal.error.scsi.sc = sc; 6984 bdev_io->internal.error.scsi.sk = sk; 6985 bdev_io->internal.error.scsi.asc = asc; 6986 bdev_io->internal.error.scsi.ascq = ascq; 6987 } 6988 6989 spdk_bdev_io_complete(bdev_io, status); 6990 } 6991 6992 void 6993 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6994 int *sc, int *sk, int *asc, int *ascq) 6995 { 6996 assert(sc != NULL); 6997 assert(sk != NULL); 6998 assert(asc != NULL); 6999 assert(ascq != NULL); 7000 7001 switch (bdev_io->internal.status) { 7002 case SPDK_BDEV_IO_STATUS_SUCCESS: 7003 *sc = SPDK_SCSI_STATUS_GOOD; 7004 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7005 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7006 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7007 break; 7008 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7009 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7010 break; 7011 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7012 *sc = bdev_io->internal.error.scsi.sc; 7013 *sk = bdev_io->internal.error.scsi.sk; 7014 *asc = bdev_io->internal.error.scsi.asc; 7015 *ascq = bdev_io->internal.error.scsi.ascq; 7016 break; 7017 default: 7018 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7019 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7020 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7021 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7022 break; 7023 } 7024 } 7025 7026 void 7027 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7028 { 7029 enum spdk_bdev_io_status status; 7030 7031 if (aio_result == 0) { 7032 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7033 } else { 7034 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7035 } 7036 7037 bdev_io->internal.error.aio_result = aio_result; 7038 7039 spdk_bdev_io_complete(bdev_io, status); 7040 } 7041 7042 void 7043 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7044 { 7045 assert(aio_result != NULL); 7046 7047 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7048 *aio_result = bdev_io->internal.error.aio_result; 7049 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7050 *aio_result = 0; 7051 } else { 7052 *aio_result = -EIO; 7053 } 7054 } 7055 7056 void 7057 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7058 { 7059 enum spdk_bdev_io_status status; 7060 7061 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7062 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7063 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7064 status = SPDK_BDEV_IO_STATUS_ABORTED; 7065 } else { 7066 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7067 } 7068 7069 bdev_io->internal.error.nvme.cdw0 = cdw0; 7070 bdev_io->internal.error.nvme.sct = sct; 7071 bdev_io->internal.error.nvme.sc = sc; 7072 7073 spdk_bdev_io_complete(bdev_io, status); 7074 } 7075 7076 void 7077 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7078 { 7079 assert(sct != NULL); 7080 assert(sc != NULL); 7081 assert(cdw0 != NULL); 7082 7083 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7084 *sct = SPDK_NVME_SCT_GENERIC; 7085 *sc = SPDK_NVME_SC_SUCCESS; 7086 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7087 *cdw0 = 0; 7088 } else { 7089 *cdw0 = 1U; 7090 } 7091 return; 7092 } 7093 7094 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7095 *sct = bdev_io->internal.error.nvme.sct; 7096 *sc = bdev_io->internal.error.nvme.sc; 7097 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7098 *sct = SPDK_NVME_SCT_GENERIC; 7099 *sc = SPDK_NVME_SC_SUCCESS; 7100 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7101 *sct = SPDK_NVME_SCT_GENERIC; 7102 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7103 } else { 7104 *sct = SPDK_NVME_SCT_GENERIC; 7105 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7106 } 7107 7108 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7109 } 7110 7111 void 7112 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7113 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7114 { 7115 assert(first_sct != NULL); 7116 assert(first_sc != NULL); 7117 assert(second_sct != NULL); 7118 assert(second_sc != NULL); 7119 assert(cdw0 != NULL); 7120 7121 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7122 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7123 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7124 *first_sct = bdev_io->internal.error.nvme.sct; 7125 *first_sc = bdev_io->internal.error.nvme.sc; 7126 *second_sct = SPDK_NVME_SCT_GENERIC; 7127 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7128 } else { 7129 *first_sct = SPDK_NVME_SCT_GENERIC; 7130 *first_sc = SPDK_NVME_SC_SUCCESS; 7131 *second_sct = bdev_io->internal.error.nvme.sct; 7132 *second_sc = bdev_io->internal.error.nvme.sc; 7133 } 7134 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7135 *first_sct = SPDK_NVME_SCT_GENERIC; 7136 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7137 *second_sct = SPDK_NVME_SCT_GENERIC; 7138 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7139 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7140 *first_sct = SPDK_NVME_SCT_GENERIC; 7141 *first_sc = SPDK_NVME_SC_SUCCESS; 7142 *second_sct = SPDK_NVME_SCT_GENERIC; 7143 *second_sc = SPDK_NVME_SC_SUCCESS; 7144 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7145 *first_sct = SPDK_NVME_SCT_GENERIC; 7146 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7147 *second_sct = SPDK_NVME_SCT_GENERIC; 7148 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7149 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7150 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7151 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7152 *second_sct = SPDK_NVME_SCT_GENERIC; 7153 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7154 } else { 7155 *first_sct = SPDK_NVME_SCT_GENERIC; 7156 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7157 *second_sct = SPDK_NVME_SCT_GENERIC; 7158 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7159 } 7160 7161 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7162 } 7163 7164 struct spdk_thread * 7165 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7166 { 7167 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7168 } 7169 7170 struct spdk_io_channel * 7171 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7172 { 7173 return bdev_io->internal.ch->channel; 7174 } 7175 7176 static int 7177 bdev_register(struct spdk_bdev *bdev) 7178 { 7179 char *bdev_name; 7180 char uuid[SPDK_UUID_STRING_LEN]; 7181 int ret, i; 7182 7183 assert(bdev->module != NULL); 7184 7185 if (!bdev->name) { 7186 SPDK_ERRLOG("Bdev name is NULL\n"); 7187 return -EINVAL; 7188 } 7189 7190 if (!strlen(bdev->name)) { 7191 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7192 return -EINVAL; 7193 } 7194 7195 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7196 if (bdev->fn_table->accel_sequence_supported == NULL) { 7197 continue; 7198 } 7199 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7200 (enum spdk_bdev_io_type)i)) { 7201 continue; 7202 } 7203 7204 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7205 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7206 "memory domains\n"); 7207 return -EINVAL; 7208 } 7209 7210 if (spdk_bdev_is_md_separate(bdev)) { 7211 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7212 "accel sequence support\n"); 7213 return -EINVAL; 7214 } 7215 } 7216 7217 /* Users often register their own I/O devices using the bdev name. In 7218 * order to avoid conflicts, prepend bdev_. */ 7219 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7220 if (!bdev_name) { 7221 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7222 return -ENOMEM; 7223 } 7224 7225 bdev->internal.stat = bdev_alloc_io_stat(true); 7226 if (!bdev->internal.stat) { 7227 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7228 free(bdev_name); 7229 return -ENOMEM; 7230 } 7231 7232 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7233 bdev->internal.measured_queue_depth = UINT64_MAX; 7234 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7235 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7236 bdev->internal.qd_poller = NULL; 7237 bdev->internal.qos = NULL; 7238 7239 TAILQ_INIT(&bdev->internal.open_descs); 7240 TAILQ_INIT(&bdev->internal.locked_ranges); 7241 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7242 TAILQ_INIT(&bdev->aliases); 7243 7244 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7245 if (ret != 0) { 7246 bdev_free_io_stat(bdev->internal.stat); 7247 free(bdev_name); 7248 return ret; 7249 } 7250 7251 /* UUID may be specified by the user or defined by bdev itself. 7252 * Otherwise it will be generated here, so this field will never be empty. */ 7253 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7254 spdk_uuid_generate(&bdev->uuid); 7255 } 7256 7257 /* Add the UUID alias only if it's different than the name */ 7258 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7259 if (strcmp(bdev->name, uuid) != 0) { 7260 ret = spdk_bdev_alias_add(bdev, uuid); 7261 if (ret != 0) { 7262 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7263 bdev_name_del(&bdev->internal.bdev_name); 7264 bdev_free_io_stat(bdev->internal.stat); 7265 free(bdev_name); 7266 return ret; 7267 } 7268 } 7269 7270 if (spdk_bdev_get_buf_align(bdev) > 1) { 7271 if (bdev->split_on_optimal_io_boundary) { 7272 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7273 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7274 } else { 7275 bdev->split_on_optimal_io_boundary = true; 7276 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7277 } 7278 } 7279 7280 /* If the user didn't specify a write unit size, set it to one. */ 7281 if (bdev->write_unit_size == 0) { 7282 bdev->write_unit_size = 1; 7283 } 7284 7285 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7286 if (bdev->acwu == 0) { 7287 bdev->acwu = bdev->write_unit_size; 7288 } 7289 7290 if (bdev->phys_blocklen == 0) { 7291 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7292 } 7293 7294 bdev->internal.reset_in_progress = NULL; 7295 bdev->internal.qd_poll_in_progress = false; 7296 bdev->internal.period = 0; 7297 bdev->internal.new_period = 0; 7298 7299 spdk_io_device_register(__bdev_to_io_dev(bdev), 7300 bdev_channel_create, bdev_channel_destroy, 7301 sizeof(struct spdk_bdev_channel), 7302 bdev_name); 7303 7304 free(bdev_name); 7305 7306 spdk_spin_init(&bdev->internal.spinlock); 7307 7308 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7309 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7310 7311 return 0; 7312 } 7313 7314 static void 7315 bdev_destroy_cb(void *io_device) 7316 { 7317 int rc; 7318 struct spdk_bdev *bdev; 7319 spdk_bdev_unregister_cb cb_fn; 7320 void *cb_arg; 7321 7322 bdev = __bdev_from_io_dev(io_device); 7323 7324 if (bdev->internal.unregister_td != spdk_get_thread()) { 7325 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7326 return; 7327 } 7328 7329 cb_fn = bdev->internal.unregister_cb; 7330 cb_arg = bdev->internal.unregister_ctx; 7331 7332 spdk_spin_destroy(&bdev->internal.spinlock); 7333 free(bdev->internal.qos); 7334 bdev_free_io_stat(bdev->internal.stat); 7335 7336 rc = bdev->fn_table->destruct(bdev->ctxt); 7337 if (rc < 0) { 7338 SPDK_ERRLOG("destruct failed\n"); 7339 } 7340 if (rc <= 0 && cb_fn != NULL) { 7341 cb_fn(cb_arg, rc); 7342 } 7343 } 7344 7345 void 7346 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7347 { 7348 if (bdev->internal.unregister_cb != NULL) { 7349 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7350 } 7351 } 7352 7353 static void 7354 _remove_notify(void *arg) 7355 { 7356 struct spdk_bdev_desc *desc = arg; 7357 7358 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7359 } 7360 7361 /* returns: 0 - bdev removed and ready to be destructed. 7362 * -EBUSY - bdev can't be destructed yet. */ 7363 static int 7364 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7365 { 7366 struct spdk_bdev_desc *desc, *tmp; 7367 int rc = 0; 7368 char uuid[SPDK_UUID_STRING_LEN]; 7369 7370 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7371 assert(spdk_spin_held(&bdev->internal.spinlock)); 7372 7373 /* Notify each descriptor about hotremoval */ 7374 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7375 rc = -EBUSY; 7376 /* 7377 * Defer invocation of the event_cb to a separate message that will 7378 * run later on its thread. This ensures this context unwinds and 7379 * we don't recursively unregister this bdev again if the event_cb 7380 * immediately closes its descriptor. 7381 */ 7382 event_notify(desc, _remove_notify); 7383 } 7384 7385 /* If there are no descriptors, proceed removing the bdev */ 7386 if (rc == 0) { 7387 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7388 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7389 7390 /* Delete the name and the UUID alias */ 7391 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7392 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7393 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7394 7395 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7396 7397 if (bdev->internal.reset_in_progress != NULL) { 7398 /* If reset is in progress, let the completion callback for reset 7399 * unregister the bdev. 7400 */ 7401 rc = -EBUSY; 7402 } 7403 } 7404 7405 return rc; 7406 } 7407 7408 static void 7409 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7410 struct spdk_io_channel *io_ch, void *_ctx) 7411 { 7412 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7413 7414 bdev_channel_abort_queued_ios(bdev_ch); 7415 spdk_bdev_for_each_channel_continue(i, 0); 7416 } 7417 7418 static void 7419 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7420 { 7421 int rc; 7422 7423 spdk_spin_lock(&g_bdev_mgr.spinlock); 7424 spdk_spin_lock(&bdev->internal.spinlock); 7425 /* 7426 * Set the status to REMOVING after completing to abort channels. Otherwise, 7427 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7428 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7429 * may fail. 7430 */ 7431 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7432 rc = bdev_unregister_unsafe(bdev); 7433 spdk_spin_unlock(&bdev->internal.spinlock); 7434 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7435 7436 if (rc == 0) { 7437 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7438 } 7439 } 7440 7441 void 7442 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7443 { 7444 struct spdk_thread *thread; 7445 7446 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7447 7448 thread = spdk_get_thread(); 7449 if (!thread) { 7450 /* The user called this from a non-SPDK thread. */ 7451 if (cb_fn != NULL) { 7452 cb_fn(cb_arg, -ENOTSUP); 7453 } 7454 return; 7455 } 7456 7457 spdk_spin_lock(&g_bdev_mgr.spinlock); 7458 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7459 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7460 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7461 if (cb_fn) { 7462 cb_fn(cb_arg, -EBUSY); 7463 } 7464 return; 7465 } 7466 7467 spdk_spin_lock(&bdev->internal.spinlock); 7468 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7469 bdev->internal.unregister_cb = cb_fn; 7470 bdev->internal.unregister_ctx = cb_arg; 7471 bdev->internal.unregister_td = thread; 7472 spdk_spin_unlock(&bdev->internal.spinlock); 7473 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7474 7475 spdk_bdev_set_qd_sampling_period(bdev, 0); 7476 7477 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7478 bdev_unregister); 7479 } 7480 7481 int 7482 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7483 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7484 { 7485 struct spdk_bdev_desc *desc; 7486 struct spdk_bdev *bdev; 7487 int rc; 7488 7489 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7490 if (rc != 0) { 7491 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7492 return rc; 7493 } 7494 7495 bdev = spdk_bdev_desc_get_bdev(desc); 7496 7497 if (bdev->module != module) { 7498 spdk_bdev_close(desc); 7499 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7500 bdev_name); 7501 return -ENODEV; 7502 } 7503 7504 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7505 7506 spdk_bdev_close(desc); 7507 7508 return 0; 7509 } 7510 7511 static int 7512 bdev_start_qos(struct spdk_bdev *bdev) 7513 { 7514 struct set_qos_limit_ctx *ctx; 7515 7516 /* Enable QoS */ 7517 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7518 ctx = calloc(1, sizeof(*ctx)); 7519 if (ctx == NULL) { 7520 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7521 return -ENOMEM; 7522 } 7523 ctx->bdev = bdev; 7524 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7525 } 7526 7527 return 0; 7528 } 7529 7530 static void 7531 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7532 struct spdk_bdev *bdev) 7533 { 7534 enum spdk_bdev_claim_type type; 7535 const char *typename, *modname; 7536 extern struct spdk_log_flag SPDK_LOG_bdev; 7537 7538 assert(spdk_spin_held(&bdev->internal.spinlock)); 7539 7540 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7541 return; 7542 } 7543 7544 type = bdev->internal.claim_type; 7545 typename = spdk_bdev_claim_get_name(type); 7546 7547 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7548 modname = bdev->internal.claim.v1.module->name; 7549 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7550 bdev->name, detail, typename, modname); 7551 return; 7552 } 7553 7554 if (claim_type_is_v2(type)) { 7555 struct spdk_bdev_module_claim *claim; 7556 7557 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7558 modname = claim->module->name; 7559 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7560 bdev->name, detail, typename, modname); 7561 } 7562 return; 7563 } 7564 7565 assert(false); 7566 } 7567 7568 static int 7569 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7570 { 7571 struct spdk_thread *thread; 7572 int rc = 0; 7573 7574 thread = spdk_get_thread(); 7575 if (!thread) { 7576 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7577 return -ENOTSUP; 7578 } 7579 7580 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7581 spdk_get_thread()); 7582 7583 desc->bdev = bdev; 7584 desc->thread = thread; 7585 desc->write = write; 7586 7587 spdk_spin_lock(&bdev->internal.spinlock); 7588 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7589 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7590 spdk_spin_unlock(&bdev->internal.spinlock); 7591 return -ENODEV; 7592 } 7593 7594 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7595 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7596 spdk_spin_unlock(&bdev->internal.spinlock); 7597 return -EPERM; 7598 } 7599 7600 rc = bdev_start_qos(bdev); 7601 if (rc != 0) { 7602 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7603 spdk_spin_unlock(&bdev->internal.spinlock); 7604 return rc; 7605 } 7606 7607 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7608 7609 spdk_spin_unlock(&bdev->internal.spinlock); 7610 7611 return 0; 7612 } 7613 7614 static int 7615 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7616 struct spdk_bdev_desc **_desc) 7617 { 7618 struct spdk_bdev_desc *desc; 7619 unsigned int i; 7620 7621 desc = calloc(1, sizeof(*desc)); 7622 if (desc == NULL) { 7623 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7624 return -ENOMEM; 7625 } 7626 7627 TAILQ_INIT(&desc->pending_media_events); 7628 TAILQ_INIT(&desc->free_media_events); 7629 7630 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7631 desc->callback.event_fn = event_cb; 7632 desc->callback.ctx = event_ctx; 7633 spdk_spin_init(&desc->spinlock); 7634 7635 if (bdev->media_events) { 7636 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7637 sizeof(*desc->media_events_buffer)); 7638 if (desc->media_events_buffer == NULL) { 7639 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7640 bdev_desc_free(desc); 7641 return -ENOMEM; 7642 } 7643 7644 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7645 TAILQ_INSERT_TAIL(&desc->free_media_events, 7646 &desc->media_events_buffer[i], tailq); 7647 } 7648 } 7649 7650 if (bdev->fn_table->accel_sequence_supported != NULL) { 7651 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7652 desc->accel_sequence_supported[i] = 7653 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7654 (enum spdk_bdev_io_type)i); 7655 } 7656 } 7657 7658 *_desc = desc; 7659 7660 return 0; 7661 } 7662 7663 int 7664 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7665 void *event_ctx, struct spdk_bdev_desc **_desc) 7666 { 7667 struct spdk_bdev_desc *desc; 7668 struct spdk_bdev *bdev; 7669 int rc; 7670 7671 if (event_cb == NULL) { 7672 SPDK_ERRLOG("Missing event callback function\n"); 7673 return -EINVAL; 7674 } 7675 7676 spdk_spin_lock(&g_bdev_mgr.spinlock); 7677 7678 bdev = bdev_get_by_name(bdev_name); 7679 7680 if (bdev == NULL) { 7681 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7682 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7683 return -ENODEV; 7684 } 7685 7686 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7687 if (rc != 0) { 7688 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7689 return rc; 7690 } 7691 7692 rc = bdev_open(bdev, write, desc); 7693 if (rc != 0) { 7694 bdev_desc_free(desc); 7695 desc = NULL; 7696 } 7697 7698 *_desc = desc; 7699 7700 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7701 7702 return rc; 7703 } 7704 7705 static void 7706 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7707 { 7708 int rc; 7709 7710 spdk_spin_lock(&bdev->internal.spinlock); 7711 spdk_spin_lock(&desc->spinlock); 7712 7713 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7714 7715 desc->closed = true; 7716 7717 if (desc->claim != NULL) { 7718 bdev_desc_release_claims(desc); 7719 } 7720 7721 if (0 == desc->refs) { 7722 spdk_spin_unlock(&desc->spinlock); 7723 bdev_desc_free(desc); 7724 } else { 7725 spdk_spin_unlock(&desc->spinlock); 7726 } 7727 7728 /* If no more descriptors, kill QoS channel */ 7729 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7730 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7731 bdev->name, spdk_get_thread()); 7732 7733 if (bdev_qos_destroy(bdev)) { 7734 /* There isn't anything we can do to recover here. Just let the 7735 * old QoS poller keep running. The QoS handling won't change 7736 * cores when the user allocates a new channel, but it won't break. */ 7737 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7738 } 7739 } 7740 7741 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7742 rc = bdev_unregister_unsafe(bdev); 7743 spdk_spin_unlock(&bdev->internal.spinlock); 7744 7745 if (rc == 0) { 7746 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7747 } 7748 } else { 7749 spdk_spin_unlock(&bdev->internal.spinlock); 7750 } 7751 } 7752 7753 void 7754 spdk_bdev_close(struct spdk_bdev_desc *desc) 7755 { 7756 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7757 7758 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7759 spdk_get_thread()); 7760 7761 assert(desc->thread == spdk_get_thread()); 7762 7763 spdk_poller_unregister(&desc->io_timeout_poller); 7764 7765 spdk_spin_lock(&g_bdev_mgr.spinlock); 7766 7767 bdev_close(bdev, desc); 7768 7769 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7770 } 7771 7772 static void 7773 bdev_register_finished(void *arg) 7774 { 7775 struct spdk_bdev_desc *desc = arg; 7776 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7777 7778 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7779 7780 spdk_spin_lock(&g_bdev_mgr.spinlock); 7781 7782 bdev_close(bdev, desc); 7783 7784 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7785 } 7786 7787 int 7788 spdk_bdev_register(struct spdk_bdev *bdev) 7789 { 7790 struct spdk_bdev_desc *desc; 7791 struct spdk_thread *thread = spdk_get_thread(); 7792 int rc; 7793 7794 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7795 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7796 thread ? spdk_thread_get_name(thread) : "null"); 7797 return -EINVAL; 7798 } 7799 7800 rc = bdev_register(bdev); 7801 if (rc != 0) { 7802 return rc; 7803 } 7804 7805 /* A descriptor is opened to prevent bdev deletion during examination */ 7806 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7807 if (rc != 0) { 7808 spdk_bdev_unregister(bdev, NULL, NULL); 7809 return rc; 7810 } 7811 7812 rc = bdev_open(bdev, false, desc); 7813 if (rc != 0) { 7814 bdev_desc_free(desc); 7815 spdk_bdev_unregister(bdev, NULL, NULL); 7816 return rc; 7817 } 7818 7819 /* Examine configuration before initializing I/O */ 7820 bdev_examine(bdev); 7821 7822 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7823 if (rc != 0) { 7824 bdev_close(bdev, desc); 7825 spdk_bdev_unregister(bdev, NULL, NULL); 7826 } 7827 7828 return rc; 7829 } 7830 7831 int 7832 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7833 struct spdk_bdev_module *module) 7834 { 7835 spdk_spin_lock(&bdev->internal.spinlock); 7836 7837 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7838 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7839 spdk_spin_unlock(&bdev->internal.spinlock); 7840 return -EPERM; 7841 } 7842 7843 if (desc && !desc->write) { 7844 desc->write = true; 7845 } 7846 7847 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7848 bdev->internal.claim.v1.module = module; 7849 7850 spdk_spin_unlock(&bdev->internal.spinlock); 7851 return 0; 7852 } 7853 7854 void 7855 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7856 { 7857 spdk_spin_lock(&bdev->internal.spinlock); 7858 7859 assert(bdev->internal.claim.v1.module != NULL); 7860 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7861 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7862 bdev->internal.claim.v1.module = NULL; 7863 7864 spdk_spin_unlock(&bdev->internal.spinlock); 7865 } 7866 7867 /* 7868 * Start claims v2 7869 */ 7870 7871 const char * 7872 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7873 { 7874 switch (type) { 7875 case SPDK_BDEV_CLAIM_NONE: 7876 return "not_claimed"; 7877 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7878 return "exclusive_write"; 7879 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7880 return "read_many_write_one"; 7881 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7882 return "read_many_write_none"; 7883 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7884 return "read_many_write_many"; 7885 default: 7886 break; 7887 } 7888 return "invalid_claim"; 7889 } 7890 7891 static bool 7892 claim_type_is_v2(enum spdk_bdev_claim_type type) 7893 { 7894 switch (type) { 7895 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7896 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7897 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7898 return true; 7899 default: 7900 break; 7901 } 7902 return false; 7903 } 7904 7905 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7906 static bool 7907 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7908 { 7909 switch (type) { 7910 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7911 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7912 return true; 7913 default: 7914 break; 7915 } 7916 return false; 7917 } 7918 7919 void 7920 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7921 { 7922 if (opts == NULL) { 7923 SPDK_ERRLOG("opts should not be NULL\n"); 7924 assert(opts != NULL); 7925 return; 7926 } 7927 if (size == 0) { 7928 SPDK_ERRLOG("size should not be zero\n"); 7929 assert(size != 0); 7930 return; 7931 } 7932 7933 memset(opts, 0, size); 7934 opts->opts_size = size; 7935 7936 #define FIELD_OK(field) \ 7937 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7938 7939 #define SET_FIELD(field, value) \ 7940 if (FIELD_OK(field)) { \ 7941 opts->field = value; \ 7942 } \ 7943 7944 SET_FIELD(shared_claim_key, 0); 7945 7946 #undef FIELD_OK 7947 #undef SET_FIELD 7948 } 7949 7950 static int 7951 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7952 { 7953 if (src->opts_size == 0) { 7954 SPDK_ERRLOG("size should not be zero\n"); 7955 return -1; 7956 } 7957 7958 memset(dst, 0, sizeof(*dst)); 7959 dst->opts_size = src->opts_size; 7960 7961 #define FIELD_OK(field) \ 7962 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7963 7964 #define SET_FIELD(field) \ 7965 if (FIELD_OK(field)) { \ 7966 dst->field = src->field; \ 7967 } \ 7968 7969 if (FIELD_OK(name)) { 7970 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7971 } 7972 7973 SET_FIELD(shared_claim_key); 7974 7975 /* You should not remove this statement, but need to update the assert statement 7976 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7977 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7978 7979 #undef FIELD_OK 7980 #undef SET_FIELD 7981 return 0; 7982 } 7983 7984 /* Returns 0 if a read-write-once claim can be taken. */ 7985 static int 7986 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7987 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7988 { 7989 struct spdk_bdev *bdev = desc->bdev; 7990 struct spdk_bdev_desc *open_desc; 7991 7992 assert(spdk_spin_held(&bdev->internal.spinlock)); 7993 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7994 7995 if (opts->shared_claim_key != 0) { 7996 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7997 bdev->name); 7998 return -EINVAL; 7999 } 8000 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8001 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8002 return -EPERM; 8003 } 8004 if (desc->claim != NULL) { 8005 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8006 bdev->name, desc->claim->module->name); 8007 return -EPERM; 8008 } 8009 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8010 if (desc != open_desc && open_desc->write) { 8011 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8012 "another descriptor is open for writing\n", 8013 bdev->name); 8014 return -EPERM; 8015 } 8016 } 8017 8018 return 0; 8019 } 8020 8021 /* Returns 0 if a read-only-many claim can be taken. */ 8022 static int 8023 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8024 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8025 { 8026 struct spdk_bdev *bdev = desc->bdev; 8027 struct spdk_bdev_desc *open_desc; 8028 8029 assert(spdk_spin_held(&bdev->internal.spinlock)); 8030 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8031 assert(desc->claim == NULL); 8032 8033 if (desc->write) { 8034 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8035 bdev->name); 8036 return -EINVAL; 8037 } 8038 if (opts->shared_claim_key != 0) { 8039 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8040 return -EINVAL; 8041 } 8042 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8043 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8044 if (open_desc->write) { 8045 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8046 "another descriptor is open for writing\n", 8047 bdev->name); 8048 return -EPERM; 8049 } 8050 } 8051 } 8052 8053 return 0; 8054 } 8055 8056 /* Returns 0 if a read-write-many claim can be taken. */ 8057 static int 8058 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8059 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8060 { 8061 struct spdk_bdev *bdev = desc->bdev; 8062 struct spdk_bdev_desc *open_desc; 8063 8064 assert(spdk_spin_held(&bdev->internal.spinlock)); 8065 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8066 assert(desc->claim == NULL); 8067 8068 if (opts->shared_claim_key == 0) { 8069 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8070 bdev->name); 8071 return -EINVAL; 8072 } 8073 switch (bdev->internal.claim_type) { 8074 case SPDK_BDEV_CLAIM_NONE: 8075 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8076 if (open_desc == desc) { 8077 continue; 8078 } 8079 if (open_desc->write) { 8080 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8081 "another descriptor is open for writing without a " 8082 "claim\n", bdev->name); 8083 return -EPERM; 8084 } 8085 } 8086 break; 8087 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8088 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8089 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8090 return -EPERM; 8091 } 8092 break; 8093 default: 8094 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8095 return -EBUSY; 8096 } 8097 8098 return 0; 8099 } 8100 8101 /* Updates desc and its bdev with a v2 claim. */ 8102 static int 8103 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8104 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8105 { 8106 struct spdk_bdev *bdev = desc->bdev; 8107 struct spdk_bdev_module_claim *claim; 8108 8109 assert(spdk_spin_held(&bdev->internal.spinlock)); 8110 assert(claim_type_is_v2(type)); 8111 assert(desc->claim == NULL); 8112 8113 claim = calloc(1, sizeof(*desc->claim)); 8114 if (claim == NULL) { 8115 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8116 return -ENOMEM; 8117 } 8118 claim->module = module; 8119 claim->desc = desc; 8120 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8121 memcpy(claim->name, opts->name, sizeof(claim->name)); 8122 desc->claim = claim; 8123 8124 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8125 bdev->internal.claim_type = type; 8126 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8127 bdev->internal.claim.v2.key = opts->shared_claim_key; 8128 } 8129 assert(type == bdev->internal.claim_type); 8130 8131 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8132 8133 if (!desc->write && claim_type_promotes_to_write(type)) { 8134 desc->write = true; 8135 } 8136 8137 return 0; 8138 } 8139 8140 int 8141 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8142 struct spdk_bdev_claim_opts *_opts, 8143 struct spdk_bdev_module *module) 8144 { 8145 struct spdk_bdev *bdev; 8146 struct spdk_bdev_claim_opts opts; 8147 int rc = 0; 8148 8149 if (desc == NULL) { 8150 SPDK_ERRLOG("descriptor must not be NULL\n"); 8151 return -EINVAL; 8152 } 8153 8154 bdev = desc->bdev; 8155 8156 if (_opts == NULL) { 8157 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8158 } else if (claim_opts_copy(_opts, &opts) != 0) { 8159 return -EINVAL; 8160 } 8161 8162 spdk_spin_lock(&bdev->internal.spinlock); 8163 8164 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8165 bdev->internal.claim_type != type) { 8166 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8167 spdk_spin_unlock(&bdev->internal.spinlock); 8168 return -EPERM; 8169 } 8170 8171 if (claim_type_is_v2(type) && desc->claim != NULL) { 8172 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8173 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8174 spdk_spin_unlock(&bdev->internal.spinlock); 8175 return -EPERM; 8176 } 8177 8178 switch (type) { 8179 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8180 spdk_spin_unlock(&bdev->internal.spinlock); 8181 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8182 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8183 rc = claim_verify_rwo(desc, type, &opts, module); 8184 break; 8185 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8186 rc = claim_verify_rom(desc, type, &opts, module); 8187 break; 8188 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8189 rc = claim_verify_rwm(desc, type, &opts, module); 8190 break; 8191 default: 8192 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8193 rc = -ENOTSUP; 8194 } 8195 8196 if (rc == 0) { 8197 rc = claim_bdev(desc, type, &opts, module); 8198 } 8199 8200 spdk_spin_unlock(&bdev->internal.spinlock); 8201 return rc; 8202 } 8203 8204 static void 8205 claim_reset(struct spdk_bdev *bdev) 8206 { 8207 assert(spdk_spin_held(&bdev->internal.spinlock)); 8208 assert(claim_type_is_v2(bdev->internal.claim_type)); 8209 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8210 8211 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8212 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8213 } 8214 8215 static void 8216 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8217 { 8218 struct spdk_bdev *bdev = desc->bdev; 8219 8220 assert(spdk_spin_held(&bdev->internal.spinlock)); 8221 assert(claim_type_is_v2(bdev->internal.claim_type)); 8222 8223 if (bdev->internal.examine_in_progress == 0) { 8224 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8225 free(desc->claim); 8226 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8227 claim_reset(bdev); 8228 } 8229 } else { 8230 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8231 desc->claim->module = NULL; 8232 desc->claim->desc = NULL; 8233 } 8234 desc->claim = NULL; 8235 } 8236 8237 /* 8238 * End claims v2 8239 */ 8240 8241 struct spdk_bdev * 8242 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8243 { 8244 assert(desc != NULL); 8245 return desc->bdev; 8246 } 8247 8248 int 8249 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8250 { 8251 struct spdk_bdev *bdev, *tmp; 8252 struct spdk_bdev_desc *desc; 8253 int rc = 0; 8254 8255 assert(fn != NULL); 8256 8257 spdk_spin_lock(&g_bdev_mgr.spinlock); 8258 bdev = spdk_bdev_first(); 8259 while (bdev != NULL) { 8260 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8261 if (rc != 0) { 8262 break; 8263 } 8264 rc = bdev_open(bdev, false, desc); 8265 if (rc != 0) { 8266 bdev_desc_free(desc); 8267 if (rc == -ENODEV) { 8268 /* Ignore the error and move to the next bdev. */ 8269 rc = 0; 8270 bdev = spdk_bdev_next(bdev); 8271 continue; 8272 } 8273 break; 8274 } 8275 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8276 8277 rc = fn(ctx, bdev); 8278 8279 spdk_spin_lock(&g_bdev_mgr.spinlock); 8280 tmp = spdk_bdev_next(bdev); 8281 bdev_close(bdev, desc); 8282 if (rc != 0) { 8283 break; 8284 } 8285 bdev = tmp; 8286 } 8287 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8288 8289 return rc; 8290 } 8291 8292 int 8293 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8294 { 8295 struct spdk_bdev *bdev, *tmp; 8296 struct spdk_bdev_desc *desc; 8297 int rc = 0; 8298 8299 assert(fn != NULL); 8300 8301 spdk_spin_lock(&g_bdev_mgr.spinlock); 8302 bdev = spdk_bdev_first_leaf(); 8303 while (bdev != NULL) { 8304 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8305 if (rc != 0) { 8306 break; 8307 } 8308 rc = bdev_open(bdev, false, desc); 8309 if (rc != 0) { 8310 bdev_desc_free(desc); 8311 if (rc == -ENODEV) { 8312 /* Ignore the error and move to the next bdev. */ 8313 rc = 0; 8314 bdev = spdk_bdev_next_leaf(bdev); 8315 continue; 8316 } 8317 break; 8318 } 8319 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8320 8321 rc = fn(ctx, bdev); 8322 8323 spdk_spin_lock(&g_bdev_mgr.spinlock); 8324 tmp = spdk_bdev_next_leaf(bdev); 8325 bdev_close(bdev, desc); 8326 if (rc != 0) { 8327 break; 8328 } 8329 bdev = tmp; 8330 } 8331 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8332 8333 return rc; 8334 } 8335 8336 void 8337 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8338 { 8339 struct iovec *iovs; 8340 int iovcnt; 8341 8342 if (bdev_io == NULL) { 8343 return; 8344 } 8345 8346 switch (bdev_io->type) { 8347 case SPDK_BDEV_IO_TYPE_READ: 8348 case SPDK_BDEV_IO_TYPE_WRITE: 8349 case SPDK_BDEV_IO_TYPE_ZCOPY: 8350 iovs = bdev_io->u.bdev.iovs; 8351 iovcnt = bdev_io->u.bdev.iovcnt; 8352 break; 8353 default: 8354 iovs = NULL; 8355 iovcnt = 0; 8356 break; 8357 } 8358 8359 if (iovp) { 8360 *iovp = iovs; 8361 } 8362 if (iovcntp) { 8363 *iovcntp = iovcnt; 8364 } 8365 } 8366 8367 void * 8368 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8369 { 8370 if (bdev_io == NULL) { 8371 return NULL; 8372 } 8373 8374 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8375 return NULL; 8376 } 8377 8378 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8379 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8380 return bdev_io->u.bdev.md_buf; 8381 } 8382 8383 return NULL; 8384 } 8385 8386 void * 8387 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8388 { 8389 if (bdev_io == NULL) { 8390 assert(false); 8391 return NULL; 8392 } 8393 8394 return bdev_io->internal.caller_ctx; 8395 } 8396 8397 void 8398 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8399 { 8400 8401 if (spdk_bdev_module_list_find(bdev_module->name)) { 8402 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8403 assert(false); 8404 } 8405 8406 spdk_spin_init(&bdev_module->internal.spinlock); 8407 8408 /* 8409 * Modules with examine callbacks must be initialized first, so they are 8410 * ready to handle examine callbacks from later modules that will 8411 * register physical bdevs. 8412 */ 8413 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8414 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8415 } else { 8416 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8417 } 8418 } 8419 8420 struct spdk_bdev_module * 8421 spdk_bdev_module_list_find(const char *name) 8422 { 8423 struct spdk_bdev_module *bdev_module; 8424 8425 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8426 if (strcmp(name, bdev_module->name) == 0) { 8427 break; 8428 } 8429 } 8430 8431 return bdev_module; 8432 } 8433 8434 static void 8435 bdev_write_zero_buffer_next(void *_bdev_io) 8436 { 8437 struct spdk_bdev_io *bdev_io = _bdev_io; 8438 uint64_t num_bytes, num_blocks; 8439 void *md_buf = NULL; 8440 int rc; 8441 8442 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8443 bdev_io->u.bdev.split_remaining_num_blocks, 8444 ZERO_BUFFER_SIZE); 8445 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8446 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8447 8448 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8449 md_buf = (char *)g_bdev_mgr.zero_buffer + 8450 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8451 } 8452 8453 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8454 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8455 g_bdev_mgr.zero_buffer, md_buf, 8456 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8457 bdev_write_zero_buffer_done, bdev_io); 8458 if (rc == 0) { 8459 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8460 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8461 } else if (rc == -ENOMEM) { 8462 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8463 } else { 8464 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8465 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8466 } 8467 } 8468 8469 static void 8470 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8471 { 8472 struct spdk_bdev_io *parent_io = cb_arg; 8473 8474 spdk_bdev_free_io(bdev_io); 8475 8476 if (!success) { 8477 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8478 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8479 return; 8480 } 8481 8482 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8483 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8484 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8485 return; 8486 } 8487 8488 bdev_write_zero_buffer_next(parent_io); 8489 } 8490 8491 static void 8492 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8493 { 8494 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8495 ctx->bdev->internal.qos_mod_in_progress = false; 8496 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8497 8498 if (ctx->cb_fn) { 8499 ctx->cb_fn(ctx->cb_arg, status); 8500 } 8501 free(ctx); 8502 } 8503 8504 static void 8505 bdev_disable_qos_done(void *cb_arg) 8506 { 8507 struct set_qos_limit_ctx *ctx = cb_arg; 8508 struct spdk_bdev *bdev = ctx->bdev; 8509 struct spdk_bdev_io *bdev_io; 8510 struct spdk_bdev_qos *qos; 8511 8512 spdk_spin_lock(&bdev->internal.spinlock); 8513 qos = bdev->internal.qos; 8514 bdev->internal.qos = NULL; 8515 spdk_spin_unlock(&bdev->internal.spinlock); 8516 8517 while (!TAILQ_EMPTY(&qos->queued)) { 8518 /* Send queued I/O back to their original thread for resubmission. */ 8519 bdev_io = TAILQ_FIRST(&qos->queued); 8520 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8521 8522 if (bdev_io->internal.io_submit_ch) { 8523 /* 8524 * Channel was changed when sending it to the QoS thread - change it back 8525 * before sending it back to the original thread. 8526 */ 8527 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8528 bdev_io->internal.io_submit_ch = NULL; 8529 } 8530 8531 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8532 _bdev_io_submit, bdev_io); 8533 } 8534 8535 if (qos->thread != NULL) { 8536 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8537 spdk_poller_unregister(&qos->poller); 8538 } 8539 8540 free(qos); 8541 8542 bdev_set_qos_limit_done(ctx, 0); 8543 } 8544 8545 static void 8546 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8547 { 8548 struct set_qos_limit_ctx *ctx = _ctx; 8549 struct spdk_thread *thread; 8550 8551 spdk_spin_lock(&bdev->internal.spinlock); 8552 thread = bdev->internal.qos->thread; 8553 spdk_spin_unlock(&bdev->internal.spinlock); 8554 8555 if (thread != NULL) { 8556 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8557 } else { 8558 bdev_disable_qos_done(ctx); 8559 } 8560 } 8561 8562 static void 8563 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8564 struct spdk_io_channel *ch, void *_ctx) 8565 { 8566 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8567 8568 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8569 8570 spdk_bdev_for_each_channel_continue(i, 0); 8571 } 8572 8573 static void 8574 bdev_update_qos_rate_limit_msg(void *cb_arg) 8575 { 8576 struct set_qos_limit_ctx *ctx = cb_arg; 8577 struct spdk_bdev *bdev = ctx->bdev; 8578 8579 spdk_spin_lock(&bdev->internal.spinlock); 8580 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8581 spdk_spin_unlock(&bdev->internal.spinlock); 8582 8583 bdev_set_qos_limit_done(ctx, 0); 8584 } 8585 8586 static void 8587 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8588 struct spdk_io_channel *ch, void *_ctx) 8589 { 8590 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8591 8592 spdk_spin_lock(&bdev->internal.spinlock); 8593 bdev_enable_qos(bdev, bdev_ch); 8594 spdk_spin_unlock(&bdev->internal.spinlock); 8595 spdk_bdev_for_each_channel_continue(i, 0); 8596 } 8597 8598 static void 8599 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8600 { 8601 struct set_qos_limit_ctx *ctx = _ctx; 8602 8603 bdev_set_qos_limit_done(ctx, status); 8604 } 8605 8606 static void 8607 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8608 { 8609 int i; 8610 8611 assert(bdev->internal.qos != NULL); 8612 8613 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8614 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8615 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8616 8617 if (limits[i] == 0) { 8618 bdev->internal.qos->rate_limits[i].limit = 8619 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8620 } 8621 } 8622 } 8623 } 8624 8625 void 8626 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8627 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8628 { 8629 struct set_qos_limit_ctx *ctx; 8630 uint32_t limit_set_complement; 8631 uint64_t min_limit_per_sec; 8632 int i; 8633 bool disable_rate_limit = true; 8634 8635 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8636 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8637 continue; 8638 } 8639 8640 if (limits[i] > 0) { 8641 disable_rate_limit = false; 8642 } 8643 8644 if (bdev_qos_is_iops_rate_limit(i) == true) { 8645 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8646 } else { 8647 /* Change from megabyte to byte rate limit */ 8648 limits[i] = limits[i] * 1024 * 1024; 8649 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8650 } 8651 8652 limit_set_complement = limits[i] % min_limit_per_sec; 8653 if (limit_set_complement) { 8654 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8655 limits[i], min_limit_per_sec); 8656 limits[i] += min_limit_per_sec - limit_set_complement; 8657 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8658 } 8659 } 8660 8661 ctx = calloc(1, sizeof(*ctx)); 8662 if (ctx == NULL) { 8663 cb_fn(cb_arg, -ENOMEM); 8664 return; 8665 } 8666 8667 ctx->cb_fn = cb_fn; 8668 ctx->cb_arg = cb_arg; 8669 ctx->bdev = bdev; 8670 8671 spdk_spin_lock(&bdev->internal.spinlock); 8672 if (bdev->internal.qos_mod_in_progress) { 8673 spdk_spin_unlock(&bdev->internal.spinlock); 8674 free(ctx); 8675 cb_fn(cb_arg, -EAGAIN); 8676 return; 8677 } 8678 bdev->internal.qos_mod_in_progress = true; 8679 8680 if (disable_rate_limit == true && bdev->internal.qos) { 8681 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8682 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8683 (bdev->internal.qos->rate_limits[i].limit > 0 && 8684 bdev->internal.qos->rate_limits[i].limit != 8685 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8686 disable_rate_limit = false; 8687 break; 8688 } 8689 } 8690 } 8691 8692 if (disable_rate_limit == false) { 8693 if (bdev->internal.qos == NULL) { 8694 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8695 if (!bdev->internal.qos) { 8696 spdk_spin_unlock(&bdev->internal.spinlock); 8697 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8698 bdev_set_qos_limit_done(ctx, -ENOMEM); 8699 return; 8700 } 8701 } 8702 8703 if (bdev->internal.qos->thread == NULL) { 8704 /* Enabling */ 8705 bdev_set_qos_rate_limits(bdev, limits); 8706 8707 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8708 bdev_enable_qos_done); 8709 } else { 8710 /* Updating */ 8711 bdev_set_qos_rate_limits(bdev, limits); 8712 8713 spdk_thread_send_msg(bdev->internal.qos->thread, 8714 bdev_update_qos_rate_limit_msg, ctx); 8715 } 8716 } else { 8717 if (bdev->internal.qos != NULL) { 8718 bdev_set_qos_rate_limits(bdev, limits); 8719 8720 /* Disabling */ 8721 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8722 bdev_disable_qos_msg_done); 8723 } else { 8724 spdk_spin_unlock(&bdev->internal.spinlock); 8725 bdev_set_qos_limit_done(ctx, 0); 8726 return; 8727 } 8728 } 8729 8730 spdk_spin_unlock(&bdev->internal.spinlock); 8731 } 8732 8733 struct spdk_bdev_histogram_ctx { 8734 spdk_bdev_histogram_status_cb cb_fn; 8735 void *cb_arg; 8736 struct spdk_bdev *bdev; 8737 int status; 8738 }; 8739 8740 static void 8741 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8742 { 8743 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8744 8745 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8746 ctx->bdev->internal.histogram_in_progress = false; 8747 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8748 ctx->cb_fn(ctx->cb_arg, ctx->status); 8749 free(ctx); 8750 } 8751 8752 static void 8753 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8754 struct spdk_io_channel *_ch, void *_ctx) 8755 { 8756 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8757 8758 if (ch->histogram != NULL) { 8759 spdk_histogram_data_free(ch->histogram); 8760 ch->histogram = NULL; 8761 } 8762 spdk_bdev_for_each_channel_continue(i, 0); 8763 } 8764 8765 static void 8766 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8767 { 8768 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8769 8770 if (status != 0) { 8771 ctx->status = status; 8772 ctx->bdev->internal.histogram_enabled = false; 8773 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8774 bdev_histogram_disable_channel_cb); 8775 } else { 8776 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8777 ctx->bdev->internal.histogram_in_progress = false; 8778 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8779 ctx->cb_fn(ctx->cb_arg, ctx->status); 8780 free(ctx); 8781 } 8782 } 8783 8784 static void 8785 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8786 struct spdk_io_channel *_ch, void *_ctx) 8787 { 8788 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8789 int status = 0; 8790 8791 if (ch->histogram == NULL) { 8792 ch->histogram = spdk_histogram_data_alloc(); 8793 if (ch->histogram == NULL) { 8794 status = -ENOMEM; 8795 } 8796 } 8797 8798 spdk_bdev_for_each_channel_continue(i, status); 8799 } 8800 8801 void 8802 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8803 void *cb_arg, bool enable) 8804 { 8805 struct spdk_bdev_histogram_ctx *ctx; 8806 8807 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8808 if (ctx == NULL) { 8809 cb_fn(cb_arg, -ENOMEM); 8810 return; 8811 } 8812 8813 ctx->bdev = bdev; 8814 ctx->status = 0; 8815 ctx->cb_fn = cb_fn; 8816 ctx->cb_arg = cb_arg; 8817 8818 spdk_spin_lock(&bdev->internal.spinlock); 8819 if (bdev->internal.histogram_in_progress) { 8820 spdk_spin_unlock(&bdev->internal.spinlock); 8821 free(ctx); 8822 cb_fn(cb_arg, -EAGAIN); 8823 return; 8824 } 8825 8826 bdev->internal.histogram_in_progress = true; 8827 spdk_spin_unlock(&bdev->internal.spinlock); 8828 8829 bdev->internal.histogram_enabled = enable; 8830 8831 if (enable) { 8832 /* Allocate histogram for each channel */ 8833 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8834 bdev_histogram_enable_channel_cb); 8835 } else { 8836 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8837 bdev_histogram_disable_channel_cb); 8838 } 8839 } 8840 8841 struct spdk_bdev_histogram_data_ctx { 8842 spdk_bdev_histogram_data_cb cb_fn; 8843 void *cb_arg; 8844 struct spdk_bdev *bdev; 8845 /** merged histogram data from all channels */ 8846 struct spdk_histogram_data *histogram; 8847 }; 8848 8849 static void 8850 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8851 { 8852 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8853 8854 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8855 free(ctx); 8856 } 8857 8858 static void 8859 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8860 struct spdk_io_channel *_ch, void *_ctx) 8861 { 8862 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8863 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8864 int status = 0; 8865 8866 if (ch->histogram == NULL) { 8867 status = -EFAULT; 8868 } else { 8869 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8870 } 8871 8872 spdk_bdev_for_each_channel_continue(i, status); 8873 } 8874 8875 void 8876 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8877 spdk_bdev_histogram_data_cb cb_fn, 8878 void *cb_arg) 8879 { 8880 struct spdk_bdev_histogram_data_ctx *ctx; 8881 8882 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8883 if (ctx == NULL) { 8884 cb_fn(cb_arg, -ENOMEM, NULL); 8885 return; 8886 } 8887 8888 ctx->bdev = bdev; 8889 ctx->cb_fn = cb_fn; 8890 ctx->cb_arg = cb_arg; 8891 8892 ctx->histogram = histogram; 8893 8894 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8895 bdev_histogram_get_channel_cb); 8896 } 8897 8898 void 8899 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8900 void *cb_arg) 8901 { 8902 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8903 int status = 0; 8904 8905 assert(cb_fn != NULL); 8906 8907 if (bdev_ch->histogram == NULL) { 8908 status = -EFAULT; 8909 } 8910 cb_fn(cb_arg, status, bdev_ch->histogram); 8911 } 8912 8913 size_t 8914 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8915 size_t max_events) 8916 { 8917 struct media_event_entry *entry; 8918 size_t num_events = 0; 8919 8920 for (; num_events < max_events; ++num_events) { 8921 entry = TAILQ_FIRST(&desc->pending_media_events); 8922 if (entry == NULL) { 8923 break; 8924 } 8925 8926 events[num_events] = entry->event; 8927 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8928 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8929 } 8930 8931 return num_events; 8932 } 8933 8934 int 8935 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8936 size_t num_events) 8937 { 8938 struct spdk_bdev_desc *desc; 8939 struct media_event_entry *entry; 8940 size_t event_id; 8941 int rc = 0; 8942 8943 assert(bdev->media_events); 8944 8945 spdk_spin_lock(&bdev->internal.spinlock); 8946 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8947 if (desc->write) { 8948 break; 8949 } 8950 } 8951 8952 if (desc == NULL || desc->media_events_buffer == NULL) { 8953 rc = -ENODEV; 8954 goto out; 8955 } 8956 8957 for (event_id = 0; event_id < num_events; ++event_id) { 8958 entry = TAILQ_FIRST(&desc->free_media_events); 8959 if (entry == NULL) { 8960 break; 8961 } 8962 8963 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8964 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8965 entry->event = events[event_id]; 8966 } 8967 8968 rc = event_id; 8969 out: 8970 spdk_spin_unlock(&bdev->internal.spinlock); 8971 return rc; 8972 } 8973 8974 static void 8975 _media_management_notify(void *arg) 8976 { 8977 struct spdk_bdev_desc *desc = arg; 8978 8979 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8980 } 8981 8982 void 8983 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8984 { 8985 struct spdk_bdev_desc *desc; 8986 8987 spdk_spin_lock(&bdev->internal.spinlock); 8988 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8989 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8990 event_notify(desc, _media_management_notify); 8991 } 8992 } 8993 spdk_spin_unlock(&bdev->internal.spinlock); 8994 } 8995 8996 struct locked_lba_range_ctx { 8997 struct lba_range range; 8998 struct spdk_bdev *bdev; 8999 struct lba_range *current_range; 9000 struct lba_range *owner_range; 9001 struct spdk_poller *poller; 9002 lock_range_cb cb_fn; 9003 void *cb_arg; 9004 }; 9005 9006 static void 9007 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9008 { 9009 struct locked_lba_range_ctx *ctx = _ctx; 9010 9011 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 9012 free(ctx); 9013 } 9014 9015 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9016 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9017 9018 static void 9019 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9020 { 9021 struct locked_lba_range_ctx *ctx = _ctx; 9022 9023 if (status == -ENOMEM) { 9024 /* One of the channels could not allocate a range object. 9025 * So we have to go back and clean up any ranges that were 9026 * allocated successfully before we return error status to 9027 * the caller. We can reuse the unlock function to do that 9028 * clean up. 9029 */ 9030 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9031 bdev_lock_error_cleanup_cb); 9032 return; 9033 } 9034 9035 /* All channels have locked this range and no I/O overlapping the range 9036 * are outstanding! Set the owner_ch for the range object for the 9037 * locking channel, so that this channel will know that it is allowed 9038 * to write to this range. 9039 */ 9040 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9041 ctx->cb_fn(ctx->cb_arg, status); 9042 9043 /* Don't free the ctx here. Its range is in the bdev's global list of 9044 * locked ranges still, and will be removed and freed when this range 9045 * is later unlocked. 9046 */ 9047 } 9048 9049 static int 9050 bdev_lock_lba_range_check_io(void *_i) 9051 { 9052 struct spdk_bdev_channel_iter *i = _i; 9053 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9054 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9055 struct locked_lba_range_ctx *ctx = i->ctx; 9056 struct lba_range *range = ctx->current_range; 9057 struct spdk_bdev_io *bdev_io; 9058 9059 spdk_poller_unregister(&ctx->poller); 9060 9061 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9062 * range. But we need to wait until any outstanding IO overlapping with this range 9063 * are completed. 9064 */ 9065 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9066 if (bdev_io_range_is_locked(bdev_io, range)) { 9067 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9068 return SPDK_POLLER_BUSY; 9069 } 9070 } 9071 9072 spdk_bdev_for_each_channel_continue(i, 0); 9073 return SPDK_POLLER_BUSY; 9074 } 9075 9076 static void 9077 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9078 struct spdk_io_channel *_ch, void *_ctx) 9079 { 9080 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9081 struct locked_lba_range_ctx *ctx = _ctx; 9082 struct lba_range *range; 9083 9084 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9085 if (range->length == ctx->range.length && 9086 range->offset == ctx->range.offset && 9087 range->locked_ctx == ctx->range.locked_ctx) { 9088 /* This range already exists on this channel, so don't add 9089 * it again. This can happen when a new channel is created 9090 * while the for_each_channel operation is in progress. 9091 * Do not check for outstanding I/O in that case, since the 9092 * range was locked before any I/O could be submitted to the 9093 * new channel. 9094 */ 9095 spdk_bdev_for_each_channel_continue(i, 0); 9096 return; 9097 } 9098 } 9099 9100 range = calloc(1, sizeof(*range)); 9101 if (range == NULL) { 9102 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9103 return; 9104 } 9105 9106 range->length = ctx->range.length; 9107 range->offset = ctx->range.offset; 9108 range->locked_ctx = ctx->range.locked_ctx; 9109 ctx->current_range = range; 9110 if (ctx->range.owner_ch == ch) { 9111 /* This is the range object for the channel that will hold 9112 * the lock. Store it in the ctx object so that we can easily 9113 * set its owner_ch after the lock is finally acquired. 9114 */ 9115 ctx->owner_range = range; 9116 } 9117 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9118 bdev_lock_lba_range_check_io(i); 9119 } 9120 9121 static void 9122 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9123 { 9124 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 9125 9126 /* We will add a copy of this range to each channel now. */ 9127 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9128 bdev_lock_lba_range_cb); 9129 } 9130 9131 static bool 9132 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9133 { 9134 struct lba_range *r; 9135 9136 TAILQ_FOREACH(r, tailq, tailq) { 9137 if (bdev_lba_range_overlapped(range, r)) { 9138 return true; 9139 } 9140 } 9141 return false; 9142 } 9143 9144 static int 9145 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9146 uint64_t offset, uint64_t length, 9147 lock_range_cb cb_fn, void *cb_arg) 9148 { 9149 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9150 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9151 struct locked_lba_range_ctx *ctx; 9152 9153 if (cb_arg == NULL) { 9154 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9155 return -EINVAL; 9156 } 9157 9158 ctx = calloc(1, sizeof(*ctx)); 9159 if (ctx == NULL) { 9160 return -ENOMEM; 9161 } 9162 9163 ctx->range.offset = offset; 9164 ctx->range.length = length; 9165 ctx->range.owner_ch = ch; 9166 ctx->range.locked_ctx = cb_arg; 9167 ctx->bdev = bdev; 9168 ctx->cb_fn = cb_fn; 9169 ctx->cb_arg = cb_arg; 9170 9171 spdk_spin_lock(&bdev->internal.spinlock); 9172 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9173 /* There is an active lock overlapping with this range. 9174 * Put it on the pending list until this range no 9175 * longer overlaps with another. 9176 */ 9177 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9178 } else { 9179 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9180 bdev_lock_lba_range_ctx(bdev, ctx); 9181 } 9182 spdk_spin_unlock(&bdev->internal.spinlock); 9183 return 0; 9184 } 9185 9186 static void 9187 bdev_lock_lba_range_ctx_msg(void *_ctx) 9188 { 9189 struct locked_lba_range_ctx *ctx = _ctx; 9190 9191 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 9192 } 9193 9194 static void 9195 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9196 { 9197 struct locked_lba_range_ctx *ctx = _ctx; 9198 struct locked_lba_range_ctx *pending_ctx; 9199 struct lba_range *range, *tmp; 9200 9201 spdk_spin_lock(&bdev->internal.spinlock); 9202 /* Check if there are any pending locked ranges that overlap with this range 9203 * that was just unlocked. If there are, check that it doesn't overlap with any 9204 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9205 * the lock process. 9206 */ 9207 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9208 if (bdev_lba_range_overlapped(range, &ctx->range) && 9209 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9210 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9211 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9212 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9213 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 9214 bdev_lock_lba_range_ctx_msg, pending_ctx); 9215 } 9216 } 9217 spdk_spin_unlock(&bdev->internal.spinlock); 9218 9219 ctx->cb_fn(ctx->cb_arg, status); 9220 free(ctx); 9221 } 9222 9223 static void 9224 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9225 struct spdk_io_channel *_ch, void *_ctx) 9226 { 9227 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9228 struct locked_lba_range_ctx *ctx = _ctx; 9229 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9230 struct spdk_bdev_io *bdev_io; 9231 struct lba_range *range; 9232 9233 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9234 if (ctx->range.offset == range->offset && 9235 ctx->range.length == range->length && 9236 ctx->range.locked_ctx == range->locked_ctx) { 9237 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9238 free(range); 9239 break; 9240 } 9241 } 9242 9243 /* Note: we should almost always be able to assert that the range specified 9244 * was found. But there are some very rare corner cases where a new channel 9245 * gets created simultaneously with a range unlock, where this function 9246 * would execute on that new channel and wouldn't have the range. 9247 * We also use this to clean up range allocations when a later allocation 9248 * fails in the locking path. 9249 * So we can't actually assert() here. 9250 */ 9251 9252 /* Swap the locked IO into a temporary list, and then try to submit them again. 9253 * We could hyper-optimize this to only resubmit locked I/O that overlap 9254 * with the range that was just unlocked, but this isn't a performance path so 9255 * we go for simplicity here. 9256 */ 9257 TAILQ_INIT(&io_locked); 9258 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9259 while (!TAILQ_EMPTY(&io_locked)) { 9260 bdev_io = TAILQ_FIRST(&io_locked); 9261 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9262 bdev_io_submit(bdev_io); 9263 } 9264 9265 spdk_bdev_for_each_channel_continue(i, 0); 9266 } 9267 9268 static int 9269 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9270 uint64_t offset, uint64_t length, 9271 lock_range_cb cb_fn, void *cb_arg) 9272 { 9273 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9274 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9275 struct locked_lba_range_ctx *ctx; 9276 struct lba_range *range; 9277 bool range_found = false; 9278 9279 /* Let's make sure the specified channel actually has a lock on 9280 * the specified range. Note that the range must match exactly. 9281 */ 9282 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9283 if (range->offset == offset && range->length == length && 9284 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9285 range_found = true; 9286 break; 9287 } 9288 } 9289 9290 if (!range_found) { 9291 return -EINVAL; 9292 } 9293 9294 spdk_spin_lock(&bdev->internal.spinlock); 9295 /* We confirmed that this channel has locked the specified range. To 9296 * start the unlock the process, we find the range in the bdev's locked_ranges 9297 * and remove it. This ensures new channels don't inherit the locked range. 9298 * Then we will send a message to each channel (including the one specified 9299 * here) to remove the range from its per-channel list. 9300 */ 9301 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9302 if (range->offset == offset && range->length == length && 9303 range->locked_ctx == cb_arg) { 9304 break; 9305 } 9306 } 9307 if (range == NULL) { 9308 assert(false); 9309 spdk_spin_unlock(&bdev->internal.spinlock); 9310 return -EINVAL; 9311 } 9312 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9313 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9314 spdk_spin_unlock(&bdev->internal.spinlock); 9315 9316 ctx->cb_fn = cb_fn; 9317 ctx->cb_arg = cb_arg; 9318 9319 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9320 bdev_unlock_lba_range_cb); 9321 return 0; 9322 } 9323 9324 int 9325 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9326 int array_size) 9327 { 9328 if (!bdev) { 9329 return -EINVAL; 9330 } 9331 9332 if (bdev->fn_table->get_memory_domains) { 9333 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9334 } 9335 9336 return 0; 9337 } 9338 9339 struct spdk_bdev_for_each_io_ctx { 9340 void *ctx; 9341 spdk_bdev_io_fn fn; 9342 spdk_bdev_for_each_io_cb cb; 9343 }; 9344 9345 static void 9346 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9347 struct spdk_io_channel *io_ch, void *_ctx) 9348 { 9349 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9350 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9351 struct spdk_bdev_io *bdev_io; 9352 int rc = 0; 9353 9354 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9355 rc = ctx->fn(ctx->ctx, bdev_io); 9356 if (rc != 0) { 9357 break; 9358 } 9359 } 9360 9361 spdk_bdev_for_each_channel_continue(i, rc); 9362 } 9363 9364 static void 9365 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9366 { 9367 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9368 9369 ctx->cb(ctx->ctx, status); 9370 9371 free(ctx); 9372 } 9373 9374 void 9375 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9376 spdk_bdev_for_each_io_cb cb) 9377 { 9378 struct spdk_bdev_for_each_io_ctx *ctx; 9379 9380 assert(fn != NULL && cb != NULL); 9381 9382 ctx = calloc(1, sizeof(*ctx)); 9383 if (ctx == NULL) { 9384 SPDK_ERRLOG("Failed to allocate context.\n"); 9385 cb(_ctx, -ENOMEM); 9386 return; 9387 } 9388 9389 ctx->ctx = _ctx; 9390 ctx->fn = fn; 9391 ctx->cb = cb; 9392 9393 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9394 bdev_for_each_io_done); 9395 } 9396 9397 void 9398 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9399 { 9400 spdk_for_each_channel_continue(iter->i, status); 9401 } 9402 9403 static struct spdk_bdev * 9404 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9405 { 9406 void *io_device = spdk_io_channel_iter_get_io_device(i); 9407 9408 return __bdev_from_io_dev(io_device); 9409 } 9410 9411 static void 9412 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9413 { 9414 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9415 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9416 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9417 9418 iter->i = i; 9419 iter->fn(iter, bdev, ch, iter->ctx); 9420 } 9421 9422 static void 9423 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9424 { 9425 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9426 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9427 9428 iter->i = i; 9429 iter->cpl(bdev, iter->ctx, status); 9430 9431 free(iter); 9432 } 9433 9434 void 9435 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9436 void *ctx, spdk_bdev_for_each_channel_done cpl) 9437 { 9438 struct spdk_bdev_channel_iter *iter; 9439 9440 assert(bdev != NULL && fn != NULL && ctx != NULL); 9441 9442 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9443 if (iter == NULL) { 9444 SPDK_ERRLOG("Unable to allocate iterator\n"); 9445 assert(false); 9446 return; 9447 } 9448 9449 iter->fn = fn; 9450 iter->cpl = cpl; 9451 iter->ctx = ctx; 9452 9453 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9454 iter, bdev_each_channel_cpl); 9455 } 9456 9457 static void 9458 bdev_copy_do_write_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9459 { 9460 struct spdk_bdev_io *parent_io = cb_arg; 9461 9462 /* Check return status of write */ 9463 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9464 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9465 spdk_bdev_free_io(bdev_io); 9466 } 9467 9468 static void 9469 bdev_copy_do_write(void *_bdev_io) 9470 { 9471 struct spdk_bdev_io *bdev_io = _bdev_io; 9472 int rc; 9473 9474 /* Write blocks */ 9475 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9476 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9477 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9478 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_complete, bdev_io); 9479 9480 if (rc == -ENOMEM) { 9481 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9482 } else if (rc != 0) { 9483 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9484 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9485 } 9486 } 9487 9488 static void 9489 bdev_copy_do_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9490 { 9491 struct spdk_bdev_io *parent_io = cb_arg; 9492 9493 /* Check return status of read */ 9494 if (!success) { 9495 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9496 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9497 spdk_bdev_free_io(bdev_io); 9498 return; 9499 } 9500 9501 spdk_bdev_free_io(bdev_io); 9502 9503 /* Do write */ 9504 bdev_copy_do_write(parent_io); 9505 } 9506 9507 static void 9508 bdev_copy_do_read(void *_bdev_io) 9509 { 9510 struct spdk_bdev_io *bdev_io = _bdev_io; 9511 int rc; 9512 9513 /* Read blocks */ 9514 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9515 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9516 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9517 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_complete, bdev_io); 9518 9519 if (rc == -ENOMEM) { 9520 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9521 } else if (rc != 0) { 9522 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9523 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9524 } 9525 } 9526 9527 static void 9528 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9529 { 9530 if (!success) { 9531 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9532 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9533 return; 9534 } 9535 9536 bdev_copy_do_read(bdev_io); 9537 } 9538 9539 int 9540 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9541 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9542 spdk_bdev_io_completion_cb cb, void *cb_arg) 9543 { 9544 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9545 struct spdk_bdev_io *bdev_io; 9546 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9547 9548 if (!desc->write) { 9549 return -EBADF; 9550 } 9551 9552 if (num_blocks == 0) { 9553 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9554 return -EINVAL; 9555 } 9556 9557 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9558 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9559 SPDK_DEBUGLOG(bdev, 9560 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9561 dst_offset_blocks, src_offset_blocks, num_blocks); 9562 return -EINVAL; 9563 } 9564 9565 bdev_io = bdev_channel_get_io(channel); 9566 if (!bdev_io) { 9567 return -ENOMEM; 9568 } 9569 9570 bdev_io->internal.ch = channel; 9571 bdev_io->internal.desc = desc; 9572 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9573 9574 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9575 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9576 bdev_io->u.bdev.num_blocks = num_blocks; 9577 bdev_io->u.bdev.memory_domain = NULL; 9578 bdev_io->u.bdev.memory_domain_ctx = NULL; 9579 bdev_io->u.bdev.iovs = NULL; 9580 bdev_io->u.bdev.iovcnt = 0; 9581 bdev_io->u.bdev.md_buf = NULL; 9582 bdev_io->u.bdev.accel_sequence = NULL; 9583 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9584 9585 if (dst_offset_blocks == src_offset_blocks) { 9586 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9587 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9588 9589 return 0; 9590 } 9591 9592 /* If the bdev backing device support copy directly, pass to it to process. 9593 * Else do general processing from bdev layer. 9594 */ 9595 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 9596 bdev_io_submit(bdev_io); 9597 return 0; 9598 } 9599 9600 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9601 9602 return 0; 9603 } 9604 9605 SPDK_LOG_REGISTER_COMPONENT(bdev) 9606 9607 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9608 { 9609 struct spdk_trace_tpoint_opts opts[] = { 9610 { 9611 "BDEV_IO_START", TRACE_BDEV_IO_START, 9612 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9613 { 9614 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9615 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9616 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9617 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9618 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9619 } 9620 }, 9621 { 9622 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9623 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9624 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9625 }, 9626 { 9627 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9628 OWNER_BDEV, OBJECT_NONE, 1, 9629 { 9630 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9631 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9632 } 9633 }, 9634 { 9635 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9636 OWNER_BDEV, OBJECT_NONE, 0, 9637 { 9638 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9639 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9640 } 9641 }, 9642 }; 9643 9644 9645 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9646 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9647 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9648 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9649 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9650 } 9651