1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 /* List of I/Os with accel sequence being currently executed */ 293 bdev_io_tailq_t io_accel_exec; 294 295 /* List of I/Os doing memory domain pull/push */ 296 bdev_io_tailq_t io_memory_domain; 297 298 uint32_t flags; 299 300 struct spdk_histogram_data *histogram; 301 302 #ifdef SPDK_CONFIG_VTUNE 303 uint64_t start_tsc; 304 uint64_t interval_tsc; 305 __itt_string_handle *handle; 306 struct spdk_bdev_io_stat *prev_stat; 307 #endif 308 309 bdev_io_tailq_t queued_resets; 310 311 lba_range_tailq_t locked_ranges; 312 }; 313 314 struct media_event_entry { 315 struct spdk_bdev_media_event event; 316 TAILQ_ENTRY(media_event_entry) tailq; 317 }; 318 319 #define MEDIA_EVENT_POOL_SIZE 64 320 321 struct spdk_bdev_desc { 322 struct spdk_bdev *bdev; 323 struct spdk_thread *thread; 324 struct { 325 spdk_bdev_event_cb_t event_fn; 326 void *ctx; 327 } callback; 328 bool closed; 329 bool write; 330 bool memory_domains_supported; 331 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 332 struct spdk_spinlock spinlock; 333 uint32_t refs; 334 TAILQ_HEAD(, media_event_entry) pending_media_events; 335 TAILQ_HEAD(, media_event_entry) free_media_events; 336 struct media_event_entry *media_events_buffer; 337 TAILQ_ENTRY(spdk_bdev_desc) link; 338 339 uint64_t timeout_in_sec; 340 spdk_bdev_io_timeout_cb cb_fn; 341 void *cb_arg; 342 struct spdk_poller *io_timeout_poller; 343 struct spdk_bdev_module_claim *claim; 344 }; 345 346 struct spdk_bdev_iostat_ctx { 347 struct spdk_bdev_io_stat *stat; 348 spdk_bdev_get_device_stat_cb cb; 349 void *cb_arg; 350 }; 351 352 struct set_qos_limit_ctx { 353 void (*cb_fn)(void *cb_arg, int status); 354 void *cb_arg; 355 struct spdk_bdev *bdev; 356 }; 357 358 struct spdk_bdev_channel_iter { 359 spdk_bdev_for_each_channel_msg fn; 360 spdk_bdev_for_each_channel_done cpl; 361 struct spdk_io_channel_iter *i; 362 void *ctx; 363 }; 364 365 struct spdk_bdev_io_error_stat { 366 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 367 }; 368 369 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 370 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 371 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 372 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 373 374 static inline void bdev_io_complete(void *ctx); 375 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 376 377 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 378 static void bdev_write_zero_buffer_next(void *_bdev_io); 379 380 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 381 struct spdk_io_channel *ch, void *_ctx); 382 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 383 384 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 385 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 386 uint64_t num_blocks, 387 struct spdk_memory_domain *domain, void *domain_ctx, 388 struct spdk_accel_sequence *seq, 389 spdk_bdev_io_completion_cb cb, void *cb_arg); 390 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 391 struct iovec *iov, int iovcnt, void *md_buf, 392 uint64_t offset_blocks, uint64_t num_blocks, 393 struct spdk_memory_domain *domain, void *domain_ctx, 394 struct spdk_accel_sequence *seq, 395 spdk_bdev_io_completion_cb cb, void *cb_arg); 396 397 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 398 uint64_t offset, uint64_t length, 399 lock_range_cb cb_fn, void *cb_arg); 400 401 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 402 uint64_t offset, uint64_t length, 403 lock_range_cb cb_fn, void *cb_arg); 404 405 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 406 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 407 408 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 409 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 410 static void claim_reset(struct spdk_bdev *bdev); 411 412 #define bdev_get_ext_io_opt(opts, field, defval) \ 413 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 414 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 415 416 void 417 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 418 { 419 if (!opts) { 420 SPDK_ERRLOG("opts should not be NULL\n"); 421 return; 422 } 423 424 if (!opts_size) { 425 SPDK_ERRLOG("opts_size should not be zero value\n"); 426 return; 427 } 428 429 opts->opts_size = opts_size; 430 431 #define SET_FIELD(field) \ 432 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 433 opts->field = g_bdev_opts.field; \ 434 } \ 435 436 SET_FIELD(bdev_io_pool_size); 437 SET_FIELD(bdev_io_cache_size); 438 SET_FIELD(bdev_auto_examine); 439 SET_FIELD(small_buf_pool_size); 440 SET_FIELD(large_buf_pool_size); 441 442 /* Do not remove this statement, you should always update this statement when you adding a new field, 443 * and do not forget to add the SET_FIELD statement for your added field. */ 444 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 445 446 #undef SET_FIELD 447 } 448 449 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 450 "v23.05", 0); 451 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 452 "v23.05", 0); 453 int 454 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 455 { 456 struct spdk_iobuf_opts iobuf_opts; 457 uint32_t min_pool_size; 458 int rc; 459 460 if (!opts) { 461 SPDK_ERRLOG("opts cannot be NULL\n"); 462 return -1; 463 } 464 465 if (!opts->opts_size) { 466 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 467 return -1; 468 } 469 470 /* 471 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 472 * initialization. A second mgmt_ch will be created on the same thread when the application starts 473 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 474 */ 475 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 476 if (opts->bdev_io_pool_size < min_pool_size) { 477 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 478 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 479 spdk_thread_get_count()); 480 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 481 return -1; 482 } 483 484 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 485 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 486 } 487 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 488 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 489 } 490 491 #define SET_FIELD(field) \ 492 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 493 g_bdev_opts.field = opts->field; \ 494 } \ 495 496 SET_FIELD(bdev_io_pool_size); 497 SET_FIELD(bdev_io_cache_size); 498 SET_FIELD(bdev_auto_examine); 499 SET_FIELD(small_buf_pool_size); 500 SET_FIELD(large_buf_pool_size); 501 502 spdk_iobuf_get_opts(&iobuf_opts); 503 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 504 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 505 506 rc = spdk_iobuf_set_opts(&iobuf_opts); 507 if (rc != 0) { 508 SPDK_ERRLOG("Failed to set iobuf opts\n"); 509 return -1; 510 } 511 512 g_bdev_opts.opts_size = opts->opts_size; 513 514 #undef SET_FIELD 515 516 return 0; 517 } 518 519 static struct spdk_bdev * 520 bdev_get_by_name(const char *bdev_name) 521 { 522 struct spdk_bdev_name find; 523 struct spdk_bdev_name *res; 524 525 find.name = (char *)bdev_name; 526 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 527 if (res != NULL) { 528 return res->bdev; 529 } 530 531 return NULL; 532 } 533 534 struct spdk_bdev * 535 spdk_bdev_get_by_name(const char *bdev_name) 536 { 537 struct spdk_bdev *bdev; 538 539 spdk_spin_lock(&g_bdev_mgr.spinlock); 540 bdev = bdev_get_by_name(bdev_name); 541 spdk_spin_unlock(&g_bdev_mgr.spinlock); 542 543 return bdev; 544 } 545 546 struct bdev_io_status_string { 547 enum spdk_bdev_io_status status; 548 const char *str; 549 }; 550 551 static const struct bdev_io_status_string bdev_io_status_strings[] = { 552 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 553 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 554 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 555 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 556 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 557 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 558 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 559 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 560 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 561 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 562 }; 563 564 static const char * 565 bdev_io_status_get_string(enum spdk_bdev_io_status status) 566 { 567 uint32_t i; 568 569 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 570 if (bdev_io_status_strings[i].status == status) { 571 return bdev_io_status_strings[i].str; 572 } 573 } 574 575 return "reserved"; 576 } 577 578 struct spdk_bdev_wait_for_examine_ctx { 579 struct spdk_poller *poller; 580 spdk_bdev_wait_for_examine_cb cb_fn; 581 void *cb_arg; 582 }; 583 584 static bool bdev_module_all_actions_completed(void); 585 586 static int 587 bdev_wait_for_examine_cb(void *arg) 588 { 589 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 590 591 if (!bdev_module_all_actions_completed()) { 592 return SPDK_POLLER_IDLE; 593 } 594 595 spdk_poller_unregister(&ctx->poller); 596 ctx->cb_fn(ctx->cb_arg); 597 free(ctx); 598 599 return SPDK_POLLER_BUSY; 600 } 601 602 int 603 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 604 { 605 struct spdk_bdev_wait_for_examine_ctx *ctx; 606 607 ctx = calloc(1, sizeof(*ctx)); 608 if (ctx == NULL) { 609 return -ENOMEM; 610 } 611 ctx->cb_fn = cb_fn; 612 ctx->cb_arg = cb_arg; 613 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 614 615 return 0; 616 } 617 618 struct spdk_bdev_examine_item { 619 char *name; 620 TAILQ_ENTRY(spdk_bdev_examine_item) link; 621 }; 622 623 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 624 625 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 626 g_bdev_examine_allowlist); 627 628 static inline bool 629 bdev_examine_allowlist_check(const char *name) 630 { 631 struct spdk_bdev_examine_item *item; 632 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 633 if (strcmp(name, item->name) == 0) { 634 return true; 635 } 636 } 637 return false; 638 } 639 640 static inline void 641 bdev_examine_allowlist_free(void) 642 { 643 struct spdk_bdev_examine_item *item; 644 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 645 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 646 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 647 free(item->name); 648 free(item); 649 } 650 } 651 652 static inline bool 653 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 654 { 655 struct spdk_bdev_alias *tmp; 656 if (bdev_examine_allowlist_check(bdev->name)) { 657 return true; 658 } 659 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 660 if (bdev_examine_allowlist_check(tmp->alias.name)) { 661 return true; 662 } 663 } 664 return false; 665 } 666 667 static inline bool 668 bdev_ok_to_examine(struct spdk_bdev *bdev) 669 { 670 if (g_bdev_opts.bdev_auto_examine) { 671 return true; 672 } else { 673 return bdev_in_examine_allowlist(bdev); 674 } 675 } 676 677 static void 678 bdev_examine(struct spdk_bdev *bdev) 679 { 680 struct spdk_bdev_module *module; 681 struct spdk_bdev_module_claim *claim, *tmpclaim; 682 uint32_t action; 683 684 if (!bdev_ok_to_examine(bdev)) { 685 return; 686 } 687 688 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 689 if (module->examine_config) { 690 spdk_spin_lock(&module->internal.spinlock); 691 action = module->internal.action_in_progress; 692 module->internal.action_in_progress++; 693 spdk_spin_unlock(&module->internal.spinlock); 694 module->examine_config(bdev); 695 if (action != module->internal.action_in_progress) { 696 SPDK_ERRLOG("examine_config for module %s did not call " 697 "spdk_bdev_module_examine_done()\n", module->name); 698 } 699 } 700 } 701 702 spdk_spin_lock(&bdev->internal.spinlock); 703 704 switch (bdev->internal.claim_type) { 705 case SPDK_BDEV_CLAIM_NONE: 706 /* Examine by all bdev modules */ 707 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 708 if (module->examine_disk) { 709 spdk_spin_lock(&module->internal.spinlock); 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 spdk_spin_unlock(&bdev->internal.spinlock); 713 module->examine_disk(bdev); 714 spdk_spin_lock(&bdev->internal.spinlock); 715 } 716 } 717 break; 718 case SPDK_BDEV_CLAIM_EXCL_WRITE: 719 /* Examine by the one bdev module with a v1 claim */ 720 module = bdev->internal.claim.v1.module; 721 if (module->examine_disk) { 722 spdk_spin_lock(&module->internal.spinlock); 723 module->internal.action_in_progress++; 724 spdk_spin_unlock(&module->internal.spinlock); 725 spdk_spin_unlock(&bdev->internal.spinlock); 726 module->examine_disk(bdev); 727 return; 728 } 729 break; 730 default: 731 /* Examine by all bdev modules with a v2 claim */ 732 assert(claim_type_is_v2(bdev->internal.claim_type)); 733 /* 734 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 735 * list, perhaps accessing freed memory. Without protection, this could happen 736 * while the lock is dropped during the examine callback. 737 */ 738 bdev->internal.examine_in_progress++; 739 740 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 741 module = claim->module; 742 743 if (module == NULL) { 744 /* This is a vestigial claim, held by examine_count */ 745 continue; 746 } 747 748 if (module->examine_disk == NULL) { 749 continue; 750 } 751 752 spdk_spin_lock(&module->internal.spinlock); 753 module->internal.action_in_progress++; 754 spdk_spin_unlock(&module->internal.spinlock); 755 756 /* Call examine_disk without holding internal.spinlock. */ 757 spdk_spin_unlock(&bdev->internal.spinlock); 758 module->examine_disk(bdev); 759 spdk_spin_lock(&bdev->internal.spinlock); 760 } 761 762 assert(bdev->internal.examine_in_progress > 0); 763 bdev->internal.examine_in_progress--; 764 if (bdev->internal.examine_in_progress == 0) { 765 /* Remove any claims that were released during examine_disk */ 766 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 767 if (claim->desc != NULL) { 768 continue; 769 } 770 771 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 772 free(claim); 773 } 774 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 775 claim_reset(bdev); 776 } 777 } 778 } 779 780 spdk_spin_unlock(&bdev->internal.spinlock); 781 } 782 783 int 784 spdk_bdev_examine(const char *name) 785 { 786 struct spdk_bdev *bdev; 787 struct spdk_bdev_examine_item *item; 788 struct spdk_thread *thread = spdk_get_thread(); 789 790 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 791 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 792 thread ? spdk_thread_get_name(thread) : "null"); 793 return -EINVAL; 794 } 795 796 if (g_bdev_opts.bdev_auto_examine) { 797 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 798 return -EINVAL; 799 } 800 801 if (bdev_examine_allowlist_check(name)) { 802 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 803 return -EEXIST; 804 } 805 806 item = calloc(1, sizeof(*item)); 807 if (!item) { 808 return -ENOMEM; 809 } 810 item->name = strdup(name); 811 if (!item->name) { 812 free(item); 813 return -ENOMEM; 814 } 815 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 816 817 bdev = spdk_bdev_get_by_name(name); 818 if (bdev) { 819 bdev_examine(bdev); 820 } 821 return 0; 822 } 823 824 static inline void 825 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 826 { 827 struct spdk_bdev_examine_item *item; 828 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 829 spdk_json_write_object_begin(w); 830 spdk_json_write_named_string(w, "method", "bdev_examine"); 831 spdk_json_write_named_object_begin(w, "params"); 832 spdk_json_write_named_string(w, "name", item->name); 833 spdk_json_write_object_end(w); 834 spdk_json_write_object_end(w); 835 } 836 } 837 838 struct spdk_bdev * 839 spdk_bdev_first(void) 840 { 841 struct spdk_bdev *bdev; 842 843 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 844 if (bdev) { 845 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 846 } 847 848 return bdev; 849 } 850 851 struct spdk_bdev * 852 spdk_bdev_next(struct spdk_bdev *prev) 853 { 854 struct spdk_bdev *bdev; 855 856 bdev = TAILQ_NEXT(prev, internal.link); 857 if (bdev) { 858 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 859 } 860 861 return bdev; 862 } 863 864 static struct spdk_bdev * 865 _bdev_next_leaf(struct spdk_bdev *bdev) 866 { 867 while (bdev != NULL) { 868 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 869 return bdev; 870 } else { 871 bdev = TAILQ_NEXT(bdev, internal.link); 872 } 873 } 874 875 return bdev; 876 } 877 878 struct spdk_bdev * 879 spdk_bdev_first_leaf(void) 880 { 881 struct spdk_bdev *bdev; 882 883 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 884 885 if (bdev) { 886 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 887 } 888 889 return bdev; 890 } 891 892 struct spdk_bdev * 893 spdk_bdev_next_leaf(struct spdk_bdev *prev) 894 { 895 struct spdk_bdev *bdev; 896 897 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 898 899 if (bdev) { 900 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 901 } 902 903 return bdev; 904 } 905 906 static inline bool 907 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 908 { 909 return bdev_io->internal.memory_domain; 910 } 911 912 static inline bool 913 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 914 { 915 return bdev_io->internal.accel_sequence; 916 } 917 918 void 919 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 920 { 921 struct iovec *iovs; 922 923 if (bdev_io->u.bdev.iovs == NULL) { 924 bdev_io->u.bdev.iovs = &bdev_io->iov; 925 bdev_io->u.bdev.iovcnt = 1; 926 } 927 928 iovs = bdev_io->u.bdev.iovs; 929 930 assert(iovs != NULL); 931 assert(bdev_io->u.bdev.iovcnt >= 1); 932 933 iovs[0].iov_base = buf; 934 iovs[0].iov_len = len; 935 } 936 937 void 938 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 939 { 940 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 941 bdev_io->u.bdev.md_buf = md_buf; 942 } 943 944 static bool 945 _is_buf_allocated(const struct iovec *iovs) 946 { 947 if (iovs == NULL) { 948 return false; 949 } 950 951 return iovs[0].iov_base != NULL; 952 } 953 954 static bool 955 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 956 { 957 int i; 958 uintptr_t iov_base; 959 960 if (spdk_likely(alignment == 1)) { 961 return true; 962 } 963 964 for (i = 0; i < iovcnt; i++) { 965 iov_base = (uintptr_t)iovs[i].iov_base; 966 if ((iov_base & (alignment - 1)) != 0) { 967 return false; 968 } 969 } 970 971 return true; 972 } 973 974 static inline bool 975 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 976 { 977 if (!bdev_io_use_accel_sequence(bdev_io)) { 978 return false; 979 } 980 981 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 982 * bdev module didn't support accel sequences */ 983 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 984 } 985 986 static void 987 bdev_io_submit_sequence_cb(void *ctx, int status) 988 { 989 struct spdk_bdev_io *bdev_io = ctx; 990 991 bdev_io->u.bdev.accel_sequence = NULL; 992 bdev_io->internal.accel_sequence = NULL; 993 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 994 995 if (spdk_unlikely(status != 0)) { 996 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 997 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 998 bdev_io_complete_unsubmitted(bdev_io); 999 return; 1000 } 1001 1002 bdev_io_submit(bdev_io); 1003 } 1004 1005 static void 1006 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, spdk_accel_completion_cb cb_fn) 1007 { 1008 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1009 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1010 1011 /* Since the operations are appended during submission, they're in the opposite order than 1012 * how we want to execute them for reads (i.e. we need to execute the most recently added 1013 * operation first), so reverse the sequence before executing it. 1014 */ 1015 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1016 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1017 } 1018 1019 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1020 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, cb_fn, bdev_io); 1021 } 1022 1023 static void 1024 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1025 { 1026 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1027 void *buf; 1028 1029 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1030 buf = bdev_io->internal.buf; 1031 bdev_io->internal.buf = NULL; 1032 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1033 bdev_io->internal.get_aux_buf_cb = NULL; 1034 } else { 1035 assert(bdev_io->internal.get_buf_cb != NULL); 1036 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1037 bdev_io->internal.get_buf_cb = NULL; 1038 } 1039 } 1040 1041 static void 1042 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1043 { 1044 struct spdk_bdev_io *bdev_io = ctx; 1045 1046 if (rc) { 1047 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1048 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1049 } 1050 bdev_io_get_buf_complete(bdev_io, !rc); 1051 } 1052 1053 static void 1054 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1055 { 1056 int rc = 0; 1057 1058 /* save original md_buf */ 1059 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1060 bdev_io->internal.orig_md_iov.iov_len = len; 1061 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1062 bdev_io->internal.bounce_md_iov.iov_len = len; 1063 /* set bounce md_buf */ 1064 bdev_io->u.bdev.md_buf = md_buf; 1065 1066 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1067 if (bdev_io_use_memory_domain(bdev_io)) { 1068 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1069 bdev_io->internal.memory_domain_ctx, 1070 &bdev_io->internal.orig_md_iov, 1, 1071 &bdev_io->internal.bounce_md_iov, 1, 1072 bdev_io->internal.data_transfer_cpl, 1073 bdev_io); 1074 if (rc == 0) { 1075 /* Continue to submit IO in completion callback */ 1076 return; 1077 } 1078 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1079 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain), rc); 1080 } else { 1081 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 1082 } 1083 } 1084 1085 assert(bdev_io->internal.data_transfer_cpl); 1086 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1087 } 1088 1089 static void 1090 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1091 { 1092 struct spdk_bdev *bdev = bdev_io->bdev; 1093 uint64_t md_len; 1094 void *buf; 1095 1096 if (spdk_bdev_is_md_separate(bdev)) { 1097 assert(!bdev_io_use_accel_sequence(bdev_io)); 1098 1099 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1100 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1101 1102 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1103 1104 if (bdev_io->u.bdev.md_buf != NULL) { 1105 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1106 return; 1107 } else { 1108 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1109 } 1110 } 1111 1112 bdev_io_get_buf_complete(bdev_io, true); 1113 } 1114 1115 static void 1116 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 1117 { 1118 struct spdk_bdev_io *bdev_io = ctx; 1119 1120 if (rc) { 1121 SPDK_ERRLOG("Failed to get data buffer\n"); 1122 assert(bdev_io->internal.data_transfer_cpl); 1123 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1124 return; 1125 } 1126 1127 _bdev_io_set_md_buf(bdev_io); 1128 } 1129 1130 static void 1131 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1132 bdev_copy_bounce_buffer_cpl cpl_cb) 1133 { 1134 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1135 int rc = 0; 1136 1137 bdev_io->internal.data_transfer_cpl = cpl_cb; 1138 /* save original iovec */ 1139 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1140 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1141 /* set bounce iov */ 1142 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1143 bdev_io->u.bdev.iovcnt = 1; 1144 /* set bounce buffer for this operation */ 1145 bdev_io->u.bdev.iovs[0].iov_base = buf; 1146 bdev_io->u.bdev.iovs[0].iov_len = len; 1147 1148 /* If we need to exec an accel sequence, append a copy operation making accel change the 1149 * src/dst buffers of the previous operation */ 1150 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1151 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1152 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1153 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1154 NULL, NULL, 1155 bdev_io->internal.orig_iovs, 1156 bdev_io->internal.orig_iovcnt, 1157 bdev_io->internal.memory_domain, 1158 bdev_io->internal.memory_domain_ctx, 1159 0, NULL, NULL); 1160 } else { 1161 /* We need to reverse the src/dst for reads */ 1162 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1163 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1164 bdev_io->internal.orig_iovs, 1165 bdev_io->internal.orig_iovcnt, 1166 bdev_io->internal.memory_domain, 1167 bdev_io->internal.memory_domain_ctx, 1168 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1169 NULL, NULL, 0, NULL, NULL); 1170 } 1171 1172 if (spdk_unlikely(rc != 0)) { 1173 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1174 bdev_io->internal.accel_sequence); 1175 } 1176 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1177 /* if this is write path, copy data from original buffer to bounce buffer */ 1178 if (bdev_io_use_memory_domain(bdev_io)) { 1179 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1180 bdev_io->internal.memory_domain_ctx, 1181 bdev_io->internal.orig_iovs, 1182 (uint32_t) bdev_io->internal.orig_iovcnt, 1183 bdev_io->u.bdev.iovs, 1, 1184 _bdev_io_pull_bounce_data_buf_done, 1185 bdev_io); 1186 if (rc == 0) { 1187 /* Continue to submit IO in completion callback */ 1188 return; 1189 } 1190 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1191 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1192 } else { 1193 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1194 } 1195 } 1196 1197 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1198 } 1199 1200 static void 1201 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1202 { 1203 struct spdk_bdev *bdev = bdev_io->bdev; 1204 bool buf_allocated; 1205 uint64_t alignment; 1206 void *aligned_buf; 1207 1208 bdev_io->internal.buf = buf; 1209 1210 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1211 bdev_io_get_buf_complete(bdev_io, true); 1212 return; 1213 } 1214 1215 alignment = spdk_bdev_get_buf_align(bdev); 1216 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1217 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1218 1219 if (buf_allocated) { 1220 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1221 /* Continue in completion callback */ 1222 return; 1223 } else { 1224 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1225 } 1226 1227 _bdev_io_set_md_buf(bdev_io); 1228 } 1229 1230 static inline uint64_t 1231 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1232 { 1233 struct spdk_bdev *bdev = bdev_io->bdev; 1234 uint64_t md_len, alignment; 1235 1236 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1237 1238 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1239 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1240 1241 return len + alignment + md_len; 1242 } 1243 1244 static void 1245 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1246 { 1247 struct spdk_bdev_mgmt_channel *ch; 1248 1249 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1250 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1251 } 1252 1253 static void 1254 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1255 { 1256 assert(bdev_io->internal.buf != NULL); 1257 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1258 bdev_io->internal.buf = NULL; 1259 } 1260 1261 void 1262 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1263 { 1264 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1265 1266 assert(buf != NULL); 1267 _bdev_io_put_buf(bdev_io, buf, len); 1268 } 1269 1270 static inline void 1271 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1272 struct spdk_bdev_io *bdev_io) 1273 { 1274 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1275 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1276 * sequence pointer to make sure we won't touch it anymore. */ 1277 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1278 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1279 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1280 bdev_io->internal.accel_sequence = NULL; 1281 } 1282 1283 bdev->fn_table->submit_request(ioch, bdev_io); 1284 } 1285 1286 static void 1287 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1288 { 1289 struct spdk_bdev *bdev = bdev_ch->bdev; 1290 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1291 struct spdk_bdev_io *bdev_io; 1292 1293 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1294 /* 1295 * Allow some more I/O to complete before retrying the nomem_io queue. 1296 * Some drivers (such as nvme) cannot immediately take a new I/O in 1297 * the context of a completion, because the resources for the I/O are 1298 * not released until control returns to the bdev poller. Also, we 1299 * may require several small I/O to complete before a larger I/O 1300 * (that requires splitting) can be submitted. 1301 */ 1302 return; 1303 } 1304 1305 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1306 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1307 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1308 bdev_io->internal.ch->io_outstanding++; 1309 shared_resource->io_outstanding++; 1310 bdev_io->internal.error.nvme.cdw0 = 0; 1311 bdev_io->num_retries++; 1312 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1313 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1314 /* This IO completed again with NOMEM status, so break the loop and 1315 * don't try anymore. Note that a bdev_io that fails with NOMEM 1316 * always gets requeued at the front of the list, to maintain 1317 * ordering. 1318 */ 1319 break; 1320 } 1321 } 1322 } 1323 1324 static inline void 1325 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1326 struct spdk_bdev_shared_resource *shared_resource) 1327 { 1328 assert(bdev_ch->io_outstanding > 0); 1329 assert(shared_resource->io_outstanding > 0); 1330 bdev_ch->io_outstanding--; 1331 shared_resource->io_outstanding--; 1332 } 1333 1334 static inline bool 1335 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1336 { 1337 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1338 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1339 1340 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1341 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1342 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1343 /* 1344 * Wait for some of the outstanding I/O to complete before we 1345 * retry any of the nomem_io. Normally we will wait for 1346 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1347 * depth channels we will instead wait for half to complete. 1348 */ 1349 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1350 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1351 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1352 * ownership of that sequence is transferred back to the bdev layer, so we need to 1353 * restore internal.accel_sequence to make sure that the sequence is handled 1354 * correctly in case the I/O is later aborted. */ 1355 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1356 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1357 assert(bdev_io->internal.accel_sequence == NULL); 1358 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1359 } 1360 1361 return true; 1362 } 1363 1364 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1365 bdev_ch_retry_io(bdev_ch); 1366 } 1367 1368 return false; 1369 } 1370 1371 static void 1372 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1373 { 1374 struct spdk_bdev_io *bdev_io = ctx; 1375 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1376 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1377 1378 if (rc) { 1379 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1380 } 1381 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1382 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1383 */ 1384 bdev_io_put_buf(bdev_io); 1385 1386 /* Continue with IO completion flow */ 1387 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1388 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1389 return; 1390 } 1391 1392 bdev_io_complete(bdev_io); 1393 } 1394 1395 static void 1396 _bdev_io_push_bounce_md_buffer_done(void *ctx, int rc) 1397 { 1398 struct spdk_bdev_io *bdev_io = ctx; 1399 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1400 1401 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1402 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1403 } 1404 1405 static inline void 1406 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1407 { 1408 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1409 int rc = 0; 1410 1411 /* do the same for metadata buffer */ 1412 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1413 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1414 1415 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1416 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1417 if (bdev_io_use_memory_domain(bdev_io)) { 1418 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1419 /* If memory domain is used then we need to call async push function */ 1420 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1421 bdev_io->internal.memory_domain_ctx, 1422 &bdev_io->internal.orig_md_iov, 1423 (uint32_t)bdev_io->internal.orig_iovcnt, 1424 &bdev_io->internal.bounce_md_iov, 1, 1425 _bdev_io_push_bounce_md_buffer_done, 1426 bdev_io); 1427 if (rc == 0) { 1428 /* Continue IO completion in async callback */ 1429 return; 1430 } 1431 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1432 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1433 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1434 } else { 1435 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1436 bdev_io->internal.orig_md_iov.iov_len); 1437 } 1438 } 1439 } 1440 1441 assert(bdev_io->internal.data_transfer_cpl); 1442 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1443 } 1444 1445 static void 1446 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1447 { 1448 struct spdk_bdev_io *bdev_io = ctx; 1449 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1450 1451 assert(bdev_io->internal.data_transfer_cpl); 1452 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1453 1454 if (rc) { 1455 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1456 return; 1457 } 1458 1459 /* set original buffer for this io */ 1460 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1461 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1462 /* disable bouncing buffer for this io */ 1463 bdev_io->internal.orig_iovcnt = 0; 1464 bdev_io->internal.orig_iovs = NULL; 1465 1466 _bdev_io_push_bounce_md_buffer(bdev_io); 1467 } 1468 1469 static inline void 1470 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1471 { 1472 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1473 int rc = 0; 1474 1475 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1476 bdev_io->internal.data_transfer_cpl = cpl_cb; 1477 1478 /* if this is read path, copy data from bounce buffer to original buffer */ 1479 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1480 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1481 if (bdev_io_use_memory_domain(bdev_io)) { 1482 /* If memory domain is used then we need to call async push function */ 1483 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1484 bdev_io->internal.memory_domain_ctx, 1485 bdev_io->internal.orig_iovs, 1486 (uint32_t)bdev_io->internal.orig_iovcnt, 1487 &bdev_io->internal.bounce_iov, 1, 1488 _bdev_io_push_bounce_data_buffer_done, 1489 bdev_io); 1490 if (rc == 0) { 1491 /* Continue IO completion in async callback */ 1492 return; 1493 } 1494 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1495 spdk_memory_domain_get_dma_device_id(bdev_io->internal.memory_domain)); 1496 } else { 1497 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1498 bdev_io->internal.orig_iovcnt, 1499 bdev_io->internal.bounce_iov.iov_base, 1500 bdev_io->internal.bounce_iov.iov_len); 1501 } 1502 } 1503 1504 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1505 } 1506 1507 static void 1508 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1509 { 1510 struct spdk_bdev_io *bdev_io; 1511 1512 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1513 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1514 } 1515 1516 static void 1517 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1518 { 1519 struct spdk_bdev_mgmt_channel *mgmt_ch; 1520 uint64_t max_len; 1521 void *buf; 1522 1523 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1524 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1525 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1526 1527 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1528 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1529 bdev_io_get_buf_complete(bdev_io, false); 1530 return; 1531 } 1532 1533 bdev_io->internal.buf_len = len; 1534 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1535 bdev_io_get_iobuf_cb); 1536 if (buf != NULL) { 1537 _bdev_io_set_buf(bdev_io, buf, len); 1538 } 1539 } 1540 1541 void 1542 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1543 { 1544 struct spdk_bdev *bdev = bdev_io->bdev; 1545 uint64_t alignment; 1546 1547 assert(cb != NULL); 1548 bdev_io->internal.get_buf_cb = cb; 1549 1550 alignment = spdk_bdev_get_buf_align(bdev); 1551 1552 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1553 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1554 /* Buffer already present and aligned */ 1555 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1556 return; 1557 } 1558 1559 bdev_io_get_buf(bdev_io, len); 1560 } 1561 1562 static void 1563 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1564 bool success) 1565 { 1566 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1567 1568 TAILQ_REMOVE(&bdev_ch->io_memory_domain, bdev_io, internal.link); 1569 1570 if (!success) { 1571 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1572 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1573 bdev_io_complete_unsubmitted(bdev_io); 1574 return; 1575 } 1576 1577 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1578 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1579 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1580 return; 1581 } 1582 /* For reads we'll execute the sequence after the data is read, so, for now, only 1583 * clear out accel_sequence pointer and submit the IO */ 1584 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1585 bdev_io->u.bdev.accel_sequence = NULL; 1586 } 1587 1588 bdev_io_submit(bdev_io); 1589 } 1590 1591 static void 1592 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1593 uint64_t len) 1594 { 1595 assert(cb != NULL); 1596 bdev_io->internal.get_buf_cb = cb; 1597 1598 bdev_io_get_buf(bdev_io, len); 1599 } 1600 1601 void 1602 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1603 { 1604 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1605 1606 assert(cb != NULL); 1607 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1608 bdev_io->internal.get_aux_buf_cb = cb; 1609 bdev_io_get_buf(bdev_io, len); 1610 } 1611 1612 static int 1613 bdev_module_get_max_ctx_size(void) 1614 { 1615 struct spdk_bdev_module *bdev_module; 1616 int max_bdev_module_size = 0; 1617 1618 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1619 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1620 max_bdev_module_size = bdev_module->get_ctx_size(); 1621 } 1622 } 1623 1624 return max_bdev_module_size; 1625 } 1626 1627 static void 1628 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1629 { 1630 int i; 1631 struct spdk_bdev_qos *qos = bdev->internal.qos; 1632 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1633 1634 if (!qos) { 1635 return; 1636 } 1637 1638 spdk_bdev_get_qos_rate_limits(bdev, limits); 1639 1640 spdk_json_write_object_begin(w); 1641 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1642 1643 spdk_json_write_named_object_begin(w, "params"); 1644 spdk_json_write_named_string(w, "name", bdev->name); 1645 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1646 if (limits[i] > 0) { 1647 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1648 } 1649 } 1650 spdk_json_write_object_end(w); 1651 1652 spdk_json_write_object_end(w); 1653 } 1654 1655 void 1656 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1657 { 1658 struct spdk_bdev_module *bdev_module; 1659 struct spdk_bdev *bdev; 1660 1661 assert(w != NULL); 1662 1663 spdk_json_write_array_begin(w); 1664 1665 spdk_json_write_object_begin(w); 1666 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1667 spdk_json_write_named_object_begin(w, "params"); 1668 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1669 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1670 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1671 spdk_json_write_object_end(w); 1672 spdk_json_write_object_end(w); 1673 1674 bdev_examine_allowlist_config_json(w); 1675 1676 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1677 if (bdev_module->config_json) { 1678 bdev_module->config_json(w); 1679 } 1680 } 1681 1682 spdk_spin_lock(&g_bdev_mgr.spinlock); 1683 1684 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1685 if (bdev->fn_table->write_config_json) { 1686 bdev->fn_table->write_config_json(bdev, w); 1687 } 1688 1689 bdev_qos_config_json(bdev, w); 1690 } 1691 1692 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1693 1694 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1695 spdk_json_write_object_begin(w); 1696 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1697 spdk_json_write_object_end(w); 1698 1699 spdk_json_write_array_end(w); 1700 } 1701 1702 static void 1703 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1704 { 1705 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1706 struct spdk_bdev_io *bdev_io; 1707 1708 spdk_iobuf_channel_fini(&ch->iobuf); 1709 1710 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1711 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1712 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1713 ch->per_thread_cache_count--; 1714 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1715 } 1716 1717 assert(ch->per_thread_cache_count == 0); 1718 } 1719 1720 static int 1721 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1722 { 1723 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1724 struct spdk_bdev_io *bdev_io; 1725 uint32_t i; 1726 int rc; 1727 1728 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1729 if (rc != 0) { 1730 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1731 return -1; 1732 } 1733 1734 STAILQ_INIT(&ch->per_thread_cache); 1735 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1736 1737 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1738 ch->per_thread_cache_count = 0; 1739 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1740 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1741 if (bdev_io == NULL) { 1742 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1743 assert(false); 1744 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1745 return -1; 1746 } 1747 ch->per_thread_cache_count++; 1748 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1749 } 1750 1751 TAILQ_INIT(&ch->shared_resources); 1752 TAILQ_INIT(&ch->io_wait_queue); 1753 1754 return 0; 1755 } 1756 1757 static void 1758 bdev_init_complete(int rc) 1759 { 1760 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1761 void *cb_arg = g_init_cb_arg; 1762 struct spdk_bdev_module *m; 1763 1764 g_bdev_mgr.init_complete = true; 1765 g_init_cb_fn = NULL; 1766 g_init_cb_arg = NULL; 1767 1768 /* 1769 * For modules that need to know when subsystem init is complete, 1770 * inform them now. 1771 */ 1772 if (rc == 0) { 1773 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1774 if (m->init_complete) { 1775 m->init_complete(); 1776 } 1777 } 1778 } 1779 1780 cb_fn(cb_arg, rc); 1781 } 1782 1783 static bool 1784 bdev_module_all_actions_completed(void) 1785 { 1786 struct spdk_bdev_module *m; 1787 1788 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1789 if (m->internal.action_in_progress > 0) { 1790 return false; 1791 } 1792 } 1793 return true; 1794 } 1795 1796 static void 1797 bdev_module_action_complete(void) 1798 { 1799 /* 1800 * Don't finish bdev subsystem initialization if 1801 * module pre-initialization is still in progress, or 1802 * the subsystem been already initialized. 1803 */ 1804 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1805 return; 1806 } 1807 1808 /* 1809 * Check all bdev modules for inits/examinations in progress. If any 1810 * exist, return immediately since we cannot finish bdev subsystem 1811 * initialization until all are completed. 1812 */ 1813 if (!bdev_module_all_actions_completed()) { 1814 return; 1815 } 1816 1817 /* 1818 * Modules already finished initialization - now that all 1819 * the bdev modules have finished their asynchronous I/O 1820 * processing, the entire bdev layer can be marked as complete. 1821 */ 1822 bdev_init_complete(0); 1823 } 1824 1825 static void 1826 bdev_module_action_done(struct spdk_bdev_module *module) 1827 { 1828 spdk_spin_lock(&module->internal.spinlock); 1829 assert(module->internal.action_in_progress > 0); 1830 module->internal.action_in_progress--; 1831 spdk_spin_unlock(&module->internal.spinlock); 1832 bdev_module_action_complete(); 1833 } 1834 1835 void 1836 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1837 { 1838 assert(module->async_init); 1839 bdev_module_action_done(module); 1840 } 1841 1842 void 1843 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1844 { 1845 bdev_module_action_done(module); 1846 } 1847 1848 /** The last initialized bdev module */ 1849 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1850 1851 static void 1852 bdev_init_failed(void *cb_arg) 1853 { 1854 struct spdk_bdev_module *module = cb_arg; 1855 1856 spdk_spin_lock(&module->internal.spinlock); 1857 assert(module->internal.action_in_progress > 0); 1858 module->internal.action_in_progress--; 1859 spdk_spin_unlock(&module->internal.spinlock); 1860 bdev_init_complete(-1); 1861 } 1862 1863 static int 1864 bdev_modules_init(void) 1865 { 1866 struct spdk_bdev_module *module; 1867 int rc = 0; 1868 1869 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1870 g_resume_bdev_module = module; 1871 if (module->async_init) { 1872 spdk_spin_lock(&module->internal.spinlock); 1873 module->internal.action_in_progress = 1; 1874 spdk_spin_unlock(&module->internal.spinlock); 1875 } 1876 rc = module->module_init(); 1877 if (rc != 0) { 1878 /* Bump action_in_progress to prevent other modules from completion of modules_init 1879 * Send message to defer application shutdown until resources are cleaned up */ 1880 spdk_spin_lock(&module->internal.spinlock); 1881 module->internal.action_in_progress = 1; 1882 spdk_spin_unlock(&module->internal.spinlock); 1883 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1884 return rc; 1885 } 1886 } 1887 1888 g_resume_bdev_module = NULL; 1889 return 0; 1890 } 1891 1892 void 1893 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1894 { 1895 int rc = 0; 1896 char mempool_name[32]; 1897 1898 assert(cb_fn != NULL); 1899 1900 g_init_cb_fn = cb_fn; 1901 g_init_cb_arg = cb_arg; 1902 1903 spdk_notify_type_register("bdev_register"); 1904 spdk_notify_type_register("bdev_unregister"); 1905 1906 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1907 1908 rc = spdk_iobuf_register_module("bdev"); 1909 if (rc != 0) { 1910 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1911 bdev_init_complete(-1); 1912 return; 1913 } 1914 1915 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1916 g_bdev_opts.bdev_io_pool_size, 1917 sizeof(struct spdk_bdev_io) + 1918 bdev_module_get_max_ctx_size(), 1919 0, 1920 SPDK_ENV_SOCKET_ID_ANY); 1921 1922 if (g_bdev_mgr.bdev_io_pool == NULL) { 1923 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1924 bdev_init_complete(-1); 1925 return; 1926 } 1927 1928 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1929 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1930 if (!g_bdev_mgr.zero_buffer) { 1931 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1932 bdev_init_complete(-1); 1933 return; 1934 } 1935 1936 #ifdef SPDK_CONFIG_VTUNE 1937 SPDK_LOG_DEPRECATED(vtune_support); 1938 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1939 #endif 1940 1941 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1942 bdev_mgmt_channel_destroy, 1943 sizeof(struct spdk_bdev_mgmt_channel), 1944 "bdev_mgr"); 1945 1946 rc = bdev_modules_init(); 1947 g_bdev_mgr.module_init_complete = true; 1948 if (rc != 0) { 1949 SPDK_ERRLOG("bdev modules init failed\n"); 1950 return; 1951 } 1952 1953 bdev_module_action_complete(); 1954 } 1955 1956 static void 1957 bdev_mgr_unregister_cb(void *io_device) 1958 { 1959 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1960 1961 if (g_bdev_mgr.bdev_io_pool) { 1962 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1963 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1964 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1965 g_bdev_opts.bdev_io_pool_size); 1966 } 1967 1968 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1969 } 1970 1971 spdk_free(g_bdev_mgr.zero_buffer); 1972 1973 bdev_examine_allowlist_free(); 1974 1975 cb_fn(g_fini_cb_arg); 1976 g_fini_cb_fn = NULL; 1977 g_fini_cb_arg = NULL; 1978 g_bdev_mgr.init_complete = false; 1979 g_bdev_mgr.module_init_complete = false; 1980 } 1981 1982 static void 1983 bdev_module_fini_iter(void *arg) 1984 { 1985 struct spdk_bdev_module *bdev_module; 1986 1987 /* FIXME: Handling initialization failures is broken now, 1988 * so we won't even try cleaning up after successfully 1989 * initialized modules. if module_init_complete is false, 1990 * just call spdk_bdev_mgr_unregister_cb 1991 */ 1992 if (!g_bdev_mgr.module_init_complete) { 1993 bdev_mgr_unregister_cb(NULL); 1994 return; 1995 } 1996 1997 /* Start iterating from the last touched module */ 1998 if (!g_resume_bdev_module) { 1999 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2000 } else { 2001 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2002 internal.tailq); 2003 } 2004 2005 while (bdev_module) { 2006 if (bdev_module->async_fini) { 2007 /* Save our place so we can resume later. We must 2008 * save the variable here, before calling module_fini() 2009 * below, because in some cases the module may immediately 2010 * call spdk_bdev_module_fini_done() and re-enter 2011 * this function to continue iterating. */ 2012 g_resume_bdev_module = bdev_module; 2013 } 2014 2015 if (bdev_module->module_fini) { 2016 bdev_module->module_fini(); 2017 } 2018 2019 if (bdev_module->async_fini) { 2020 return; 2021 } 2022 2023 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2024 internal.tailq); 2025 } 2026 2027 g_resume_bdev_module = NULL; 2028 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2029 } 2030 2031 void 2032 spdk_bdev_module_fini_done(void) 2033 { 2034 if (spdk_get_thread() != g_fini_thread) { 2035 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2036 } else { 2037 bdev_module_fini_iter(NULL); 2038 } 2039 } 2040 2041 static void 2042 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2043 { 2044 struct spdk_bdev *bdev = cb_arg; 2045 2046 if (bdeverrno && bdev) { 2047 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2048 bdev->name); 2049 2050 /* 2051 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2052 * bdev; try to continue by manually removing this bdev from the list and continue 2053 * with the next bdev in the list. 2054 */ 2055 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2056 } 2057 2058 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2059 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2060 /* 2061 * Bdev module finish need to be deferred as we might be in the middle of some context 2062 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2063 * after returning. 2064 */ 2065 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2066 return; 2067 } 2068 2069 /* 2070 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2071 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2072 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2073 * base bdevs. 2074 * 2075 * Also, walk the list in the reverse order. 2076 */ 2077 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2078 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2079 spdk_spin_lock(&bdev->internal.spinlock); 2080 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2081 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2082 spdk_spin_unlock(&bdev->internal.spinlock); 2083 continue; 2084 } 2085 spdk_spin_unlock(&bdev->internal.spinlock); 2086 2087 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2088 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2089 return; 2090 } 2091 2092 /* 2093 * If any bdev fails to unclaim underlying bdev properly, we may face the 2094 * case of bdev list consisting of claimed bdevs only (if claims are managed 2095 * correctly, this would mean there's a loop in the claims graph which is 2096 * clearly impossible). Warn and unregister last bdev on the list then. 2097 */ 2098 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2099 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2100 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2101 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2102 return; 2103 } 2104 } 2105 2106 static void 2107 bdev_module_fini_start_iter(void *arg) 2108 { 2109 struct spdk_bdev_module *bdev_module; 2110 2111 if (!g_resume_bdev_module) { 2112 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2113 } else { 2114 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2115 } 2116 2117 while (bdev_module) { 2118 if (bdev_module->async_fini_start) { 2119 /* Save our place so we can resume later. We must 2120 * save the variable here, before calling fini_start() 2121 * below, because in some cases the module may immediately 2122 * call spdk_bdev_module_fini_start_done() and re-enter 2123 * this function to continue iterating. */ 2124 g_resume_bdev_module = bdev_module; 2125 } 2126 2127 if (bdev_module->fini_start) { 2128 bdev_module->fini_start(); 2129 } 2130 2131 if (bdev_module->async_fini_start) { 2132 return; 2133 } 2134 2135 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2136 } 2137 2138 g_resume_bdev_module = NULL; 2139 2140 bdev_finish_unregister_bdevs_iter(NULL, 0); 2141 } 2142 2143 void 2144 spdk_bdev_module_fini_start_done(void) 2145 { 2146 if (spdk_get_thread() != g_fini_thread) { 2147 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2148 } else { 2149 bdev_module_fini_start_iter(NULL); 2150 } 2151 } 2152 2153 static void 2154 bdev_finish_wait_for_examine_done(void *cb_arg) 2155 { 2156 bdev_module_fini_start_iter(NULL); 2157 } 2158 2159 void 2160 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2161 { 2162 int rc; 2163 2164 assert(cb_fn != NULL); 2165 2166 g_fini_thread = spdk_get_thread(); 2167 2168 g_fini_cb_fn = cb_fn; 2169 g_fini_cb_arg = cb_arg; 2170 2171 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2172 if (rc != 0) { 2173 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2174 bdev_finish_wait_for_examine_done(NULL); 2175 } 2176 } 2177 2178 struct spdk_bdev_io * 2179 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2180 { 2181 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2182 struct spdk_bdev_io *bdev_io; 2183 2184 if (ch->per_thread_cache_count > 0) { 2185 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2186 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2187 ch->per_thread_cache_count--; 2188 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2189 /* 2190 * Don't try to look for bdev_ios in the global pool if there are 2191 * waiters on bdev_ios - we don't want this caller to jump the line. 2192 */ 2193 bdev_io = NULL; 2194 } else { 2195 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2196 } 2197 2198 return bdev_io; 2199 } 2200 2201 void 2202 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2203 { 2204 struct spdk_bdev_mgmt_channel *ch; 2205 2206 assert(bdev_io != NULL); 2207 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2208 2209 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2210 2211 if (bdev_io->internal.buf != NULL) { 2212 bdev_io_put_buf(bdev_io); 2213 } 2214 2215 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2216 ch->per_thread_cache_count++; 2217 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2218 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2219 struct spdk_bdev_io_wait_entry *entry; 2220 2221 entry = TAILQ_FIRST(&ch->io_wait_queue); 2222 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2223 entry->cb_fn(entry->cb_arg); 2224 } 2225 } else { 2226 /* We should never have a full cache with entries on the io wait queue. */ 2227 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2228 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2229 } 2230 } 2231 2232 static bool 2233 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2234 { 2235 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2236 2237 switch (limit) { 2238 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2239 return true; 2240 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2241 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2242 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2243 return false; 2244 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2245 default: 2246 return false; 2247 } 2248 } 2249 2250 static bool 2251 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2252 { 2253 switch (bdev_io->type) { 2254 case SPDK_BDEV_IO_TYPE_NVME_IO: 2255 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2256 case SPDK_BDEV_IO_TYPE_READ: 2257 case SPDK_BDEV_IO_TYPE_WRITE: 2258 return true; 2259 case SPDK_BDEV_IO_TYPE_ZCOPY: 2260 if (bdev_io->u.bdev.zcopy.start) { 2261 return true; 2262 } else { 2263 return false; 2264 } 2265 default: 2266 return false; 2267 } 2268 } 2269 2270 static bool 2271 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2272 { 2273 switch (bdev_io->type) { 2274 case SPDK_BDEV_IO_TYPE_NVME_IO: 2275 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2276 /* Bit 1 (0x2) set for read operation */ 2277 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2278 return true; 2279 } else { 2280 return false; 2281 } 2282 case SPDK_BDEV_IO_TYPE_READ: 2283 return true; 2284 case SPDK_BDEV_IO_TYPE_ZCOPY: 2285 /* Populate to read from disk */ 2286 if (bdev_io->u.bdev.zcopy.populate) { 2287 return true; 2288 } else { 2289 return false; 2290 } 2291 default: 2292 return false; 2293 } 2294 } 2295 2296 static uint64_t 2297 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2298 { 2299 struct spdk_bdev *bdev = bdev_io->bdev; 2300 2301 switch (bdev_io->type) { 2302 case SPDK_BDEV_IO_TYPE_NVME_IO: 2303 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2304 return bdev_io->u.nvme_passthru.nbytes; 2305 case SPDK_BDEV_IO_TYPE_READ: 2306 case SPDK_BDEV_IO_TYPE_WRITE: 2307 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2308 case SPDK_BDEV_IO_TYPE_ZCOPY: 2309 /* Track the data in the start phase only */ 2310 if (bdev_io->u.bdev.zcopy.start) { 2311 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2312 } else { 2313 return 0; 2314 } 2315 default: 2316 return 0; 2317 } 2318 } 2319 2320 static bool 2321 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2322 { 2323 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2324 return true; 2325 } else { 2326 return false; 2327 } 2328 } 2329 2330 static bool 2331 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2332 { 2333 if (bdev_is_read_io(io) == false) { 2334 return false; 2335 } 2336 2337 return bdev_qos_rw_queue_io(limit, io); 2338 } 2339 2340 static bool 2341 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2342 { 2343 if (bdev_is_read_io(io) == true) { 2344 return false; 2345 } 2346 2347 return bdev_qos_rw_queue_io(limit, io); 2348 } 2349 2350 static void 2351 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2352 { 2353 limit->remaining_this_timeslice--; 2354 } 2355 2356 static void 2357 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2358 { 2359 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2360 } 2361 2362 static void 2363 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2364 { 2365 if (bdev_is_read_io(io) == false) { 2366 return; 2367 } 2368 2369 return bdev_qos_rw_bps_update_quota(limit, io); 2370 } 2371 2372 static void 2373 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2374 { 2375 if (bdev_is_read_io(io) == true) { 2376 return; 2377 } 2378 2379 return bdev_qos_rw_bps_update_quota(limit, io); 2380 } 2381 2382 static void 2383 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2384 { 2385 int i; 2386 2387 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2388 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2389 qos->rate_limits[i].queue_io = NULL; 2390 qos->rate_limits[i].update_quota = NULL; 2391 continue; 2392 } 2393 2394 switch (i) { 2395 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2396 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2397 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2398 break; 2399 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2400 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2401 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2402 break; 2403 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2404 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2405 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2406 break; 2407 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2408 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2409 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2410 break; 2411 default: 2412 break; 2413 } 2414 } 2415 } 2416 2417 static void 2418 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2419 struct spdk_bdev_io *bdev_io, 2420 enum spdk_bdev_io_status status) 2421 { 2422 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2423 2424 bdev_io->internal.in_submit_request = true; 2425 bdev_ch->io_outstanding++; 2426 shared_resource->io_outstanding++; 2427 spdk_bdev_io_complete(bdev_io, status); 2428 bdev_io->internal.in_submit_request = false; 2429 } 2430 2431 static inline void 2432 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2433 { 2434 struct spdk_bdev *bdev = bdev_io->bdev; 2435 struct spdk_io_channel *ch = bdev_ch->channel; 2436 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2437 2438 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2439 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2440 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2441 2442 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2443 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2444 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2445 SPDK_BDEV_IO_STATUS_SUCCESS); 2446 return; 2447 } 2448 } 2449 2450 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2451 bdev_io->bdev->split_on_write_unit && 2452 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2453 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2454 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2455 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2456 return; 2457 } 2458 2459 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2460 bdev_ch->io_outstanding++; 2461 shared_resource->io_outstanding++; 2462 bdev_io->internal.in_submit_request = true; 2463 bdev_submit_request(bdev, ch, bdev_io); 2464 bdev_io->internal.in_submit_request = false; 2465 } else { 2466 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2467 } 2468 } 2469 2470 static bool 2471 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2472 { 2473 int i; 2474 2475 if (bdev_qos_io_to_limit(bdev_io) == true) { 2476 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2477 if (!qos->rate_limits[i].queue_io) { 2478 continue; 2479 } 2480 2481 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2482 bdev_io) == true) { 2483 return true; 2484 } 2485 } 2486 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2487 if (!qos->rate_limits[i].update_quota) { 2488 continue; 2489 } 2490 2491 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2492 } 2493 } 2494 2495 return false; 2496 } 2497 2498 static inline void 2499 _bdev_io_do_submit(void *ctx) 2500 { 2501 struct spdk_bdev_io *bdev_io = ctx; 2502 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2503 2504 bdev_io_do_submit(ch, bdev_io); 2505 } 2506 2507 static int 2508 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2509 { 2510 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2511 int submitted_ios = 0; 2512 2513 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2514 if (!bdev_qos_queue_io(qos, bdev_io)) { 2515 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2516 2517 if (bdev_io->internal.io_submit_ch) { 2518 /* Send back the IO to the original thread for the actual processing. */ 2519 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2520 bdev_io->internal.io_submit_ch = NULL; 2521 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2522 _bdev_io_do_submit, bdev_io); 2523 } else { 2524 bdev_io_do_submit(ch, bdev_io); 2525 } 2526 2527 submitted_ios++; 2528 } 2529 } 2530 2531 return submitted_ios; 2532 } 2533 2534 static void 2535 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2536 { 2537 int rc; 2538 2539 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2540 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2541 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2542 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2543 &bdev_io->internal.waitq_entry); 2544 if (rc != 0) { 2545 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2546 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2547 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2548 } 2549 } 2550 2551 static bool 2552 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2553 { 2554 uint32_t io_boundary; 2555 struct spdk_bdev *bdev = bdev_io->bdev; 2556 uint32_t max_size = bdev->max_segment_size; 2557 int max_segs = bdev->max_num_segments; 2558 2559 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2560 io_boundary = bdev->write_unit_size; 2561 } else if (bdev->split_on_optimal_io_boundary) { 2562 io_boundary = bdev->optimal_io_boundary; 2563 } else { 2564 io_boundary = 0; 2565 } 2566 2567 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2568 return false; 2569 } 2570 2571 if (io_boundary) { 2572 uint64_t start_stripe, end_stripe; 2573 2574 start_stripe = bdev_io->u.bdev.offset_blocks; 2575 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2576 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2577 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2578 start_stripe >>= spdk_u32log2(io_boundary); 2579 end_stripe >>= spdk_u32log2(io_boundary); 2580 } else { 2581 start_stripe /= io_boundary; 2582 end_stripe /= io_boundary; 2583 } 2584 2585 if (start_stripe != end_stripe) { 2586 return true; 2587 } 2588 } 2589 2590 if (max_segs) { 2591 if (bdev_io->u.bdev.iovcnt > max_segs) { 2592 return true; 2593 } 2594 } 2595 2596 if (max_size) { 2597 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2598 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2599 return true; 2600 } 2601 } 2602 } 2603 2604 return false; 2605 } 2606 2607 static bool 2608 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2609 { 2610 uint32_t num_unmap_segments; 2611 2612 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2613 return false; 2614 } 2615 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2616 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2617 return true; 2618 } 2619 2620 return false; 2621 } 2622 2623 static bool 2624 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2625 { 2626 if (!bdev_io->bdev->max_write_zeroes) { 2627 return false; 2628 } 2629 2630 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2631 return true; 2632 } 2633 2634 return false; 2635 } 2636 2637 static bool 2638 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2639 { 2640 if (bdev_io->bdev->max_copy != 0 && 2641 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2642 return true; 2643 } 2644 2645 return false; 2646 } 2647 2648 static bool 2649 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2650 { 2651 switch (bdev_io->type) { 2652 case SPDK_BDEV_IO_TYPE_READ: 2653 case SPDK_BDEV_IO_TYPE_WRITE: 2654 return bdev_rw_should_split(bdev_io); 2655 case SPDK_BDEV_IO_TYPE_UNMAP: 2656 return bdev_unmap_should_split(bdev_io); 2657 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2658 return bdev_write_zeroes_should_split(bdev_io); 2659 case SPDK_BDEV_IO_TYPE_COPY: 2660 return bdev_copy_should_split(bdev_io); 2661 default: 2662 return false; 2663 } 2664 } 2665 2666 static uint32_t 2667 _to_next_boundary(uint64_t offset, uint32_t boundary) 2668 { 2669 return (boundary - (offset % boundary)); 2670 } 2671 2672 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2673 2674 static void _bdev_rw_split(void *_bdev_io); 2675 2676 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2677 2678 static void 2679 _bdev_unmap_split(void *_bdev_io) 2680 { 2681 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2682 } 2683 2684 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2685 2686 static void 2687 _bdev_write_zeroes_split(void *_bdev_io) 2688 { 2689 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2690 } 2691 2692 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2693 2694 static void 2695 _bdev_copy_split(void *_bdev_io) 2696 { 2697 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2698 } 2699 2700 static int 2701 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2702 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2703 { 2704 int rc; 2705 uint64_t current_offset, current_remaining, current_src_offset; 2706 spdk_bdev_io_wait_cb io_wait_fn; 2707 2708 current_offset = *offset; 2709 current_remaining = *remaining; 2710 2711 bdev_io->u.bdev.split_outstanding++; 2712 2713 io_wait_fn = _bdev_rw_split; 2714 switch (bdev_io->type) { 2715 case SPDK_BDEV_IO_TYPE_READ: 2716 assert(bdev_io->u.bdev.accel_sequence == NULL); 2717 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2718 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2719 iov, iovcnt, md_buf, current_offset, 2720 num_blocks, bdev_io->internal.memory_domain, 2721 bdev_io->internal.memory_domain_ctx, NULL, 2722 bdev_io_split_done, bdev_io); 2723 break; 2724 case SPDK_BDEV_IO_TYPE_WRITE: 2725 assert(bdev_io->u.bdev.accel_sequence == NULL); 2726 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2727 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2728 iov, iovcnt, md_buf, current_offset, 2729 num_blocks, bdev_io->internal.memory_domain, 2730 bdev_io->internal.memory_domain_ctx, NULL, 2731 bdev_io_split_done, bdev_io); 2732 break; 2733 case SPDK_BDEV_IO_TYPE_UNMAP: 2734 io_wait_fn = _bdev_unmap_split; 2735 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2736 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2737 current_offset, num_blocks, 2738 bdev_io_split_done, bdev_io); 2739 break; 2740 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2741 io_wait_fn = _bdev_write_zeroes_split; 2742 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2743 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2744 current_offset, num_blocks, 2745 bdev_io_split_done, bdev_io); 2746 break; 2747 case SPDK_BDEV_IO_TYPE_COPY: 2748 io_wait_fn = _bdev_copy_split; 2749 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2750 (current_offset - bdev_io->u.bdev.offset_blocks); 2751 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2752 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2753 current_offset, current_src_offset, num_blocks, 2754 bdev_io_split_done, bdev_io); 2755 break; 2756 default: 2757 assert(false); 2758 rc = -EINVAL; 2759 break; 2760 } 2761 2762 if (rc == 0) { 2763 current_offset += num_blocks; 2764 current_remaining -= num_blocks; 2765 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2766 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2767 *offset = current_offset; 2768 *remaining = current_remaining; 2769 } else { 2770 bdev_io->u.bdev.split_outstanding--; 2771 if (rc == -ENOMEM) { 2772 if (bdev_io->u.bdev.split_outstanding == 0) { 2773 /* No I/O is outstanding. Hence we should wait here. */ 2774 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2775 } 2776 } else { 2777 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2778 if (bdev_io->u.bdev.split_outstanding == 0) { 2779 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2780 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2781 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2782 } 2783 } 2784 } 2785 2786 return rc; 2787 } 2788 2789 static void 2790 _bdev_rw_split(void *_bdev_io) 2791 { 2792 struct iovec *parent_iov, *iov; 2793 struct spdk_bdev_io *bdev_io = _bdev_io; 2794 struct spdk_bdev *bdev = bdev_io->bdev; 2795 uint64_t parent_offset, current_offset, remaining; 2796 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2797 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2798 uint32_t iovcnt, iov_len, child_iovsize; 2799 uint32_t blocklen = bdev->blocklen; 2800 uint32_t io_boundary; 2801 uint32_t max_segment_size = bdev->max_segment_size; 2802 uint32_t max_child_iovcnt = bdev->max_num_segments; 2803 void *md_buf = NULL; 2804 int rc; 2805 2806 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2807 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2808 SPDK_BDEV_IO_NUM_CHILD_IOV; 2809 2810 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2811 io_boundary = bdev->write_unit_size; 2812 } else if (bdev->split_on_optimal_io_boundary) { 2813 io_boundary = bdev->optimal_io_boundary; 2814 } else { 2815 io_boundary = UINT32_MAX; 2816 } 2817 2818 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2819 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2820 parent_offset = bdev_io->u.bdev.offset_blocks; 2821 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2822 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2823 2824 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2825 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2826 if (parent_iov_offset < parent_iov->iov_len) { 2827 break; 2828 } 2829 parent_iov_offset -= parent_iov->iov_len; 2830 } 2831 2832 child_iovcnt = 0; 2833 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2834 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2835 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2836 to_next_boundary = spdk_min(remaining, to_next_boundary); 2837 to_next_boundary_bytes = to_next_boundary * blocklen; 2838 2839 iov = &bdev_io->child_iov[child_iovcnt]; 2840 iovcnt = 0; 2841 2842 if (bdev_io->u.bdev.md_buf) { 2843 md_buf = (char *)bdev_io->u.bdev.md_buf + 2844 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2845 } 2846 2847 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2848 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2849 iovcnt < child_iovsize) { 2850 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2851 iov_len = parent_iov->iov_len - parent_iov_offset; 2852 2853 iov_len = spdk_min(iov_len, max_segment_size); 2854 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2855 to_next_boundary_bytes -= iov_len; 2856 2857 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2858 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2859 2860 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2861 parent_iov_offset += iov_len; 2862 } else { 2863 parent_iovpos++; 2864 parent_iov_offset = 0; 2865 } 2866 child_iovcnt++; 2867 iovcnt++; 2868 } 2869 2870 if (to_next_boundary_bytes > 0) { 2871 /* We had to stop this child I/O early because we ran out of 2872 * child_iov space or were limited by max_num_segments. 2873 * Ensure the iovs to be aligned with block size and 2874 * then adjust to_next_boundary before starting the 2875 * child I/O. 2876 */ 2877 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2878 iovcnt == child_iovsize); 2879 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2880 if (to_last_block_bytes != 0) { 2881 uint32_t child_iovpos = child_iovcnt - 1; 2882 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2883 * so the loop will naturally end 2884 */ 2885 2886 to_last_block_bytes = blocklen - to_last_block_bytes; 2887 to_next_boundary_bytes += to_last_block_bytes; 2888 while (to_last_block_bytes > 0 && iovcnt > 0) { 2889 iov_len = spdk_min(to_last_block_bytes, 2890 bdev_io->child_iov[child_iovpos].iov_len); 2891 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2892 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2893 child_iovpos--; 2894 if (--iovcnt == 0) { 2895 /* If the child IO is less than a block size just return. 2896 * If the first child IO of any split round is less than 2897 * a block size, an error exit. 2898 */ 2899 if (bdev_io->u.bdev.split_outstanding == 0) { 2900 SPDK_ERRLOG("The first child io was less than a block size\n"); 2901 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2902 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2903 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2904 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2905 } 2906 2907 return; 2908 } 2909 } 2910 2911 to_last_block_bytes -= iov_len; 2912 2913 if (parent_iov_offset == 0) { 2914 parent_iovpos--; 2915 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2916 } 2917 parent_iov_offset -= iov_len; 2918 } 2919 2920 assert(to_last_block_bytes == 0); 2921 } 2922 to_next_boundary -= to_next_boundary_bytes / blocklen; 2923 } 2924 2925 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2926 ¤t_offset, &remaining); 2927 if (spdk_unlikely(rc)) { 2928 return; 2929 } 2930 } 2931 } 2932 2933 static void 2934 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2935 { 2936 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2937 uint32_t num_children_reqs = 0; 2938 int rc; 2939 2940 offset = bdev_io->u.bdev.split_current_offset_blocks; 2941 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2942 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2943 2944 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2945 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2946 2947 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2948 &offset, &remaining); 2949 if (spdk_likely(rc == 0)) { 2950 num_children_reqs++; 2951 } else { 2952 return; 2953 } 2954 } 2955 } 2956 2957 static void 2958 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2959 { 2960 uint64_t offset, write_zeroes_blocks, remaining; 2961 uint32_t num_children_reqs = 0; 2962 int rc; 2963 2964 offset = bdev_io->u.bdev.split_current_offset_blocks; 2965 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2966 2967 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2968 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2969 2970 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2971 &offset, &remaining); 2972 if (spdk_likely(rc == 0)) { 2973 num_children_reqs++; 2974 } else { 2975 return; 2976 } 2977 } 2978 } 2979 2980 static void 2981 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2982 { 2983 uint64_t offset, copy_blocks, remaining; 2984 uint32_t num_children_reqs = 0; 2985 int rc; 2986 2987 offset = bdev_io->u.bdev.split_current_offset_blocks; 2988 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2989 2990 assert(bdev_io->bdev->max_copy != 0); 2991 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2992 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2993 2994 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2995 &offset, &remaining); 2996 if (spdk_likely(rc == 0)) { 2997 num_children_reqs++; 2998 } else { 2999 return; 3000 } 3001 } 3002 } 3003 3004 static void 3005 parent_bdev_io_complete(void *ctx, int rc) 3006 { 3007 struct spdk_bdev_io *parent_io = ctx; 3008 3009 if (rc) { 3010 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3011 } 3012 3013 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3014 parent_io->internal.caller_ctx); 3015 } 3016 3017 static void 3018 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3019 { 3020 struct spdk_bdev_io *bdev_io = ctx; 3021 3022 /* u.bdev.accel_sequence should have already been cleared at this point */ 3023 assert(bdev_io->u.bdev.accel_sequence == NULL); 3024 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3025 3026 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 3027 bdev_io->internal.accel_sequence = NULL; 3028 3029 if (spdk_unlikely(status != 0)) { 3030 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3031 } 3032 3033 parent_bdev_io_complete(bdev_io, status); 3034 } 3035 3036 static void 3037 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3038 { 3039 struct spdk_bdev_io *parent_io = cb_arg; 3040 3041 spdk_bdev_free_io(bdev_io); 3042 3043 if (!success) { 3044 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3045 /* If any child I/O failed, stop further splitting process. */ 3046 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3047 parent_io->u.bdev.split_remaining_num_blocks = 0; 3048 } 3049 parent_io->u.bdev.split_outstanding--; 3050 if (parent_io->u.bdev.split_outstanding != 0) { 3051 return; 3052 } 3053 3054 /* 3055 * Parent I/O finishes when all blocks are consumed. 3056 */ 3057 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3058 assert(parent_io->internal.cb != bdev_io_split_done); 3059 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3060 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3061 3062 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io) && 3063 spdk_likely(success)) { 3064 bdev_io_exec_sequence(bdev_io, bdev_io_complete_parent_sequence_cb); 3065 } else if (parent_io->internal.orig_iovcnt != 0) { 3066 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3067 /* bdev IO will be completed in the callback */ 3068 } else { 3069 parent_bdev_io_complete(parent_io, 0); 3070 } 3071 return; 3072 } 3073 3074 /* 3075 * Continue with the splitting process. This function will complete the parent I/O if the 3076 * splitting is done. 3077 */ 3078 switch (parent_io->type) { 3079 case SPDK_BDEV_IO_TYPE_READ: 3080 case SPDK_BDEV_IO_TYPE_WRITE: 3081 _bdev_rw_split(parent_io); 3082 break; 3083 case SPDK_BDEV_IO_TYPE_UNMAP: 3084 bdev_unmap_split(parent_io); 3085 break; 3086 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3087 bdev_write_zeroes_split(parent_io); 3088 break; 3089 case SPDK_BDEV_IO_TYPE_COPY: 3090 bdev_copy_split(parent_io); 3091 break; 3092 default: 3093 assert(false); 3094 break; 3095 } 3096 } 3097 3098 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3099 bool success); 3100 3101 static void 3102 bdev_io_split(struct spdk_bdev_io *bdev_io) 3103 { 3104 assert(bdev_io_should_split(bdev_io)); 3105 3106 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3107 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3108 bdev_io->u.bdev.split_outstanding = 0; 3109 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3110 3111 switch (bdev_io->type) { 3112 case SPDK_BDEV_IO_TYPE_READ: 3113 case SPDK_BDEV_IO_TYPE_WRITE: 3114 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3115 _bdev_rw_split(bdev_io); 3116 } else { 3117 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3118 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3119 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3120 } 3121 break; 3122 case SPDK_BDEV_IO_TYPE_UNMAP: 3123 bdev_unmap_split(bdev_io); 3124 break; 3125 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3126 bdev_write_zeroes_split(bdev_io); 3127 break; 3128 case SPDK_BDEV_IO_TYPE_COPY: 3129 bdev_copy_split(bdev_io); 3130 break; 3131 default: 3132 assert(false); 3133 break; 3134 } 3135 } 3136 3137 static void 3138 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3139 { 3140 if (!success) { 3141 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3142 return; 3143 } 3144 3145 _bdev_rw_split(bdev_io); 3146 } 3147 3148 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3149 * be inlined, at least on some compilers. 3150 */ 3151 static inline void 3152 _bdev_io_submit(void *ctx) 3153 { 3154 struct spdk_bdev_io *bdev_io = ctx; 3155 struct spdk_bdev *bdev = bdev_io->bdev; 3156 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3157 3158 if (spdk_likely(bdev_ch->flags == 0)) { 3159 bdev_io_do_submit(bdev_ch, bdev_io); 3160 return; 3161 } 3162 3163 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3164 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3165 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3166 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3167 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3168 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3169 } else { 3170 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3171 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3172 } 3173 } else { 3174 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3175 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3176 } 3177 } 3178 3179 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3180 3181 bool 3182 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3183 { 3184 if (range1->length == 0 || range2->length == 0) { 3185 return false; 3186 } 3187 3188 if (range1->offset + range1->length <= range2->offset) { 3189 return false; 3190 } 3191 3192 if (range2->offset + range2->length <= range1->offset) { 3193 return false; 3194 } 3195 3196 return true; 3197 } 3198 3199 static bool 3200 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3201 { 3202 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3203 struct lba_range r; 3204 3205 switch (bdev_io->type) { 3206 case SPDK_BDEV_IO_TYPE_NVME_IO: 3207 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3208 /* Don't try to decode the NVMe command - just assume worst-case and that 3209 * it overlaps a locked range. 3210 */ 3211 return true; 3212 case SPDK_BDEV_IO_TYPE_WRITE: 3213 case SPDK_BDEV_IO_TYPE_UNMAP: 3214 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3215 case SPDK_BDEV_IO_TYPE_ZCOPY: 3216 case SPDK_BDEV_IO_TYPE_COPY: 3217 r.offset = bdev_io->u.bdev.offset_blocks; 3218 r.length = bdev_io->u.bdev.num_blocks; 3219 if (!bdev_lba_range_overlapped(range, &r)) { 3220 /* This I/O doesn't overlap the specified LBA range. */ 3221 return false; 3222 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3223 /* This I/O overlaps, but the I/O is on the same channel that locked this 3224 * range, and the caller_ctx is the same as the locked_ctx. This means 3225 * that this I/O is associated with the lock, and is allowed to execute. 3226 */ 3227 return false; 3228 } else { 3229 return true; 3230 } 3231 default: 3232 return false; 3233 } 3234 } 3235 3236 void 3237 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3238 { 3239 struct spdk_bdev *bdev = bdev_io->bdev; 3240 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3241 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3242 3243 assert(thread != NULL); 3244 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3245 3246 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3247 struct lba_range *range; 3248 3249 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3250 if (bdev_io_range_is_locked(bdev_io, range)) { 3251 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3252 return; 3253 } 3254 } 3255 } 3256 3257 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3258 3259 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3260 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3261 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3262 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3263 spdk_bdev_get_name(bdev)); 3264 3265 if (bdev_io->internal.split) { 3266 bdev_io_split(bdev_io); 3267 return; 3268 } 3269 3270 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3271 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3272 _bdev_io_submit(bdev_io); 3273 } else { 3274 bdev_io->internal.io_submit_ch = ch; 3275 bdev_io->internal.ch = bdev->internal.qos->ch; 3276 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3277 } 3278 } else { 3279 _bdev_io_submit(bdev_io); 3280 } 3281 } 3282 3283 static inline void 3284 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3285 { 3286 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3287 3288 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3289 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3290 * For write operation we need to pull buffers from memory domain before submitting IO. 3291 * Once read operation completes, we need to use memory_domain push functionality to 3292 * update data in original memory domain IO buffer 3293 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3294 bdev_io->u.bdev.memory_domain = NULL; 3295 bdev_io->u.bdev.memory_domain_ctx = NULL; 3296 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 3297 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3298 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3299 } 3300 3301 static inline void 3302 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3303 { 3304 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3305 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3306 3307 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3308 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3309 bdev_io_complete_unsubmitted(bdev_io); 3310 return; 3311 } 3312 3313 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3314 * support them, but we need to execute an accel sequence and the data buffer is from accel 3315 * memory domain (to avoid doing a push/pull from that domain). 3316 */ 3317 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3318 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3319 _bdev_io_ext_use_bounce_buffer(bdev_io); 3320 return; 3321 } 3322 3323 if (needs_exec) { 3324 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3325 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3326 return; 3327 } 3328 /* For reads we'll execute the sequence after the data is read, so, for now, only 3329 * clear out accel_sequence pointer and submit the IO */ 3330 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3331 bdev_io->u.bdev.accel_sequence = NULL; 3332 } 3333 3334 bdev_io_submit(bdev_io); 3335 } 3336 3337 static void 3338 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3339 { 3340 struct spdk_bdev *bdev = bdev_io->bdev; 3341 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3342 struct spdk_io_channel *ch = bdev_ch->channel; 3343 3344 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3345 3346 bdev_io->internal.in_submit_request = true; 3347 bdev_submit_request(bdev, ch, bdev_io); 3348 bdev_io->internal.in_submit_request = false; 3349 } 3350 3351 void 3352 bdev_io_init(struct spdk_bdev_io *bdev_io, 3353 struct spdk_bdev *bdev, void *cb_arg, 3354 spdk_bdev_io_completion_cb cb) 3355 { 3356 bdev_io->bdev = bdev; 3357 bdev_io->internal.caller_ctx = cb_arg; 3358 bdev_io->internal.cb = cb; 3359 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3360 bdev_io->internal.in_submit_request = false; 3361 bdev_io->internal.buf = NULL; 3362 bdev_io->internal.io_submit_ch = NULL; 3363 bdev_io->internal.orig_iovs = NULL; 3364 bdev_io->internal.orig_iovcnt = 0; 3365 bdev_io->internal.orig_md_iov.iov_base = NULL; 3366 bdev_io->internal.error.nvme.cdw0 = 0; 3367 bdev_io->num_retries = 0; 3368 bdev_io->internal.get_buf_cb = NULL; 3369 bdev_io->internal.get_aux_buf_cb = NULL; 3370 bdev_io->internal.memory_domain = NULL; 3371 bdev_io->internal.memory_domain_ctx = NULL; 3372 bdev_io->internal.data_transfer_cpl = NULL; 3373 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3374 bdev_io->internal.accel_sequence = NULL; 3375 } 3376 3377 static bool 3378 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3379 { 3380 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3381 } 3382 3383 bool 3384 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3385 { 3386 bool supported; 3387 3388 supported = bdev_io_type_supported(bdev, io_type); 3389 3390 if (!supported) { 3391 switch (io_type) { 3392 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3393 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3394 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3395 break; 3396 default: 3397 break; 3398 } 3399 } 3400 3401 return supported; 3402 } 3403 3404 uint64_t 3405 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3406 { 3407 return bdev_io->internal.submit_tsc; 3408 } 3409 3410 int 3411 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3412 { 3413 if (bdev->fn_table->dump_info_json) { 3414 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3415 } 3416 3417 return 0; 3418 } 3419 3420 static void 3421 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3422 { 3423 uint32_t max_per_timeslice = 0; 3424 int i; 3425 3426 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3427 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3428 qos->rate_limits[i].max_per_timeslice = 0; 3429 continue; 3430 } 3431 3432 max_per_timeslice = qos->rate_limits[i].limit * 3433 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3434 3435 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3436 qos->rate_limits[i].min_per_timeslice); 3437 3438 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3439 } 3440 3441 bdev_qos_set_ops(qos); 3442 } 3443 3444 static int 3445 bdev_channel_poll_qos(void *arg) 3446 { 3447 struct spdk_bdev_qos *qos = arg; 3448 uint64_t now = spdk_get_ticks(); 3449 int i; 3450 3451 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3452 /* We received our callback earlier than expected - return 3453 * immediately and wait to do accounting until at least one 3454 * timeslice has actually expired. This should never happen 3455 * with a well-behaved timer implementation. 3456 */ 3457 return SPDK_POLLER_IDLE; 3458 } 3459 3460 /* Reset for next round of rate limiting */ 3461 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3462 /* We may have allowed the IOs or bytes to slightly overrun in the last 3463 * timeslice. remaining_this_timeslice is signed, so if it's negative 3464 * here, we'll account for the overrun so that the next timeslice will 3465 * be appropriately reduced. 3466 */ 3467 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3468 qos->rate_limits[i].remaining_this_timeslice = 0; 3469 } 3470 } 3471 3472 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3473 qos->last_timeslice += qos->timeslice_size; 3474 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3475 qos->rate_limits[i].remaining_this_timeslice += 3476 qos->rate_limits[i].max_per_timeslice; 3477 } 3478 } 3479 3480 return bdev_qos_io_submit(qos->ch, qos); 3481 } 3482 3483 static void 3484 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3485 { 3486 struct spdk_bdev_shared_resource *shared_resource; 3487 struct lba_range *range; 3488 3489 bdev_free_io_stat(ch->stat); 3490 #ifdef SPDK_CONFIG_VTUNE 3491 bdev_free_io_stat(ch->prev_stat); 3492 #endif 3493 3494 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3495 range = TAILQ_FIRST(&ch->locked_ranges); 3496 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3497 free(range); 3498 } 3499 3500 spdk_put_io_channel(ch->channel); 3501 spdk_put_io_channel(ch->accel_channel); 3502 3503 shared_resource = ch->shared_resource; 3504 3505 assert(TAILQ_EMPTY(&ch->io_locked)); 3506 assert(TAILQ_EMPTY(&ch->io_submitted)); 3507 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3508 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3509 assert(ch->io_outstanding == 0); 3510 assert(shared_resource->ref > 0); 3511 shared_resource->ref--; 3512 if (shared_resource->ref == 0) { 3513 assert(shared_resource->io_outstanding == 0); 3514 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3515 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3516 free(shared_resource); 3517 } 3518 } 3519 3520 static void 3521 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3522 { 3523 struct spdk_bdev_qos *qos = bdev->internal.qos; 3524 int i; 3525 3526 assert(spdk_spin_held(&bdev->internal.spinlock)); 3527 3528 /* Rate limiting on this bdev enabled */ 3529 if (qos) { 3530 if (qos->ch == NULL) { 3531 struct spdk_io_channel *io_ch; 3532 3533 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3534 bdev->name, spdk_get_thread()); 3535 3536 /* No qos channel has been selected, so set one up */ 3537 3538 /* Take another reference to ch */ 3539 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3540 assert(io_ch != NULL); 3541 qos->ch = ch; 3542 3543 qos->thread = spdk_io_channel_get_thread(io_ch); 3544 3545 TAILQ_INIT(&qos->queued); 3546 3547 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3548 if (bdev_qos_is_iops_rate_limit(i) == true) { 3549 qos->rate_limits[i].min_per_timeslice = 3550 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3551 } else { 3552 qos->rate_limits[i].min_per_timeslice = 3553 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3554 } 3555 3556 if (qos->rate_limits[i].limit == 0) { 3557 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3558 } 3559 } 3560 bdev_qos_update_max_quota_per_timeslice(qos); 3561 qos->timeslice_size = 3562 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3563 qos->last_timeslice = spdk_get_ticks(); 3564 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3565 qos, 3566 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3567 } 3568 3569 ch->flags |= BDEV_CH_QOS_ENABLED; 3570 } 3571 } 3572 3573 struct poll_timeout_ctx { 3574 struct spdk_bdev_desc *desc; 3575 uint64_t timeout_in_sec; 3576 spdk_bdev_io_timeout_cb cb_fn; 3577 void *cb_arg; 3578 }; 3579 3580 static void 3581 bdev_desc_free(struct spdk_bdev_desc *desc) 3582 { 3583 spdk_spin_destroy(&desc->spinlock); 3584 free(desc->media_events_buffer); 3585 free(desc); 3586 } 3587 3588 static void 3589 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3590 { 3591 struct poll_timeout_ctx *ctx = _ctx; 3592 struct spdk_bdev_desc *desc = ctx->desc; 3593 3594 free(ctx); 3595 3596 spdk_spin_lock(&desc->spinlock); 3597 desc->refs--; 3598 if (desc->closed == true && desc->refs == 0) { 3599 spdk_spin_unlock(&desc->spinlock); 3600 bdev_desc_free(desc); 3601 return; 3602 } 3603 spdk_spin_unlock(&desc->spinlock); 3604 } 3605 3606 static void 3607 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3608 struct spdk_io_channel *io_ch, void *_ctx) 3609 { 3610 struct poll_timeout_ctx *ctx = _ctx; 3611 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3612 struct spdk_bdev_desc *desc = ctx->desc; 3613 struct spdk_bdev_io *bdev_io; 3614 uint64_t now; 3615 3616 spdk_spin_lock(&desc->spinlock); 3617 if (desc->closed == true) { 3618 spdk_spin_unlock(&desc->spinlock); 3619 spdk_bdev_for_each_channel_continue(i, -1); 3620 return; 3621 } 3622 spdk_spin_unlock(&desc->spinlock); 3623 3624 now = spdk_get_ticks(); 3625 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3626 /* Exclude any I/O that are generated via splitting. */ 3627 if (bdev_io->internal.cb == bdev_io_split_done) { 3628 continue; 3629 } 3630 3631 /* Once we find an I/O that has not timed out, we can immediately 3632 * exit the loop. 3633 */ 3634 if (now < (bdev_io->internal.submit_tsc + 3635 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3636 goto end; 3637 } 3638 3639 if (bdev_io->internal.desc == desc) { 3640 ctx->cb_fn(ctx->cb_arg, bdev_io); 3641 } 3642 } 3643 3644 end: 3645 spdk_bdev_for_each_channel_continue(i, 0); 3646 } 3647 3648 static int 3649 bdev_poll_timeout_io(void *arg) 3650 { 3651 struct spdk_bdev_desc *desc = arg; 3652 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3653 struct poll_timeout_ctx *ctx; 3654 3655 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3656 if (!ctx) { 3657 SPDK_ERRLOG("failed to allocate memory\n"); 3658 return SPDK_POLLER_BUSY; 3659 } 3660 ctx->desc = desc; 3661 ctx->cb_arg = desc->cb_arg; 3662 ctx->cb_fn = desc->cb_fn; 3663 ctx->timeout_in_sec = desc->timeout_in_sec; 3664 3665 /* Take a ref on the descriptor in case it gets closed while we are checking 3666 * all of the channels. 3667 */ 3668 spdk_spin_lock(&desc->spinlock); 3669 desc->refs++; 3670 spdk_spin_unlock(&desc->spinlock); 3671 3672 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3673 bdev_channel_poll_timeout_io_done); 3674 3675 return SPDK_POLLER_BUSY; 3676 } 3677 3678 int 3679 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3680 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3681 { 3682 assert(desc->thread == spdk_get_thread()); 3683 3684 spdk_poller_unregister(&desc->io_timeout_poller); 3685 3686 if (timeout_in_sec) { 3687 assert(cb_fn != NULL); 3688 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3689 desc, 3690 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3691 1000); 3692 if (desc->io_timeout_poller == NULL) { 3693 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3694 return -1; 3695 } 3696 } 3697 3698 desc->cb_fn = cb_fn; 3699 desc->cb_arg = cb_arg; 3700 desc->timeout_in_sec = timeout_in_sec; 3701 3702 return 0; 3703 } 3704 3705 static int 3706 bdev_channel_create(void *io_device, void *ctx_buf) 3707 { 3708 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3709 struct spdk_bdev_channel *ch = ctx_buf; 3710 struct spdk_io_channel *mgmt_io_ch; 3711 struct spdk_bdev_mgmt_channel *mgmt_ch; 3712 struct spdk_bdev_shared_resource *shared_resource; 3713 struct lba_range *range; 3714 3715 ch->bdev = bdev; 3716 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3717 if (!ch->channel) { 3718 return -1; 3719 } 3720 3721 ch->accel_channel = spdk_accel_get_io_channel(); 3722 if (!ch->accel_channel) { 3723 spdk_put_io_channel(ch->channel); 3724 return -1; 3725 } 3726 3727 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3728 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3729 3730 assert(ch->histogram == NULL); 3731 if (bdev->internal.histogram_enabled) { 3732 ch->histogram = spdk_histogram_data_alloc(); 3733 if (ch->histogram == NULL) { 3734 SPDK_ERRLOG("Could not allocate histogram\n"); 3735 } 3736 } 3737 3738 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3739 if (!mgmt_io_ch) { 3740 spdk_put_io_channel(ch->channel); 3741 spdk_put_io_channel(ch->accel_channel); 3742 return -1; 3743 } 3744 3745 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3746 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3747 if (shared_resource->shared_ch == ch->channel) { 3748 spdk_put_io_channel(mgmt_io_ch); 3749 shared_resource->ref++; 3750 break; 3751 } 3752 } 3753 3754 if (shared_resource == NULL) { 3755 shared_resource = calloc(1, sizeof(*shared_resource)); 3756 if (shared_resource == NULL) { 3757 spdk_put_io_channel(ch->channel); 3758 spdk_put_io_channel(ch->accel_channel); 3759 spdk_put_io_channel(mgmt_io_ch); 3760 return -1; 3761 } 3762 3763 shared_resource->mgmt_ch = mgmt_ch; 3764 shared_resource->io_outstanding = 0; 3765 TAILQ_INIT(&shared_resource->nomem_io); 3766 shared_resource->nomem_threshold = 0; 3767 shared_resource->shared_ch = ch->channel; 3768 shared_resource->ref = 1; 3769 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3770 } 3771 3772 ch->io_outstanding = 0; 3773 TAILQ_INIT(&ch->queued_resets); 3774 TAILQ_INIT(&ch->locked_ranges); 3775 ch->flags = 0; 3776 ch->shared_resource = shared_resource; 3777 3778 TAILQ_INIT(&ch->io_submitted); 3779 TAILQ_INIT(&ch->io_locked); 3780 TAILQ_INIT(&ch->io_accel_exec); 3781 TAILQ_INIT(&ch->io_memory_domain); 3782 3783 ch->stat = bdev_alloc_io_stat(false); 3784 if (ch->stat == NULL) { 3785 bdev_channel_destroy_resource(ch); 3786 return -1; 3787 } 3788 3789 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3790 3791 #ifdef SPDK_CONFIG_VTUNE 3792 { 3793 char *name; 3794 __itt_init_ittlib(NULL, 0); 3795 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3796 if (!name) { 3797 bdev_channel_destroy_resource(ch); 3798 return -1; 3799 } 3800 ch->handle = __itt_string_handle_create(name); 3801 free(name); 3802 ch->start_tsc = spdk_get_ticks(); 3803 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3804 ch->prev_stat = bdev_alloc_io_stat(false); 3805 if (ch->prev_stat == NULL) { 3806 bdev_channel_destroy_resource(ch); 3807 return -1; 3808 } 3809 } 3810 #endif 3811 3812 spdk_spin_lock(&bdev->internal.spinlock); 3813 bdev_enable_qos(bdev, ch); 3814 3815 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3816 struct lba_range *new_range; 3817 3818 new_range = calloc(1, sizeof(*new_range)); 3819 if (new_range == NULL) { 3820 spdk_spin_unlock(&bdev->internal.spinlock); 3821 bdev_channel_destroy_resource(ch); 3822 return -1; 3823 } 3824 new_range->length = range->length; 3825 new_range->offset = range->offset; 3826 new_range->locked_ctx = range->locked_ctx; 3827 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3828 } 3829 3830 spdk_spin_unlock(&bdev->internal.spinlock); 3831 3832 return 0; 3833 } 3834 3835 static int 3836 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3837 void *cb_ctx) 3838 { 3839 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3840 struct spdk_bdev_io *bdev_io; 3841 uint64_t buf_len; 3842 3843 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3844 if (bdev_io->internal.ch == bdev_ch) { 3845 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3846 spdk_iobuf_entry_abort(ch, entry, buf_len); 3847 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3848 } 3849 3850 return 0; 3851 } 3852 3853 /* 3854 * Abort I/O that are waiting on a data buffer. 3855 */ 3856 static void 3857 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3858 { 3859 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3860 bdev_abort_all_buf_io_cb, ch); 3861 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3862 bdev_abort_all_buf_io_cb, ch); 3863 } 3864 3865 /* 3866 * Abort I/O that are queued waiting for submission. These types of I/O are 3867 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3868 */ 3869 static void 3870 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3871 { 3872 struct spdk_bdev_io *bdev_io, *tmp; 3873 3874 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3875 if (bdev_io->internal.ch == ch) { 3876 TAILQ_REMOVE(queue, bdev_io, internal.link); 3877 /* 3878 * spdk_bdev_io_complete() assumes that the completed I/O had 3879 * been submitted to the bdev module. Since in this case it 3880 * hadn't, bump io_outstanding to account for the decrement 3881 * that spdk_bdev_io_complete() will do. 3882 */ 3883 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3884 ch->io_outstanding++; 3885 ch->shared_resource->io_outstanding++; 3886 } 3887 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3888 } 3889 } 3890 } 3891 3892 static bool 3893 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3894 { 3895 struct spdk_bdev_io *bdev_io; 3896 3897 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3898 if (bdev_io == bio_to_abort) { 3899 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3900 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3901 return true; 3902 } 3903 } 3904 3905 return false; 3906 } 3907 3908 static int 3909 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3910 { 3911 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3912 uint64_t buf_len; 3913 3914 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3915 if (bdev_io == bio_to_abort) { 3916 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3917 spdk_iobuf_entry_abort(ch, entry, buf_len); 3918 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3919 return 1; 3920 } 3921 3922 return 0; 3923 } 3924 3925 static bool 3926 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3927 { 3928 int rc; 3929 3930 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3931 bdev_abort_buf_io_cb, bio_to_abort); 3932 if (rc == 1) { 3933 return true; 3934 } 3935 3936 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3937 bdev_abort_buf_io_cb, bio_to_abort); 3938 return rc == 1; 3939 } 3940 3941 static void 3942 bdev_qos_channel_destroy(void *cb_arg) 3943 { 3944 struct spdk_bdev_qos *qos = cb_arg; 3945 3946 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3947 spdk_poller_unregister(&qos->poller); 3948 3949 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3950 3951 free(qos); 3952 } 3953 3954 static int 3955 bdev_qos_destroy(struct spdk_bdev *bdev) 3956 { 3957 int i; 3958 3959 /* 3960 * Cleanly shutting down the QoS poller is tricky, because 3961 * during the asynchronous operation the user could open 3962 * a new descriptor and create a new channel, spawning 3963 * a new QoS poller. 3964 * 3965 * The strategy is to create a new QoS structure here and swap it 3966 * in. The shutdown path then continues to refer to the old one 3967 * until it completes and then releases it. 3968 */ 3969 struct spdk_bdev_qos *new_qos, *old_qos; 3970 3971 old_qos = bdev->internal.qos; 3972 3973 new_qos = calloc(1, sizeof(*new_qos)); 3974 if (!new_qos) { 3975 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3976 return -ENOMEM; 3977 } 3978 3979 /* Copy the old QoS data into the newly allocated structure */ 3980 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3981 3982 /* Zero out the key parts of the QoS structure */ 3983 new_qos->ch = NULL; 3984 new_qos->thread = NULL; 3985 new_qos->poller = NULL; 3986 TAILQ_INIT(&new_qos->queued); 3987 /* 3988 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3989 * It will be used later for the new QoS structure. 3990 */ 3991 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3992 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3993 new_qos->rate_limits[i].min_per_timeslice = 0; 3994 new_qos->rate_limits[i].max_per_timeslice = 0; 3995 } 3996 3997 bdev->internal.qos = new_qos; 3998 3999 if (old_qos->thread == NULL) { 4000 free(old_qos); 4001 } else { 4002 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4003 } 4004 4005 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4006 * been destroyed yet. The destruction path will end up waiting for the final 4007 * channel to be put before it releases resources. */ 4008 4009 return 0; 4010 } 4011 4012 void 4013 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4014 { 4015 total->bytes_read += add->bytes_read; 4016 total->num_read_ops += add->num_read_ops; 4017 total->bytes_written += add->bytes_written; 4018 total->num_write_ops += add->num_write_ops; 4019 total->bytes_unmapped += add->bytes_unmapped; 4020 total->num_unmap_ops += add->num_unmap_ops; 4021 total->bytes_copied += add->bytes_copied; 4022 total->num_copy_ops += add->num_copy_ops; 4023 total->read_latency_ticks += add->read_latency_ticks; 4024 total->write_latency_ticks += add->write_latency_ticks; 4025 total->unmap_latency_ticks += add->unmap_latency_ticks; 4026 total->copy_latency_ticks += add->copy_latency_ticks; 4027 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4028 total->max_read_latency_ticks = add->max_read_latency_ticks; 4029 } 4030 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4031 total->min_read_latency_ticks = add->min_read_latency_ticks; 4032 } 4033 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4034 total->max_write_latency_ticks = add->max_write_latency_ticks; 4035 } 4036 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4037 total->min_write_latency_ticks = add->min_write_latency_ticks; 4038 } 4039 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4040 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4041 } 4042 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4043 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4044 } 4045 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4046 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4047 } 4048 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4049 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4050 } 4051 } 4052 4053 static void 4054 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4055 { 4056 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4057 4058 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4059 memcpy(to_stat->io_error, from_stat->io_error, 4060 sizeof(struct spdk_bdev_io_error_stat)); 4061 } 4062 } 4063 4064 void 4065 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4066 { 4067 stat->max_read_latency_ticks = 0; 4068 stat->min_read_latency_ticks = UINT64_MAX; 4069 stat->max_write_latency_ticks = 0; 4070 stat->min_write_latency_ticks = UINT64_MAX; 4071 stat->max_unmap_latency_ticks = 0; 4072 stat->min_unmap_latency_ticks = UINT64_MAX; 4073 stat->max_copy_latency_ticks = 0; 4074 stat->min_copy_latency_ticks = UINT64_MAX; 4075 4076 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4077 return; 4078 } 4079 4080 stat->bytes_read = 0; 4081 stat->num_read_ops = 0; 4082 stat->bytes_written = 0; 4083 stat->num_write_ops = 0; 4084 stat->bytes_unmapped = 0; 4085 stat->num_unmap_ops = 0; 4086 stat->bytes_copied = 0; 4087 stat->num_copy_ops = 0; 4088 stat->read_latency_ticks = 0; 4089 stat->write_latency_ticks = 0; 4090 stat->unmap_latency_ticks = 0; 4091 stat->copy_latency_ticks = 0; 4092 4093 if (stat->io_error != NULL) { 4094 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4095 } 4096 } 4097 4098 struct spdk_bdev_io_stat * 4099 bdev_alloc_io_stat(bool io_error_stat) 4100 { 4101 struct spdk_bdev_io_stat *stat; 4102 4103 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4104 if (stat == NULL) { 4105 return NULL; 4106 } 4107 4108 if (io_error_stat) { 4109 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4110 if (stat->io_error == NULL) { 4111 free(stat); 4112 return NULL; 4113 } 4114 } else { 4115 stat->io_error = NULL; 4116 } 4117 4118 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4119 4120 return stat; 4121 } 4122 4123 void 4124 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4125 { 4126 if (stat != NULL) { 4127 free(stat->io_error); 4128 free(stat); 4129 } 4130 } 4131 4132 void 4133 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4134 { 4135 int i; 4136 4137 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4138 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4139 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4140 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4141 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4142 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4143 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4144 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4145 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4146 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4147 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4148 stat->min_read_latency_ticks != UINT64_MAX ? 4149 stat->min_read_latency_ticks : 0); 4150 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4151 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4152 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4153 stat->min_write_latency_ticks != UINT64_MAX ? 4154 stat->min_write_latency_ticks : 0); 4155 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4156 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4157 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4158 stat->min_unmap_latency_ticks != UINT64_MAX ? 4159 stat->min_unmap_latency_ticks : 0); 4160 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4161 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4162 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4163 stat->min_copy_latency_ticks != UINT64_MAX ? 4164 stat->min_copy_latency_ticks : 0); 4165 4166 if (stat->io_error != NULL) { 4167 spdk_json_write_named_object_begin(w, "io_error"); 4168 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4169 if (stat->io_error->error_status[i] != 0) { 4170 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4171 stat->io_error->error_status[i]); 4172 } 4173 } 4174 spdk_json_write_object_end(w); 4175 } 4176 } 4177 4178 static void 4179 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4180 { 4181 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4182 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4183 4184 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4185 bdev_abort_all_buf_io(mgmt_ch, ch); 4186 bdev_abort_all_buf_io(mgmt_ch, ch); 4187 } 4188 4189 static void 4190 bdev_channel_destroy(void *io_device, void *ctx_buf) 4191 { 4192 struct spdk_bdev_channel *ch = ctx_buf; 4193 4194 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4195 spdk_get_thread()); 4196 4197 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4198 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4199 4200 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4201 spdk_spin_lock(&ch->bdev->internal.spinlock); 4202 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4203 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4204 4205 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4206 4207 bdev_channel_abort_queued_ios(ch); 4208 4209 if (ch->histogram) { 4210 spdk_histogram_data_free(ch->histogram); 4211 } 4212 4213 bdev_channel_destroy_resource(ch); 4214 } 4215 4216 /* 4217 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4218 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4219 */ 4220 static int 4221 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4222 { 4223 struct spdk_bdev_name *tmp; 4224 4225 bdev_name->name = strdup(name); 4226 if (bdev_name->name == NULL) { 4227 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4228 return -ENOMEM; 4229 } 4230 4231 bdev_name->bdev = bdev; 4232 4233 spdk_spin_lock(&g_bdev_mgr.spinlock); 4234 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4235 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4236 4237 if (tmp != NULL) { 4238 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4239 free(bdev_name->name); 4240 return -EEXIST; 4241 } 4242 4243 return 0; 4244 } 4245 4246 static void 4247 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4248 { 4249 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4250 free(bdev_name->name); 4251 } 4252 4253 static void 4254 bdev_name_del(struct spdk_bdev_name *bdev_name) 4255 { 4256 spdk_spin_lock(&g_bdev_mgr.spinlock); 4257 bdev_name_del_unsafe(bdev_name); 4258 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4259 } 4260 4261 int 4262 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4263 { 4264 struct spdk_bdev_alias *tmp; 4265 int ret; 4266 4267 if (alias == NULL) { 4268 SPDK_ERRLOG("Empty alias passed\n"); 4269 return -EINVAL; 4270 } 4271 4272 tmp = calloc(1, sizeof(*tmp)); 4273 if (tmp == NULL) { 4274 SPDK_ERRLOG("Unable to allocate alias\n"); 4275 return -ENOMEM; 4276 } 4277 4278 ret = bdev_name_add(&tmp->alias, bdev, alias); 4279 if (ret != 0) { 4280 free(tmp); 4281 return ret; 4282 } 4283 4284 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4285 4286 return 0; 4287 } 4288 4289 static int 4290 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4291 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4292 { 4293 struct spdk_bdev_alias *tmp; 4294 4295 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4296 if (strcmp(alias, tmp->alias.name) == 0) { 4297 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4298 alias_del_fn(&tmp->alias); 4299 free(tmp); 4300 return 0; 4301 } 4302 } 4303 4304 return -ENOENT; 4305 } 4306 4307 int 4308 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4309 { 4310 int rc; 4311 4312 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4313 if (rc == -ENOENT) { 4314 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4315 } 4316 4317 return rc; 4318 } 4319 4320 void 4321 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4322 { 4323 struct spdk_bdev_alias *p, *tmp; 4324 4325 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4326 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4327 bdev_name_del(&p->alias); 4328 free(p); 4329 } 4330 } 4331 4332 struct spdk_io_channel * 4333 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4334 { 4335 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4336 } 4337 4338 void * 4339 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4340 { 4341 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4342 void *ctx = NULL; 4343 4344 if (bdev->fn_table->get_module_ctx) { 4345 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4346 } 4347 4348 return ctx; 4349 } 4350 4351 const char * 4352 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4353 { 4354 return bdev->module->name; 4355 } 4356 4357 const char * 4358 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4359 { 4360 return bdev->name; 4361 } 4362 4363 const char * 4364 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4365 { 4366 return bdev->product_name; 4367 } 4368 4369 const struct spdk_bdev_aliases_list * 4370 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4371 { 4372 return &bdev->aliases; 4373 } 4374 4375 uint32_t 4376 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4377 { 4378 return bdev->blocklen; 4379 } 4380 4381 uint32_t 4382 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4383 { 4384 return bdev->write_unit_size; 4385 } 4386 4387 uint64_t 4388 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4389 { 4390 return bdev->blockcnt; 4391 } 4392 4393 const char * 4394 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4395 { 4396 return qos_rpc_type[type]; 4397 } 4398 4399 void 4400 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4401 { 4402 int i; 4403 4404 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4405 4406 spdk_spin_lock(&bdev->internal.spinlock); 4407 if (bdev->internal.qos) { 4408 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4409 if (bdev->internal.qos->rate_limits[i].limit != 4410 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4411 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4412 if (bdev_qos_is_iops_rate_limit(i) == false) { 4413 /* Change from Byte to Megabyte which is user visible. */ 4414 limits[i] = limits[i] / 1024 / 1024; 4415 } 4416 } 4417 } 4418 } 4419 spdk_spin_unlock(&bdev->internal.spinlock); 4420 } 4421 4422 size_t 4423 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4424 { 4425 return 1 << bdev->required_alignment; 4426 } 4427 4428 uint32_t 4429 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4430 { 4431 return bdev->optimal_io_boundary; 4432 } 4433 4434 bool 4435 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4436 { 4437 return bdev->write_cache; 4438 } 4439 4440 const struct spdk_uuid * 4441 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4442 { 4443 return &bdev->uuid; 4444 } 4445 4446 uint16_t 4447 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4448 { 4449 return bdev->acwu; 4450 } 4451 4452 uint32_t 4453 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4454 { 4455 return bdev->md_len; 4456 } 4457 4458 bool 4459 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4460 { 4461 return (bdev->md_len != 0) && bdev->md_interleave; 4462 } 4463 4464 bool 4465 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4466 { 4467 return (bdev->md_len != 0) && !bdev->md_interleave; 4468 } 4469 4470 bool 4471 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4472 { 4473 return bdev->zoned; 4474 } 4475 4476 uint32_t 4477 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4478 { 4479 if (spdk_bdev_is_md_interleaved(bdev)) { 4480 return bdev->blocklen - bdev->md_len; 4481 } else { 4482 return bdev->blocklen; 4483 } 4484 } 4485 4486 uint32_t 4487 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4488 { 4489 return bdev->phys_blocklen; 4490 } 4491 4492 static uint32_t 4493 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4494 { 4495 if (!spdk_bdev_is_md_interleaved(bdev)) { 4496 return bdev->blocklen + bdev->md_len; 4497 } else { 4498 return bdev->blocklen; 4499 } 4500 } 4501 4502 /* We have to use the typedef in the function declaration to appease astyle. */ 4503 typedef enum spdk_dif_type spdk_dif_type_t; 4504 4505 spdk_dif_type_t 4506 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4507 { 4508 if (bdev->md_len != 0) { 4509 return bdev->dif_type; 4510 } else { 4511 return SPDK_DIF_DISABLE; 4512 } 4513 } 4514 4515 bool 4516 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4517 { 4518 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4519 return bdev->dif_is_head_of_md; 4520 } else { 4521 return false; 4522 } 4523 } 4524 4525 bool 4526 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4527 enum spdk_dif_check_type check_type) 4528 { 4529 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4530 return false; 4531 } 4532 4533 switch (check_type) { 4534 case SPDK_DIF_CHECK_TYPE_REFTAG: 4535 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4536 case SPDK_DIF_CHECK_TYPE_APPTAG: 4537 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4538 case SPDK_DIF_CHECK_TYPE_GUARD: 4539 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4540 default: 4541 return false; 4542 } 4543 } 4544 4545 uint32_t 4546 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4547 { 4548 uint64_t alighed_length; 4549 uint64_t max_copy_blocks; 4550 uint64_t temp_max_copy_blocks; 4551 struct spdk_iobuf_opts opts; 4552 4553 if (spdk_bdev_io_type_supported((struct spdk_bdev *)bdev, SPDK_BDEV_IO_TYPE_COPY)) { 4554 return bdev->max_copy; 4555 } else { 4556 spdk_iobuf_get_opts(&opts); 4557 alighed_length = opts.large_bufsize - spdk_bdev_get_buf_align(bdev); 4558 temp_max_copy_blocks = spdk_bdev_is_md_separate(bdev) ? 4559 alighed_length / (bdev->blocklen + bdev->md_len) : 4560 alighed_length / bdev->blocklen; 4561 max_copy_blocks = 1 << spdk_u64log2(temp_max_copy_blocks); 4562 return max_copy_blocks; 4563 } 4564 } 4565 4566 uint64_t 4567 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4568 { 4569 return bdev->internal.measured_queue_depth; 4570 } 4571 4572 uint64_t 4573 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4574 { 4575 return bdev->internal.period; 4576 } 4577 4578 uint64_t 4579 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4580 { 4581 return bdev->internal.weighted_io_time; 4582 } 4583 4584 uint64_t 4585 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4586 { 4587 return bdev->internal.io_time; 4588 } 4589 4590 static void bdev_update_qd_sampling_period(void *ctx); 4591 4592 static void 4593 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4594 { 4595 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4596 4597 if (bdev->internal.measured_queue_depth) { 4598 bdev->internal.io_time += bdev->internal.period; 4599 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4600 } 4601 4602 bdev->internal.qd_poll_in_progress = false; 4603 4604 bdev_update_qd_sampling_period(bdev); 4605 } 4606 4607 static void 4608 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4609 struct spdk_io_channel *io_ch, void *_ctx) 4610 { 4611 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4612 4613 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4614 spdk_bdev_for_each_channel_continue(i, 0); 4615 } 4616 4617 static int 4618 bdev_calculate_measured_queue_depth(void *ctx) 4619 { 4620 struct spdk_bdev *bdev = ctx; 4621 4622 bdev->internal.qd_poll_in_progress = true; 4623 bdev->internal.temporary_queue_depth = 0; 4624 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4625 return SPDK_POLLER_BUSY; 4626 } 4627 4628 static void 4629 bdev_update_qd_sampling_period(void *ctx) 4630 { 4631 struct spdk_bdev *bdev = ctx; 4632 4633 if (bdev->internal.period == bdev->internal.new_period) { 4634 return; 4635 } 4636 4637 if (bdev->internal.qd_poll_in_progress) { 4638 return; 4639 } 4640 4641 bdev->internal.period = bdev->internal.new_period; 4642 4643 spdk_poller_unregister(&bdev->internal.qd_poller); 4644 if (bdev->internal.period != 0) { 4645 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4646 bdev, bdev->internal.period); 4647 } else { 4648 spdk_bdev_close(bdev->internal.qd_desc); 4649 bdev->internal.qd_desc = NULL; 4650 } 4651 } 4652 4653 static void 4654 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4655 { 4656 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4657 } 4658 4659 void 4660 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4661 { 4662 int rc; 4663 4664 if (bdev->internal.new_period == period) { 4665 return; 4666 } 4667 4668 bdev->internal.new_period = period; 4669 4670 if (bdev->internal.qd_desc != NULL) { 4671 assert(bdev->internal.period != 0); 4672 4673 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4674 bdev_update_qd_sampling_period, bdev); 4675 return; 4676 } 4677 4678 assert(bdev->internal.period == 0); 4679 4680 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4681 NULL, &bdev->internal.qd_desc); 4682 if (rc != 0) { 4683 return; 4684 } 4685 4686 bdev->internal.period = period; 4687 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4688 bdev, period); 4689 } 4690 4691 struct bdev_get_current_qd_ctx { 4692 uint64_t current_qd; 4693 spdk_bdev_get_current_qd_cb cb_fn; 4694 void *cb_arg; 4695 }; 4696 4697 static void 4698 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4699 { 4700 struct bdev_get_current_qd_ctx *ctx = _ctx; 4701 4702 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4703 4704 free(ctx); 4705 } 4706 4707 static void 4708 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4709 struct spdk_io_channel *io_ch, void *_ctx) 4710 { 4711 struct bdev_get_current_qd_ctx *ctx = _ctx; 4712 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4713 4714 ctx->current_qd += bdev_ch->io_outstanding; 4715 4716 spdk_bdev_for_each_channel_continue(i, 0); 4717 } 4718 4719 void 4720 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4721 void *cb_arg) 4722 { 4723 struct bdev_get_current_qd_ctx *ctx; 4724 4725 assert(cb_fn != NULL); 4726 4727 ctx = calloc(1, sizeof(*ctx)); 4728 if (ctx == NULL) { 4729 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4730 return; 4731 } 4732 4733 ctx->cb_fn = cb_fn; 4734 ctx->cb_arg = cb_arg; 4735 4736 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4737 } 4738 4739 static void 4740 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4741 { 4742 assert(desc->thread == spdk_get_thread()); 4743 4744 spdk_spin_lock(&desc->spinlock); 4745 desc->refs--; 4746 if (!desc->closed) { 4747 spdk_spin_unlock(&desc->spinlock); 4748 desc->callback.event_fn(type, 4749 desc->bdev, 4750 desc->callback.ctx); 4751 return; 4752 } else if (desc->refs == 0) { 4753 /* This descriptor was closed after this event_notify message was sent. 4754 * spdk_bdev_close() could not free the descriptor since this message was 4755 * in flight, so we free it now using bdev_desc_free(). 4756 */ 4757 spdk_spin_unlock(&desc->spinlock); 4758 bdev_desc_free(desc); 4759 return; 4760 } 4761 spdk_spin_unlock(&desc->spinlock); 4762 } 4763 4764 static void 4765 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4766 { 4767 spdk_spin_lock(&desc->spinlock); 4768 desc->refs++; 4769 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4770 spdk_spin_unlock(&desc->spinlock); 4771 } 4772 4773 static void 4774 _resize_notify(void *ctx) 4775 { 4776 struct spdk_bdev_desc *desc = ctx; 4777 4778 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4779 } 4780 4781 int 4782 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4783 { 4784 struct spdk_bdev_desc *desc; 4785 int ret; 4786 4787 if (size == bdev->blockcnt) { 4788 return 0; 4789 } 4790 4791 spdk_spin_lock(&bdev->internal.spinlock); 4792 4793 /* bdev has open descriptors */ 4794 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4795 bdev->blockcnt > size) { 4796 ret = -EBUSY; 4797 } else { 4798 bdev->blockcnt = size; 4799 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4800 event_notify(desc, _resize_notify); 4801 } 4802 ret = 0; 4803 } 4804 4805 spdk_spin_unlock(&bdev->internal.spinlock); 4806 4807 return ret; 4808 } 4809 4810 /* 4811 * Convert I/O offset and length from bytes to blocks. 4812 * 4813 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4814 */ 4815 static uint64_t 4816 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4817 uint64_t num_bytes, uint64_t *num_blocks) 4818 { 4819 uint32_t block_size = bdev->blocklen; 4820 uint8_t shift_cnt; 4821 4822 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4823 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4824 shift_cnt = spdk_u32log2(block_size); 4825 *offset_blocks = offset_bytes >> shift_cnt; 4826 *num_blocks = num_bytes >> shift_cnt; 4827 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4828 (num_bytes - (*num_blocks << shift_cnt)); 4829 } else { 4830 *offset_blocks = offset_bytes / block_size; 4831 *num_blocks = num_bytes / block_size; 4832 return (offset_bytes % block_size) | (num_bytes % block_size); 4833 } 4834 } 4835 4836 static bool 4837 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4838 { 4839 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4840 * has been an overflow and hence the offset has been wrapped around */ 4841 if (offset_blocks + num_blocks < offset_blocks) { 4842 return false; 4843 } 4844 4845 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4846 if (offset_blocks + num_blocks > bdev->blockcnt) { 4847 return false; 4848 } 4849 4850 return true; 4851 } 4852 4853 static void 4854 bdev_seek_complete_cb(void *ctx) 4855 { 4856 struct spdk_bdev_io *bdev_io = ctx; 4857 4858 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4859 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4860 } 4861 4862 static int 4863 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4864 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4865 spdk_bdev_io_completion_cb cb, void *cb_arg) 4866 { 4867 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4868 struct spdk_bdev_io *bdev_io; 4869 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4870 4871 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4872 4873 /* Check if offset_blocks is valid looking at the validity of one block */ 4874 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4875 return -EINVAL; 4876 } 4877 4878 bdev_io = bdev_channel_get_io(channel); 4879 if (!bdev_io) { 4880 return -ENOMEM; 4881 } 4882 4883 bdev_io->internal.ch = channel; 4884 bdev_io->internal.desc = desc; 4885 bdev_io->type = io_type; 4886 bdev_io->u.bdev.offset_blocks = offset_blocks; 4887 bdev_io->u.bdev.memory_domain = NULL; 4888 bdev_io->u.bdev.memory_domain_ctx = NULL; 4889 bdev_io->u.bdev.accel_sequence = NULL; 4890 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4891 4892 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4893 /* In case bdev doesn't support seek to next data/hole offset, 4894 * it is assumed that only data and no holes are present */ 4895 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4896 bdev_io->u.bdev.seek.offset = offset_blocks; 4897 } else { 4898 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4899 } 4900 4901 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4902 return 0; 4903 } 4904 4905 bdev_io_submit(bdev_io); 4906 return 0; 4907 } 4908 4909 int 4910 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4911 uint64_t offset_blocks, 4912 spdk_bdev_io_completion_cb cb, void *cb_arg) 4913 { 4914 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4915 } 4916 4917 int 4918 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4919 uint64_t offset_blocks, 4920 spdk_bdev_io_completion_cb cb, void *cb_arg) 4921 { 4922 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4923 } 4924 4925 uint64_t 4926 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4927 { 4928 return bdev_io->u.bdev.seek.offset; 4929 } 4930 4931 static int 4932 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4933 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4934 spdk_bdev_io_completion_cb cb, void *cb_arg) 4935 { 4936 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4937 struct spdk_bdev_io *bdev_io; 4938 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4939 4940 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4941 return -EINVAL; 4942 } 4943 4944 bdev_io = bdev_channel_get_io(channel); 4945 if (!bdev_io) { 4946 return -ENOMEM; 4947 } 4948 4949 bdev_io->internal.ch = channel; 4950 bdev_io->internal.desc = desc; 4951 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4952 bdev_io->u.bdev.iovs = &bdev_io->iov; 4953 bdev_io->u.bdev.iovs[0].iov_base = buf; 4954 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4955 bdev_io->u.bdev.iovcnt = 1; 4956 bdev_io->u.bdev.md_buf = md_buf; 4957 bdev_io->u.bdev.num_blocks = num_blocks; 4958 bdev_io->u.bdev.offset_blocks = offset_blocks; 4959 bdev_io->u.bdev.memory_domain = NULL; 4960 bdev_io->u.bdev.memory_domain_ctx = NULL; 4961 bdev_io->u.bdev.accel_sequence = NULL; 4962 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4963 4964 bdev_io_submit(bdev_io); 4965 return 0; 4966 } 4967 4968 int 4969 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4970 void *buf, uint64_t offset, uint64_t nbytes, 4971 spdk_bdev_io_completion_cb cb, void *cb_arg) 4972 { 4973 uint64_t offset_blocks, num_blocks; 4974 4975 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4976 nbytes, &num_blocks) != 0) { 4977 return -EINVAL; 4978 } 4979 4980 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4981 } 4982 4983 int 4984 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4985 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4986 spdk_bdev_io_completion_cb cb, void *cb_arg) 4987 { 4988 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4989 } 4990 4991 int 4992 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4993 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4994 spdk_bdev_io_completion_cb cb, void *cb_arg) 4995 { 4996 struct iovec iov = { 4997 .iov_base = buf, 4998 }; 4999 5000 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5001 return -EINVAL; 5002 } 5003 5004 if (md_buf && !_is_buf_allocated(&iov)) { 5005 return -EINVAL; 5006 } 5007 5008 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5009 cb, cb_arg); 5010 } 5011 5012 int 5013 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5014 struct iovec *iov, int iovcnt, 5015 uint64_t offset, uint64_t nbytes, 5016 spdk_bdev_io_completion_cb cb, void *cb_arg) 5017 { 5018 uint64_t offset_blocks, num_blocks; 5019 5020 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5021 nbytes, &num_blocks) != 0) { 5022 return -EINVAL; 5023 } 5024 5025 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5026 } 5027 5028 static int 5029 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5030 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5031 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5032 struct spdk_accel_sequence *seq, 5033 spdk_bdev_io_completion_cb cb, void *cb_arg) 5034 { 5035 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5036 struct spdk_bdev_io *bdev_io; 5037 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5038 5039 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5040 return -EINVAL; 5041 } 5042 5043 bdev_io = bdev_channel_get_io(channel); 5044 if (!bdev_io) { 5045 return -ENOMEM; 5046 } 5047 5048 bdev_io->internal.ch = channel; 5049 bdev_io->internal.desc = desc; 5050 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5051 bdev_io->u.bdev.iovs = iov; 5052 bdev_io->u.bdev.iovcnt = iovcnt; 5053 bdev_io->u.bdev.md_buf = md_buf; 5054 bdev_io->u.bdev.num_blocks = num_blocks; 5055 bdev_io->u.bdev.offset_blocks = offset_blocks; 5056 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5057 bdev_io->internal.memory_domain = domain; 5058 bdev_io->internal.memory_domain_ctx = domain_ctx; 5059 bdev_io->internal.accel_sequence = seq; 5060 bdev_io->u.bdev.memory_domain = domain; 5061 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5062 bdev_io->u.bdev.accel_sequence = seq; 5063 5064 _bdev_io_submit_ext(desc, bdev_io); 5065 5066 return 0; 5067 } 5068 5069 int 5070 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5071 struct iovec *iov, int iovcnt, 5072 uint64_t offset_blocks, uint64_t num_blocks, 5073 spdk_bdev_io_completion_cb cb, void *cb_arg) 5074 { 5075 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5076 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5077 } 5078 5079 int 5080 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5081 struct iovec *iov, int iovcnt, void *md_buf, 5082 uint64_t offset_blocks, uint64_t num_blocks, 5083 spdk_bdev_io_completion_cb cb, void *cb_arg) 5084 { 5085 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5086 return -EINVAL; 5087 } 5088 5089 if (md_buf && !_is_buf_allocated(iov)) { 5090 return -EINVAL; 5091 } 5092 5093 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5094 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5095 } 5096 5097 static inline bool 5098 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5099 { 5100 /* 5101 * We check if opts size is at least of size when we first introduced 5102 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5103 * are not checked internal. 5104 */ 5105 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5106 sizeof(opts->metadata) && 5107 opts->size <= sizeof(*opts) && 5108 /* When memory domain is used, the user must provide data buffers */ 5109 (!opts->memory_domain || (iov && iov[0].iov_base)); 5110 } 5111 5112 int 5113 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5114 struct iovec *iov, int iovcnt, 5115 uint64_t offset_blocks, uint64_t num_blocks, 5116 spdk_bdev_io_completion_cb cb, void *cb_arg, 5117 struct spdk_bdev_ext_io_opts *opts) 5118 { 5119 void *md = NULL; 5120 5121 if (opts) { 5122 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5123 return -EINVAL; 5124 } 5125 md = opts->metadata; 5126 } 5127 5128 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5129 return -EINVAL; 5130 } 5131 5132 if (md && !_is_buf_allocated(iov)) { 5133 return -EINVAL; 5134 } 5135 5136 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5137 num_blocks, 5138 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5139 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5140 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5141 cb, cb_arg); 5142 } 5143 5144 static int 5145 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5146 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5147 spdk_bdev_io_completion_cb cb, void *cb_arg) 5148 { 5149 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5150 struct spdk_bdev_io *bdev_io; 5151 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5152 5153 if (!desc->write) { 5154 return -EBADF; 5155 } 5156 5157 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5158 return -EINVAL; 5159 } 5160 5161 bdev_io = bdev_channel_get_io(channel); 5162 if (!bdev_io) { 5163 return -ENOMEM; 5164 } 5165 5166 bdev_io->internal.ch = channel; 5167 bdev_io->internal.desc = desc; 5168 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5169 bdev_io->u.bdev.iovs = &bdev_io->iov; 5170 bdev_io->u.bdev.iovs[0].iov_base = buf; 5171 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5172 bdev_io->u.bdev.iovcnt = 1; 5173 bdev_io->u.bdev.md_buf = md_buf; 5174 bdev_io->u.bdev.num_blocks = num_blocks; 5175 bdev_io->u.bdev.offset_blocks = offset_blocks; 5176 bdev_io->u.bdev.memory_domain = NULL; 5177 bdev_io->u.bdev.memory_domain_ctx = NULL; 5178 bdev_io->u.bdev.accel_sequence = NULL; 5179 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5180 5181 bdev_io_submit(bdev_io); 5182 return 0; 5183 } 5184 5185 int 5186 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5187 void *buf, uint64_t offset, uint64_t nbytes, 5188 spdk_bdev_io_completion_cb cb, void *cb_arg) 5189 { 5190 uint64_t offset_blocks, num_blocks; 5191 5192 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5193 nbytes, &num_blocks) != 0) { 5194 return -EINVAL; 5195 } 5196 5197 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5198 } 5199 5200 int 5201 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5202 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5203 spdk_bdev_io_completion_cb cb, void *cb_arg) 5204 { 5205 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5206 cb, cb_arg); 5207 } 5208 5209 int 5210 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5211 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5212 spdk_bdev_io_completion_cb cb, void *cb_arg) 5213 { 5214 struct iovec iov = { 5215 .iov_base = buf, 5216 }; 5217 5218 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5219 return -EINVAL; 5220 } 5221 5222 if (md_buf && !_is_buf_allocated(&iov)) { 5223 return -EINVAL; 5224 } 5225 5226 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5227 cb, cb_arg); 5228 } 5229 5230 static int 5231 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5232 struct iovec *iov, int iovcnt, void *md_buf, 5233 uint64_t offset_blocks, uint64_t num_blocks, 5234 struct spdk_memory_domain *domain, void *domain_ctx, 5235 struct spdk_accel_sequence *seq, 5236 spdk_bdev_io_completion_cb cb, void *cb_arg) 5237 { 5238 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5239 struct spdk_bdev_io *bdev_io; 5240 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5241 5242 if (!desc->write) { 5243 return -EBADF; 5244 } 5245 5246 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5247 return -EINVAL; 5248 } 5249 5250 bdev_io = bdev_channel_get_io(channel); 5251 if (!bdev_io) { 5252 return -ENOMEM; 5253 } 5254 5255 bdev_io->internal.ch = channel; 5256 bdev_io->internal.desc = desc; 5257 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5258 bdev_io->u.bdev.iovs = iov; 5259 bdev_io->u.bdev.iovcnt = iovcnt; 5260 bdev_io->u.bdev.md_buf = md_buf; 5261 bdev_io->u.bdev.num_blocks = num_blocks; 5262 bdev_io->u.bdev.offset_blocks = offset_blocks; 5263 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5264 bdev_io->internal.memory_domain = domain; 5265 bdev_io->internal.memory_domain_ctx = domain_ctx; 5266 bdev_io->internal.accel_sequence = seq; 5267 bdev_io->u.bdev.memory_domain = domain; 5268 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5269 bdev_io->u.bdev.accel_sequence = seq; 5270 5271 _bdev_io_submit_ext(desc, bdev_io); 5272 5273 return 0; 5274 } 5275 5276 int 5277 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5278 struct iovec *iov, int iovcnt, 5279 uint64_t offset, uint64_t len, 5280 spdk_bdev_io_completion_cb cb, void *cb_arg) 5281 { 5282 uint64_t offset_blocks, num_blocks; 5283 5284 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5285 len, &num_blocks) != 0) { 5286 return -EINVAL; 5287 } 5288 5289 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5290 } 5291 5292 int 5293 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5294 struct iovec *iov, int iovcnt, 5295 uint64_t offset_blocks, uint64_t num_blocks, 5296 spdk_bdev_io_completion_cb cb, void *cb_arg) 5297 { 5298 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5299 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5300 } 5301 5302 int 5303 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5304 struct iovec *iov, int iovcnt, void *md_buf, 5305 uint64_t offset_blocks, uint64_t num_blocks, 5306 spdk_bdev_io_completion_cb cb, void *cb_arg) 5307 { 5308 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5309 return -EINVAL; 5310 } 5311 5312 if (md_buf && !_is_buf_allocated(iov)) { 5313 return -EINVAL; 5314 } 5315 5316 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5317 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5318 } 5319 5320 int 5321 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5322 struct iovec *iov, int iovcnt, 5323 uint64_t offset_blocks, uint64_t num_blocks, 5324 spdk_bdev_io_completion_cb cb, void *cb_arg, 5325 struct spdk_bdev_ext_io_opts *opts) 5326 { 5327 void *md = NULL; 5328 5329 if (opts) { 5330 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5331 return -EINVAL; 5332 } 5333 md = opts->metadata; 5334 } 5335 5336 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5337 return -EINVAL; 5338 } 5339 5340 if (md && !_is_buf_allocated(iov)) { 5341 return -EINVAL; 5342 } 5343 5344 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5345 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5346 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5347 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5348 cb, cb_arg); 5349 } 5350 5351 static void 5352 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5353 { 5354 struct spdk_bdev_io *parent_io = cb_arg; 5355 struct spdk_bdev *bdev = parent_io->bdev; 5356 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5357 int i, rc = 0; 5358 5359 if (!success) { 5360 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5361 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5362 spdk_bdev_free_io(bdev_io); 5363 return; 5364 } 5365 5366 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5367 rc = memcmp(read_buf, 5368 parent_io->u.bdev.iovs[i].iov_base, 5369 parent_io->u.bdev.iovs[i].iov_len); 5370 if (rc) { 5371 break; 5372 } 5373 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5374 } 5375 5376 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5377 rc = memcmp(bdev_io->u.bdev.md_buf, 5378 parent_io->u.bdev.md_buf, 5379 spdk_bdev_get_md_size(bdev)); 5380 } 5381 5382 spdk_bdev_free_io(bdev_io); 5383 5384 if (rc == 0) { 5385 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5386 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5387 } else { 5388 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5389 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5390 } 5391 } 5392 5393 static void 5394 bdev_compare_do_read(void *_bdev_io) 5395 { 5396 struct spdk_bdev_io *bdev_io = _bdev_io; 5397 int rc; 5398 5399 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5400 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5401 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5402 bdev_compare_do_read_done, bdev_io); 5403 5404 if (rc == -ENOMEM) { 5405 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5406 } else if (rc != 0) { 5407 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5408 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5409 } 5410 } 5411 5412 static int 5413 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5414 struct iovec *iov, int iovcnt, void *md_buf, 5415 uint64_t offset_blocks, uint64_t num_blocks, 5416 spdk_bdev_io_completion_cb cb, void *cb_arg) 5417 { 5418 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5419 struct spdk_bdev_io *bdev_io; 5420 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5421 5422 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5423 return -EINVAL; 5424 } 5425 5426 bdev_io = bdev_channel_get_io(channel); 5427 if (!bdev_io) { 5428 return -ENOMEM; 5429 } 5430 5431 bdev_io->internal.ch = channel; 5432 bdev_io->internal.desc = desc; 5433 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5434 bdev_io->u.bdev.iovs = iov; 5435 bdev_io->u.bdev.iovcnt = iovcnt; 5436 bdev_io->u.bdev.md_buf = md_buf; 5437 bdev_io->u.bdev.num_blocks = num_blocks; 5438 bdev_io->u.bdev.offset_blocks = offset_blocks; 5439 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5440 bdev_io->u.bdev.memory_domain = NULL; 5441 bdev_io->u.bdev.memory_domain_ctx = NULL; 5442 bdev_io->u.bdev.accel_sequence = NULL; 5443 5444 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5445 bdev_io_submit(bdev_io); 5446 return 0; 5447 } 5448 5449 bdev_compare_do_read(bdev_io); 5450 5451 return 0; 5452 } 5453 5454 int 5455 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5456 struct iovec *iov, int iovcnt, 5457 uint64_t offset_blocks, uint64_t num_blocks, 5458 spdk_bdev_io_completion_cb cb, void *cb_arg) 5459 { 5460 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5461 num_blocks, cb, cb_arg); 5462 } 5463 5464 int 5465 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5466 struct iovec *iov, int iovcnt, void *md_buf, 5467 uint64_t offset_blocks, uint64_t num_blocks, 5468 spdk_bdev_io_completion_cb cb, void *cb_arg) 5469 { 5470 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5471 return -EINVAL; 5472 } 5473 5474 if (md_buf && !_is_buf_allocated(iov)) { 5475 return -EINVAL; 5476 } 5477 5478 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5479 num_blocks, cb, cb_arg); 5480 } 5481 5482 static int 5483 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5484 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5485 spdk_bdev_io_completion_cb cb, void *cb_arg) 5486 { 5487 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5488 struct spdk_bdev_io *bdev_io; 5489 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5490 5491 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5492 return -EINVAL; 5493 } 5494 5495 bdev_io = bdev_channel_get_io(channel); 5496 if (!bdev_io) { 5497 return -ENOMEM; 5498 } 5499 5500 bdev_io->internal.ch = channel; 5501 bdev_io->internal.desc = desc; 5502 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5503 bdev_io->u.bdev.iovs = &bdev_io->iov; 5504 bdev_io->u.bdev.iovs[0].iov_base = buf; 5505 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5506 bdev_io->u.bdev.iovcnt = 1; 5507 bdev_io->u.bdev.md_buf = md_buf; 5508 bdev_io->u.bdev.num_blocks = num_blocks; 5509 bdev_io->u.bdev.offset_blocks = offset_blocks; 5510 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5511 bdev_io->u.bdev.memory_domain = NULL; 5512 bdev_io->u.bdev.memory_domain_ctx = NULL; 5513 bdev_io->u.bdev.accel_sequence = NULL; 5514 5515 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5516 bdev_io_submit(bdev_io); 5517 return 0; 5518 } 5519 5520 bdev_compare_do_read(bdev_io); 5521 5522 return 0; 5523 } 5524 5525 int 5526 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5527 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5528 spdk_bdev_io_completion_cb cb, void *cb_arg) 5529 { 5530 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5531 cb, cb_arg); 5532 } 5533 5534 int 5535 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5536 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5537 spdk_bdev_io_completion_cb cb, void *cb_arg) 5538 { 5539 struct iovec iov = { 5540 .iov_base = buf, 5541 }; 5542 5543 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5544 return -EINVAL; 5545 } 5546 5547 if (md_buf && !_is_buf_allocated(&iov)) { 5548 return -EINVAL; 5549 } 5550 5551 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5552 cb, cb_arg); 5553 } 5554 5555 static void 5556 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5557 { 5558 struct spdk_bdev_io *bdev_io = ctx; 5559 5560 if (unlock_status) { 5561 SPDK_ERRLOG("LBA range unlock failed\n"); 5562 } 5563 5564 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5565 false, bdev_io->internal.caller_ctx); 5566 } 5567 5568 static void 5569 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5570 { 5571 bdev_io->internal.status = status; 5572 5573 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5574 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5575 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5576 } 5577 5578 static void 5579 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5580 { 5581 struct spdk_bdev_io *parent_io = cb_arg; 5582 5583 if (!success) { 5584 SPDK_ERRLOG("Compare and write operation failed\n"); 5585 } 5586 5587 spdk_bdev_free_io(bdev_io); 5588 5589 bdev_comparev_and_writev_blocks_unlock(parent_io, 5590 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5591 } 5592 5593 static void 5594 bdev_compare_and_write_do_write(void *_bdev_io) 5595 { 5596 struct spdk_bdev_io *bdev_io = _bdev_io; 5597 int rc; 5598 5599 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5600 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5601 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5602 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5603 bdev_compare_and_write_do_write_done, bdev_io); 5604 5605 5606 if (rc == -ENOMEM) { 5607 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5608 } else if (rc != 0) { 5609 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5610 } 5611 } 5612 5613 static void 5614 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5615 { 5616 struct spdk_bdev_io *parent_io = cb_arg; 5617 5618 spdk_bdev_free_io(bdev_io); 5619 5620 if (!success) { 5621 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5622 return; 5623 } 5624 5625 bdev_compare_and_write_do_write(parent_io); 5626 } 5627 5628 static void 5629 bdev_compare_and_write_do_compare(void *_bdev_io) 5630 { 5631 struct spdk_bdev_io *bdev_io = _bdev_io; 5632 int rc; 5633 5634 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5635 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5636 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5637 bdev_compare_and_write_do_compare_done, bdev_io); 5638 5639 if (rc == -ENOMEM) { 5640 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5641 } else if (rc != 0) { 5642 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5643 } 5644 } 5645 5646 static void 5647 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5648 { 5649 struct spdk_bdev_io *bdev_io = ctx; 5650 5651 if (status) { 5652 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5653 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5654 return; 5655 } 5656 5657 bdev_compare_and_write_do_compare(bdev_io); 5658 } 5659 5660 int 5661 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5662 struct iovec *compare_iov, int compare_iovcnt, 5663 struct iovec *write_iov, int write_iovcnt, 5664 uint64_t offset_blocks, uint64_t num_blocks, 5665 spdk_bdev_io_completion_cb cb, void *cb_arg) 5666 { 5667 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5668 struct spdk_bdev_io *bdev_io; 5669 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5670 5671 if (!desc->write) { 5672 return -EBADF; 5673 } 5674 5675 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5676 return -EINVAL; 5677 } 5678 5679 if (num_blocks > bdev->acwu) { 5680 return -EINVAL; 5681 } 5682 5683 bdev_io = bdev_channel_get_io(channel); 5684 if (!bdev_io) { 5685 return -ENOMEM; 5686 } 5687 5688 bdev_io->internal.ch = channel; 5689 bdev_io->internal.desc = desc; 5690 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5691 bdev_io->u.bdev.iovs = compare_iov; 5692 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5693 bdev_io->u.bdev.fused_iovs = write_iov; 5694 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5695 bdev_io->u.bdev.md_buf = NULL; 5696 bdev_io->u.bdev.num_blocks = num_blocks; 5697 bdev_io->u.bdev.offset_blocks = offset_blocks; 5698 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5699 bdev_io->u.bdev.memory_domain = NULL; 5700 bdev_io->u.bdev.memory_domain_ctx = NULL; 5701 bdev_io->u.bdev.accel_sequence = NULL; 5702 5703 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5704 bdev_io_submit(bdev_io); 5705 return 0; 5706 } 5707 5708 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5709 bdev_comparev_and_writev_blocks_locked, bdev_io); 5710 } 5711 5712 int 5713 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5714 struct iovec *iov, int iovcnt, 5715 uint64_t offset_blocks, uint64_t num_blocks, 5716 bool populate, 5717 spdk_bdev_io_completion_cb cb, void *cb_arg) 5718 { 5719 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5720 struct spdk_bdev_io *bdev_io; 5721 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5722 5723 if (!desc->write) { 5724 return -EBADF; 5725 } 5726 5727 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5728 return -EINVAL; 5729 } 5730 5731 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5732 return -ENOTSUP; 5733 } 5734 5735 bdev_io = bdev_channel_get_io(channel); 5736 if (!bdev_io) { 5737 return -ENOMEM; 5738 } 5739 5740 bdev_io->internal.ch = channel; 5741 bdev_io->internal.desc = desc; 5742 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5743 bdev_io->u.bdev.num_blocks = num_blocks; 5744 bdev_io->u.bdev.offset_blocks = offset_blocks; 5745 bdev_io->u.bdev.iovs = iov; 5746 bdev_io->u.bdev.iovcnt = iovcnt; 5747 bdev_io->u.bdev.md_buf = NULL; 5748 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5749 bdev_io->u.bdev.zcopy.commit = 0; 5750 bdev_io->u.bdev.zcopy.start = 1; 5751 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5752 bdev_io->u.bdev.memory_domain = NULL; 5753 bdev_io->u.bdev.memory_domain_ctx = NULL; 5754 bdev_io->u.bdev.accel_sequence = NULL; 5755 5756 bdev_io_submit(bdev_io); 5757 5758 return 0; 5759 } 5760 5761 int 5762 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5763 spdk_bdev_io_completion_cb cb, void *cb_arg) 5764 { 5765 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5766 return -EINVAL; 5767 } 5768 5769 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5770 bdev_io->u.bdev.zcopy.start = 0; 5771 bdev_io->internal.caller_ctx = cb_arg; 5772 bdev_io->internal.cb = cb; 5773 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5774 5775 bdev_io_submit(bdev_io); 5776 5777 return 0; 5778 } 5779 5780 int 5781 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5782 uint64_t offset, uint64_t len, 5783 spdk_bdev_io_completion_cb cb, void *cb_arg) 5784 { 5785 uint64_t offset_blocks, num_blocks; 5786 5787 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5788 len, &num_blocks) != 0) { 5789 return -EINVAL; 5790 } 5791 5792 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5793 } 5794 5795 int 5796 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5797 uint64_t offset_blocks, uint64_t num_blocks, 5798 spdk_bdev_io_completion_cb cb, void *cb_arg) 5799 { 5800 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5801 struct spdk_bdev_io *bdev_io; 5802 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5803 5804 if (!desc->write) { 5805 return -EBADF; 5806 } 5807 5808 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5809 return -EINVAL; 5810 } 5811 5812 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5813 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5814 return -ENOTSUP; 5815 } 5816 5817 bdev_io = bdev_channel_get_io(channel); 5818 5819 if (!bdev_io) { 5820 return -ENOMEM; 5821 } 5822 5823 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5824 bdev_io->internal.ch = channel; 5825 bdev_io->internal.desc = desc; 5826 bdev_io->u.bdev.offset_blocks = offset_blocks; 5827 bdev_io->u.bdev.num_blocks = num_blocks; 5828 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5829 bdev_io->u.bdev.memory_domain = NULL; 5830 bdev_io->u.bdev.memory_domain_ctx = NULL; 5831 bdev_io->u.bdev.accel_sequence = NULL; 5832 5833 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5834 bdev_io_submit(bdev_io); 5835 return 0; 5836 } 5837 5838 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5839 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5840 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5841 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5842 bdev_write_zero_buffer_next(bdev_io); 5843 5844 return 0; 5845 } 5846 5847 int 5848 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5849 uint64_t offset, uint64_t nbytes, 5850 spdk_bdev_io_completion_cb cb, void *cb_arg) 5851 { 5852 uint64_t offset_blocks, num_blocks; 5853 5854 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5855 nbytes, &num_blocks) != 0) { 5856 return -EINVAL; 5857 } 5858 5859 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5860 } 5861 5862 int 5863 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5864 uint64_t offset_blocks, uint64_t num_blocks, 5865 spdk_bdev_io_completion_cb cb, void *cb_arg) 5866 { 5867 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5868 struct spdk_bdev_io *bdev_io; 5869 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5870 5871 if (!desc->write) { 5872 return -EBADF; 5873 } 5874 5875 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5876 return -EINVAL; 5877 } 5878 5879 if (num_blocks == 0) { 5880 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5881 return -EINVAL; 5882 } 5883 5884 bdev_io = bdev_channel_get_io(channel); 5885 if (!bdev_io) { 5886 return -ENOMEM; 5887 } 5888 5889 bdev_io->internal.ch = channel; 5890 bdev_io->internal.desc = desc; 5891 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5892 5893 bdev_io->u.bdev.iovs = &bdev_io->iov; 5894 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5895 bdev_io->u.bdev.iovs[0].iov_len = 0; 5896 bdev_io->u.bdev.iovcnt = 1; 5897 5898 bdev_io->u.bdev.offset_blocks = offset_blocks; 5899 bdev_io->u.bdev.num_blocks = num_blocks; 5900 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5901 bdev_io->u.bdev.memory_domain = NULL; 5902 bdev_io->u.bdev.memory_domain_ctx = NULL; 5903 bdev_io->u.bdev.accel_sequence = NULL; 5904 5905 bdev_io_submit(bdev_io); 5906 return 0; 5907 } 5908 5909 int 5910 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5911 uint64_t offset, uint64_t length, 5912 spdk_bdev_io_completion_cb cb, void *cb_arg) 5913 { 5914 uint64_t offset_blocks, num_blocks; 5915 5916 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5917 length, &num_blocks) != 0) { 5918 return -EINVAL; 5919 } 5920 5921 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5922 } 5923 5924 int 5925 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5926 uint64_t offset_blocks, uint64_t num_blocks, 5927 spdk_bdev_io_completion_cb cb, void *cb_arg) 5928 { 5929 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5930 struct spdk_bdev_io *bdev_io; 5931 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5932 5933 if (!desc->write) { 5934 return -EBADF; 5935 } 5936 5937 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5938 return -EINVAL; 5939 } 5940 5941 bdev_io = bdev_channel_get_io(channel); 5942 if (!bdev_io) { 5943 return -ENOMEM; 5944 } 5945 5946 bdev_io->internal.ch = channel; 5947 bdev_io->internal.desc = desc; 5948 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5949 bdev_io->u.bdev.iovs = NULL; 5950 bdev_io->u.bdev.iovcnt = 0; 5951 bdev_io->u.bdev.offset_blocks = offset_blocks; 5952 bdev_io->u.bdev.num_blocks = num_blocks; 5953 bdev_io->u.bdev.memory_domain = NULL; 5954 bdev_io->u.bdev.memory_domain_ctx = NULL; 5955 bdev_io->u.bdev.accel_sequence = NULL; 5956 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5957 5958 bdev_io_submit(bdev_io); 5959 return 0; 5960 } 5961 5962 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5963 5964 static void 5965 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5966 { 5967 struct spdk_bdev_channel *ch = _ctx; 5968 struct spdk_bdev_io *bdev_io; 5969 5970 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5971 5972 if (status == -EBUSY) { 5973 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5974 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5975 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5976 } else { 5977 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5978 5979 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 5980 /* If outstanding IOs are still present and reset_io_drain_timeout 5981 * seconds passed, start the reset. */ 5982 bdev_io_submit_reset(bdev_io); 5983 } else { 5984 /* We still have in progress memory domain pull/push or we're 5985 * executing accel sequence. Since we cannot abort either of those 5986 * operaions, fail the reset request. */ 5987 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5988 } 5989 } 5990 } else { 5991 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5992 SPDK_DEBUGLOG(bdev, 5993 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5994 ch->bdev->name); 5995 /* Mark the completion status as a SUCCESS and complete the reset. */ 5996 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5997 } 5998 } 5999 6000 static void 6001 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6002 struct spdk_io_channel *io_ch, void *_ctx) 6003 { 6004 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6005 int status = 0; 6006 6007 if (cur_ch->io_outstanding > 0 || 6008 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6009 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6010 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6011 * further iteration over the rest of the channels and pass non-zero status 6012 * to the callback function. */ 6013 status = -EBUSY; 6014 } 6015 spdk_bdev_for_each_channel_continue(i, status); 6016 } 6017 6018 static int 6019 bdev_reset_poll_for_outstanding_io(void *ctx) 6020 { 6021 struct spdk_bdev_channel *ch = ctx; 6022 struct spdk_bdev_io *bdev_io; 6023 6024 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6025 6026 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6027 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6028 bdev_reset_check_outstanding_io_done); 6029 6030 return SPDK_POLLER_BUSY; 6031 } 6032 6033 static void 6034 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6035 { 6036 struct spdk_bdev_channel *ch = _ctx; 6037 struct spdk_bdev_io *bdev_io; 6038 6039 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6040 6041 if (bdev->reset_io_drain_timeout == 0) { 6042 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6043 6044 bdev_io_submit_reset(bdev_io); 6045 return; 6046 } 6047 6048 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6049 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6050 6051 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6052 * submit the reset to the underlying module only if outstanding I/O 6053 * remain after reset_io_drain_timeout seconds have passed. */ 6054 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6055 bdev_reset_check_outstanding_io_done); 6056 } 6057 6058 static void 6059 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6060 struct spdk_io_channel *ch, void *_ctx) 6061 { 6062 struct spdk_bdev_channel *channel; 6063 struct spdk_bdev_mgmt_channel *mgmt_channel; 6064 struct spdk_bdev_shared_resource *shared_resource; 6065 bdev_io_tailq_t tmp_queued; 6066 6067 TAILQ_INIT(&tmp_queued); 6068 6069 channel = __io_ch_to_bdev_ch(ch); 6070 shared_resource = channel->shared_resource; 6071 mgmt_channel = shared_resource->mgmt_ch; 6072 6073 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6074 6075 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6076 /* The QoS object is always valid and readable while 6077 * the channel flag is set, so the lock here should not 6078 * be necessary. We're not in the fast path though, so 6079 * just take it anyway. */ 6080 spdk_spin_lock(&channel->bdev->internal.spinlock); 6081 if (channel->bdev->internal.qos->ch == channel) { 6082 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6083 } 6084 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6085 } 6086 6087 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6088 bdev_abort_all_buf_io(mgmt_channel, channel); 6089 bdev_abort_all_buf_io(mgmt_channel, channel); 6090 bdev_abort_all_queued_io(&tmp_queued, channel); 6091 6092 spdk_bdev_for_each_channel_continue(i, 0); 6093 } 6094 6095 static void 6096 bdev_start_reset(void *ctx) 6097 { 6098 struct spdk_bdev_channel *ch = ctx; 6099 6100 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6101 bdev_reset_freeze_channel_done); 6102 } 6103 6104 static void 6105 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6106 { 6107 struct spdk_bdev *bdev = ch->bdev; 6108 6109 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6110 6111 spdk_spin_lock(&bdev->internal.spinlock); 6112 if (bdev->internal.reset_in_progress == NULL) { 6113 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6114 /* 6115 * Take a channel reference for the target bdev for the life of this 6116 * reset. This guards against the channel getting destroyed while 6117 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6118 * progress. We will release the reference when this reset is 6119 * completed. 6120 */ 6121 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6122 bdev_start_reset(ch); 6123 } 6124 spdk_spin_unlock(&bdev->internal.spinlock); 6125 } 6126 6127 int 6128 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6129 spdk_bdev_io_completion_cb cb, void *cb_arg) 6130 { 6131 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6132 struct spdk_bdev_io *bdev_io; 6133 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6134 6135 bdev_io = bdev_channel_get_io(channel); 6136 if (!bdev_io) { 6137 return -ENOMEM; 6138 } 6139 6140 bdev_io->internal.ch = channel; 6141 bdev_io->internal.desc = desc; 6142 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6143 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6144 bdev_io->u.reset.ch_ref = NULL; 6145 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6146 6147 spdk_spin_lock(&bdev->internal.spinlock); 6148 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6149 spdk_spin_unlock(&bdev->internal.spinlock); 6150 6151 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6152 internal.ch_link); 6153 6154 bdev_channel_start_reset(channel); 6155 6156 return 0; 6157 } 6158 6159 void 6160 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6161 struct spdk_bdev_io_stat *stat) 6162 { 6163 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6164 6165 bdev_get_io_stat(stat, channel->stat); 6166 } 6167 6168 static void 6169 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6170 { 6171 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6172 6173 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6174 bdev_iostat_ctx->cb_arg, 0); 6175 free(bdev_iostat_ctx); 6176 } 6177 6178 static void 6179 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6180 struct spdk_io_channel *ch, void *_ctx) 6181 { 6182 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6183 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6184 6185 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6186 spdk_bdev_for_each_channel_continue(i, 0); 6187 } 6188 6189 void 6190 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6191 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6192 { 6193 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6194 6195 assert(bdev != NULL); 6196 assert(stat != NULL); 6197 assert(cb != NULL); 6198 6199 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6200 if (bdev_iostat_ctx == NULL) { 6201 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6202 cb(bdev, stat, cb_arg, -ENOMEM); 6203 return; 6204 } 6205 6206 bdev_iostat_ctx->stat = stat; 6207 bdev_iostat_ctx->cb = cb; 6208 bdev_iostat_ctx->cb_arg = cb_arg; 6209 6210 /* Start with the statistics from previously deleted channels. */ 6211 spdk_spin_lock(&bdev->internal.spinlock); 6212 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6213 spdk_spin_unlock(&bdev->internal.spinlock); 6214 6215 /* Then iterate and add the statistics from each existing channel. */ 6216 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6217 bdev_get_device_stat_done); 6218 } 6219 6220 struct bdev_iostat_reset_ctx { 6221 enum spdk_bdev_reset_stat_mode mode; 6222 bdev_reset_device_stat_cb cb; 6223 void *cb_arg; 6224 }; 6225 6226 static void 6227 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6228 { 6229 struct bdev_iostat_reset_ctx *ctx = _ctx; 6230 6231 ctx->cb(bdev, ctx->cb_arg, 0); 6232 6233 free(ctx); 6234 } 6235 6236 static void 6237 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6238 struct spdk_io_channel *ch, void *_ctx) 6239 { 6240 struct bdev_iostat_reset_ctx *ctx = _ctx; 6241 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6242 6243 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6244 6245 spdk_bdev_for_each_channel_continue(i, 0); 6246 } 6247 6248 void 6249 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6250 bdev_reset_device_stat_cb cb, void *cb_arg) 6251 { 6252 struct bdev_iostat_reset_ctx *ctx; 6253 6254 assert(bdev != NULL); 6255 assert(cb != NULL); 6256 6257 ctx = calloc(1, sizeof(*ctx)); 6258 if (ctx == NULL) { 6259 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6260 cb(bdev, cb_arg, -ENOMEM); 6261 return; 6262 } 6263 6264 ctx->mode = mode; 6265 ctx->cb = cb; 6266 ctx->cb_arg = cb_arg; 6267 6268 spdk_spin_lock(&bdev->internal.spinlock); 6269 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6270 spdk_spin_unlock(&bdev->internal.spinlock); 6271 6272 spdk_bdev_for_each_channel(bdev, 6273 bdev_reset_each_channel_stat, 6274 ctx, 6275 bdev_reset_device_stat_done); 6276 } 6277 6278 int 6279 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6280 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6281 spdk_bdev_io_completion_cb cb, void *cb_arg) 6282 { 6283 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6284 struct spdk_bdev_io *bdev_io; 6285 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6286 6287 if (!desc->write) { 6288 return -EBADF; 6289 } 6290 6291 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6292 return -ENOTSUP; 6293 } 6294 6295 bdev_io = bdev_channel_get_io(channel); 6296 if (!bdev_io) { 6297 return -ENOMEM; 6298 } 6299 6300 bdev_io->internal.ch = channel; 6301 bdev_io->internal.desc = desc; 6302 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6303 bdev_io->u.nvme_passthru.cmd = *cmd; 6304 bdev_io->u.nvme_passthru.buf = buf; 6305 bdev_io->u.nvme_passthru.nbytes = nbytes; 6306 bdev_io->u.nvme_passthru.md_buf = NULL; 6307 bdev_io->u.nvme_passthru.md_len = 0; 6308 6309 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6310 6311 bdev_io_submit(bdev_io); 6312 return 0; 6313 } 6314 6315 int 6316 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6317 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6318 spdk_bdev_io_completion_cb cb, void *cb_arg) 6319 { 6320 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6321 struct spdk_bdev_io *bdev_io; 6322 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6323 6324 if (!desc->write) { 6325 /* 6326 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6327 * to easily determine if the command is a read or write, but for now just 6328 * do not allow io_passthru with a read-only descriptor. 6329 */ 6330 return -EBADF; 6331 } 6332 6333 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6334 return -ENOTSUP; 6335 } 6336 6337 bdev_io = bdev_channel_get_io(channel); 6338 if (!bdev_io) { 6339 return -ENOMEM; 6340 } 6341 6342 bdev_io->internal.ch = channel; 6343 bdev_io->internal.desc = desc; 6344 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6345 bdev_io->u.nvme_passthru.cmd = *cmd; 6346 bdev_io->u.nvme_passthru.buf = buf; 6347 bdev_io->u.nvme_passthru.nbytes = nbytes; 6348 bdev_io->u.nvme_passthru.md_buf = NULL; 6349 bdev_io->u.nvme_passthru.md_len = 0; 6350 6351 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6352 6353 bdev_io_submit(bdev_io); 6354 return 0; 6355 } 6356 6357 int 6358 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6359 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6360 spdk_bdev_io_completion_cb cb, void *cb_arg) 6361 { 6362 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6363 struct spdk_bdev_io *bdev_io; 6364 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6365 6366 if (!desc->write) { 6367 /* 6368 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6369 * to easily determine if the command is a read or write, but for now just 6370 * do not allow io_passthru with a read-only descriptor. 6371 */ 6372 return -EBADF; 6373 } 6374 6375 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6376 return -ENOTSUP; 6377 } 6378 6379 bdev_io = bdev_channel_get_io(channel); 6380 if (!bdev_io) { 6381 return -ENOMEM; 6382 } 6383 6384 bdev_io->internal.ch = channel; 6385 bdev_io->internal.desc = desc; 6386 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6387 bdev_io->u.nvme_passthru.cmd = *cmd; 6388 bdev_io->u.nvme_passthru.buf = buf; 6389 bdev_io->u.nvme_passthru.nbytes = nbytes; 6390 bdev_io->u.nvme_passthru.md_buf = md_buf; 6391 bdev_io->u.nvme_passthru.md_len = md_len; 6392 6393 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6394 6395 bdev_io_submit(bdev_io); 6396 return 0; 6397 } 6398 6399 static void bdev_abort_retry(void *ctx); 6400 static void bdev_abort(struct spdk_bdev_io *parent_io); 6401 6402 static void 6403 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6404 { 6405 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6406 struct spdk_bdev_io *parent_io = cb_arg; 6407 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6408 6409 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6410 6411 spdk_bdev_free_io(bdev_io); 6412 6413 if (!success) { 6414 /* Check if the target I/O completed in the meantime. */ 6415 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6416 if (tmp_io == bio_to_abort) { 6417 break; 6418 } 6419 } 6420 6421 /* If the target I/O still exists, set the parent to failed. */ 6422 if (tmp_io != NULL) { 6423 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6424 } 6425 } 6426 6427 parent_io->u.bdev.split_outstanding--; 6428 if (parent_io->u.bdev.split_outstanding == 0) { 6429 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6430 bdev_abort_retry(parent_io); 6431 } else { 6432 bdev_io_complete(parent_io); 6433 } 6434 } 6435 } 6436 6437 static int 6438 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6439 struct spdk_bdev_io *bio_to_abort, 6440 spdk_bdev_io_completion_cb cb, void *cb_arg) 6441 { 6442 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6443 struct spdk_bdev_io *bdev_io; 6444 6445 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6446 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6447 /* TODO: Abort reset or abort request. */ 6448 return -ENOTSUP; 6449 } 6450 6451 bdev_io = bdev_channel_get_io(channel); 6452 if (bdev_io == NULL) { 6453 return -ENOMEM; 6454 } 6455 6456 bdev_io->internal.ch = channel; 6457 bdev_io->internal.desc = desc; 6458 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6459 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6460 6461 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6462 assert(bdev_io_should_split(bio_to_abort)); 6463 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6464 6465 /* Parent abort request is not submitted directly, but to manage its 6466 * execution add it to the submitted list here. 6467 */ 6468 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6469 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6470 6471 bdev_abort(bdev_io); 6472 6473 return 0; 6474 } 6475 6476 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6477 6478 /* Submit the abort request to the underlying bdev module. */ 6479 bdev_io_submit(bdev_io); 6480 6481 return 0; 6482 } 6483 6484 static bool 6485 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6486 { 6487 struct spdk_bdev_io *iter; 6488 6489 TAILQ_FOREACH(iter, tailq, internal.link) { 6490 if (iter == bdev_io) { 6491 return true; 6492 } 6493 } 6494 6495 return false; 6496 } 6497 6498 static uint32_t 6499 _bdev_abort(struct spdk_bdev_io *parent_io) 6500 { 6501 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6502 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6503 void *bio_cb_arg; 6504 struct spdk_bdev_io *bio_to_abort; 6505 uint32_t matched_ios; 6506 int rc; 6507 6508 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6509 6510 /* matched_ios is returned and will be kept by the caller. 6511 * 6512 * This function will be used for two cases, 1) the same cb_arg is used for 6513 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6514 * Incrementing split_outstanding directly here may confuse readers especially 6515 * for the 1st case. 6516 * 6517 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6518 * works as expected. 6519 */ 6520 matched_ios = 0; 6521 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6522 6523 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6524 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6525 continue; 6526 } 6527 6528 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6529 /* Any I/O which was submitted after this abort command should be excluded. */ 6530 continue; 6531 } 6532 6533 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6534 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6535 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6536 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6537 break; 6538 } 6539 6540 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6541 if (rc != 0) { 6542 if (rc == -ENOMEM) { 6543 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6544 } else { 6545 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6546 } 6547 break; 6548 } 6549 matched_ios++; 6550 } 6551 6552 return matched_ios; 6553 } 6554 6555 static void 6556 bdev_abort_retry(void *ctx) 6557 { 6558 struct spdk_bdev_io *parent_io = ctx; 6559 uint32_t matched_ios; 6560 6561 matched_ios = _bdev_abort(parent_io); 6562 6563 if (matched_ios == 0) { 6564 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6565 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6566 } else { 6567 /* For retry, the case that no target I/O was found is success 6568 * because it means target I/Os completed in the meantime. 6569 */ 6570 bdev_io_complete(parent_io); 6571 } 6572 return; 6573 } 6574 6575 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6576 parent_io->u.bdev.split_outstanding = matched_ios; 6577 } 6578 6579 static void 6580 bdev_abort(struct spdk_bdev_io *parent_io) 6581 { 6582 uint32_t matched_ios; 6583 6584 matched_ios = _bdev_abort(parent_io); 6585 6586 if (matched_ios == 0) { 6587 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6588 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6589 } else { 6590 /* The case the no target I/O was found is failure. */ 6591 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6592 bdev_io_complete(parent_io); 6593 } 6594 return; 6595 } 6596 6597 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6598 parent_io->u.bdev.split_outstanding = matched_ios; 6599 } 6600 6601 int 6602 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6603 void *bio_cb_arg, 6604 spdk_bdev_io_completion_cb cb, void *cb_arg) 6605 { 6606 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6607 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6608 struct spdk_bdev_io *bdev_io; 6609 6610 if (bio_cb_arg == NULL) { 6611 return -EINVAL; 6612 } 6613 6614 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6615 return -ENOTSUP; 6616 } 6617 6618 bdev_io = bdev_channel_get_io(channel); 6619 if (bdev_io == NULL) { 6620 return -ENOMEM; 6621 } 6622 6623 bdev_io->internal.ch = channel; 6624 bdev_io->internal.desc = desc; 6625 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6626 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6627 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6628 6629 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6630 6631 /* Parent abort request is not submitted directly, but to manage its execution, 6632 * add it to the submitted list here. 6633 */ 6634 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6635 6636 bdev_abort(bdev_io); 6637 6638 return 0; 6639 } 6640 6641 int 6642 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6643 struct spdk_bdev_io_wait_entry *entry) 6644 { 6645 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6646 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6647 6648 if (bdev != entry->bdev) { 6649 SPDK_ERRLOG("bdevs do not match\n"); 6650 return -EINVAL; 6651 } 6652 6653 if (mgmt_ch->per_thread_cache_count > 0) { 6654 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6655 return -EINVAL; 6656 } 6657 6658 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6659 return 0; 6660 } 6661 6662 static inline void 6663 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6664 { 6665 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6666 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6667 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6668 uint32_t blocklen = bdev_io->bdev->blocklen; 6669 6670 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6671 switch (bdev_io->type) { 6672 case SPDK_BDEV_IO_TYPE_READ: 6673 io_stat->bytes_read += num_blocks * blocklen; 6674 io_stat->num_read_ops++; 6675 io_stat->read_latency_ticks += tsc_diff; 6676 if (io_stat->max_read_latency_ticks < tsc_diff) { 6677 io_stat->max_read_latency_ticks = tsc_diff; 6678 } 6679 if (io_stat->min_read_latency_ticks > tsc_diff) { 6680 io_stat->min_read_latency_ticks = tsc_diff; 6681 } 6682 break; 6683 case SPDK_BDEV_IO_TYPE_WRITE: 6684 io_stat->bytes_written += num_blocks * blocklen; 6685 io_stat->num_write_ops++; 6686 io_stat->write_latency_ticks += tsc_diff; 6687 if (io_stat->max_write_latency_ticks < tsc_diff) { 6688 io_stat->max_write_latency_ticks = tsc_diff; 6689 } 6690 if (io_stat->min_write_latency_ticks > tsc_diff) { 6691 io_stat->min_write_latency_ticks = tsc_diff; 6692 } 6693 break; 6694 case SPDK_BDEV_IO_TYPE_UNMAP: 6695 io_stat->bytes_unmapped += num_blocks * blocklen; 6696 io_stat->num_unmap_ops++; 6697 io_stat->unmap_latency_ticks += tsc_diff; 6698 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6699 io_stat->max_unmap_latency_ticks = tsc_diff; 6700 } 6701 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6702 io_stat->min_unmap_latency_ticks = tsc_diff; 6703 } 6704 break; 6705 case SPDK_BDEV_IO_TYPE_ZCOPY: 6706 /* Track the data in the start phase only */ 6707 if (bdev_io->u.bdev.zcopy.start) { 6708 if (bdev_io->u.bdev.zcopy.populate) { 6709 io_stat->bytes_read += num_blocks * blocklen; 6710 io_stat->num_read_ops++; 6711 io_stat->read_latency_ticks += tsc_diff; 6712 if (io_stat->max_read_latency_ticks < tsc_diff) { 6713 io_stat->max_read_latency_ticks = tsc_diff; 6714 } 6715 if (io_stat->min_read_latency_ticks > tsc_diff) { 6716 io_stat->min_read_latency_ticks = tsc_diff; 6717 } 6718 } else { 6719 io_stat->bytes_written += num_blocks * blocklen; 6720 io_stat->num_write_ops++; 6721 io_stat->write_latency_ticks += tsc_diff; 6722 if (io_stat->max_write_latency_ticks < tsc_diff) { 6723 io_stat->max_write_latency_ticks = tsc_diff; 6724 } 6725 if (io_stat->min_write_latency_ticks > tsc_diff) { 6726 io_stat->min_write_latency_ticks = tsc_diff; 6727 } 6728 } 6729 } 6730 break; 6731 case SPDK_BDEV_IO_TYPE_COPY: 6732 io_stat->bytes_copied += num_blocks * blocklen; 6733 io_stat->num_copy_ops++; 6734 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6735 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6736 io_stat->max_copy_latency_ticks = tsc_diff; 6737 } 6738 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6739 io_stat->min_copy_latency_ticks = tsc_diff; 6740 } 6741 break; 6742 default: 6743 break; 6744 } 6745 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6746 io_stat = bdev_io->bdev->internal.stat; 6747 assert(io_stat->io_error != NULL); 6748 6749 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6750 io_stat->io_error->error_status[-io_status - 1]++; 6751 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6752 } 6753 6754 #ifdef SPDK_CONFIG_VTUNE 6755 uint64_t now_tsc = spdk_get_ticks(); 6756 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6757 uint64_t data[5]; 6758 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6759 6760 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6761 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6762 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6763 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6764 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6765 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6766 6767 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6768 __itt_metadata_u64, 5, data); 6769 6770 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6771 bdev_io->internal.ch->start_tsc = now_tsc; 6772 } 6773 #endif 6774 } 6775 6776 static inline void 6777 _bdev_io_complete(void *ctx) 6778 { 6779 struct spdk_bdev_io *bdev_io = ctx; 6780 6781 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6782 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6783 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6784 } 6785 6786 assert(bdev_io->internal.cb != NULL); 6787 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6788 6789 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6790 bdev_io->internal.caller_ctx); 6791 } 6792 6793 static inline void 6794 bdev_io_complete(void *ctx) 6795 { 6796 struct spdk_bdev_io *bdev_io = ctx; 6797 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6798 uint64_t tsc, tsc_diff; 6799 6800 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6801 /* 6802 * Defer completion to avoid potential infinite recursion if the 6803 * user's completion callback issues a new I/O. 6804 */ 6805 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6806 bdev_io_complete, bdev_io); 6807 return; 6808 } 6809 6810 tsc = spdk_get_ticks(); 6811 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6812 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6813 bdev_io->internal.caller_ctx); 6814 6815 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6816 6817 if (bdev_io->internal.ch->histogram) { 6818 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6819 } 6820 6821 bdev_io_update_io_stat(bdev_io, tsc_diff); 6822 _bdev_io_complete(bdev_io); 6823 } 6824 6825 /* The difference between this function and bdev_io_complete() is that this should be called to 6826 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 6827 * io_submitted list and don't have submit_tsc updated. 6828 */ 6829 static inline void 6830 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 6831 { 6832 /* Since the IO hasn't been submitted it's bound to be failed */ 6833 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6834 6835 /* At this point we don't know if the IO is completed from submission context or not, but, 6836 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 6837 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6838 _bdev_io_complete, bdev_io); 6839 } 6840 6841 static void bdev_destroy_cb(void *io_device); 6842 6843 static void 6844 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6845 { 6846 struct spdk_bdev_io *bdev_io = _ctx; 6847 6848 if (bdev_io->u.reset.ch_ref != NULL) { 6849 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6850 bdev_io->u.reset.ch_ref = NULL; 6851 } 6852 6853 bdev_io_complete(bdev_io); 6854 6855 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6856 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6857 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6858 } 6859 } 6860 6861 static void 6862 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6863 struct spdk_io_channel *_ch, void *_ctx) 6864 { 6865 struct spdk_bdev_io *bdev_io = _ctx; 6866 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6867 struct spdk_bdev_io *queued_reset; 6868 6869 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6870 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6871 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6872 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6873 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6874 } 6875 6876 spdk_bdev_for_each_channel_continue(i, 0); 6877 } 6878 6879 static void 6880 bdev_io_complete_sequence_cb(void *ctx, int status) 6881 { 6882 struct spdk_bdev_io *bdev_io = ctx; 6883 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6884 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6885 6886 /* u.bdev.accel_sequence should have already been cleared at this point */ 6887 assert(bdev_io->u.bdev.accel_sequence == NULL); 6888 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 6889 6890 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 6891 bdev_io->internal.accel_sequence = NULL; 6892 6893 if (spdk_unlikely(status != 0)) { 6894 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 6895 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6896 } 6897 6898 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6899 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6900 return; 6901 } 6902 6903 bdev_io_complete(bdev_io); 6904 } 6905 6906 void 6907 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6908 { 6909 struct spdk_bdev *bdev = bdev_io->bdev; 6910 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6911 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6912 6913 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 6914 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 6915 spdk_bdev_get_module_name(bdev), 6916 bdev_io_status_get_string(bdev_io->internal.status)); 6917 assert(false); 6918 } 6919 bdev_io->internal.status = status; 6920 6921 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6922 bool unlock_channels = false; 6923 6924 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6925 SPDK_ERRLOG("NOMEM returned for reset\n"); 6926 } 6927 spdk_spin_lock(&bdev->internal.spinlock); 6928 if (bdev_io == bdev->internal.reset_in_progress) { 6929 bdev->internal.reset_in_progress = NULL; 6930 unlock_channels = true; 6931 } 6932 spdk_spin_unlock(&bdev->internal.spinlock); 6933 6934 if (unlock_channels) { 6935 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6936 bdev_reset_complete); 6937 return; 6938 } 6939 } else { 6940 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io) && 6941 spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6942 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 6943 return; 6944 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6945 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6946 /* bdev IO will be completed in the callback */ 6947 return; 6948 } 6949 6950 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6951 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6952 return; 6953 } 6954 } 6955 6956 bdev_io_complete(bdev_io); 6957 } 6958 6959 void 6960 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6961 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6962 { 6963 enum spdk_bdev_io_status status; 6964 6965 if (sc == SPDK_SCSI_STATUS_GOOD) { 6966 status = SPDK_BDEV_IO_STATUS_SUCCESS; 6967 } else { 6968 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6969 bdev_io->internal.error.scsi.sc = sc; 6970 bdev_io->internal.error.scsi.sk = sk; 6971 bdev_io->internal.error.scsi.asc = asc; 6972 bdev_io->internal.error.scsi.ascq = ascq; 6973 } 6974 6975 spdk_bdev_io_complete(bdev_io, status); 6976 } 6977 6978 void 6979 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6980 int *sc, int *sk, int *asc, int *ascq) 6981 { 6982 assert(sc != NULL); 6983 assert(sk != NULL); 6984 assert(asc != NULL); 6985 assert(ascq != NULL); 6986 6987 switch (bdev_io->internal.status) { 6988 case SPDK_BDEV_IO_STATUS_SUCCESS: 6989 *sc = SPDK_SCSI_STATUS_GOOD; 6990 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6991 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6992 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6993 break; 6994 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6995 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6996 break; 6997 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6998 *sc = bdev_io->internal.error.scsi.sc; 6999 *sk = bdev_io->internal.error.scsi.sk; 7000 *asc = bdev_io->internal.error.scsi.asc; 7001 *ascq = bdev_io->internal.error.scsi.ascq; 7002 break; 7003 default: 7004 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7005 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7006 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7007 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7008 break; 7009 } 7010 } 7011 7012 void 7013 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7014 { 7015 enum spdk_bdev_io_status status; 7016 7017 if (aio_result == 0) { 7018 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7019 } else { 7020 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7021 } 7022 7023 bdev_io->internal.error.aio_result = aio_result; 7024 7025 spdk_bdev_io_complete(bdev_io, status); 7026 } 7027 7028 void 7029 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7030 { 7031 assert(aio_result != NULL); 7032 7033 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7034 *aio_result = bdev_io->internal.error.aio_result; 7035 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7036 *aio_result = 0; 7037 } else { 7038 *aio_result = -EIO; 7039 } 7040 } 7041 7042 void 7043 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7044 { 7045 enum spdk_bdev_io_status status; 7046 7047 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7048 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7049 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7050 status = SPDK_BDEV_IO_STATUS_ABORTED; 7051 } else { 7052 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7053 } 7054 7055 bdev_io->internal.error.nvme.cdw0 = cdw0; 7056 bdev_io->internal.error.nvme.sct = sct; 7057 bdev_io->internal.error.nvme.sc = sc; 7058 7059 spdk_bdev_io_complete(bdev_io, status); 7060 } 7061 7062 void 7063 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7064 { 7065 assert(sct != NULL); 7066 assert(sc != NULL); 7067 assert(cdw0 != NULL); 7068 7069 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7070 *sct = SPDK_NVME_SCT_GENERIC; 7071 *sc = SPDK_NVME_SC_SUCCESS; 7072 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7073 *cdw0 = 0; 7074 } else { 7075 *cdw0 = 1U; 7076 } 7077 return; 7078 } 7079 7080 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7081 *sct = bdev_io->internal.error.nvme.sct; 7082 *sc = bdev_io->internal.error.nvme.sc; 7083 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7084 *sct = SPDK_NVME_SCT_GENERIC; 7085 *sc = SPDK_NVME_SC_SUCCESS; 7086 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7087 *sct = SPDK_NVME_SCT_GENERIC; 7088 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7089 } else { 7090 *sct = SPDK_NVME_SCT_GENERIC; 7091 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7092 } 7093 7094 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7095 } 7096 7097 void 7098 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7099 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7100 { 7101 assert(first_sct != NULL); 7102 assert(first_sc != NULL); 7103 assert(second_sct != NULL); 7104 assert(second_sc != NULL); 7105 assert(cdw0 != NULL); 7106 7107 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7108 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7109 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7110 *first_sct = bdev_io->internal.error.nvme.sct; 7111 *first_sc = bdev_io->internal.error.nvme.sc; 7112 *second_sct = SPDK_NVME_SCT_GENERIC; 7113 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7114 } else { 7115 *first_sct = SPDK_NVME_SCT_GENERIC; 7116 *first_sc = SPDK_NVME_SC_SUCCESS; 7117 *second_sct = bdev_io->internal.error.nvme.sct; 7118 *second_sc = bdev_io->internal.error.nvme.sc; 7119 } 7120 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7121 *first_sct = SPDK_NVME_SCT_GENERIC; 7122 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7123 *second_sct = SPDK_NVME_SCT_GENERIC; 7124 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7125 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7126 *first_sct = SPDK_NVME_SCT_GENERIC; 7127 *first_sc = SPDK_NVME_SC_SUCCESS; 7128 *second_sct = SPDK_NVME_SCT_GENERIC; 7129 *second_sc = SPDK_NVME_SC_SUCCESS; 7130 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7131 *first_sct = SPDK_NVME_SCT_GENERIC; 7132 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7133 *second_sct = SPDK_NVME_SCT_GENERIC; 7134 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7135 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7136 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7137 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7138 *second_sct = SPDK_NVME_SCT_GENERIC; 7139 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7140 } else { 7141 *first_sct = SPDK_NVME_SCT_GENERIC; 7142 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7143 *second_sct = SPDK_NVME_SCT_GENERIC; 7144 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7145 } 7146 7147 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7148 } 7149 7150 struct spdk_thread * 7151 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7152 { 7153 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7154 } 7155 7156 struct spdk_io_channel * 7157 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7158 { 7159 return bdev_io->internal.ch->channel; 7160 } 7161 7162 static int 7163 bdev_register(struct spdk_bdev *bdev) 7164 { 7165 char *bdev_name; 7166 char uuid[SPDK_UUID_STRING_LEN]; 7167 int ret, i; 7168 7169 assert(bdev->module != NULL); 7170 7171 if (!bdev->name) { 7172 SPDK_ERRLOG("Bdev name is NULL\n"); 7173 return -EINVAL; 7174 } 7175 7176 if (!strlen(bdev->name)) { 7177 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7178 return -EINVAL; 7179 } 7180 7181 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7182 if (bdev->fn_table->accel_sequence_supported == NULL) { 7183 continue; 7184 } 7185 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7186 (enum spdk_bdev_io_type)i)) { 7187 continue; 7188 } 7189 7190 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7191 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7192 "memory domains\n"); 7193 return -EINVAL; 7194 } 7195 7196 if (spdk_bdev_is_md_separate(bdev)) { 7197 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7198 "accel sequence support\n"); 7199 return -EINVAL; 7200 } 7201 } 7202 7203 /* Users often register their own I/O devices using the bdev name. In 7204 * order to avoid conflicts, prepend bdev_. */ 7205 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7206 if (!bdev_name) { 7207 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7208 return -ENOMEM; 7209 } 7210 7211 bdev->internal.stat = bdev_alloc_io_stat(true); 7212 if (!bdev->internal.stat) { 7213 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7214 free(bdev_name); 7215 return -ENOMEM; 7216 } 7217 7218 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7219 bdev->internal.measured_queue_depth = UINT64_MAX; 7220 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7221 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7222 bdev->internal.qd_poller = NULL; 7223 bdev->internal.qos = NULL; 7224 7225 TAILQ_INIT(&bdev->internal.open_descs); 7226 TAILQ_INIT(&bdev->internal.locked_ranges); 7227 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7228 TAILQ_INIT(&bdev->aliases); 7229 7230 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7231 if (ret != 0) { 7232 bdev_free_io_stat(bdev->internal.stat); 7233 free(bdev_name); 7234 return ret; 7235 } 7236 7237 /* UUID may be specified by the user or defined by bdev itself. 7238 * Otherwise it will be generated here, so this field will never be empty. */ 7239 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7240 spdk_uuid_generate(&bdev->uuid); 7241 } 7242 7243 /* Add the UUID alias only if it's different than the name */ 7244 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7245 if (strcmp(bdev->name, uuid) != 0) { 7246 ret = spdk_bdev_alias_add(bdev, uuid); 7247 if (ret != 0) { 7248 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7249 bdev_name_del(&bdev->internal.bdev_name); 7250 bdev_free_io_stat(bdev->internal.stat); 7251 free(bdev_name); 7252 return ret; 7253 } 7254 } 7255 7256 if (spdk_bdev_get_buf_align(bdev) > 1) { 7257 if (bdev->split_on_optimal_io_boundary) { 7258 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7259 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7260 } else { 7261 bdev->split_on_optimal_io_boundary = true; 7262 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7263 } 7264 } 7265 7266 /* If the user didn't specify a write unit size, set it to one. */ 7267 if (bdev->write_unit_size == 0) { 7268 bdev->write_unit_size = 1; 7269 } 7270 7271 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7272 if (bdev->acwu == 0) { 7273 bdev->acwu = bdev->write_unit_size; 7274 } 7275 7276 if (bdev->phys_blocklen == 0) { 7277 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7278 } 7279 7280 bdev->internal.reset_in_progress = NULL; 7281 bdev->internal.qd_poll_in_progress = false; 7282 bdev->internal.period = 0; 7283 bdev->internal.new_period = 0; 7284 7285 spdk_io_device_register(__bdev_to_io_dev(bdev), 7286 bdev_channel_create, bdev_channel_destroy, 7287 sizeof(struct spdk_bdev_channel), 7288 bdev_name); 7289 7290 free(bdev_name); 7291 7292 spdk_spin_init(&bdev->internal.spinlock); 7293 7294 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7295 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7296 7297 return 0; 7298 } 7299 7300 static void 7301 bdev_destroy_cb(void *io_device) 7302 { 7303 int rc; 7304 struct spdk_bdev *bdev; 7305 spdk_bdev_unregister_cb cb_fn; 7306 void *cb_arg; 7307 7308 bdev = __bdev_from_io_dev(io_device); 7309 7310 if (bdev->internal.unregister_td != spdk_get_thread()) { 7311 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7312 return; 7313 } 7314 7315 cb_fn = bdev->internal.unregister_cb; 7316 cb_arg = bdev->internal.unregister_ctx; 7317 7318 spdk_spin_destroy(&bdev->internal.spinlock); 7319 free(bdev->internal.qos); 7320 bdev_free_io_stat(bdev->internal.stat); 7321 7322 rc = bdev->fn_table->destruct(bdev->ctxt); 7323 if (rc < 0) { 7324 SPDK_ERRLOG("destruct failed\n"); 7325 } 7326 if (rc <= 0 && cb_fn != NULL) { 7327 cb_fn(cb_arg, rc); 7328 } 7329 } 7330 7331 void 7332 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7333 { 7334 if (bdev->internal.unregister_cb != NULL) { 7335 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7336 } 7337 } 7338 7339 static void 7340 _remove_notify(void *arg) 7341 { 7342 struct spdk_bdev_desc *desc = arg; 7343 7344 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7345 } 7346 7347 /* returns: 0 - bdev removed and ready to be destructed. 7348 * -EBUSY - bdev can't be destructed yet. */ 7349 static int 7350 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7351 { 7352 struct spdk_bdev_desc *desc, *tmp; 7353 int rc = 0; 7354 char uuid[SPDK_UUID_STRING_LEN]; 7355 7356 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7357 assert(spdk_spin_held(&bdev->internal.spinlock)); 7358 7359 /* Notify each descriptor about hotremoval */ 7360 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7361 rc = -EBUSY; 7362 /* 7363 * Defer invocation of the event_cb to a separate message that will 7364 * run later on its thread. This ensures this context unwinds and 7365 * we don't recursively unregister this bdev again if the event_cb 7366 * immediately closes its descriptor. 7367 */ 7368 event_notify(desc, _remove_notify); 7369 } 7370 7371 /* If there are no descriptors, proceed removing the bdev */ 7372 if (rc == 0) { 7373 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7374 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7375 7376 /* Delete the name and the UUID alias */ 7377 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7378 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7379 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7380 7381 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7382 7383 if (bdev->internal.reset_in_progress != NULL) { 7384 /* If reset is in progress, let the completion callback for reset 7385 * unregister the bdev. 7386 */ 7387 rc = -EBUSY; 7388 } 7389 } 7390 7391 return rc; 7392 } 7393 7394 static void 7395 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7396 struct spdk_io_channel *io_ch, void *_ctx) 7397 { 7398 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7399 7400 bdev_channel_abort_queued_ios(bdev_ch); 7401 spdk_bdev_for_each_channel_continue(i, 0); 7402 } 7403 7404 static void 7405 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7406 { 7407 int rc; 7408 7409 spdk_spin_lock(&g_bdev_mgr.spinlock); 7410 spdk_spin_lock(&bdev->internal.spinlock); 7411 /* 7412 * Set the status to REMOVING after completing to abort channels. Otherwise, 7413 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7414 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7415 * may fail. 7416 */ 7417 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7418 rc = bdev_unregister_unsafe(bdev); 7419 spdk_spin_unlock(&bdev->internal.spinlock); 7420 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7421 7422 if (rc == 0) { 7423 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7424 } 7425 } 7426 7427 void 7428 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7429 { 7430 struct spdk_thread *thread; 7431 7432 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7433 7434 thread = spdk_get_thread(); 7435 if (!thread) { 7436 /* The user called this from a non-SPDK thread. */ 7437 if (cb_fn != NULL) { 7438 cb_fn(cb_arg, -ENOTSUP); 7439 } 7440 return; 7441 } 7442 7443 spdk_spin_lock(&g_bdev_mgr.spinlock); 7444 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7445 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7446 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7447 if (cb_fn) { 7448 cb_fn(cb_arg, -EBUSY); 7449 } 7450 return; 7451 } 7452 7453 spdk_spin_lock(&bdev->internal.spinlock); 7454 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7455 bdev->internal.unregister_cb = cb_fn; 7456 bdev->internal.unregister_ctx = cb_arg; 7457 bdev->internal.unregister_td = thread; 7458 spdk_spin_unlock(&bdev->internal.spinlock); 7459 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7460 7461 spdk_bdev_set_qd_sampling_period(bdev, 0); 7462 7463 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7464 bdev_unregister); 7465 } 7466 7467 int 7468 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7469 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7470 { 7471 struct spdk_bdev_desc *desc; 7472 struct spdk_bdev *bdev; 7473 int rc; 7474 7475 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7476 if (rc != 0) { 7477 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7478 return rc; 7479 } 7480 7481 bdev = spdk_bdev_desc_get_bdev(desc); 7482 7483 if (bdev->module != module) { 7484 spdk_bdev_close(desc); 7485 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7486 bdev_name); 7487 return -ENODEV; 7488 } 7489 7490 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7491 7492 spdk_bdev_close(desc); 7493 7494 return 0; 7495 } 7496 7497 static int 7498 bdev_start_qos(struct spdk_bdev *bdev) 7499 { 7500 struct set_qos_limit_ctx *ctx; 7501 7502 /* Enable QoS */ 7503 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7504 ctx = calloc(1, sizeof(*ctx)); 7505 if (ctx == NULL) { 7506 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7507 return -ENOMEM; 7508 } 7509 ctx->bdev = bdev; 7510 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7511 } 7512 7513 return 0; 7514 } 7515 7516 static void 7517 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7518 struct spdk_bdev *bdev) 7519 { 7520 enum spdk_bdev_claim_type type; 7521 const char *typename, *modname; 7522 extern struct spdk_log_flag SPDK_LOG_bdev; 7523 7524 assert(spdk_spin_held(&bdev->internal.spinlock)); 7525 7526 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7527 return; 7528 } 7529 7530 type = bdev->internal.claim_type; 7531 typename = spdk_bdev_claim_get_name(type); 7532 7533 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7534 modname = bdev->internal.claim.v1.module->name; 7535 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7536 bdev->name, detail, typename, modname); 7537 return; 7538 } 7539 7540 if (claim_type_is_v2(type)) { 7541 struct spdk_bdev_module_claim *claim; 7542 7543 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7544 modname = claim->module->name; 7545 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7546 bdev->name, detail, typename, modname); 7547 } 7548 return; 7549 } 7550 7551 assert(false); 7552 } 7553 7554 static int 7555 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7556 { 7557 struct spdk_thread *thread; 7558 int rc = 0; 7559 7560 thread = spdk_get_thread(); 7561 if (!thread) { 7562 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7563 return -ENOTSUP; 7564 } 7565 7566 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7567 spdk_get_thread()); 7568 7569 desc->bdev = bdev; 7570 desc->thread = thread; 7571 desc->write = write; 7572 7573 spdk_spin_lock(&bdev->internal.spinlock); 7574 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7575 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7576 spdk_spin_unlock(&bdev->internal.spinlock); 7577 return -ENODEV; 7578 } 7579 7580 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7581 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7582 spdk_spin_unlock(&bdev->internal.spinlock); 7583 return -EPERM; 7584 } 7585 7586 rc = bdev_start_qos(bdev); 7587 if (rc != 0) { 7588 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7589 spdk_spin_unlock(&bdev->internal.spinlock); 7590 return rc; 7591 } 7592 7593 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7594 7595 spdk_spin_unlock(&bdev->internal.spinlock); 7596 7597 return 0; 7598 } 7599 7600 static int 7601 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7602 struct spdk_bdev_desc **_desc) 7603 { 7604 struct spdk_bdev_desc *desc; 7605 unsigned int i; 7606 7607 desc = calloc(1, sizeof(*desc)); 7608 if (desc == NULL) { 7609 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7610 return -ENOMEM; 7611 } 7612 7613 TAILQ_INIT(&desc->pending_media_events); 7614 TAILQ_INIT(&desc->free_media_events); 7615 7616 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7617 desc->callback.event_fn = event_cb; 7618 desc->callback.ctx = event_ctx; 7619 spdk_spin_init(&desc->spinlock); 7620 7621 if (bdev->media_events) { 7622 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7623 sizeof(*desc->media_events_buffer)); 7624 if (desc->media_events_buffer == NULL) { 7625 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7626 bdev_desc_free(desc); 7627 return -ENOMEM; 7628 } 7629 7630 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7631 TAILQ_INSERT_TAIL(&desc->free_media_events, 7632 &desc->media_events_buffer[i], tailq); 7633 } 7634 } 7635 7636 if (bdev->fn_table->accel_sequence_supported != NULL) { 7637 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7638 desc->accel_sequence_supported[i] = 7639 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7640 (enum spdk_bdev_io_type)i); 7641 } 7642 } 7643 7644 *_desc = desc; 7645 7646 return 0; 7647 } 7648 7649 int 7650 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7651 void *event_ctx, struct spdk_bdev_desc **_desc) 7652 { 7653 struct spdk_bdev_desc *desc; 7654 struct spdk_bdev *bdev; 7655 int rc; 7656 7657 if (event_cb == NULL) { 7658 SPDK_ERRLOG("Missing event callback function\n"); 7659 return -EINVAL; 7660 } 7661 7662 spdk_spin_lock(&g_bdev_mgr.spinlock); 7663 7664 bdev = bdev_get_by_name(bdev_name); 7665 7666 if (bdev == NULL) { 7667 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7668 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7669 return -ENODEV; 7670 } 7671 7672 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7673 if (rc != 0) { 7674 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7675 return rc; 7676 } 7677 7678 rc = bdev_open(bdev, write, desc); 7679 if (rc != 0) { 7680 bdev_desc_free(desc); 7681 desc = NULL; 7682 } 7683 7684 *_desc = desc; 7685 7686 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7687 7688 return rc; 7689 } 7690 7691 static void 7692 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7693 { 7694 int rc; 7695 7696 spdk_spin_lock(&bdev->internal.spinlock); 7697 spdk_spin_lock(&desc->spinlock); 7698 7699 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7700 7701 desc->closed = true; 7702 7703 if (desc->claim != NULL) { 7704 bdev_desc_release_claims(desc); 7705 } 7706 7707 if (0 == desc->refs) { 7708 spdk_spin_unlock(&desc->spinlock); 7709 bdev_desc_free(desc); 7710 } else { 7711 spdk_spin_unlock(&desc->spinlock); 7712 } 7713 7714 /* If no more descriptors, kill QoS channel */ 7715 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7716 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7717 bdev->name, spdk_get_thread()); 7718 7719 if (bdev_qos_destroy(bdev)) { 7720 /* There isn't anything we can do to recover here. Just let the 7721 * old QoS poller keep running. The QoS handling won't change 7722 * cores when the user allocates a new channel, but it won't break. */ 7723 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7724 } 7725 } 7726 7727 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7728 rc = bdev_unregister_unsafe(bdev); 7729 spdk_spin_unlock(&bdev->internal.spinlock); 7730 7731 if (rc == 0) { 7732 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7733 } 7734 } else { 7735 spdk_spin_unlock(&bdev->internal.spinlock); 7736 } 7737 } 7738 7739 void 7740 spdk_bdev_close(struct spdk_bdev_desc *desc) 7741 { 7742 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7743 7744 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7745 spdk_get_thread()); 7746 7747 assert(desc->thread == spdk_get_thread()); 7748 7749 spdk_poller_unregister(&desc->io_timeout_poller); 7750 7751 spdk_spin_lock(&g_bdev_mgr.spinlock); 7752 7753 bdev_close(bdev, desc); 7754 7755 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7756 } 7757 7758 static void 7759 bdev_register_finished(void *arg) 7760 { 7761 struct spdk_bdev_desc *desc = arg; 7762 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7763 7764 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7765 7766 spdk_spin_lock(&g_bdev_mgr.spinlock); 7767 7768 bdev_close(bdev, desc); 7769 7770 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7771 } 7772 7773 int 7774 spdk_bdev_register(struct spdk_bdev *bdev) 7775 { 7776 struct spdk_bdev_desc *desc; 7777 struct spdk_thread *thread = spdk_get_thread(); 7778 int rc; 7779 7780 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7781 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7782 thread ? spdk_thread_get_name(thread) : "null"); 7783 return -EINVAL; 7784 } 7785 7786 rc = bdev_register(bdev); 7787 if (rc != 0) { 7788 return rc; 7789 } 7790 7791 /* A descriptor is opened to prevent bdev deletion during examination */ 7792 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7793 if (rc != 0) { 7794 spdk_bdev_unregister(bdev, NULL, NULL); 7795 return rc; 7796 } 7797 7798 rc = bdev_open(bdev, false, desc); 7799 if (rc != 0) { 7800 bdev_desc_free(desc); 7801 spdk_bdev_unregister(bdev, NULL, NULL); 7802 return rc; 7803 } 7804 7805 /* Examine configuration before initializing I/O */ 7806 bdev_examine(bdev); 7807 7808 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7809 if (rc != 0) { 7810 bdev_close(bdev, desc); 7811 spdk_bdev_unregister(bdev, NULL, NULL); 7812 } 7813 7814 return rc; 7815 } 7816 7817 int 7818 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7819 struct spdk_bdev_module *module) 7820 { 7821 spdk_spin_lock(&bdev->internal.spinlock); 7822 7823 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7824 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7825 spdk_spin_unlock(&bdev->internal.spinlock); 7826 return -EPERM; 7827 } 7828 7829 if (desc && !desc->write) { 7830 desc->write = true; 7831 } 7832 7833 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7834 bdev->internal.claim.v1.module = module; 7835 7836 spdk_spin_unlock(&bdev->internal.spinlock); 7837 return 0; 7838 } 7839 7840 void 7841 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7842 { 7843 spdk_spin_lock(&bdev->internal.spinlock); 7844 7845 assert(bdev->internal.claim.v1.module != NULL); 7846 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7847 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7848 bdev->internal.claim.v1.module = NULL; 7849 7850 spdk_spin_unlock(&bdev->internal.spinlock); 7851 } 7852 7853 /* 7854 * Start claims v2 7855 */ 7856 7857 const char * 7858 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 7859 { 7860 switch (type) { 7861 case SPDK_BDEV_CLAIM_NONE: 7862 return "not_claimed"; 7863 case SPDK_BDEV_CLAIM_EXCL_WRITE: 7864 return "exclusive_write"; 7865 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7866 return "read_many_write_one"; 7867 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7868 return "read_many_write_none"; 7869 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7870 return "read_many_write_many"; 7871 default: 7872 break; 7873 } 7874 return "invalid_claim"; 7875 } 7876 7877 static bool 7878 claim_type_is_v2(enum spdk_bdev_claim_type type) 7879 { 7880 switch (type) { 7881 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7882 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 7883 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7884 return true; 7885 default: 7886 break; 7887 } 7888 return false; 7889 } 7890 7891 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 7892 static bool 7893 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 7894 { 7895 switch (type) { 7896 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 7897 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 7898 return true; 7899 default: 7900 break; 7901 } 7902 return false; 7903 } 7904 7905 void 7906 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 7907 { 7908 if (opts == NULL) { 7909 SPDK_ERRLOG("opts should not be NULL\n"); 7910 assert(opts != NULL); 7911 return; 7912 } 7913 if (size == 0) { 7914 SPDK_ERRLOG("size should not be zero\n"); 7915 assert(size != 0); 7916 return; 7917 } 7918 7919 memset(opts, 0, size); 7920 opts->opts_size = size; 7921 7922 #define FIELD_OK(field) \ 7923 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 7924 7925 #define SET_FIELD(field, value) \ 7926 if (FIELD_OK(field)) { \ 7927 opts->field = value; \ 7928 } \ 7929 7930 SET_FIELD(shared_claim_key, 0); 7931 7932 #undef FIELD_OK 7933 #undef SET_FIELD 7934 } 7935 7936 static int 7937 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 7938 { 7939 if (src->opts_size == 0) { 7940 SPDK_ERRLOG("size should not be zero\n"); 7941 return -1; 7942 } 7943 7944 memset(dst, 0, sizeof(*dst)); 7945 dst->opts_size = src->opts_size; 7946 7947 #define FIELD_OK(field) \ 7948 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 7949 7950 #define SET_FIELD(field) \ 7951 if (FIELD_OK(field)) { \ 7952 dst->field = src->field; \ 7953 } \ 7954 7955 if (FIELD_OK(name)) { 7956 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 7957 } 7958 7959 SET_FIELD(shared_claim_key); 7960 7961 /* You should not remove this statement, but need to update the assert statement 7962 * if you add a new field, and also add a corresponding SET_FIELD statement */ 7963 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 7964 7965 #undef FIELD_OK 7966 #undef SET_FIELD 7967 return 0; 7968 } 7969 7970 /* Returns 0 if a read-write-once claim can be taken. */ 7971 static int 7972 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 7973 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 7974 { 7975 struct spdk_bdev *bdev = desc->bdev; 7976 struct spdk_bdev_desc *open_desc; 7977 7978 assert(spdk_spin_held(&bdev->internal.spinlock)); 7979 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 7980 7981 if (opts->shared_claim_key != 0) { 7982 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 7983 bdev->name); 7984 return -EINVAL; 7985 } 7986 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7987 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7988 return -EPERM; 7989 } 7990 if (desc->claim != NULL) { 7991 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 7992 bdev->name, desc->claim->module->name); 7993 return -EPERM; 7994 } 7995 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 7996 if (desc != open_desc && open_desc->write) { 7997 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 7998 "another descriptor is open for writing\n", 7999 bdev->name); 8000 return -EPERM; 8001 } 8002 } 8003 8004 return 0; 8005 } 8006 8007 /* Returns 0 if a read-only-many claim can be taken. */ 8008 static int 8009 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8010 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8011 { 8012 struct spdk_bdev *bdev = desc->bdev; 8013 struct spdk_bdev_desc *open_desc; 8014 8015 assert(spdk_spin_held(&bdev->internal.spinlock)); 8016 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8017 assert(desc->claim == NULL); 8018 8019 if (desc->write) { 8020 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8021 bdev->name); 8022 return -EINVAL; 8023 } 8024 if (opts->shared_claim_key != 0) { 8025 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8026 return -EINVAL; 8027 } 8028 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8029 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8030 if (open_desc->write) { 8031 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8032 "another descriptor is open for writing\n", 8033 bdev->name); 8034 return -EPERM; 8035 } 8036 } 8037 } 8038 8039 return 0; 8040 } 8041 8042 /* Returns 0 if a read-write-many claim can be taken. */ 8043 static int 8044 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8045 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8046 { 8047 struct spdk_bdev *bdev = desc->bdev; 8048 struct spdk_bdev_desc *open_desc; 8049 8050 assert(spdk_spin_held(&bdev->internal.spinlock)); 8051 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8052 assert(desc->claim == NULL); 8053 8054 if (opts->shared_claim_key == 0) { 8055 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8056 bdev->name); 8057 return -EINVAL; 8058 } 8059 switch (bdev->internal.claim_type) { 8060 case SPDK_BDEV_CLAIM_NONE: 8061 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8062 if (open_desc == desc) { 8063 continue; 8064 } 8065 if (open_desc->write) { 8066 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8067 "another descriptor is open for writing without a " 8068 "claim\n", bdev->name); 8069 return -EPERM; 8070 } 8071 } 8072 break; 8073 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8074 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8075 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8076 return -EPERM; 8077 } 8078 break; 8079 default: 8080 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8081 return -EBUSY; 8082 } 8083 8084 return 0; 8085 } 8086 8087 /* Updates desc and its bdev with a v2 claim. */ 8088 static int 8089 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8090 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8091 { 8092 struct spdk_bdev *bdev = desc->bdev; 8093 struct spdk_bdev_module_claim *claim; 8094 8095 assert(spdk_spin_held(&bdev->internal.spinlock)); 8096 assert(claim_type_is_v2(type)); 8097 assert(desc->claim == NULL); 8098 8099 claim = calloc(1, sizeof(*desc->claim)); 8100 if (claim == NULL) { 8101 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8102 return -ENOMEM; 8103 } 8104 claim->module = module; 8105 claim->desc = desc; 8106 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8107 memcpy(claim->name, opts->name, sizeof(claim->name)); 8108 desc->claim = claim; 8109 8110 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8111 bdev->internal.claim_type = type; 8112 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8113 bdev->internal.claim.v2.key = opts->shared_claim_key; 8114 } 8115 assert(type == bdev->internal.claim_type); 8116 8117 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8118 8119 if (!desc->write && claim_type_promotes_to_write(type)) { 8120 desc->write = true; 8121 } 8122 8123 return 0; 8124 } 8125 8126 int 8127 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8128 struct spdk_bdev_claim_opts *_opts, 8129 struct spdk_bdev_module *module) 8130 { 8131 struct spdk_bdev *bdev; 8132 struct spdk_bdev_claim_opts opts; 8133 int rc = 0; 8134 8135 if (desc == NULL) { 8136 SPDK_ERRLOG("descriptor must not be NULL\n"); 8137 return -EINVAL; 8138 } 8139 8140 bdev = desc->bdev; 8141 8142 if (_opts == NULL) { 8143 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8144 } else if (claim_opts_copy(_opts, &opts) != 0) { 8145 return -EINVAL; 8146 } 8147 8148 spdk_spin_lock(&bdev->internal.spinlock); 8149 8150 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8151 bdev->internal.claim_type != type) { 8152 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8153 spdk_spin_unlock(&bdev->internal.spinlock); 8154 return -EPERM; 8155 } 8156 8157 if (claim_type_is_v2(type) && desc->claim != NULL) { 8158 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8159 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8160 spdk_spin_unlock(&bdev->internal.spinlock); 8161 return -EPERM; 8162 } 8163 8164 switch (type) { 8165 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8166 spdk_spin_unlock(&bdev->internal.spinlock); 8167 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8168 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8169 rc = claim_verify_rwo(desc, type, &opts, module); 8170 break; 8171 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8172 rc = claim_verify_rom(desc, type, &opts, module); 8173 break; 8174 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8175 rc = claim_verify_rwm(desc, type, &opts, module); 8176 break; 8177 default: 8178 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8179 rc = -ENOTSUP; 8180 } 8181 8182 if (rc == 0) { 8183 rc = claim_bdev(desc, type, &opts, module); 8184 } 8185 8186 spdk_spin_unlock(&bdev->internal.spinlock); 8187 return rc; 8188 } 8189 8190 static void 8191 claim_reset(struct spdk_bdev *bdev) 8192 { 8193 assert(spdk_spin_held(&bdev->internal.spinlock)); 8194 assert(claim_type_is_v2(bdev->internal.claim_type)); 8195 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8196 8197 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8198 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8199 } 8200 8201 static void 8202 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8203 { 8204 struct spdk_bdev *bdev = desc->bdev; 8205 8206 assert(spdk_spin_held(&bdev->internal.spinlock)); 8207 assert(claim_type_is_v2(bdev->internal.claim_type)); 8208 8209 if (bdev->internal.examine_in_progress == 0) { 8210 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8211 free(desc->claim); 8212 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8213 claim_reset(bdev); 8214 } 8215 } else { 8216 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8217 desc->claim->module = NULL; 8218 desc->claim->desc = NULL; 8219 } 8220 desc->claim = NULL; 8221 } 8222 8223 /* 8224 * End claims v2 8225 */ 8226 8227 struct spdk_bdev * 8228 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8229 { 8230 assert(desc != NULL); 8231 return desc->bdev; 8232 } 8233 8234 int 8235 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8236 { 8237 struct spdk_bdev *bdev, *tmp; 8238 struct spdk_bdev_desc *desc; 8239 int rc = 0; 8240 8241 assert(fn != NULL); 8242 8243 spdk_spin_lock(&g_bdev_mgr.spinlock); 8244 bdev = spdk_bdev_first(); 8245 while (bdev != NULL) { 8246 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8247 if (rc != 0) { 8248 break; 8249 } 8250 rc = bdev_open(bdev, false, desc); 8251 if (rc != 0) { 8252 bdev_desc_free(desc); 8253 if (rc == -ENODEV) { 8254 /* Ignore the error and move to the next bdev. */ 8255 rc = 0; 8256 bdev = spdk_bdev_next(bdev); 8257 continue; 8258 } 8259 break; 8260 } 8261 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8262 8263 rc = fn(ctx, bdev); 8264 8265 spdk_spin_lock(&g_bdev_mgr.spinlock); 8266 tmp = spdk_bdev_next(bdev); 8267 bdev_close(bdev, desc); 8268 if (rc != 0) { 8269 break; 8270 } 8271 bdev = tmp; 8272 } 8273 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8274 8275 return rc; 8276 } 8277 8278 int 8279 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8280 { 8281 struct spdk_bdev *bdev, *tmp; 8282 struct spdk_bdev_desc *desc; 8283 int rc = 0; 8284 8285 assert(fn != NULL); 8286 8287 spdk_spin_lock(&g_bdev_mgr.spinlock); 8288 bdev = spdk_bdev_first_leaf(); 8289 while (bdev != NULL) { 8290 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8291 if (rc != 0) { 8292 break; 8293 } 8294 rc = bdev_open(bdev, false, desc); 8295 if (rc != 0) { 8296 bdev_desc_free(desc); 8297 if (rc == -ENODEV) { 8298 /* Ignore the error and move to the next bdev. */ 8299 rc = 0; 8300 bdev = spdk_bdev_next_leaf(bdev); 8301 continue; 8302 } 8303 break; 8304 } 8305 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8306 8307 rc = fn(ctx, bdev); 8308 8309 spdk_spin_lock(&g_bdev_mgr.spinlock); 8310 tmp = spdk_bdev_next_leaf(bdev); 8311 bdev_close(bdev, desc); 8312 if (rc != 0) { 8313 break; 8314 } 8315 bdev = tmp; 8316 } 8317 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8318 8319 return rc; 8320 } 8321 8322 void 8323 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8324 { 8325 struct iovec *iovs; 8326 int iovcnt; 8327 8328 if (bdev_io == NULL) { 8329 return; 8330 } 8331 8332 switch (bdev_io->type) { 8333 case SPDK_BDEV_IO_TYPE_READ: 8334 case SPDK_BDEV_IO_TYPE_WRITE: 8335 case SPDK_BDEV_IO_TYPE_ZCOPY: 8336 iovs = bdev_io->u.bdev.iovs; 8337 iovcnt = bdev_io->u.bdev.iovcnt; 8338 break; 8339 default: 8340 iovs = NULL; 8341 iovcnt = 0; 8342 break; 8343 } 8344 8345 if (iovp) { 8346 *iovp = iovs; 8347 } 8348 if (iovcntp) { 8349 *iovcntp = iovcnt; 8350 } 8351 } 8352 8353 void * 8354 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8355 { 8356 if (bdev_io == NULL) { 8357 return NULL; 8358 } 8359 8360 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8361 return NULL; 8362 } 8363 8364 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8365 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8366 return bdev_io->u.bdev.md_buf; 8367 } 8368 8369 return NULL; 8370 } 8371 8372 void * 8373 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8374 { 8375 if (bdev_io == NULL) { 8376 assert(false); 8377 return NULL; 8378 } 8379 8380 return bdev_io->internal.caller_ctx; 8381 } 8382 8383 void 8384 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8385 { 8386 8387 if (spdk_bdev_module_list_find(bdev_module->name)) { 8388 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8389 assert(false); 8390 } 8391 8392 spdk_spin_init(&bdev_module->internal.spinlock); 8393 8394 /* 8395 * Modules with examine callbacks must be initialized first, so they are 8396 * ready to handle examine callbacks from later modules that will 8397 * register physical bdevs. 8398 */ 8399 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8400 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8401 } else { 8402 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8403 } 8404 } 8405 8406 struct spdk_bdev_module * 8407 spdk_bdev_module_list_find(const char *name) 8408 { 8409 struct spdk_bdev_module *bdev_module; 8410 8411 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8412 if (strcmp(name, bdev_module->name) == 0) { 8413 break; 8414 } 8415 } 8416 8417 return bdev_module; 8418 } 8419 8420 static void 8421 bdev_write_zero_buffer_next(void *_bdev_io) 8422 { 8423 struct spdk_bdev_io *bdev_io = _bdev_io; 8424 uint64_t num_bytes, num_blocks; 8425 void *md_buf = NULL; 8426 int rc; 8427 8428 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 8429 bdev_io->u.bdev.split_remaining_num_blocks, 8430 ZERO_BUFFER_SIZE); 8431 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 8432 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 8433 8434 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8435 md_buf = (char *)g_bdev_mgr.zero_buffer + 8436 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8437 } 8438 8439 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 8440 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8441 g_bdev_mgr.zero_buffer, md_buf, 8442 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 8443 bdev_write_zero_buffer_done, bdev_io); 8444 if (rc == 0) { 8445 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 8446 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 8447 } else if (rc == -ENOMEM) { 8448 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 8449 } else { 8450 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8451 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 8452 } 8453 } 8454 8455 static void 8456 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8457 { 8458 struct spdk_bdev_io *parent_io = cb_arg; 8459 8460 spdk_bdev_free_io(bdev_io); 8461 8462 if (!success) { 8463 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 8464 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 8465 return; 8466 } 8467 8468 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 8469 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 8470 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 8471 return; 8472 } 8473 8474 bdev_write_zero_buffer_next(parent_io); 8475 } 8476 8477 static void 8478 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8479 { 8480 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8481 ctx->bdev->internal.qos_mod_in_progress = false; 8482 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8483 8484 if (ctx->cb_fn) { 8485 ctx->cb_fn(ctx->cb_arg, status); 8486 } 8487 free(ctx); 8488 } 8489 8490 static void 8491 bdev_disable_qos_done(void *cb_arg) 8492 { 8493 struct set_qos_limit_ctx *ctx = cb_arg; 8494 struct spdk_bdev *bdev = ctx->bdev; 8495 struct spdk_bdev_io *bdev_io; 8496 struct spdk_bdev_qos *qos; 8497 8498 spdk_spin_lock(&bdev->internal.spinlock); 8499 qos = bdev->internal.qos; 8500 bdev->internal.qos = NULL; 8501 spdk_spin_unlock(&bdev->internal.spinlock); 8502 8503 while (!TAILQ_EMPTY(&qos->queued)) { 8504 /* Send queued I/O back to their original thread for resubmission. */ 8505 bdev_io = TAILQ_FIRST(&qos->queued); 8506 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8507 8508 if (bdev_io->internal.io_submit_ch) { 8509 /* 8510 * Channel was changed when sending it to the QoS thread - change it back 8511 * before sending it back to the original thread. 8512 */ 8513 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8514 bdev_io->internal.io_submit_ch = NULL; 8515 } 8516 8517 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8518 _bdev_io_submit, bdev_io); 8519 } 8520 8521 if (qos->thread != NULL) { 8522 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8523 spdk_poller_unregister(&qos->poller); 8524 } 8525 8526 free(qos); 8527 8528 bdev_set_qos_limit_done(ctx, 0); 8529 } 8530 8531 static void 8532 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8533 { 8534 struct set_qos_limit_ctx *ctx = _ctx; 8535 struct spdk_thread *thread; 8536 8537 spdk_spin_lock(&bdev->internal.spinlock); 8538 thread = bdev->internal.qos->thread; 8539 spdk_spin_unlock(&bdev->internal.spinlock); 8540 8541 if (thread != NULL) { 8542 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8543 } else { 8544 bdev_disable_qos_done(ctx); 8545 } 8546 } 8547 8548 static void 8549 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8550 struct spdk_io_channel *ch, void *_ctx) 8551 { 8552 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8553 8554 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8555 8556 spdk_bdev_for_each_channel_continue(i, 0); 8557 } 8558 8559 static void 8560 bdev_update_qos_rate_limit_msg(void *cb_arg) 8561 { 8562 struct set_qos_limit_ctx *ctx = cb_arg; 8563 struct spdk_bdev *bdev = ctx->bdev; 8564 8565 spdk_spin_lock(&bdev->internal.spinlock); 8566 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8567 spdk_spin_unlock(&bdev->internal.spinlock); 8568 8569 bdev_set_qos_limit_done(ctx, 0); 8570 } 8571 8572 static void 8573 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8574 struct spdk_io_channel *ch, void *_ctx) 8575 { 8576 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8577 8578 spdk_spin_lock(&bdev->internal.spinlock); 8579 bdev_enable_qos(bdev, bdev_ch); 8580 spdk_spin_unlock(&bdev->internal.spinlock); 8581 spdk_bdev_for_each_channel_continue(i, 0); 8582 } 8583 8584 static void 8585 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8586 { 8587 struct set_qos_limit_ctx *ctx = _ctx; 8588 8589 bdev_set_qos_limit_done(ctx, status); 8590 } 8591 8592 static void 8593 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8594 { 8595 int i; 8596 8597 assert(bdev->internal.qos != NULL); 8598 8599 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8600 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8601 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8602 8603 if (limits[i] == 0) { 8604 bdev->internal.qos->rate_limits[i].limit = 8605 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8606 } 8607 } 8608 } 8609 } 8610 8611 void 8612 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8613 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8614 { 8615 struct set_qos_limit_ctx *ctx; 8616 uint32_t limit_set_complement; 8617 uint64_t min_limit_per_sec; 8618 int i; 8619 bool disable_rate_limit = true; 8620 8621 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8622 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8623 continue; 8624 } 8625 8626 if (limits[i] > 0) { 8627 disable_rate_limit = false; 8628 } 8629 8630 if (bdev_qos_is_iops_rate_limit(i) == true) { 8631 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8632 } else { 8633 /* Change from megabyte to byte rate limit */ 8634 limits[i] = limits[i] * 1024 * 1024; 8635 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8636 } 8637 8638 limit_set_complement = limits[i] % min_limit_per_sec; 8639 if (limit_set_complement) { 8640 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8641 limits[i], min_limit_per_sec); 8642 limits[i] += min_limit_per_sec - limit_set_complement; 8643 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8644 } 8645 } 8646 8647 ctx = calloc(1, sizeof(*ctx)); 8648 if (ctx == NULL) { 8649 cb_fn(cb_arg, -ENOMEM); 8650 return; 8651 } 8652 8653 ctx->cb_fn = cb_fn; 8654 ctx->cb_arg = cb_arg; 8655 ctx->bdev = bdev; 8656 8657 spdk_spin_lock(&bdev->internal.spinlock); 8658 if (bdev->internal.qos_mod_in_progress) { 8659 spdk_spin_unlock(&bdev->internal.spinlock); 8660 free(ctx); 8661 cb_fn(cb_arg, -EAGAIN); 8662 return; 8663 } 8664 bdev->internal.qos_mod_in_progress = true; 8665 8666 if (disable_rate_limit == true && bdev->internal.qos) { 8667 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8668 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8669 (bdev->internal.qos->rate_limits[i].limit > 0 && 8670 bdev->internal.qos->rate_limits[i].limit != 8671 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8672 disable_rate_limit = false; 8673 break; 8674 } 8675 } 8676 } 8677 8678 if (disable_rate_limit == false) { 8679 if (bdev->internal.qos == NULL) { 8680 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8681 if (!bdev->internal.qos) { 8682 spdk_spin_unlock(&bdev->internal.spinlock); 8683 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8684 bdev_set_qos_limit_done(ctx, -ENOMEM); 8685 return; 8686 } 8687 } 8688 8689 if (bdev->internal.qos->thread == NULL) { 8690 /* Enabling */ 8691 bdev_set_qos_rate_limits(bdev, limits); 8692 8693 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8694 bdev_enable_qos_done); 8695 } else { 8696 /* Updating */ 8697 bdev_set_qos_rate_limits(bdev, limits); 8698 8699 spdk_thread_send_msg(bdev->internal.qos->thread, 8700 bdev_update_qos_rate_limit_msg, ctx); 8701 } 8702 } else { 8703 if (bdev->internal.qos != NULL) { 8704 bdev_set_qos_rate_limits(bdev, limits); 8705 8706 /* Disabling */ 8707 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8708 bdev_disable_qos_msg_done); 8709 } else { 8710 spdk_spin_unlock(&bdev->internal.spinlock); 8711 bdev_set_qos_limit_done(ctx, 0); 8712 return; 8713 } 8714 } 8715 8716 spdk_spin_unlock(&bdev->internal.spinlock); 8717 } 8718 8719 struct spdk_bdev_histogram_ctx { 8720 spdk_bdev_histogram_status_cb cb_fn; 8721 void *cb_arg; 8722 struct spdk_bdev *bdev; 8723 int status; 8724 }; 8725 8726 static void 8727 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8728 { 8729 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8730 8731 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8732 ctx->bdev->internal.histogram_in_progress = false; 8733 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8734 ctx->cb_fn(ctx->cb_arg, ctx->status); 8735 free(ctx); 8736 } 8737 8738 static void 8739 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8740 struct spdk_io_channel *_ch, void *_ctx) 8741 { 8742 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8743 8744 if (ch->histogram != NULL) { 8745 spdk_histogram_data_free(ch->histogram); 8746 ch->histogram = NULL; 8747 } 8748 spdk_bdev_for_each_channel_continue(i, 0); 8749 } 8750 8751 static void 8752 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8753 { 8754 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8755 8756 if (status != 0) { 8757 ctx->status = status; 8758 ctx->bdev->internal.histogram_enabled = false; 8759 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8760 bdev_histogram_disable_channel_cb); 8761 } else { 8762 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8763 ctx->bdev->internal.histogram_in_progress = false; 8764 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8765 ctx->cb_fn(ctx->cb_arg, ctx->status); 8766 free(ctx); 8767 } 8768 } 8769 8770 static void 8771 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8772 struct spdk_io_channel *_ch, void *_ctx) 8773 { 8774 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8775 int status = 0; 8776 8777 if (ch->histogram == NULL) { 8778 ch->histogram = spdk_histogram_data_alloc(); 8779 if (ch->histogram == NULL) { 8780 status = -ENOMEM; 8781 } 8782 } 8783 8784 spdk_bdev_for_each_channel_continue(i, status); 8785 } 8786 8787 void 8788 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8789 void *cb_arg, bool enable) 8790 { 8791 struct spdk_bdev_histogram_ctx *ctx; 8792 8793 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8794 if (ctx == NULL) { 8795 cb_fn(cb_arg, -ENOMEM); 8796 return; 8797 } 8798 8799 ctx->bdev = bdev; 8800 ctx->status = 0; 8801 ctx->cb_fn = cb_fn; 8802 ctx->cb_arg = cb_arg; 8803 8804 spdk_spin_lock(&bdev->internal.spinlock); 8805 if (bdev->internal.histogram_in_progress) { 8806 spdk_spin_unlock(&bdev->internal.spinlock); 8807 free(ctx); 8808 cb_fn(cb_arg, -EAGAIN); 8809 return; 8810 } 8811 8812 bdev->internal.histogram_in_progress = true; 8813 spdk_spin_unlock(&bdev->internal.spinlock); 8814 8815 bdev->internal.histogram_enabled = enable; 8816 8817 if (enable) { 8818 /* Allocate histogram for each channel */ 8819 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8820 bdev_histogram_enable_channel_cb); 8821 } else { 8822 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8823 bdev_histogram_disable_channel_cb); 8824 } 8825 } 8826 8827 struct spdk_bdev_histogram_data_ctx { 8828 spdk_bdev_histogram_data_cb cb_fn; 8829 void *cb_arg; 8830 struct spdk_bdev *bdev; 8831 /** merged histogram data from all channels */ 8832 struct spdk_histogram_data *histogram; 8833 }; 8834 8835 static void 8836 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8837 { 8838 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8839 8840 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 8841 free(ctx); 8842 } 8843 8844 static void 8845 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8846 struct spdk_io_channel *_ch, void *_ctx) 8847 { 8848 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8849 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 8850 int status = 0; 8851 8852 if (ch->histogram == NULL) { 8853 status = -EFAULT; 8854 } else { 8855 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 8856 } 8857 8858 spdk_bdev_for_each_channel_continue(i, status); 8859 } 8860 8861 void 8862 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 8863 spdk_bdev_histogram_data_cb cb_fn, 8864 void *cb_arg) 8865 { 8866 struct spdk_bdev_histogram_data_ctx *ctx; 8867 8868 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 8869 if (ctx == NULL) { 8870 cb_fn(cb_arg, -ENOMEM, NULL); 8871 return; 8872 } 8873 8874 ctx->bdev = bdev; 8875 ctx->cb_fn = cb_fn; 8876 ctx->cb_arg = cb_arg; 8877 8878 ctx->histogram = histogram; 8879 8880 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 8881 bdev_histogram_get_channel_cb); 8882 } 8883 8884 void 8885 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 8886 void *cb_arg) 8887 { 8888 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8889 int status = 0; 8890 8891 assert(cb_fn != NULL); 8892 8893 if (bdev_ch->histogram == NULL) { 8894 status = -EFAULT; 8895 } 8896 cb_fn(cb_arg, status, bdev_ch->histogram); 8897 } 8898 8899 size_t 8900 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8901 size_t max_events) 8902 { 8903 struct media_event_entry *entry; 8904 size_t num_events = 0; 8905 8906 for (; num_events < max_events; ++num_events) { 8907 entry = TAILQ_FIRST(&desc->pending_media_events); 8908 if (entry == NULL) { 8909 break; 8910 } 8911 8912 events[num_events] = entry->event; 8913 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8914 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8915 } 8916 8917 return num_events; 8918 } 8919 8920 int 8921 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8922 size_t num_events) 8923 { 8924 struct spdk_bdev_desc *desc; 8925 struct media_event_entry *entry; 8926 size_t event_id; 8927 int rc = 0; 8928 8929 assert(bdev->media_events); 8930 8931 spdk_spin_lock(&bdev->internal.spinlock); 8932 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8933 if (desc->write) { 8934 break; 8935 } 8936 } 8937 8938 if (desc == NULL || desc->media_events_buffer == NULL) { 8939 rc = -ENODEV; 8940 goto out; 8941 } 8942 8943 for (event_id = 0; event_id < num_events; ++event_id) { 8944 entry = TAILQ_FIRST(&desc->free_media_events); 8945 if (entry == NULL) { 8946 break; 8947 } 8948 8949 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8950 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8951 entry->event = events[event_id]; 8952 } 8953 8954 rc = event_id; 8955 out: 8956 spdk_spin_unlock(&bdev->internal.spinlock); 8957 return rc; 8958 } 8959 8960 static void 8961 _media_management_notify(void *arg) 8962 { 8963 struct spdk_bdev_desc *desc = arg; 8964 8965 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 8966 } 8967 8968 void 8969 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8970 { 8971 struct spdk_bdev_desc *desc; 8972 8973 spdk_spin_lock(&bdev->internal.spinlock); 8974 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8975 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8976 event_notify(desc, _media_management_notify); 8977 } 8978 } 8979 spdk_spin_unlock(&bdev->internal.spinlock); 8980 } 8981 8982 struct locked_lba_range_ctx { 8983 struct lba_range range; 8984 struct spdk_bdev *bdev; 8985 struct lba_range *current_range; 8986 struct lba_range *owner_range; 8987 struct spdk_poller *poller; 8988 lock_range_cb cb_fn; 8989 void *cb_arg; 8990 }; 8991 8992 static void 8993 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8994 { 8995 struct locked_lba_range_ctx *ctx = _ctx; 8996 8997 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8998 free(ctx); 8999 } 9000 9001 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9002 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9003 9004 static void 9005 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9006 { 9007 struct locked_lba_range_ctx *ctx = _ctx; 9008 9009 if (status == -ENOMEM) { 9010 /* One of the channels could not allocate a range object. 9011 * So we have to go back and clean up any ranges that were 9012 * allocated successfully before we return error status to 9013 * the caller. We can reuse the unlock function to do that 9014 * clean up. 9015 */ 9016 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9017 bdev_lock_error_cleanup_cb); 9018 return; 9019 } 9020 9021 /* All channels have locked this range and no I/O overlapping the range 9022 * are outstanding! Set the owner_ch for the range object for the 9023 * locking channel, so that this channel will know that it is allowed 9024 * to write to this range. 9025 */ 9026 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9027 ctx->cb_fn(ctx->cb_arg, status); 9028 9029 /* Don't free the ctx here. Its range is in the bdev's global list of 9030 * locked ranges still, and will be removed and freed when this range 9031 * is later unlocked. 9032 */ 9033 } 9034 9035 static int 9036 bdev_lock_lba_range_check_io(void *_i) 9037 { 9038 struct spdk_bdev_channel_iter *i = _i; 9039 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9040 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9041 struct locked_lba_range_ctx *ctx = i->ctx; 9042 struct lba_range *range = ctx->current_range; 9043 struct spdk_bdev_io *bdev_io; 9044 9045 spdk_poller_unregister(&ctx->poller); 9046 9047 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9048 * range. But we need to wait until any outstanding IO overlapping with this range 9049 * are completed. 9050 */ 9051 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9052 if (bdev_io_range_is_locked(bdev_io, range)) { 9053 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9054 return SPDK_POLLER_BUSY; 9055 } 9056 } 9057 9058 spdk_bdev_for_each_channel_continue(i, 0); 9059 return SPDK_POLLER_BUSY; 9060 } 9061 9062 static void 9063 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9064 struct spdk_io_channel *_ch, void *_ctx) 9065 { 9066 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9067 struct locked_lba_range_ctx *ctx = _ctx; 9068 struct lba_range *range; 9069 9070 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9071 if (range->length == ctx->range.length && 9072 range->offset == ctx->range.offset && 9073 range->locked_ctx == ctx->range.locked_ctx) { 9074 /* This range already exists on this channel, so don't add 9075 * it again. This can happen when a new channel is created 9076 * while the for_each_channel operation is in progress. 9077 * Do not check for outstanding I/O in that case, since the 9078 * range was locked before any I/O could be submitted to the 9079 * new channel. 9080 */ 9081 spdk_bdev_for_each_channel_continue(i, 0); 9082 return; 9083 } 9084 } 9085 9086 range = calloc(1, sizeof(*range)); 9087 if (range == NULL) { 9088 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9089 return; 9090 } 9091 9092 range->length = ctx->range.length; 9093 range->offset = ctx->range.offset; 9094 range->locked_ctx = ctx->range.locked_ctx; 9095 ctx->current_range = range; 9096 if (ctx->range.owner_ch == ch) { 9097 /* This is the range object for the channel that will hold 9098 * the lock. Store it in the ctx object so that we can easily 9099 * set its owner_ch after the lock is finally acquired. 9100 */ 9101 ctx->owner_range = range; 9102 } 9103 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9104 bdev_lock_lba_range_check_io(i); 9105 } 9106 9107 static void 9108 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9109 { 9110 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 9111 9112 /* We will add a copy of this range to each channel now. */ 9113 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9114 bdev_lock_lba_range_cb); 9115 } 9116 9117 static bool 9118 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9119 { 9120 struct lba_range *r; 9121 9122 TAILQ_FOREACH(r, tailq, tailq) { 9123 if (bdev_lba_range_overlapped(range, r)) { 9124 return true; 9125 } 9126 } 9127 return false; 9128 } 9129 9130 static int 9131 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9132 uint64_t offset, uint64_t length, 9133 lock_range_cb cb_fn, void *cb_arg) 9134 { 9135 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9136 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9137 struct locked_lba_range_ctx *ctx; 9138 9139 if (cb_arg == NULL) { 9140 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9141 return -EINVAL; 9142 } 9143 9144 ctx = calloc(1, sizeof(*ctx)); 9145 if (ctx == NULL) { 9146 return -ENOMEM; 9147 } 9148 9149 ctx->range.offset = offset; 9150 ctx->range.length = length; 9151 ctx->range.owner_ch = ch; 9152 ctx->range.locked_ctx = cb_arg; 9153 ctx->bdev = bdev; 9154 ctx->cb_fn = cb_fn; 9155 ctx->cb_arg = cb_arg; 9156 9157 spdk_spin_lock(&bdev->internal.spinlock); 9158 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9159 /* There is an active lock overlapping with this range. 9160 * Put it on the pending list until this range no 9161 * longer overlaps with another. 9162 */ 9163 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9164 } else { 9165 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9166 bdev_lock_lba_range_ctx(bdev, ctx); 9167 } 9168 spdk_spin_unlock(&bdev->internal.spinlock); 9169 return 0; 9170 } 9171 9172 static void 9173 bdev_lock_lba_range_ctx_msg(void *_ctx) 9174 { 9175 struct locked_lba_range_ctx *ctx = _ctx; 9176 9177 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 9178 } 9179 9180 static void 9181 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9182 { 9183 struct locked_lba_range_ctx *ctx = _ctx; 9184 struct locked_lba_range_ctx *pending_ctx; 9185 struct lba_range *range, *tmp; 9186 9187 spdk_spin_lock(&bdev->internal.spinlock); 9188 /* Check if there are any pending locked ranges that overlap with this range 9189 * that was just unlocked. If there are, check that it doesn't overlap with any 9190 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9191 * the lock process. 9192 */ 9193 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9194 if (bdev_lba_range_overlapped(range, &ctx->range) && 9195 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9196 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9197 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9198 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9199 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 9200 bdev_lock_lba_range_ctx_msg, pending_ctx); 9201 } 9202 } 9203 spdk_spin_unlock(&bdev->internal.spinlock); 9204 9205 ctx->cb_fn(ctx->cb_arg, status); 9206 free(ctx); 9207 } 9208 9209 static void 9210 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9211 struct spdk_io_channel *_ch, void *_ctx) 9212 { 9213 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9214 struct locked_lba_range_ctx *ctx = _ctx; 9215 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9216 struct spdk_bdev_io *bdev_io; 9217 struct lba_range *range; 9218 9219 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9220 if (ctx->range.offset == range->offset && 9221 ctx->range.length == range->length && 9222 ctx->range.locked_ctx == range->locked_ctx) { 9223 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9224 free(range); 9225 break; 9226 } 9227 } 9228 9229 /* Note: we should almost always be able to assert that the range specified 9230 * was found. But there are some very rare corner cases where a new channel 9231 * gets created simultaneously with a range unlock, where this function 9232 * would execute on that new channel and wouldn't have the range. 9233 * We also use this to clean up range allocations when a later allocation 9234 * fails in the locking path. 9235 * So we can't actually assert() here. 9236 */ 9237 9238 /* Swap the locked IO into a temporary list, and then try to submit them again. 9239 * We could hyper-optimize this to only resubmit locked I/O that overlap 9240 * with the range that was just unlocked, but this isn't a performance path so 9241 * we go for simplicity here. 9242 */ 9243 TAILQ_INIT(&io_locked); 9244 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9245 while (!TAILQ_EMPTY(&io_locked)) { 9246 bdev_io = TAILQ_FIRST(&io_locked); 9247 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9248 bdev_io_submit(bdev_io); 9249 } 9250 9251 spdk_bdev_for_each_channel_continue(i, 0); 9252 } 9253 9254 static int 9255 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9256 uint64_t offset, uint64_t length, 9257 lock_range_cb cb_fn, void *cb_arg) 9258 { 9259 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9260 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9261 struct locked_lba_range_ctx *ctx; 9262 struct lba_range *range; 9263 bool range_found = false; 9264 9265 /* Let's make sure the specified channel actually has a lock on 9266 * the specified range. Note that the range must match exactly. 9267 */ 9268 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9269 if (range->offset == offset && range->length == length && 9270 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9271 range_found = true; 9272 break; 9273 } 9274 } 9275 9276 if (!range_found) { 9277 return -EINVAL; 9278 } 9279 9280 spdk_spin_lock(&bdev->internal.spinlock); 9281 /* We confirmed that this channel has locked the specified range. To 9282 * start the unlock the process, we find the range in the bdev's locked_ranges 9283 * and remove it. This ensures new channels don't inherit the locked range. 9284 * Then we will send a message to each channel (including the one specified 9285 * here) to remove the range from its per-channel list. 9286 */ 9287 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9288 if (range->offset == offset && range->length == length && 9289 range->locked_ctx == cb_arg) { 9290 break; 9291 } 9292 } 9293 if (range == NULL) { 9294 assert(false); 9295 spdk_spin_unlock(&bdev->internal.spinlock); 9296 return -EINVAL; 9297 } 9298 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9299 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9300 spdk_spin_unlock(&bdev->internal.spinlock); 9301 9302 ctx->cb_fn = cb_fn; 9303 ctx->cb_arg = cb_arg; 9304 9305 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9306 bdev_unlock_lba_range_cb); 9307 return 0; 9308 } 9309 9310 int 9311 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9312 int array_size) 9313 { 9314 if (!bdev) { 9315 return -EINVAL; 9316 } 9317 9318 if (bdev->fn_table->get_memory_domains) { 9319 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9320 } 9321 9322 return 0; 9323 } 9324 9325 struct spdk_bdev_for_each_io_ctx { 9326 void *ctx; 9327 spdk_bdev_io_fn fn; 9328 spdk_bdev_for_each_io_cb cb; 9329 }; 9330 9331 static void 9332 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9333 struct spdk_io_channel *io_ch, void *_ctx) 9334 { 9335 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9336 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9337 struct spdk_bdev_io *bdev_io; 9338 int rc = 0; 9339 9340 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9341 rc = ctx->fn(ctx->ctx, bdev_io); 9342 if (rc != 0) { 9343 break; 9344 } 9345 } 9346 9347 spdk_bdev_for_each_channel_continue(i, rc); 9348 } 9349 9350 static void 9351 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9352 { 9353 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9354 9355 ctx->cb(ctx->ctx, status); 9356 9357 free(ctx); 9358 } 9359 9360 void 9361 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9362 spdk_bdev_for_each_io_cb cb) 9363 { 9364 struct spdk_bdev_for_each_io_ctx *ctx; 9365 9366 assert(fn != NULL && cb != NULL); 9367 9368 ctx = calloc(1, sizeof(*ctx)); 9369 if (ctx == NULL) { 9370 SPDK_ERRLOG("Failed to allocate context.\n"); 9371 cb(_ctx, -ENOMEM); 9372 return; 9373 } 9374 9375 ctx->ctx = _ctx; 9376 ctx->fn = fn; 9377 ctx->cb = cb; 9378 9379 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9380 bdev_for_each_io_done); 9381 } 9382 9383 void 9384 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9385 { 9386 spdk_for_each_channel_continue(iter->i, status); 9387 } 9388 9389 static struct spdk_bdev * 9390 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9391 { 9392 void *io_device = spdk_io_channel_iter_get_io_device(i); 9393 9394 return __bdev_from_io_dev(io_device); 9395 } 9396 9397 static void 9398 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9399 { 9400 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9401 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9402 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9403 9404 iter->i = i; 9405 iter->fn(iter, bdev, ch, iter->ctx); 9406 } 9407 9408 static void 9409 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9410 { 9411 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9412 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9413 9414 iter->i = i; 9415 iter->cpl(bdev, iter->ctx, status); 9416 9417 free(iter); 9418 } 9419 9420 void 9421 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9422 void *ctx, spdk_bdev_for_each_channel_done cpl) 9423 { 9424 struct spdk_bdev_channel_iter *iter; 9425 9426 assert(bdev != NULL && fn != NULL && ctx != NULL); 9427 9428 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9429 if (iter == NULL) { 9430 SPDK_ERRLOG("Unable to allocate iterator\n"); 9431 assert(false); 9432 return; 9433 } 9434 9435 iter->fn = fn; 9436 iter->cpl = cpl; 9437 iter->ctx = ctx; 9438 9439 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9440 iter, bdev_each_channel_cpl); 9441 } 9442 9443 static void 9444 bdev_copy_do_write_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9445 { 9446 struct spdk_bdev_io *parent_io = cb_arg; 9447 9448 /* Check return status of write */ 9449 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9450 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9451 spdk_bdev_free_io(bdev_io); 9452 } 9453 9454 static void 9455 bdev_copy_do_write(void *_bdev_io) 9456 { 9457 struct spdk_bdev_io *bdev_io = _bdev_io; 9458 int rc; 9459 9460 /* Write blocks */ 9461 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9462 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9463 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9464 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_complete, bdev_io); 9465 9466 if (rc == -ENOMEM) { 9467 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9468 } else if (rc != 0) { 9469 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9470 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9471 } 9472 } 9473 9474 static void 9475 bdev_copy_do_read_complete(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9476 { 9477 struct spdk_bdev_io *parent_io = cb_arg; 9478 9479 /* Check return status of read */ 9480 if (!success) { 9481 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9482 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9483 spdk_bdev_free_io(bdev_io); 9484 return; 9485 } 9486 9487 spdk_bdev_free_io(bdev_io); 9488 9489 /* Do write */ 9490 bdev_copy_do_write(parent_io); 9491 } 9492 9493 static void 9494 bdev_copy_do_read(void *_bdev_io) 9495 { 9496 struct spdk_bdev_io *bdev_io = _bdev_io; 9497 int rc; 9498 9499 /* Read blocks */ 9500 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9501 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs[0].iov_base, 9502 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9503 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_complete, bdev_io); 9504 9505 if (rc == -ENOMEM) { 9506 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9507 } else if (rc != 0) { 9508 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9509 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9510 } 9511 } 9512 9513 static void 9514 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9515 { 9516 if (!success) { 9517 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9518 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9519 return; 9520 } 9521 9522 bdev_copy_do_read(bdev_io); 9523 } 9524 9525 int 9526 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9527 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9528 spdk_bdev_io_completion_cb cb, void *cb_arg) 9529 { 9530 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9531 struct spdk_bdev_io *bdev_io; 9532 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9533 9534 if (!desc->write) { 9535 return -EBADF; 9536 } 9537 9538 if (num_blocks == 0) { 9539 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9540 return -EINVAL; 9541 } 9542 9543 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9544 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9545 SPDK_DEBUGLOG(bdev, 9546 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9547 dst_offset_blocks, src_offset_blocks, num_blocks); 9548 return -EINVAL; 9549 } 9550 9551 bdev_io = bdev_channel_get_io(channel); 9552 if (!bdev_io) { 9553 return -ENOMEM; 9554 } 9555 9556 bdev_io->internal.ch = channel; 9557 bdev_io->internal.desc = desc; 9558 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9559 9560 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9561 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9562 bdev_io->u.bdev.num_blocks = num_blocks; 9563 bdev_io->u.bdev.memory_domain = NULL; 9564 bdev_io->u.bdev.memory_domain_ctx = NULL; 9565 bdev_io->u.bdev.iovs = NULL; 9566 bdev_io->u.bdev.iovcnt = 0; 9567 bdev_io->u.bdev.md_buf = NULL; 9568 bdev_io->u.bdev.accel_sequence = NULL; 9569 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9570 9571 if (dst_offset_blocks == src_offset_blocks) { 9572 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9573 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9574 9575 return 0; 9576 } 9577 9578 /* If the bdev backing device support copy directly, pass to it to process. 9579 * Else do general processing from bdev layer. 9580 */ 9581 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 9582 bdev_io_submit(bdev_io); 9583 return 0; 9584 } 9585 9586 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9587 9588 return 0; 9589 } 9590 9591 SPDK_LOG_REGISTER_COMPONENT(bdev) 9592 9593 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9594 { 9595 struct spdk_trace_tpoint_opts opts[] = { 9596 { 9597 "BDEV_IO_START", TRACE_BDEV_IO_START, 9598 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9599 { 9600 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9601 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9602 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9603 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9604 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9605 } 9606 }, 9607 { 9608 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9609 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9610 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9611 }, 9612 { 9613 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9614 OWNER_BDEV, OBJECT_NONE, 1, 9615 { 9616 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9617 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9618 } 9619 }, 9620 { 9621 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9622 OWNER_BDEV, OBJECT_NONE, 0, 9623 { 9624 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9625 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9626 } 9627 }, 9628 }; 9629 9630 9631 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9632 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9633 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9634 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9635 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9636 } 9637