1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 uint64_t offset; 137 uint64_t length; 138 void *locked_ctx; 139 struct spdk_bdev_channel *owner_ch; 140 TAILQ_ENTRY(lba_range) tailq; 141 }; 142 143 static struct spdk_bdev_opts g_bdev_opts = { 144 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 145 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 146 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 147 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 148 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 149 }; 150 151 static spdk_bdev_init_cb g_init_cb_fn = NULL; 152 static void *g_init_cb_arg = NULL; 153 154 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 155 static void *g_fini_cb_arg = NULL; 156 static struct spdk_thread *g_fini_thread = NULL; 157 158 struct spdk_bdev_qos_limit { 159 /** IOs or bytes allowed per second (i.e., 1s). */ 160 uint64_t limit; 161 162 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 163 * For remaining bytes, allowed to run negative if an I/O is submitted when 164 * some bytes are remaining, but the I/O is bigger than that amount. The 165 * excess will be deducted from the next timeslice. 166 */ 167 int64_t remaining_this_timeslice; 168 169 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 170 uint32_t min_per_timeslice; 171 172 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t max_per_timeslice; 174 175 /** Function to check whether to queue the IO. */ 176 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 177 178 /** Function to update for the submitted IO. */ 179 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 }; 181 182 struct spdk_bdev_qos { 183 /** Types of structure of rate limits. */ 184 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 185 186 /** The channel that all I/O are funneled through. */ 187 struct spdk_bdev_channel *ch; 188 189 /** The thread on which the poller is running. */ 190 struct spdk_thread *thread; 191 192 /** Queue of I/O waiting to be issued. */ 193 bdev_io_tailq_t queued; 194 195 /** Size of a timeslice in tsc ticks. */ 196 uint64_t timeslice_size; 197 198 /** Timestamp of start of last timeslice. */ 199 uint64_t last_timeslice; 200 201 /** Poller that processes queued I/O commands each time slice. */ 202 struct spdk_poller *poller; 203 }; 204 205 struct spdk_bdev_mgmt_channel { 206 /* 207 * Each thread keeps a cache of bdev_io - this allows 208 * bdev threads which are *not* DPDK threads to still 209 * benefit from a per-thread bdev_io cache. Without 210 * this, non-DPDK threads fetching from the mempool 211 * incur a cmpxchg on get and put. 212 */ 213 bdev_io_stailq_t per_thread_cache; 214 uint32_t per_thread_cache_count; 215 uint32_t bdev_io_cache_size; 216 217 struct spdk_iobuf_channel iobuf; 218 219 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 220 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 221 }; 222 223 /* 224 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 225 * will queue here their IO that awaits retry. It makes it possible to retry sending 226 * IO to one bdev after IO from other bdev completes. 227 */ 228 struct spdk_bdev_shared_resource { 229 /* The bdev management channel */ 230 struct spdk_bdev_mgmt_channel *mgmt_ch; 231 232 /* 233 * Count of I/O submitted to bdev module and waiting for completion. 234 * Incremented before submit_request() is called on an spdk_bdev_io. 235 */ 236 uint64_t io_outstanding; 237 238 /* 239 * Queue of IO awaiting retry because of a previous NOMEM status returned 240 * on this channel. 241 */ 242 bdev_io_tailq_t nomem_io; 243 244 /* 245 * Threshold which io_outstanding must drop to before retrying nomem_io. 246 */ 247 uint64_t nomem_threshold; 248 249 /* I/O channel allocated by a bdev module */ 250 struct spdk_io_channel *shared_ch; 251 252 /* Refcount of bdev channels using this resource */ 253 uint32_t ref; 254 255 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 256 }; 257 258 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 259 #define BDEV_CH_QOS_ENABLED (1 << 1) 260 261 struct spdk_bdev_channel { 262 struct spdk_bdev *bdev; 263 264 /* The channel for the underlying device */ 265 struct spdk_io_channel *channel; 266 267 /* Accel channel */ 268 struct spdk_io_channel *accel_channel; 269 270 /* Per io_device per thread data */ 271 struct spdk_bdev_shared_resource *shared_resource; 272 273 struct spdk_bdev_io_stat *stat; 274 275 /* 276 * Count of I/O submitted to the underlying dev module through this channel 277 * and waiting for completion. 278 */ 279 uint64_t io_outstanding; 280 281 /* 282 * List of all submitted I/Os including I/O that are generated via splitting. 283 */ 284 bdev_io_tailq_t io_submitted; 285 286 /* 287 * List of spdk_bdev_io that are currently queued because they write to a locked 288 * LBA range. 289 */ 290 bdev_io_tailq_t io_locked; 291 292 /* List of I/Os with accel sequence being currently executed */ 293 bdev_io_tailq_t io_accel_exec; 294 295 /* List of I/Os doing memory domain pull/push */ 296 bdev_io_tailq_t io_memory_domain; 297 298 uint32_t flags; 299 300 struct spdk_histogram_data *histogram; 301 302 #ifdef SPDK_CONFIG_VTUNE 303 uint64_t start_tsc; 304 uint64_t interval_tsc; 305 __itt_string_handle *handle; 306 struct spdk_bdev_io_stat *prev_stat; 307 #endif 308 309 bdev_io_tailq_t queued_resets; 310 311 lba_range_tailq_t locked_ranges; 312 }; 313 314 struct media_event_entry { 315 struct spdk_bdev_media_event event; 316 TAILQ_ENTRY(media_event_entry) tailq; 317 }; 318 319 #define MEDIA_EVENT_POOL_SIZE 64 320 321 struct spdk_bdev_desc { 322 struct spdk_bdev *bdev; 323 struct spdk_thread *thread; 324 struct { 325 spdk_bdev_event_cb_t event_fn; 326 void *ctx; 327 } callback; 328 bool closed; 329 bool write; 330 bool memory_domains_supported; 331 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 332 struct spdk_spinlock spinlock; 333 uint32_t refs; 334 TAILQ_HEAD(, media_event_entry) pending_media_events; 335 TAILQ_HEAD(, media_event_entry) free_media_events; 336 struct media_event_entry *media_events_buffer; 337 TAILQ_ENTRY(spdk_bdev_desc) link; 338 339 uint64_t timeout_in_sec; 340 spdk_bdev_io_timeout_cb cb_fn; 341 void *cb_arg; 342 struct spdk_poller *io_timeout_poller; 343 struct spdk_bdev_module_claim *claim; 344 }; 345 346 struct spdk_bdev_iostat_ctx { 347 struct spdk_bdev_io_stat *stat; 348 spdk_bdev_get_device_stat_cb cb; 349 void *cb_arg; 350 }; 351 352 struct set_qos_limit_ctx { 353 void (*cb_fn)(void *cb_arg, int status); 354 void *cb_arg; 355 struct spdk_bdev *bdev; 356 }; 357 358 struct spdk_bdev_channel_iter { 359 spdk_bdev_for_each_channel_msg fn; 360 spdk_bdev_for_each_channel_done cpl; 361 struct spdk_io_channel_iter *i; 362 void *ctx; 363 }; 364 365 struct spdk_bdev_io_error_stat { 366 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 367 }; 368 369 enum bdev_io_retry_state { 370 BDEV_IO_RETRY_STATE_INVALID, 371 BDEV_IO_RETRY_STATE_PULL, 372 BDEV_IO_RETRY_STATE_PULL_MD, 373 BDEV_IO_RETRY_STATE_SUBMIT, 374 BDEV_IO_RETRY_STATE_PUSH, 375 BDEV_IO_RETRY_STATE_PUSH_MD, 376 }; 377 378 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 379 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 380 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 381 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 382 383 static inline void bdev_io_complete(void *ctx); 384 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 385 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 386 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 387 388 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 389 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 390 391 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 392 struct spdk_io_channel *ch, void *_ctx); 393 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 394 395 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 396 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 397 uint64_t num_blocks, 398 struct spdk_memory_domain *domain, void *domain_ctx, 399 struct spdk_accel_sequence *seq, 400 spdk_bdev_io_completion_cb cb, void *cb_arg); 401 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 402 struct iovec *iov, int iovcnt, void *md_buf, 403 uint64_t offset_blocks, uint64_t num_blocks, 404 struct spdk_memory_domain *domain, void *domain_ctx, 405 struct spdk_accel_sequence *seq, 406 spdk_bdev_io_completion_cb cb, void *cb_arg); 407 408 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 409 uint64_t offset, uint64_t length, 410 lock_range_cb cb_fn, void *cb_arg); 411 412 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 413 uint64_t offset, uint64_t length, 414 lock_range_cb cb_fn, void *cb_arg); 415 416 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 417 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 418 419 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 420 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 421 static void claim_reset(struct spdk_bdev *bdev); 422 423 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 424 425 #define bdev_get_ext_io_opt(opts, field, defval) \ 426 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 427 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 428 429 void 430 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 431 { 432 if (!opts) { 433 SPDK_ERRLOG("opts should not be NULL\n"); 434 return; 435 } 436 437 if (!opts_size) { 438 SPDK_ERRLOG("opts_size should not be zero value\n"); 439 return; 440 } 441 442 opts->opts_size = opts_size; 443 444 #define SET_FIELD(field) \ 445 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 446 opts->field = g_bdev_opts.field; \ 447 } \ 448 449 SET_FIELD(bdev_io_pool_size); 450 SET_FIELD(bdev_io_cache_size); 451 SET_FIELD(bdev_auto_examine); 452 SET_FIELD(small_buf_pool_size); 453 SET_FIELD(large_buf_pool_size); 454 455 /* Do not remove this statement, you should always update this statement when you adding a new field, 456 * and do not forget to add the SET_FIELD statement for your added field. */ 457 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 458 459 #undef SET_FIELD 460 } 461 462 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 463 "v23.05", 0); 464 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 465 "v23.05", 0); 466 int 467 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 468 { 469 struct spdk_iobuf_opts iobuf_opts; 470 uint32_t min_pool_size; 471 int rc; 472 473 if (!opts) { 474 SPDK_ERRLOG("opts cannot be NULL\n"); 475 return -1; 476 } 477 478 if (!opts->opts_size) { 479 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 480 return -1; 481 } 482 483 /* 484 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 485 * initialization. A second mgmt_ch will be created on the same thread when the application starts 486 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 487 */ 488 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 489 if (opts->bdev_io_pool_size < min_pool_size) { 490 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 491 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 492 spdk_thread_get_count()); 493 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 494 return -1; 495 } 496 497 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 498 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 499 } 500 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 501 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 502 } 503 504 #define SET_FIELD(field) \ 505 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 506 g_bdev_opts.field = opts->field; \ 507 } \ 508 509 SET_FIELD(bdev_io_pool_size); 510 SET_FIELD(bdev_io_cache_size); 511 SET_FIELD(bdev_auto_examine); 512 SET_FIELD(small_buf_pool_size); 513 SET_FIELD(large_buf_pool_size); 514 515 spdk_iobuf_get_opts(&iobuf_opts); 516 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 517 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 518 519 rc = spdk_iobuf_set_opts(&iobuf_opts); 520 if (rc != 0) { 521 SPDK_ERRLOG("Failed to set iobuf opts\n"); 522 return -1; 523 } 524 525 g_bdev_opts.opts_size = opts->opts_size; 526 527 #undef SET_FIELD 528 529 return 0; 530 } 531 532 static struct spdk_bdev * 533 bdev_get_by_name(const char *bdev_name) 534 { 535 struct spdk_bdev_name find; 536 struct spdk_bdev_name *res; 537 538 find.name = (char *)bdev_name; 539 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 540 if (res != NULL) { 541 return res->bdev; 542 } 543 544 return NULL; 545 } 546 547 struct spdk_bdev * 548 spdk_bdev_get_by_name(const char *bdev_name) 549 { 550 struct spdk_bdev *bdev; 551 552 spdk_spin_lock(&g_bdev_mgr.spinlock); 553 bdev = bdev_get_by_name(bdev_name); 554 spdk_spin_unlock(&g_bdev_mgr.spinlock); 555 556 return bdev; 557 } 558 559 struct bdev_io_status_string { 560 enum spdk_bdev_io_status status; 561 const char *str; 562 }; 563 564 static const struct bdev_io_status_string bdev_io_status_strings[] = { 565 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 566 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 567 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 568 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 569 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 570 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 571 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 572 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 573 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 574 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 575 }; 576 577 static const char * 578 bdev_io_status_get_string(enum spdk_bdev_io_status status) 579 { 580 uint32_t i; 581 582 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 583 if (bdev_io_status_strings[i].status == status) { 584 return bdev_io_status_strings[i].str; 585 } 586 } 587 588 return "reserved"; 589 } 590 591 struct spdk_bdev_wait_for_examine_ctx { 592 struct spdk_poller *poller; 593 spdk_bdev_wait_for_examine_cb cb_fn; 594 void *cb_arg; 595 }; 596 597 static bool bdev_module_all_actions_completed(void); 598 599 static int 600 bdev_wait_for_examine_cb(void *arg) 601 { 602 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 603 604 if (!bdev_module_all_actions_completed()) { 605 return SPDK_POLLER_IDLE; 606 } 607 608 spdk_poller_unregister(&ctx->poller); 609 ctx->cb_fn(ctx->cb_arg); 610 free(ctx); 611 612 return SPDK_POLLER_BUSY; 613 } 614 615 int 616 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 617 { 618 struct spdk_bdev_wait_for_examine_ctx *ctx; 619 620 ctx = calloc(1, sizeof(*ctx)); 621 if (ctx == NULL) { 622 return -ENOMEM; 623 } 624 ctx->cb_fn = cb_fn; 625 ctx->cb_arg = cb_arg; 626 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 627 628 return 0; 629 } 630 631 struct spdk_bdev_examine_item { 632 char *name; 633 TAILQ_ENTRY(spdk_bdev_examine_item) link; 634 }; 635 636 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 637 638 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 639 g_bdev_examine_allowlist); 640 641 static inline bool 642 bdev_examine_allowlist_check(const char *name) 643 { 644 struct spdk_bdev_examine_item *item; 645 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 646 if (strcmp(name, item->name) == 0) { 647 return true; 648 } 649 } 650 return false; 651 } 652 653 static inline void 654 bdev_examine_allowlist_free(void) 655 { 656 struct spdk_bdev_examine_item *item; 657 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 658 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 659 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 660 free(item->name); 661 free(item); 662 } 663 } 664 665 static inline bool 666 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 667 { 668 struct spdk_bdev_alias *tmp; 669 if (bdev_examine_allowlist_check(bdev->name)) { 670 return true; 671 } 672 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 673 if (bdev_examine_allowlist_check(tmp->alias.name)) { 674 return true; 675 } 676 } 677 return false; 678 } 679 680 static inline bool 681 bdev_ok_to_examine(struct spdk_bdev *bdev) 682 { 683 if (g_bdev_opts.bdev_auto_examine) { 684 return true; 685 } else { 686 return bdev_in_examine_allowlist(bdev); 687 } 688 } 689 690 static void 691 bdev_examine(struct spdk_bdev *bdev) 692 { 693 struct spdk_bdev_module *module; 694 struct spdk_bdev_module_claim *claim, *tmpclaim; 695 uint32_t action; 696 697 if (!bdev_ok_to_examine(bdev)) { 698 return; 699 } 700 701 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 702 if (module->examine_config) { 703 spdk_spin_lock(&module->internal.spinlock); 704 action = module->internal.action_in_progress; 705 module->internal.action_in_progress++; 706 spdk_spin_unlock(&module->internal.spinlock); 707 module->examine_config(bdev); 708 if (action != module->internal.action_in_progress) { 709 SPDK_ERRLOG("examine_config for module %s did not call " 710 "spdk_bdev_module_examine_done()\n", module->name); 711 } 712 } 713 } 714 715 spdk_spin_lock(&bdev->internal.spinlock); 716 717 switch (bdev->internal.claim_type) { 718 case SPDK_BDEV_CLAIM_NONE: 719 /* Examine by all bdev modules */ 720 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 721 if (module->examine_disk) { 722 spdk_spin_lock(&module->internal.spinlock); 723 module->internal.action_in_progress++; 724 spdk_spin_unlock(&module->internal.spinlock); 725 spdk_spin_unlock(&bdev->internal.spinlock); 726 module->examine_disk(bdev); 727 spdk_spin_lock(&bdev->internal.spinlock); 728 } 729 } 730 break; 731 case SPDK_BDEV_CLAIM_EXCL_WRITE: 732 /* Examine by the one bdev module with a v1 claim */ 733 module = bdev->internal.claim.v1.module; 734 if (module->examine_disk) { 735 spdk_spin_lock(&module->internal.spinlock); 736 module->internal.action_in_progress++; 737 spdk_spin_unlock(&module->internal.spinlock); 738 spdk_spin_unlock(&bdev->internal.spinlock); 739 module->examine_disk(bdev); 740 return; 741 } 742 break; 743 default: 744 /* Examine by all bdev modules with a v2 claim */ 745 assert(claim_type_is_v2(bdev->internal.claim_type)); 746 /* 747 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 748 * list, perhaps accessing freed memory. Without protection, this could happen 749 * while the lock is dropped during the examine callback. 750 */ 751 bdev->internal.examine_in_progress++; 752 753 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 754 module = claim->module; 755 756 if (module == NULL) { 757 /* This is a vestigial claim, held by examine_count */ 758 continue; 759 } 760 761 if (module->examine_disk == NULL) { 762 continue; 763 } 764 765 spdk_spin_lock(&module->internal.spinlock); 766 module->internal.action_in_progress++; 767 spdk_spin_unlock(&module->internal.spinlock); 768 769 /* Call examine_disk without holding internal.spinlock. */ 770 spdk_spin_unlock(&bdev->internal.spinlock); 771 module->examine_disk(bdev); 772 spdk_spin_lock(&bdev->internal.spinlock); 773 } 774 775 assert(bdev->internal.examine_in_progress > 0); 776 bdev->internal.examine_in_progress--; 777 if (bdev->internal.examine_in_progress == 0) { 778 /* Remove any claims that were released during examine_disk */ 779 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 780 if (claim->desc != NULL) { 781 continue; 782 } 783 784 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 785 free(claim); 786 } 787 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 788 claim_reset(bdev); 789 } 790 } 791 } 792 793 spdk_spin_unlock(&bdev->internal.spinlock); 794 } 795 796 int 797 spdk_bdev_examine(const char *name) 798 { 799 struct spdk_bdev *bdev; 800 struct spdk_bdev_examine_item *item; 801 struct spdk_thread *thread = spdk_get_thread(); 802 803 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 804 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 805 thread ? spdk_thread_get_name(thread) : "null"); 806 return -EINVAL; 807 } 808 809 if (g_bdev_opts.bdev_auto_examine) { 810 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 811 return -EINVAL; 812 } 813 814 if (bdev_examine_allowlist_check(name)) { 815 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 816 return -EEXIST; 817 } 818 819 item = calloc(1, sizeof(*item)); 820 if (!item) { 821 return -ENOMEM; 822 } 823 item->name = strdup(name); 824 if (!item->name) { 825 free(item); 826 return -ENOMEM; 827 } 828 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 829 830 bdev = spdk_bdev_get_by_name(name); 831 if (bdev) { 832 bdev_examine(bdev); 833 } 834 return 0; 835 } 836 837 static inline void 838 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 839 { 840 struct spdk_bdev_examine_item *item; 841 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 842 spdk_json_write_object_begin(w); 843 spdk_json_write_named_string(w, "method", "bdev_examine"); 844 spdk_json_write_named_object_begin(w, "params"); 845 spdk_json_write_named_string(w, "name", item->name); 846 spdk_json_write_object_end(w); 847 spdk_json_write_object_end(w); 848 } 849 } 850 851 struct spdk_bdev * 852 spdk_bdev_first(void) 853 { 854 struct spdk_bdev *bdev; 855 856 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 857 if (bdev) { 858 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 859 } 860 861 return bdev; 862 } 863 864 struct spdk_bdev * 865 spdk_bdev_next(struct spdk_bdev *prev) 866 { 867 struct spdk_bdev *bdev; 868 869 bdev = TAILQ_NEXT(prev, internal.link); 870 if (bdev) { 871 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 872 } 873 874 return bdev; 875 } 876 877 static struct spdk_bdev * 878 _bdev_next_leaf(struct spdk_bdev *bdev) 879 { 880 while (bdev != NULL) { 881 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 882 return bdev; 883 } else { 884 bdev = TAILQ_NEXT(bdev, internal.link); 885 } 886 } 887 888 return bdev; 889 } 890 891 struct spdk_bdev * 892 spdk_bdev_first_leaf(void) 893 { 894 struct spdk_bdev *bdev; 895 896 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 897 898 if (bdev) { 899 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 900 } 901 902 return bdev; 903 } 904 905 struct spdk_bdev * 906 spdk_bdev_next_leaf(struct spdk_bdev *prev) 907 { 908 struct spdk_bdev *bdev; 909 910 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 911 912 if (bdev) { 913 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 914 } 915 916 return bdev; 917 } 918 919 static inline bool 920 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 921 { 922 return bdev_io->internal.memory_domain; 923 } 924 925 static inline bool 926 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 927 { 928 return bdev_io->internal.accel_sequence; 929 } 930 931 static inline void 932 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 933 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 934 { 935 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 936 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 937 * channels we will instead wait for half to complete. 938 */ 939 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 940 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 941 942 assert(state != BDEV_IO_RETRY_STATE_INVALID); 943 bdev_io->internal.retry_state = state; 944 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 945 } 946 947 static inline void 948 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 949 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 950 { 951 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 952 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 953 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 954 955 assert(state != BDEV_IO_RETRY_STATE_INVALID); 956 bdev_io->internal.retry_state = state; 957 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 958 } 959 960 void 961 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 962 { 963 struct iovec *iovs; 964 965 if (bdev_io->u.bdev.iovs == NULL) { 966 bdev_io->u.bdev.iovs = &bdev_io->iov; 967 bdev_io->u.bdev.iovcnt = 1; 968 } 969 970 iovs = bdev_io->u.bdev.iovs; 971 972 assert(iovs != NULL); 973 assert(bdev_io->u.bdev.iovcnt >= 1); 974 975 iovs[0].iov_base = buf; 976 iovs[0].iov_len = len; 977 } 978 979 void 980 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 981 { 982 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 983 bdev_io->u.bdev.md_buf = md_buf; 984 } 985 986 static bool 987 _is_buf_allocated(const struct iovec *iovs) 988 { 989 if (iovs == NULL) { 990 return false; 991 } 992 993 return iovs[0].iov_base != NULL; 994 } 995 996 static bool 997 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 998 { 999 int i; 1000 uintptr_t iov_base; 1001 1002 if (spdk_likely(alignment == 1)) { 1003 return true; 1004 } 1005 1006 for (i = 0; i < iovcnt; i++) { 1007 iov_base = (uintptr_t)iovs[i].iov_base; 1008 if ((iov_base & (alignment - 1)) != 0) { 1009 return false; 1010 } 1011 } 1012 1013 return true; 1014 } 1015 1016 static inline bool 1017 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1018 { 1019 if (!bdev_io_use_accel_sequence(bdev_io)) { 1020 return false; 1021 } 1022 1023 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1024 * bdev module didn't support accel sequences */ 1025 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1026 } 1027 1028 static inline void 1029 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1030 struct spdk_bdev_shared_resource *shared_resource) 1031 { 1032 bdev_ch->io_outstanding++; 1033 shared_resource->io_outstanding++; 1034 } 1035 1036 static inline void 1037 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1038 struct spdk_bdev_shared_resource *shared_resource) 1039 { 1040 assert(bdev_ch->io_outstanding > 0); 1041 assert(shared_resource->io_outstanding > 0); 1042 bdev_ch->io_outstanding--; 1043 shared_resource->io_outstanding--; 1044 } 1045 1046 static void 1047 bdev_io_submit_sequence_cb(void *ctx, int status) 1048 { 1049 struct spdk_bdev_io *bdev_io = ctx; 1050 1051 bdev_io->u.bdev.accel_sequence = NULL; 1052 bdev_io->internal.accel_sequence = NULL; 1053 1054 if (spdk_unlikely(status != 0)) { 1055 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1056 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1057 bdev_io_complete_unsubmitted(bdev_io); 1058 return; 1059 } 1060 1061 bdev_io_submit(bdev_io); 1062 } 1063 1064 static void 1065 bdev_io_exec_sequence_cb(void *ctx, int status) 1066 { 1067 struct spdk_bdev_io *bdev_io = ctx; 1068 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1069 1070 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1071 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1072 1073 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1074 bdev_ch_retry_io(ch); 1075 } 1076 1077 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1078 } 1079 1080 static void 1081 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1082 { 1083 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1084 1085 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1086 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1087 1088 /* Since the operations are appended during submission, they're in the opposite order than 1089 * how we want to execute them for reads (i.e. we need to execute the most recently added 1090 * operation first), so reverse the sequence before executing it. 1091 */ 1092 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1093 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1094 } 1095 1096 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1097 bdev_io_increment_outstanding(ch, ch->shared_resource); 1098 bdev_io->internal.data_transfer_cpl = cb_fn; 1099 1100 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1101 bdev_io_exec_sequence_cb, bdev_io); 1102 } 1103 1104 static void 1105 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1106 { 1107 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1108 void *buf; 1109 1110 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1111 buf = bdev_io->internal.buf; 1112 bdev_io->internal.buf = NULL; 1113 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1114 bdev_io->internal.get_aux_buf_cb = NULL; 1115 } else { 1116 assert(bdev_io->internal.get_buf_cb != NULL); 1117 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1118 bdev_io->internal.get_buf_cb = NULL; 1119 } 1120 } 1121 1122 static void 1123 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1124 { 1125 struct spdk_bdev_io *bdev_io = ctx; 1126 1127 if (rc) { 1128 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1129 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1130 } 1131 bdev_io_get_buf_complete(bdev_io, !rc); 1132 } 1133 1134 static void 1135 bdev_io_pull_md_buf_done(void *ctx, int status) 1136 { 1137 struct spdk_bdev_io *bdev_io = ctx; 1138 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1139 1140 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1141 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1142 1143 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1144 bdev_ch_retry_io(ch); 1145 } 1146 1147 assert(bdev_io->internal.data_transfer_cpl); 1148 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1149 } 1150 1151 static void 1152 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1153 { 1154 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1155 int rc = 0; 1156 1157 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1158 if (bdev_io_use_memory_domain(bdev_io)) { 1159 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1160 bdev_io_increment_outstanding(ch, ch->shared_resource); 1161 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1162 bdev_io->internal.memory_domain_ctx, 1163 &bdev_io->internal.orig_md_iov, 1, 1164 &bdev_io->internal.bounce_md_iov, 1, 1165 bdev_io_pull_md_buf_done, bdev_io); 1166 if (rc == 0) { 1167 /* Continue to submit IO in completion callback */ 1168 return; 1169 } 1170 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1171 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1172 if (rc != -ENOMEM) { 1173 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1174 spdk_memory_domain_get_dma_device_id( 1175 bdev_io->internal.memory_domain), rc); 1176 } 1177 } else { 1178 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1179 bdev_io->internal.orig_md_iov.iov_base, 1180 bdev_io->internal.orig_md_iov.iov_len); 1181 } 1182 } 1183 1184 if (spdk_unlikely(rc == -ENOMEM)) { 1185 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1186 } else { 1187 assert(bdev_io->internal.data_transfer_cpl); 1188 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1189 } 1190 } 1191 1192 static void 1193 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1194 { 1195 /* save original md_buf */ 1196 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1197 bdev_io->internal.orig_md_iov.iov_len = len; 1198 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1199 bdev_io->internal.bounce_md_iov.iov_len = len; 1200 /* set bounce md_buf */ 1201 bdev_io->u.bdev.md_buf = md_buf; 1202 1203 bdev_io_pull_md_buf(bdev_io); 1204 } 1205 1206 static void 1207 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1208 { 1209 struct spdk_bdev *bdev = bdev_io->bdev; 1210 uint64_t md_len; 1211 void *buf; 1212 1213 if (spdk_bdev_is_md_separate(bdev)) { 1214 assert(!bdev_io_use_accel_sequence(bdev_io)); 1215 1216 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1217 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1218 1219 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1220 1221 if (bdev_io->u.bdev.md_buf != NULL) { 1222 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1223 return; 1224 } else { 1225 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1226 } 1227 } 1228 1229 bdev_io_get_buf_complete(bdev_io, true); 1230 } 1231 1232 static inline void 1233 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1234 { 1235 if (rc) { 1236 SPDK_ERRLOG("Failed to get data buffer\n"); 1237 assert(bdev_io->internal.data_transfer_cpl); 1238 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1239 return; 1240 } 1241 1242 _bdev_io_set_md_buf(bdev_io); 1243 } 1244 1245 static void 1246 bdev_io_pull_data_done_and_track(void *ctx, int status) 1247 { 1248 struct spdk_bdev_io *bdev_io = ctx; 1249 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1250 1251 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1252 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1253 1254 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1255 bdev_ch_retry_io(ch); 1256 } 1257 1258 bdev_io_pull_data_done(bdev_io, status); 1259 } 1260 1261 static void 1262 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1263 { 1264 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1265 int rc = 0; 1266 1267 /* If we need to exec an accel sequence, append a copy operation making accel change the 1268 * src/dst buffers of the previous operation */ 1269 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1270 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1271 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1272 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1273 NULL, NULL, 1274 bdev_io->internal.orig_iovs, 1275 bdev_io->internal.orig_iovcnt, 1276 bdev_io->internal.memory_domain, 1277 bdev_io->internal.memory_domain_ctx, 1278 0, NULL, NULL); 1279 } else { 1280 /* We need to reverse the src/dst for reads */ 1281 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1282 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1283 bdev_io->internal.orig_iovs, 1284 bdev_io->internal.orig_iovcnt, 1285 bdev_io->internal.memory_domain, 1286 bdev_io->internal.memory_domain_ctx, 1287 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1288 NULL, NULL, 0, NULL, NULL); 1289 } 1290 1291 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1292 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1293 bdev_io->internal.accel_sequence); 1294 } 1295 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1296 /* if this is write path, copy data from original buffer to bounce buffer */ 1297 if (bdev_io_use_memory_domain(bdev_io)) { 1298 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1299 bdev_io_increment_outstanding(ch, ch->shared_resource); 1300 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1301 bdev_io->internal.memory_domain_ctx, 1302 bdev_io->internal.orig_iovs, 1303 (uint32_t) bdev_io->internal.orig_iovcnt, 1304 bdev_io->u.bdev.iovs, 1, 1305 bdev_io_pull_data_done_and_track, 1306 bdev_io); 1307 if (rc == 0) { 1308 /* Continue to submit IO in completion callback */ 1309 return; 1310 } 1311 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1312 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1313 if (rc != -ENOMEM) { 1314 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1315 spdk_memory_domain_get_dma_device_id( 1316 bdev_io->internal.memory_domain)); 1317 } 1318 } else { 1319 assert(bdev_io->u.bdev.iovcnt == 1); 1320 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1321 bdev_io->u.bdev.iovs[0].iov_len, 1322 bdev_io->internal.orig_iovs, 1323 bdev_io->internal.orig_iovcnt); 1324 } 1325 } 1326 1327 if (spdk_unlikely(rc == -ENOMEM)) { 1328 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1329 } else { 1330 bdev_io_pull_data_done(bdev_io, rc); 1331 } 1332 } 1333 1334 static void 1335 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1336 bdev_copy_bounce_buffer_cpl cpl_cb) 1337 { 1338 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1339 1340 bdev_io->internal.data_transfer_cpl = cpl_cb; 1341 /* save original iovec */ 1342 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1343 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1344 /* set bounce iov */ 1345 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1346 bdev_io->u.bdev.iovcnt = 1; 1347 /* set bounce buffer for this operation */ 1348 bdev_io->u.bdev.iovs[0].iov_base = buf; 1349 bdev_io->u.bdev.iovs[0].iov_len = len; 1350 1351 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1352 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1353 } else { 1354 bdev_io_pull_data(bdev_io); 1355 } 1356 } 1357 1358 static void 1359 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1360 { 1361 struct spdk_bdev *bdev = bdev_io->bdev; 1362 bool buf_allocated; 1363 uint64_t alignment; 1364 void *aligned_buf; 1365 1366 bdev_io->internal.buf = buf; 1367 1368 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1369 bdev_io_get_buf_complete(bdev_io, true); 1370 return; 1371 } 1372 1373 alignment = spdk_bdev_get_buf_align(bdev); 1374 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1375 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1376 1377 if (buf_allocated) { 1378 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1379 /* Continue in completion callback */ 1380 return; 1381 } else { 1382 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1383 } 1384 1385 _bdev_io_set_md_buf(bdev_io); 1386 } 1387 1388 static inline uint64_t 1389 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1390 { 1391 struct spdk_bdev *bdev = bdev_io->bdev; 1392 uint64_t md_len, alignment; 1393 1394 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1395 1396 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1397 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1398 1399 return len + alignment + md_len; 1400 } 1401 1402 static void 1403 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1404 { 1405 struct spdk_bdev_mgmt_channel *ch; 1406 1407 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1408 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1409 } 1410 1411 static void 1412 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1413 { 1414 assert(bdev_io->internal.buf != NULL); 1415 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1416 bdev_io->internal.buf = NULL; 1417 } 1418 1419 void 1420 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1421 { 1422 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1423 1424 assert(buf != NULL); 1425 _bdev_io_put_buf(bdev_io, buf, len); 1426 } 1427 1428 static inline void 1429 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1430 struct spdk_bdev_io *bdev_io) 1431 { 1432 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1433 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1434 * sequence pointer to make sure we won't touch it anymore. */ 1435 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1436 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1437 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1438 bdev_io->internal.accel_sequence = NULL; 1439 } 1440 1441 bdev->fn_table->submit_request(ioch, bdev_io); 1442 } 1443 1444 static inline void 1445 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1446 { 1447 struct spdk_bdev *bdev = bdev_ch->bdev; 1448 1449 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1450 bdev_io->internal.error.nvme.cdw0 = 0; 1451 bdev_io->num_retries++; 1452 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1453 } 1454 1455 static void 1456 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1457 { 1458 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1459 struct spdk_bdev_io *bdev_io; 1460 1461 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1462 /* 1463 * Allow some more I/O to complete before retrying the nomem_io queue. 1464 * Some drivers (such as nvme) cannot immediately take a new I/O in 1465 * the context of a completion, because the resources for the I/O are 1466 * not released until control returns to the bdev poller. Also, we 1467 * may require several small I/O to complete before a larger I/O 1468 * (that requires splitting) can be submitted. 1469 */ 1470 return; 1471 } 1472 1473 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1474 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1475 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1476 1477 switch (bdev_io->internal.retry_state) { 1478 case BDEV_IO_RETRY_STATE_SUBMIT: 1479 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1480 break; 1481 case BDEV_IO_RETRY_STATE_PULL: 1482 bdev_io_pull_data(bdev_io); 1483 break; 1484 case BDEV_IO_RETRY_STATE_PULL_MD: 1485 bdev_io_pull_md_buf(bdev_io); 1486 break; 1487 case BDEV_IO_RETRY_STATE_PUSH: 1488 bdev_io_push_bounce_data(bdev_io); 1489 break; 1490 case BDEV_IO_RETRY_STATE_PUSH_MD: 1491 bdev_io_push_bounce_md_buf(bdev_io); 1492 break; 1493 default: 1494 assert(0 && "invalid retry state"); 1495 break; 1496 } 1497 1498 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1499 /* This IO completed again with NOMEM status, so break the loop and 1500 * don't try anymore. Note that a bdev_io that fails with NOMEM 1501 * always gets requeued at the front of the list, to maintain 1502 * ordering. 1503 */ 1504 break; 1505 } 1506 } 1507 } 1508 1509 static inline bool 1510 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1511 { 1512 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1513 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1514 1515 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1516 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1517 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1518 1519 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1520 * ownership of that sequence is transferred back to the bdev layer, so we need to 1521 * restore internal.accel_sequence to make sure that the sequence is handled 1522 * correctly in case the I/O is later aborted. */ 1523 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1524 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1525 assert(bdev_io->internal.accel_sequence == NULL); 1526 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1527 } 1528 1529 return true; 1530 } 1531 1532 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1533 bdev_ch_retry_io(bdev_ch); 1534 } 1535 1536 return false; 1537 } 1538 1539 static void 1540 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1541 { 1542 struct spdk_bdev_io *bdev_io = ctx; 1543 1544 if (rc) { 1545 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1546 } 1547 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1548 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1549 */ 1550 bdev_io_put_buf(bdev_io); 1551 1552 /* Continue with IO completion flow */ 1553 bdev_io_complete(bdev_io); 1554 } 1555 1556 static void 1557 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1558 { 1559 struct spdk_bdev_io *bdev_io = ctx; 1560 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1561 1562 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1563 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1564 1565 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1566 bdev_ch_retry_io(ch); 1567 } 1568 1569 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1570 } 1571 1572 static inline void 1573 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1574 { 1575 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1576 int rc = 0; 1577 1578 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1579 /* do the same for metadata buffer */ 1580 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1581 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1582 1583 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1584 if (bdev_io_use_memory_domain(bdev_io)) { 1585 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1586 bdev_io_increment_outstanding(ch, ch->shared_resource); 1587 /* If memory domain is used then we need to call async push function */ 1588 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1589 bdev_io->internal.memory_domain_ctx, 1590 &bdev_io->internal.orig_md_iov, 1591 (uint32_t)bdev_io->internal.orig_iovcnt, 1592 &bdev_io->internal.bounce_md_iov, 1, 1593 bdev_io_push_bounce_md_buf_done, 1594 bdev_io); 1595 if (rc == 0) { 1596 /* Continue IO completion in async callback */ 1597 return; 1598 } 1599 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1600 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1601 if (rc != -ENOMEM) { 1602 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1603 spdk_memory_domain_get_dma_device_id( 1604 bdev_io->internal.memory_domain)); 1605 } 1606 } else { 1607 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1608 bdev_io->internal.orig_md_iov.iov_len); 1609 } 1610 } 1611 } 1612 1613 if (spdk_unlikely(rc == -ENOMEM)) { 1614 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1615 } else { 1616 assert(bdev_io->internal.data_transfer_cpl); 1617 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1618 } 1619 } 1620 1621 static inline void 1622 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1623 { 1624 assert(bdev_io->internal.data_transfer_cpl); 1625 if (rc) { 1626 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1627 return; 1628 } 1629 1630 /* set original buffer for this io */ 1631 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1632 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1633 /* disable bouncing buffer for this io */ 1634 bdev_io->internal.orig_iovcnt = 0; 1635 bdev_io->internal.orig_iovs = NULL; 1636 1637 bdev_io_push_bounce_md_buf(bdev_io); 1638 } 1639 1640 static void 1641 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1642 { 1643 struct spdk_bdev_io *bdev_io = ctx; 1644 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1645 1646 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1647 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1648 1649 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1650 bdev_ch_retry_io(ch); 1651 } 1652 1653 bdev_io_push_bounce_data_done(bdev_io, status); 1654 } 1655 1656 static inline void 1657 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1658 { 1659 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1660 int rc = 0; 1661 1662 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1663 /* if this is read path, copy data from bounce buffer to original buffer */ 1664 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1665 if (bdev_io_use_memory_domain(bdev_io)) { 1666 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1667 bdev_io_increment_outstanding(ch, ch->shared_resource); 1668 /* If memory domain is used then we need to call async push function */ 1669 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1670 bdev_io->internal.memory_domain_ctx, 1671 bdev_io->internal.orig_iovs, 1672 (uint32_t)bdev_io->internal.orig_iovcnt, 1673 &bdev_io->internal.bounce_iov, 1, 1674 bdev_io_push_bounce_data_done_and_track, 1675 bdev_io); 1676 if (rc == 0) { 1677 /* Continue IO completion in async callback */ 1678 return; 1679 } 1680 1681 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1682 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1683 if (rc != -ENOMEM) { 1684 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1685 spdk_memory_domain_get_dma_device_id( 1686 bdev_io->internal.memory_domain)); 1687 } 1688 } else { 1689 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1690 bdev_io->internal.orig_iovcnt, 1691 bdev_io->internal.bounce_iov.iov_base, 1692 bdev_io->internal.bounce_iov.iov_len); 1693 } 1694 } 1695 1696 if (spdk_unlikely(rc == -ENOMEM)) { 1697 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1698 } else { 1699 bdev_io_push_bounce_data_done(bdev_io, rc); 1700 } 1701 } 1702 1703 static inline void 1704 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1705 { 1706 bdev_io->internal.data_transfer_cpl = cpl_cb; 1707 bdev_io_push_bounce_data(bdev_io); 1708 } 1709 1710 static void 1711 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1712 { 1713 struct spdk_bdev_io *bdev_io; 1714 1715 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1716 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1717 } 1718 1719 static void 1720 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1721 { 1722 struct spdk_bdev_mgmt_channel *mgmt_ch; 1723 uint64_t max_len; 1724 void *buf; 1725 1726 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1727 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1728 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1729 1730 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1731 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1732 bdev_io_get_buf_complete(bdev_io, false); 1733 return; 1734 } 1735 1736 bdev_io->internal.buf_len = len; 1737 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1738 bdev_io_get_iobuf_cb); 1739 if (buf != NULL) { 1740 _bdev_io_set_buf(bdev_io, buf, len); 1741 } 1742 } 1743 1744 void 1745 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1746 { 1747 struct spdk_bdev *bdev = bdev_io->bdev; 1748 uint64_t alignment; 1749 1750 assert(cb != NULL); 1751 bdev_io->internal.get_buf_cb = cb; 1752 1753 alignment = spdk_bdev_get_buf_align(bdev); 1754 1755 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1756 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1757 /* Buffer already present and aligned */ 1758 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1759 return; 1760 } 1761 1762 bdev_io_get_buf(bdev_io, len); 1763 } 1764 1765 static void 1766 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1767 bool success) 1768 { 1769 if (!success) { 1770 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1771 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1772 bdev_io_complete_unsubmitted(bdev_io); 1773 return; 1774 } 1775 1776 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1777 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1778 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1779 return; 1780 } 1781 /* For reads we'll execute the sequence after the data is read, so, for now, only 1782 * clear out accel_sequence pointer and submit the IO */ 1783 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1784 bdev_io->u.bdev.accel_sequence = NULL; 1785 } 1786 1787 bdev_io_submit(bdev_io); 1788 } 1789 1790 static void 1791 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1792 uint64_t len) 1793 { 1794 assert(cb != NULL); 1795 bdev_io->internal.get_buf_cb = cb; 1796 1797 bdev_io_get_buf(bdev_io, len); 1798 } 1799 1800 void 1801 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1802 { 1803 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1804 1805 assert(cb != NULL); 1806 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1807 bdev_io->internal.get_aux_buf_cb = cb; 1808 bdev_io_get_buf(bdev_io, len); 1809 } 1810 1811 static int 1812 bdev_module_get_max_ctx_size(void) 1813 { 1814 struct spdk_bdev_module *bdev_module; 1815 int max_bdev_module_size = 0; 1816 1817 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1818 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1819 max_bdev_module_size = bdev_module->get_ctx_size(); 1820 } 1821 } 1822 1823 return max_bdev_module_size; 1824 } 1825 1826 static void 1827 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1828 { 1829 int i; 1830 struct spdk_bdev_qos *qos = bdev->internal.qos; 1831 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1832 1833 if (!qos) { 1834 return; 1835 } 1836 1837 spdk_bdev_get_qos_rate_limits(bdev, limits); 1838 1839 spdk_json_write_object_begin(w); 1840 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1841 1842 spdk_json_write_named_object_begin(w, "params"); 1843 spdk_json_write_named_string(w, "name", bdev->name); 1844 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1845 if (limits[i] > 0) { 1846 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1847 } 1848 } 1849 spdk_json_write_object_end(w); 1850 1851 spdk_json_write_object_end(w); 1852 } 1853 1854 void 1855 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1856 { 1857 struct spdk_bdev_module *bdev_module; 1858 struct spdk_bdev *bdev; 1859 1860 assert(w != NULL); 1861 1862 spdk_json_write_array_begin(w); 1863 1864 spdk_json_write_object_begin(w); 1865 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1866 spdk_json_write_named_object_begin(w, "params"); 1867 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1868 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1869 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1870 spdk_json_write_object_end(w); 1871 spdk_json_write_object_end(w); 1872 1873 bdev_examine_allowlist_config_json(w); 1874 1875 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1876 if (bdev_module->config_json) { 1877 bdev_module->config_json(w); 1878 } 1879 } 1880 1881 spdk_spin_lock(&g_bdev_mgr.spinlock); 1882 1883 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1884 if (bdev->fn_table->write_config_json) { 1885 bdev->fn_table->write_config_json(bdev, w); 1886 } 1887 1888 bdev_qos_config_json(bdev, w); 1889 } 1890 1891 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1892 1893 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1894 spdk_json_write_object_begin(w); 1895 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1896 spdk_json_write_object_end(w); 1897 1898 spdk_json_write_array_end(w); 1899 } 1900 1901 static void 1902 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1903 { 1904 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1905 struct spdk_bdev_io *bdev_io; 1906 1907 spdk_iobuf_channel_fini(&ch->iobuf); 1908 1909 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1910 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1911 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1912 ch->per_thread_cache_count--; 1913 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1914 } 1915 1916 assert(ch->per_thread_cache_count == 0); 1917 } 1918 1919 static int 1920 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1921 { 1922 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1923 struct spdk_bdev_io *bdev_io; 1924 uint32_t i; 1925 int rc; 1926 1927 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1928 if (rc != 0) { 1929 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1930 return -1; 1931 } 1932 1933 STAILQ_INIT(&ch->per_thread_cache); 1934 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1935 1936 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1937 ch->per_thread_cache_count = 0; 1938 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1939 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1940 if (bdev_io == NULL) { 1941 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1942 assert(false); 1943 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1944 return -1; 1945 } 1946 ch->per_thread_cache_count++; 1947 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1948 } 1949 1950 TAILQ_INIT(&ch->shared_resources); 1951 TAILQ_INIT(&ch->io_wait_queue); 1952 1953 return 0; 1954 } 1955 1956 static void 1957 bdev_init_complete(int rc) 1958 { 1959 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1960 void *cb_arg = g_init_cb_arg; 1961 struct spdk_bdev_module *m; 1962 1963 g_bdev_mgr.init_complete = true; 1964 g_init_cb_fn = NULL; 1965 g_init_cb_arg = NULL; 1966 1967 /* 1968 * For modules that need to know when subsystem init is complete, 1969 * inform them now. 1970 */ 1971 if (rc == 0) { 1972 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1973 if (m->init_complete) { 1974 m->init_complete(); 1975 } 1976 } 1977 } 1978 1979 cb_fn(cb_arg, rc); 1980 } 1981 1982 static bool 1983 bdev_module_all_actions_completed(void) 1984 { 1985 struct spdk_bdev_module *m; 1986 1987 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1988 if (m->internal.action_in_progress > 0) { 1989 return false; 1990 } 1991 } 1992 return true; 1993 } 1994 1995 static void 1996 bdev_module_action_complete(void) 1997 { 1998 /* 1999 * Don't finish bdev subsystem initialization if 2000 * module pre-initialization is still in progress, or 2001 * the subsystem been already initialized. 2002 */ 2003 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2004 return; 2005 } 2006 2007 /* 2008 * Check all bdev modules for inits/examinations in progress. If any 2009 * exist, return immediately since we cannot finish bdev subsystem 2010 * initialization until all are completed. 2011 */ 2012 if (!bdev_module_all_actions_completed()) { 2013 return; 2014 } 2015 2016 /* 2017 * Modules already finished initialization - now that all 2018 * the bdev modules have finished their asynchronous I/O 2019 * processing, the entire bdev layer can be marked as complete. 2020 */ 2021 bdev_init_complete(0); 2022 } 2023 2024 static void 2025 bdev_module_action_done(struct spdk_bdev_module *module) 2026 { 2027 spdk_spin_lock(&module->internal.spinlock); 2028 assert(module->internal.action_in_progress > 0); 2029 module->internal.action_in_progress--; 2030 spdk_spin_unlock(&module->internal.spinlock); 2031 bdev_module_action_complete(); 2032 } 2033 2034 void 2035 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2036 { 2037 assert(module->async_init); 2038 bdev_module_action_done(module); 2039 } 2040 2041 void 2042 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2043 { 2044 bdev_module_action_done(module); 2045 } 2046 2047 /** The last initialized bdev module */ 2048 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2049 2050 static void 2051 bdev_init_failed(void *cb_arg) 2052 { 2053 struct spdk_bdev_module *module = cb_arg; 2054 2055 spdk_spin_lock(&module->internal.spinlock); 2056 assert(module->internal.action_in_progress > 0); 2057 module->internal.action_in_progress--; 2058 spdk_spin_unlock(&module->internal.spinlock); 2059 bdev_init_complete(-1); 2060 } 2061 2062 static int 2063 bdev_modules_init(void) 2064 { 2065 struct spdk_bdev_module *module; 2066 int rc = 0; 2067 2068 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2069 g_resume_bdev_module = module; 2070 if (module->async_init) { 2071 spdk_spin_lock(&module->internal.spinlock); 2072 module->internal.action_in_progress = 1; 2073 spdk_spin_unlock(&module->internal.spinlock); 2074 } 2075 rc = module->module_init(); 2076 if (rc != 0) { 2077 /* Bump action_in_progress to prevent other modules from completion of modules_init 2078 * Send message to defer application shutdown until resources are cleaned up */ 2079 spdk_spin_lock(&module->internal.spinlock); 2080 module->internal.action_in_progress = 1; 2081 spdk_spin_unlock(&module->internal.spinlock); 2082 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2083 return rc; 2084 } 2085 } 2086 2087 g_resume_bdev_module = NULL; 2088 return 0; 2089 } 2090 2091 void 2092 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2093 { 2094 int rc = 0; 2095 char mempool_name[32]; 2096 2097 assert(cb_fn != NULL); 2098 2099 g_init_cb_fn = cb_fn; 2100 g_init_cb_arg = cb_arg; 2101 2102 spdk_notify_type_register("bdev_register"); 2103 spdk_notify_type_register("bdev_unregister"); 2104 2105 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2106 2107 rc = spdk_iobuf_register_module("bdev"); 2108 if (rc != 0) { 2109 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2110 bdev_init_complete(-1); 2111 return; 2112 } 2113 2114 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2115 g_bdev_opts.bdev_io_pool_size, 2116 sizeof(struct spdk_bdev_io) + 2117 bdev_module_get_max_ctx_size(), 2118 0, 2119 SPDK_ENV_SOCKET_ID_ANY); 2120 2121 if (g_bdev_mgr.bdev_io_pool == NULL) { 2122 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2123 bdev_init_complete(-1); 2124 return; 2125 } 2126 2127 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2128 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2129 if (!g_bdev_mgr.zero_buffer) { 2130 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2131 bdev_init_complete(-1); 2132 return; 2133 } 2134 2135 #ifdef SPDK_CONFIG_VTUNE 2136 SPDK_LOG_DEPRECATED(vtune_support); 2137 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2138 #endif 2139 2140 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2141 bdev_mgmt_channel_destroy, 2142 sizeof(struct spdk_bdev_mgmt_channel), 2143 "bdev_mgr"); 2144 2145 rc = bdev_modules_init(); 2146 g_bdev_mgr.module_init_complete = true; 2147 if (rc != 0) { 2148 SPDK_ERRLOG("bdev modules init failed\n"); 2149 return; 2150 } 2151 2152 bdev_module_action_complete(); 2153 } 2154 2155 static void 2156 bdev_mgr_unregister_cb(void *io_device) 2157 { 2158 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2159 2160 if (g_bdev_mgr.bdev_io_pool) { 2161 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2162 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2163 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2164 g_bdev_opts.bdev_io_pool_size); 2165 } 2166 2167 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2168 } 2169 2170 spdk_free(g_bdev_mgr.zero_buffer); 2171 2172 bdev_examine_allowlist_free(); 2173 2174 cb_fn(g_fini_cb_arg); 2175 g_fini_cb_fn = NULL; 2176 g_fini_cb_arg = NULL; 2177 g_bdev_mgr.init_complete = false; 2178 g_bdev_mgr.module_init_complete = false; 2179 } 2180 2181 static void 2182 bdev_module_fini_iter(void *arg) 2183 { 2184 struct spdk_bdev_module *bdev_module; 2185 2186 /* FIXME: Handling initialization failures is broken now, 2187 * so we won't even try cleaning up after successfully 2188 * initialized modules. if module_init_complete is false, 2189 * just call spdk_bdev_mgr_unregister_cb 2190 */ 2191 if (!g_bdev_mgr.module_init_complete) { 2192 bdev_mgr_unregister_cb(NULL); 2193 return; 2194 } 2195 2196 /* Start iterating from the last touched module */ 2197 if (!g_resume_bdev_module) { 2198 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2199 } else { 2200 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2201 internal.tailq); 2202 } 2203 2204 while (bdev_module) { 2205 if (bdev_module->async_fini) { 2206 /* Save our place so we can resume later. We must 2207 * save the variable here, before calling module_fini() 2208 * below, because in some cases the module may immediately 2209 * call spdk_bdev_module_fini_done() and re-enter 2210 * this function to continue iterating. */ 2211 g_resume_bdev_module = bdev_module; 2212 } 2213 2214 if (bdev_module->module_fini) { 2215 bdev_module->module_fini(); 2216 } 2217 2218 if (bdev_module->async_fini) { 2219 return; 2220 } 2221 2222 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2223 internal.tailq); 2224 } 2225 2226 g_resume_bdev_module = NULL; 2227 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2228 } 2229 2230 void 2231 spdk_bdev_module_fini_done(void) 2232 { 2233 if (spdk_get_thread() != g_fini_thread) { 2234 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2235 } else { 2236 bdev_module_fini_iter(NULL); 2237 } 2238 } 2239 2240 static void 2241 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2242 { 2243 struct spdk_bdev *bdev = cb_arg; 2244 2245 if (bdeverrno && bdev) { 2246 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2247 bdev->name); 2248 2249 /* 2250 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2251 * bdev; try to continue by manually removing this bdev from the list and continue 2252 * with the next bdev in the list. 2253 */ 2254 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2255 } 2256 2257 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2258 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2259 /* 2260 * Bdev module finish need to be deferred as we might be in the middle of some context 2261 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2262 * after returning. 2263 */ 2264 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2265 return; 2266 } 2267 2268 /* 2269 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2270 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2271 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2272 * base bdevs. 2273 * 2274 * Also, walk the list in the reverse order. 2275 */ 2276 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2277 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2278 spdk_spin_lock(&bdev->internal.spinlock); 2279 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2280 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2281 spdk_spin_unlock(&bdev->internal.spinlock); 2282 continue; 2283 } 2284 spdk_spin_unlock(&bdev->internal.spinlock); 2285 2286 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2287 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2288 return; 2289 } 2290 2291 /* 2292 * If any bdev fails to unclaim underlying bdev properly, we may face the 2293 * case of bdev list consisting of claimed bdevs only (if claims are managed 2294 * correctly, this would mean there's a loop in the claims graph which is 2295 * clearly impossible). Warn and unregister last bdev on the list then. 2296 */ 2297 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2298 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2299 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2300 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2301 return; 2302 } 2303 } 2304 2305 static void 2306 bdev_module_fini_start_iter(void *arg) 2307 { 2308 struct spdk_bdev_module *bdev_module; 2309 2310 if (!g_resume_bdev_module) { 2311 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2312 } else { 2313 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2314 } 2315 2316 while (bdev_module) { 2317 if (bdev_module->async_fini_start) { 2318 /* Save our place so we can resume later. We must 2319 * save the variable here, before calling fini_start() 2320 * below, because in some cases the module may immediately 2321 * call spdk_bdev_module_fini_start_done() and re-enter 2322 * this function to continue iterating. */ 2323 g_resume_bdev_module = bdev_module; 2324 } 2325 2326 if (bdev_module->fini_start) { 2327 bdev_module->fini_start(); 2328 } 2329 2330 if (bdev_module->async_fini_start) { 2331 return; 2332 } 2333 2334 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2335 } 2336 2337 g_resume_bdev_module = NULL; 2338 2339 bdev_finish_unregister_bdevs_iter(NULL, 0); 2340 } 2341 2342 void 2343 spdk_bdev_module_fini_start_done(void) 2344 { 2345 if (spdk_get_thread() != g_fini_thread) { 2346 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2347 } else { 2348 bdev_module_fini_start_iter(NULL); 2349 } 2350 } 2351 2352 static void 2353 bdev_finish_wait_for_examine_done(void *cb_arg) 2354 { 2355 bdev_module_fini_start_iter(NULL); 2356 } 2357 2358 void 2359 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2360 { 2361 int rc; 2362 2363 assert(cb_fn != NULL); 2364 2365 g_fini_thread = spdk_get_thread(); 2366 2367 g_fini_cb_fn = cb_fn; 2368 g_fini_cb_arg = cb_arg; 2369 2370 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2371 if (rc != 0) { 2372 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2373 bdev_finish_wait_for_examine_done(NULL); 2374 } 2375 } 2376 2377 struct spdk_bdev_io * 2378 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2379 { 2380 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2381 struct spdk_bdev_io *bdev_io; 2382 2383 if (ch->per_thread_cache_count > 0) { 2384 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2385 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2386 ch->per_thread_cache_count--; 2387 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2388 /* 2389 * Don't try to look for bdev_ios in the global pool if there are 2390 * waiters on bdev_ios - we don't want this caller to jump the line. 2391 */ 2392 bdev_io = NULL; 2393 } else { 2394 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2395 } 2396 2397 return bdev_io; 2398 } 2399 2400 void 2401 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2402 { 2403 struct spdk_bdev_mgmt_channel *ch; 2404 2405 assert(bdev_io != NULL); 2406 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2407 2408 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2409 2410 if (bdev_io->internal.buf != NULL) { 2411 bdev_io_put_buf(bdev_io); 2412 } 2413 2414 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2415 ch->per_thread_cache_count++; 2416 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2417 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2418 struct spdk_bdev_io_wait_entry *entry; 2419 2420 entry = TAILQ_FIRST(&ch->io_wait_queue); 2421 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2422 entry->cb_fn(entry->cb_arg); 2423 } 2424 } else { 2425 /* We should never have a full cache with entries on the io wait queue. */ 2426 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2427 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2428 } 2429 } 2430 2431 static bool 2432 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2433 { 2434 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2435 2436 switch (limit) { 2437 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2438 return true; 2439 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2440 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2441 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2442 return false; 2443 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2444 default: 2445 return false; 2446 } 2447 } 2448 2449 static bool 2450 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2451 { 2452 switch (bdev_io->type) { 2453 case SPDK_BDEV_IO_TYPE_NVME_IO: 2454 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2455 case SPDK_BDEV_IO_TYPE_READ: 2456 case SPDK_BDEV_IO_TYPE_WRITE: 2457 return true; 2458 case SPDK_BDEV_IO_TYPE_ZCOPY: 2459 if (bdev_io->u.bdev.zcopy.start) { 2460 return true; 2461 } else { 2462 return false; 2463 } 2464 default: 2465 return false; 2466 } 2467 } 2468 2469 static bool 2470 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2471 { 2472 switch (bdev_io->type) { 2473 case SPDK_BDEV_IO_TYPE_NVME_IO: 2474 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2475 /* Bit 1 (0x2) set for read operation */ 2476 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2477 return true; 2478 } else { 2479 return false; 2480 } 2481 case SPDK_BDEV_IO_TYPE_READ: 2482 return true; 2483 case SPDK_BDEV_IO_TYPE_ZCOPY: 2484 /* Populate to read from disk */ 2485 if (bdev_io->u.bdev.zcopy.populate) { 2486 return true; 2487 } else { 2488 return false; 2489 } 2490 default: 2491 return false; 2492 } 2493 } 2494 2495 static uint64_t 2496 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2497 { 2498 struct spdk_bdev *bdev = bdev_io->bdev; 2499 2500 switch (bdev_io->type) { 2501 case SPDK_BDEV_IO_TYPE_NVME_IO: 2502 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2503 return bdev_io->u.nvme_passthru.nbytes; 2504 case SPDK_BDEV_IO_TYPE_READ: 2505 case SPDK_BDEV_IO_TYPE_WRITE: 2506 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2507 case SPDK_BDEV_IO_TYPE_ZCOPY: 2508 /* Track the data in the start phase only */ 2509 if (bdev_io->u.bdev.zcopy.start) { 2510 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2511 } else { 2512 return 0; 2513 } 2514 default: 2515 return 0; 2516 } 2517 } 2518 2519 static bool 2520 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2521 { 2522 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2523 return true; 2524 } else { 2525 return false; 2526 } 2527 } 2528 2529 static bool 2530 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2531 { 2532 if (bdev_is_read_io(io) == false) { 2533 return false; 2534 } 2535 2536 return bdev_qos_rw_queue_io(limit, io); 2537 } 2538 2539 static bool 2540 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2541 { 2542 if (bdev_is_read_io(io) == true) { 2543 return false; 2544 } 2545 2546 return bdev_qos_rw_queue_io(limit, io); 2547 } 2548 2549 static void 2550 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2551 { 2552 limit->remaining_this_timeslice--; 2553 } 2554 2555 static void 2556 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2557 { 2558 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2559 } 2560 2561 static void 2562 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2563 { 2564 if (bdev_is_read_io(io) == false) { 2565 return; 2566 } 2567 2568 return bdev_qos_rw_bps_update_quota(limit, io); 2569 } 2570 2571 static void 2572 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2573 { 2574 if (bdev_is_read_io(io) == true) { 2575 return; 2576 } 2577 2578 return bdev_qos_rw_bps_update_quota(limit, io); 2579 } 2580 2581 static void 2582 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2583 { 2584 int i; 2585 2586 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2587 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2588 qos->rate_limits[i].queue_io = NULL; 2589 qos->rate_limits[i].update_quota = NULL; 2590 continue; 2591 } 2592 2593 switch (i) { 2594 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2595 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2596 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2597 break; 2598 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2599 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2600 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2601 break; 2602 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2603 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2604 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2605 break; 2606 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2607 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2608 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2609 break; 2610 default: 2611 break; 2612 } 2613 } 2614 } 2615 2616 static void 2617 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2618 struct spdk_bdev_io *bdev_io, 2619 enum spdk_bdev_io_status status) 2620 { 2621 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2622 2623 bdev_io->internal.in_submit_request = true; 2624 bdev_ch->io_outstanding++; 2625 shared_resource->io_outstanding++; 2626 spdk_bdev_io_complete(bdev_io, status); 2627 bdev_io->internal.in_submit_request = false; 2628 } 2629 2630 static inline void 2631 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2632 { 2633 struct spdk_bdev *bdev = bdev_io->bdev; 2634 struct spdk_io_channel *ch = bdev_ch->channel; 2635 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2636 2637 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2638 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2639 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2640 2641 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2642 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2643 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2644 SPDK_BDEV_IO_STATUS_SUCCESS); 2645 return; 2646 } 2647 } 2648 2649 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2650 bdev_io->bdev->split_on_write_unit && 2651 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2652 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2653 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2654 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2655 return; 2656 } 2657 2658 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2659 bdev_ch->io_outstanding++; 2660 shared_resource->io_outstanding++; 2661 bdev_io->internal.in_submit_request = true; 2662 bdev_submit_request(bdev, ch, bdev_io); 2663 bdev_io->internal.in_submit_request = false; 2664 } else { 2665 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2666 } 2667 } 2668 2669 static bool 2670 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2671 { 2672 int i; 2673 2674 if (bdev_qos_io_to_limit(bdev_io) == true) { 2675 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2676 if (!qos->rate_limits[i].queue_io) { 2677 continue; 2678 } 2679 2680 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2681 bdev_io) == true) { 2682 return true; 2683 } 2684 } 2685 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2686 if (!qos->rate_limits[i].update_quota) { 2687 continue; 2688 } 2689 2690 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2691 } 2692 } 2693 2694 return false; 2695 } 2696 2697 static inline void 2698 _bdev_io_do_submit(void *ctx) 2699 { 2700 struct spdk_bdev_io *bdev_io = ctx; 2701 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2702 2703 bdev_io_do_submit(ch, bdev_io); 2704 } 2705 2706 static int 2707 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2708 { 2709 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2710 int submitted_ios = 0; 2711 2712 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2713 if (!bdev_qos_queue_io(qos, bdev_io)) { 2714 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2715 2716 if (bdev_io->internal.io_submit_ch) { 2717 /* Send back the IO to the original thread for the actual processing. */ 2718 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2719 bdev_io->internal.io_submit_ch = NULL; 2720 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2721 _bdev_io_do_submit, bdev_io); 2722 } else { 2723 bdev_io_do_submit(ch, bdev_io); 2724 } 2725 2726 submitted_ios++; 2727 } 2728 } 2729 2730 return submitted_ios; 2731 } 2732 2733 static void 2734 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2735 { 2736 int rc; 2737 2738 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2739 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2740 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2741 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2742 &bdev_io->internal.waitq_entry); 2743 if (rc != 0) { 2744 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2745 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2746 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2747 } 2748 } 2749 2750 static bool 2751 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2752 { 2753 uint32_t io_boundary; 2754 struct spdk_bdev *bdev = bdev_io->bdev; 2755 uint32_t max_size = bdev->max_segment_size; 2756 int max_segs = bdev->max_num_segments; 2757 2758 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2759 io_boundary = bdev->write_unit_size; 2760 } else if (bdev->split_on_optimal_io_boundary) { 2761 io_boundary = bdev->optimal_io_boundary; 2762 } else { 2763 io_boundary = 0; 2764 } 2765 2766 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2767 return false; 2768 } 2769 2770 if (io_boundary) { 2771 uint64_t start_stripe, end_stripe; 2772 2773 start_stripe = bdev_io->u.bdev.offset_blocks; 2774 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2775 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2776 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2777 start_stripe >>= spdk_u32log2(io_boundary); 2778 end_stripe >>= spdk_u32log2(io_boundary); 2779 } else { 2780 start_stripe /= io_boundary; 2781 end_stripe /= io_boundary; 2782 } 2783 2784 if (start_stripe != end_stripe) { 2785 return true; 2786 } 2787 } 2788 2789 if (max_segs) { 2790 if (bdev_io->u.bdev.iovcnt > max_segs) { 2791 return true; 2792 } 2793 } 2794 2795 if (max_size) { 2796 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2797 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2798 return true; 2799 } 2800 } 2801 } 2802 2803 return false; 2804 } 2805 2806 static bool 2807 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2808 { 2809 uint32_t num_unmap_segments; 2810 2811 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2812 return false; 2813 } 2814 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2815 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2816 return true; 2817 } 2818 2819 return false; 2820 } 2821 2822 static bool 2823 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2824 { 2825 if (!bdev_io->bdev->max_write_zeroes) { 2826 return false; 2827 } 2828 2829 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2830 return true; 2831 } 2832 2833 return false; 2834 } 2835 2836 static bool 2837 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2838 { 2839 if (bdev_io->bdev->max_copy != 0 && 2840 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2841 return true; 2842 } 2843 2844 return false; 2845 } 2846 2847 static bool 2848 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2849 { 2850 switch (bdev_io->type) { 2851 case SPDK_BDEV_IO_TYPE_READ: 2852 case SPDK_BDEV_IO_TYPE_WRITE: 2853 return bdev_rw_should_split(bdev_io); 2854 case SPDK_BDEV_IO_TYPE_UNMAP: 2855 return bdev_unmap_should_split(bdev_io); 2856 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2857 return bdev_write_zeroes_should_split(bdev_io); 2858 case SPDK_BDEV_IO_TYPE_COPY: 2859 return bdev_copy_should_split(bdev_io); 2860 default: 2861 return false; 2862 } 2863 } 2864 2865 static uint32_t 2866 _to_next_boundary(uint64_t offset, uint32_t boundary) 2867 { 2868 return (boundary - (offset % boundary)); 2869 } 2870 2871 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2872 2873 static void _bdev_rw_split(void *_bdev_io); 2874 2875 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2876 2877 static void 2878 _bdev_unmap_split(void *_bdev_io) 2879 { 2880 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2881 } 2882 2883 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2884 2885 static void 2886 _bdev_write_zeroes_split(void *_bdev_io) 2887 { 2888 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2889 } 2890 2891 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2892 2893 static void 2894 _bdev_copy_split(void *_bdev_io) 2895 { 2896 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2897 } 2898 2899 static int 2900 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2901 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2902 { 2903 int rc; 2904 uint64_t current_offset, current_remaining, current_src_offset; 2905 spdk_bdev_io_wait_cb io_wait_fn; 2906 2907 current_offset = *offset; 2908 current_remaining = *remaining; 2909 2910 bdev_io->u.bdev.split_outstanding++; 2911 2912 io_wait_fn = _bdev_rw_split; 2913 switch (bdev_io->type) { 2914 case SPDK_BDEV_IO_TYPE_READ: 2915 assert(bdev_io->u.bdev.accel_sequence == NULL); 2916 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2917 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2918 iov, iovcnt, md_buf, current_offset, 2919 num_blocks, bdev_io->internal.memory_domain, 2920 bdev_io->internal.memory_domain_ctx, NULL, 2921 bdev_io_split_done, bdev_io); 2922 break; 2923 case SPDK_BDEV_IO_TYPE_WRITE: 2924 assert(bdev_io->u.bdev.accel_sequence == NULL); 2925 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2926 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2927 iov, iovcnt, md_buf, current_offset, 2928 num_blocks, bdev_io->internal.memory_domain, 2929 bdev_io->internal.memory_domain_ctx, NULL, 2930 bdev_io_split_done, bdev_io); 2931 break; 2932 case SPDK_BDEV_IO_TYPE_UNMAP: 2933 io_wait_fn = _bdev_unmap_split; 2934 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2935 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2936 current_offset, num_blocks, 2937 bdev_io_split_done, bdev_io); 2938 break; 2939 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2940 io_wait_fn = _bdev_write_zeroes_split; 2941 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2942 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2943 current_offset, num_blocks, 2944 bdev_io_split_done, bdev_io); 2945 break; 2946 case SPDK_BDEV_IO_TYPE_COPY: 2947 io_wait_fn = _bdev_copy_split; 2948 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2949 (current_offset - bdev_io->u.bdev.offset_blocks); 2950 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2951 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2952 current_offset, current_src_offset, num_blocks, 2953 bdev_io_split_done, bdev_io); 2954 break; 2955 default: 2956 assert(false); 2957 rc = -EINVAL; 2958 break; 2959 } 2960 2961 if (rc == 0) { 2962 current_offset += num_blocks; 2963 current_remaining -= num_blocks; 2964 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2965 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2966 *offset = current_offset; 2967 *remaining = current_remaining; 2968 } else { 2969 bdev_io->u.bdev.split_outstanding--; 2970 if (rc == -ENOMEM) { 2971 if (bdev_io->u.bdev.split_outstanding == 0) { 2972 /* No I/O is outstanding. Hence we should wait here. */ 2973 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2974 } 2975 } else { 2976 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2977 if (bdev_io->u.bdev.split_outstanding == 0) { 2978 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2979 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2980 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2981 } 2982 } 2983 } 2984 2985 return rc; 2986 } 2987 2988 static void 2989 _bdev_rw_split(void *_bdev_io) 2990 { 2991 struct iovec *parent_iov, *iov; 2992 struct spdk_bdev_io *bdev_io = _bdev_io; 2993 struct spdk_bdev *bdev = bdev_io->bdev; 2994 uint64_t parent_offset, current_offset, remaining; 2995 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2996 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2997 uint32_t iovcnt, iov_len, child_iovsize; 2998 uint32_t blocklen = bdev->blocklen; 2999 uint32_t io_boundary; 3000 uint32_t max_segment_size = bdev->max_segment_size; 3001 uint32_t max_child_iovcnt = bdev->max_num_segments; 3002 void *md_buf = NULL; 3003 int rc; 3004 3005 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3006 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3007 SPDK_BDEV_IO_NUM_CHILD_IOV; 3008 3009 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3010 io_boundary = bdev->write_unit_size; 3011 } else if (bdev->split_on_optimal_io_boundary) { 3012 io_boundary = bdev->optimal_io_boundary; 3013 } else { 3014 io_boundary = UINT32_MAX; 3015 } 3016 3017 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3018 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3019 parent_offset = bdev_io->u.bdev.offset_blocks; 3020 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3021 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3022 3023 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3024 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3025 if (parent_iov_offset < parent_iov->iov_len) { 3026 break; 3027 } 3028 parent_iov_offset -= parent_iov->iov_len; 3029 } 3030 3031 child_iovcnt = 0; 3032 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3033 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3034 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3035 to_next_boundary = spdk_min(remaining, to_next_boundary); 3036 to_next_boundary_bytes = to_next_boundary * blocklen; 3037 3038 iov = &bdev_io->child_iov[child_iovcnt]; 3039 iovcnt = 0; 3040 3041 if (bdev_io->u.bdev.md_buf) { 3042 md_buf = (char *)bdev_io->u.bdev.md_buf + 3043 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3044 } 3045 3046 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3047 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3048 iovcnt < child_iovsize) { 3049 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3050 iov_len = parent_iov->iov_len - parent_iov_offset; 3051 3052 iov_len = spdk_min(iov_len, max_segment_size); 3053 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3054 to_next_boundary_bytes -= iov_len; 3055 3056 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3057 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3058 3059 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3060 parent_iov_offset += iov_len; 3061 } else { 3062 parent_iovpos++; 3063 parent_iov_offset = 0; 3064 } 3065 child_iovcnt++; 3066 iovcnt++; 3067 } 3068 3069 if (to_next_boundary_bytes > 0) { 3070 /* We had to stop this child I/O early because we ran out of 3071 * child_iov space or were limited by max_num_segments. 3072 * Ensure the iovs to be aligned with block size and 3073 * then adjust to_next_boundary before starting the 3074 * child I/O. 3075 */ 3076 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3077 iovcnt == child_iovsize); 3078 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3079 if (to_last_block_bytes != 0) { 3080 uint32_t child_iovpos = child_iovcnt - 1; 3081 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3082 * so the loop will naturally end 3083 */ 3084 3085 to_last_block_bytes = blocklen - to_last_block_bytes; 3086 to_next_boundary_bytes += to_last_block_bytes; 3087 while (to_last_block_bytes > 0 && iovcnt > 0) { 3088 iov_len = spdk_min(to_last_block_bytes, 3089 bdev_io->child_iov[child_iovpos].iov_len); 3090 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3091 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3092 child_iovpos--; 3093 if (--iovcnt == 0) { 3094 /* If the child IO is less than a block size just return. 3095 * If the first child IO of any split round is less than 3096 * a block size, an error exit. 3097 */ 3098 if (bdev_io->u.bdev.split_outstanding == 0) { 3099 SPDK_ERRLOG("The first child io was less than a block size\n"); 3100 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3101 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3102 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3103 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3104 } 3105 3106 return; 3107 } 3108 } 3109 3110 to_last_block_bytes -= iov_len; 3111 3112 if (parent_iov_offset == 0) { 3113 parent_iovpos--; 3114 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3115 } 3116 parent_iov_offset -= iov_len; 3117 } 3118 3119 assert(to_last_block_bytes == 0); 3120 } 3121 to_next_boundary -= to_next_boundary_bytes / blocklen; 3122 } 3123 3124 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3125 ¤t_offset, &remaining); 3126 if (spdk_unlikely(rc)) { 3127 return; 3128 } 3129 } 3130 } 3131 3132 static void 3133 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3134 { 3135 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3136 uint32_t num_children_reqs = 0; 3137 int rc; 3138 3139 offset = bdev_io->u.bdev.split_current_offset_blocks; 3140 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3141 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3142 3143 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3144 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3145 3146 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3147 &offset, &remaining); 3148 if (spdk_likely(rc == 0)) { 3149 num_children_reqs++; 3150 } else { 3151 return; 3152 } 3153 } 3154 } 3155 3156 static void 3157 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3158 { 3159 uint64_t offset, write_zeroes_blocks, remaining; 3160 uint32_t num_children_reqs = 0; 3161 int rc; 3162 3163 offset = bdev_io->u.bdev.split_current_offset_blocks; 3164 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3165 3166 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3167 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3168 3169 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3170 &offset, &remaining); 3171 if (spdk_likely(rc == 0)) { 3172 num_children_reqs++; 3173 } else { 3174 return; 3175 } 3176 } 3177 } 3178 3179 static void 3180 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3181 { 3182 uint64_t offset, copy_blocks, remaining; 3183 uint32_t num_children_reqs = 0; 3184 int rc; 3185 3186 offset = bdev_io->u.bdev.split_current_offset_blocks; 3187 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3188 3189 assert(bdev_io->bdev->max_copy != 0); 3190 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3191 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3192 3193 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3194 &offset, &remaining); 3195 if (spdk_likely(rc == 0)) { 3196 num_children_reqs++; 3197 } else { 3198 return; 3199 } 3200 } 3201 } 3202 3203 static void 3204 parent_bdev_io_complete(void *ctx, int rc) 3205 { 3206 struct spdk_bdev_io *parent_io = ctx; 3207 3208 if (rc) { 3209 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3210 } 3211 3212 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3213 parent_io->internal.caller_ctx); 3214 } 3215 3216 static void 3217 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3218 { 3219 struct spdk_bdev_io *bdev_io = ctx; 3220 3221 /* u.bdev.accel_sequence should have already been cleared at this point */ 3222 assert(bdev_io->u.bdev.accel_sequence == NULL); 3223 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3224 bdev_io->internal.accel_sequence = NULL; 3225 3226 if (spdk_unlikely(status != 0)) { 3227 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3228 } 3229 3230 parent_bdev_io_complete(bdev_io, status); 3231 } 3232 3233 static void 3234 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3235 { 3236 struct spdk_bdev_io *parent_io = cb_arg; 3237 3238 spdk_bdev_free_io(bdev_io); 3239 3240 if (!success) { 3241 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3242 /* If any child I/O failed, stop further splitting process. */ 3243 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3244 parent_io->u.bdev.split_remaining_num_blocks = 0; 3245 } 3246 parent_io->u.bdev.split_outstanding--; 3247 if (parent_io->u.bdev.split_outstanding != 0) { 3248 return; 3249 } 3250 3251 /* 3252 * Parent I/O finishes when all blocks are consumed. 3253 */ 3254 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3255 assert(parent_io->internal.cb != bdev_io_split_done); 3256 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3257 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3258 3259 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3260 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3261 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3262 return; 3263 } else if (parent_io->internal.orig_iovcnt != 0) { 3264 /* bdev IO will be completed in the callback */ 3265 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3266 return; 3267 } 3268 } 3269 3270 parent_bdev_io_complete(parent_io, 0); 3271 return; 3272 } 3273 3274 /* 3275 * Continue with the splitting process. This function will complete the parent I/O if the 3276 * splitting is done. 3277 */ 3278 switch (parent_io->type) { 3279 case SPDK_BDEV_IO_TYPE_READ: 3280 case SPDK_BDEV_IO_TYPE_WRITE: 3281 _bdev_rw_split(parent_io); 3282 break; 3283 case SPDK_BDEV_IO_TYPE_UNMAP: 3284 bdev_unmap_split(parent_io); 3285 break; 3286 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3287 bdev_write_zeroes_split(parent_io); 3288 break; 3289 case SPDK_BDEV_IO_TYPE_COPY: 3290 bdev_copy_split(parent_io); 3291 break; 3292 default: 3293 assert(false); 3294 break; 3295 } 3296 } 3297 3298 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3299 bool success); 3300 3301 static void 3302 bdev_io_split(struct spdk_bdev_io *bdev_io) 3303 { 3304 assert(bdev_io_should_split(bdev_io)); 3305 3306 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3307 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3308 bdev_io->u.bdev.split_outstanding = 0; 3309 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3310 3311 switch (bdev_io->type) { 3312 case SPDK_BDEV_IO_TYPE_READ: 3313 case SPDK_BDEV_IO_TYPE_WRITE: 3314 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3315 _bdev_rw_split(bdev_io); 3316 } else { 3317 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3318 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3319 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3320 } 3321 break; 3322 case SPDK_BDEV_IO_TYPE_UNMAP: 3323 bdev_unmap_split(bdev_io); 3324 break; 3325 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3326 bdev_write_zeroes_split(bdev_io); 3327 break; 3328 case SPDK_BDEV_IO_TYPE_COPY: 3329 bdev_copy_split(bdev_io); 3330 break; 3331 default: 3332 assert(false); 3333 break; 3334 } 3335 } 3336 3337 static void 3338 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3339 { 3340 if (!success) { 3341 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3342 return; 3343 } 3344 3345 _bdev_rw_split(bdev_io); 3346 } 3347 3348 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3349 * be inlined, at least on some compilers. 3350 */ 3351 static inline void 3352 _bdev_io_submit(void *ctx) 3353 { 3354 struct spdk_bdev_io *bdev_io = ctx; 3355 struct spdk_bdev *bdev = bdev_io->bdev; 3356 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3357 3358 if (spdk_likely(bdev_ch->flags == 0)) { 3359 bdev_io_do_submit(bdev_ch, bdev_io); 3360 return; 3361 } 3362 3363 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3364 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3365 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3366 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3367 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3368 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3369 } else { 3370 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3371 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3372 } 3373 } else { 3374 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3375 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3376 } 3377 } 3378 3379 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3380 3381 bool 3382 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3383 { 3384 if (range1->length == 0 || range2->length == 0) { 3385 return false; 3386 } 3387 3388 if (range1->offset + range1->length <= range2->offset) { 3389 return false; 3390 } 3391 3392 if (range2->offset + range2->length <= range1->offset) { 3393 return false; 3394 } 3395 3396 return true; 3397 } 3398 3399 static bool 3400 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3401 { 3402 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3403 struct lba_range r; 3404 3405 switch (bdev_io->type) { 3406 case SPDK_BDEV_IO_TYPE_NVME_IO: 3407 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3408 /* Don't try to decode the NVMe command - just assume worst-case and that 3409 * it overlaps a locked range. 3410 */ 3411 return true; 3412 case SPDK_BDEV_IO_TYPE_WRITE: 3413 case SPDK_BDEV_IO_TYPE_UNMAP: 3414 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3415 case SPDK_BDEV_IO_TYPE_ZCOPY: 3416 case SPDK_BDEV_IO_TYPE_COPY: 3417 r.offset = bdev_io->u.bdev.offset_blocks; 3418 r.length = bdev_io->u.bdev.num_blocks; 3419 if (!bdev_lba_range_overlapped(range, &r)) { 3420 /* This I/O doesn't overlap the specified LBA range. */ 3421 return false; 3422 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3423 /* This I/O overlaps, but the I/O is on the same channel that locked this 3424 * range, and the caller_ctx is the same as the locked_ctx. This means 3425 * that this I/O is associated with the lock, and is allowed to execute. 3426 */ 3427 return false; 3428 } else { 3429 return true; 3430 } 3431 default: 3432 return false; 3433 } 3434 } 3435 3436 void 3437 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3438 { 3439 struct spdk_bdev *bdev = bdev_io->bdev; 3440 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3441 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3442 3443 assert(thread != NULL); 3444 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3445 3446 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3447 struct lba_range *range; 3448 3449 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3450 if (bdev_io_range_is_locked(bdev_io, range)) { 3451 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3452 return; 3453 } 3454 } 3455 } 3456 3457 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3458 3459 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3460 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3461 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3462 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3463 spdk_bdev_get_name(bdev)); 3464 3465 if (bdev_io->internal.split) { 3466 bdev_io_split(bdev_io); 3467 return; 3468 } 3469 3470 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3471 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3472 _bdev_io_submit(bdev_io); 3473 } else { 3474 bdev_io->internal.io_submit_ch = ch; 3475 bdev_io->internal.ch = bdev->internal.qos->ch; 3476 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3477 } 3478 } else { 3479 _bdev_io_submit(bdev_io); 3480 } 3481 } 3482 3483 static inline void 3484 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3485 { 3486 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3487 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3488 * For write operation we need to pull buffers from memory domain before submitting IO. 3489 * Once read operation completes, we need to use memory_domain push functionality to 3490 * update data in original memory domain IO buffer 3491 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3492 bdev_io->u.bdev.memory_domain = NULL; 3493 bdev_io->u.bdev.memory_domain_ctx = NULL; 3494 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3495 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3496 } 3497 3498 static inline void 3499 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3500 { 3501 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3502 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3503 3504 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3505 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3506 bdev_io_complete_unsubmitted(bdev_io); 3507 return; 3508 } 3509 3510 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3511 * support them, but we need to execute an accel sequence and the data buffer is from accel 3512 * memory domain (to avoid doing a push/pull from that domain). 3513 */ 3514 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3515 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3516 _bdev_io_ext_use_bounce_buffer(bdev_io); 3517 return; 3518 } 3519 3520 if (needs_exec) { 3521 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3522 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3523 return; 3524 } 3525 /* For reads we'll execute the sequence after the data is read, so, for now, only 3526 * clear out accel_sequence pointer and submit the IO */ 3527 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3528 bdev_io->u.bdev.accel_sequence = NULL; 3529 } 3530 3531 bdev_io_submit(bdev_io); 3532 } 3533 3534 static void 3535 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3536 { 3537 struct spdk_bdev *bdev = bdev_io->bdev; 3538 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3539 struct spdk_io_channel *ch = bdev_ch->channel; 3540 3541 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3542 3543 bdev_io->internal.in_submit_request = true; 3544 bdev_submit_request(bdev, ch, bdev_io); 3545 bdev_io->internal.in_submit_request = false; 3546 } 3547 3548 void 3549 bdev_io_init(struct spdk_bdev_io *bdev_io, 3550 struct spdk_bdev *bdev, void *cb_arg, 3551 spdk_bdev_io_completion_cb cb) 3552 { 3553 bdev_io->bdev = bdev; 3554 bdev_io->internal.caller_ctx = cb_arg; 3555 bdev_io->internal.cb = cb; 3556 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3557 bdev_io->internal.in_submit_request = false; 3558 bdev_io->internal.buf = NULL; 3559 bdev_io->internal.io_submit_ch = NULL; 3560 bdev_io->internal.orig_iovs = NULL; 3561 bdev_io->internal.orig_iovcnt = 0; 3562 bdev_io->internal.orig_md_iov.iov_base = NULL; 3563 bdev_io->internal.error.nvme.cdw0 = 0; 3564 bdev_io->num_retries = 0; 3565 bdev_io->internal.get_buf_cb = NULL; 3566 bdev_io->internal.get_aux_buf_cb = NULL; 3567 bdev_io->internal.memory_domain = NULL; 3568 bdev_io->internal.memory_domain_ctx = NULL; 3569 bdev_io->internal.data_transfer_cpl = NULL; 3570 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3571 bdev_io->internal.accel_sequence = NULL; 3572 } 3573 3574 static bool 3575 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3576 { 3577 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3578 } 3579 3580 bool 3581 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3582 { 3583 bool supported; 3584 3585 supported = bdev_io_type_supported(bdev, io_type); 3586 3587 if (!supported) { 3588 switch (io_type) { 3589 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3590 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3591 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3592 break; 3593 default: 3594 break; 3595 } 3596 } 3597 3598 return supported; 3599 } 3600 3601 uint64_t 3602 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3603 { 3604 return bdev_io->internal.submit_tsc; 3605 } 3606 3607 int 3608 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3609 { 3610 if (bdev->fn_table->dump_info_json) { 3611 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3612 } 3613 3614 return 0; 3615 } 3616 3617 static void 3618 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3619 { 3620 uint32_t max_per_timeslice = 0; 3621 int i; 3622 3623 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3624 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3625 qos->rate_limits[i].max_per_timeslice = 0; 3626 continue; 3627 } 3628 3629 max_per_timeslice = qos->rate_limits[i].limit * 3630 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3631 3632 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3633 qos->rate_limits[i].min_per_timeslice); 3634 3635 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3636 } 3637 3638 bdev_qos_set_ops(qos); 3639 } 3640 3641 static int 3642 bdev_channel_poll_qos(void *arg) 3643 { 3644 struct spdk_bdev_qos *qos = arg; 3645 uint64_t now = spdk_get_ticks(); 3646 int i; 3647 3648 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3649 /* We received our callback earlier than expected - return 3650 * immediately and wait to do accounting until at least one 3651 * timeslice has actually expired. This should never happen 3652 * with a well-behaved timer implementation. 3653 */ 3654 return SPDK_POLLER_IDLE; 3655 } 3656 3657 /* Reset for next round of rate limiting */ 3658 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3659 /* We may have allowed the IOs or bytes to slightly overrun in the last 3660 * timeslice. remaining_this_timeslice is signed, so if it's negative 3661 * here, we'll account for the overrun so that the next timeslice will 3662 * be appropriately reduced. 3663 */ 3664 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3665 qos->rate_limits[i].remaining_this_timeslice = 0; 3666 } 3667 } 3668 3669 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3670 qos->last_timeslice += qos->timeslice_size; 3671 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3672 qos->rate_limits[i].remaining_this_timeslice += 3673 qos->rate_limits[i].max_per_timeslice; 3674 } 3675 } 3676 3677 return bdev_qos_io_submit(qos->ch, qos); 3678 } 3679 3680 static void 3681 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3682 { 3683 struct spdk_bdev_shared_resource *shared_resource; 3684 struct lba_range *range; 3685 3686 bdev_free_io_stat(ch->stat); 3687 #ifdef SPDK_CONFIG_VTUNE 3688 bdev_free_io_stat(ch->prev_stat); 3689 #endif 3690 3691 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3692 range = TAILQ_FIRST(&ch->locked_ranges); 3693 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3694 free(range); 3695 } 3696 3697 spdk_put_io_channel(ch->channel); 3698 spdk_put_io_channel(ch->accel_channel); 3699 3700 shared_resource = ch->shared_resource; 3701 3702 assert(TAILQ_EMPTY(&ch->io_locked)); 3703 assert(TAILQ_EMPTY(&ch->io_submitted)); 3704 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3705 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3706 assert(ch->io_outstanding == 0); 3707 assert(shared_resource->ref > 0); 3708 shared_resource->ref--; 3709 if (shared_resource->ref == 0) { 3710 assert(shared_resource->io_outstanding == 0); 3711 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3712 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3713 free(shared_resource); 3714 } 3715 } 3716 3717 static void 3718 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3719 { 3720 struct spdk_bdev_qos *qos = bdev->internal.qos; 3721 int i; 3722 3723 assert(spdk_spin_held(&bdev->internal.spinlock)); 3724 3725 /* Rate limiting on this bdev enabled */ 3726 if (qos) { 3727 if (qos->ch == NULL) { 3728 struct spdk_io_channel *io_ch; 3729 3730 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3731 bdev->name, spdk_get_thread()); 3732 3733 /* No qos channel has been selected, so set one up */ 3734 3735 /* Take another reference to ch */ 3736 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3737 assert(io_ch != NULL); 3738 qos->ch = ch; 3739 3740 qos->thread = spdk_io_channel_get_thread(io_ch); 3741 3742 TAILQ_INIT(&qos->queued); 3743 3744 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3745 if (bdev_qos_is_iops_rate_limit(i) == true) { 3746 qos->rate_limits[i].min_per_timeslice = 3747 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3748 } else { 3749 qos->rate_limits[i].min_per_timeslice = 3750 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3751 } 3752 3753 if (qos->rate_limits[i].limit == 0) { 3754 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3755 } 3756 } 3757 bdev_qos_update_max_quota_per_timeslice(qos); 3758 qos->timeslice_size = 3759 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3760 qos->last_timeslice = spdk_get_ticks(); 3761 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3762 qos, 3763 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3764 } 3765 3766 ch->flags |= BDEV_CH_QOS_ENABLED; 3767 } 3768 } 3769 3770 struct poll_timeout_ctx { 3771 struct spdk_bdev_desc *desc; 3772 uint64_t timeout_in_sec; 3773 spdk_bdev_io_timeout_cb cb_fn; 3774 void *cb_arg; 3775 }; 3776 3777 static void 3778 bdev_desc_free(struct spdk_bdev_desc *desc) 3779 { 3780 spdk_spin_destroy(&desc->spinlock); 3781 free(desc->media_events_buffer); 3782 free(desc); 3783 } 3784 3785 static void 3786 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3787 { 3788 struct poll_timeout_ctx *ctx = _ctx; 3789 struct spdk_bdev_desc *desc = ctx->desc; 3790 3791 free(ctx); 3792 3793 spdk_spin_lock(&desc->spinlock); 3794 desc->refs--; 3795 if (desc->closed == true && desc->refs == 0) { 3796 spdk_spin_unlock(&desc->spinlock); 3797 bdev_desc_free(desc); 3798 return; 3799 } 3800 spdk_spin_unlock(&desc->spinlock); 3801 } 3802 3803 static void 3804 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3805 struct spdk_io_channel *io_ch, void *_ctx) 3806 { 3807 struct poll_timeout_ctx *ctx = _ctx; 3808 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3809 struct spdk_bdev_desc *desc = ctx->desc; 3810 struct spdk_bdev_io *bdev_io; 3811 uint64_t now; 3812 3813 spdk_spin_lock(&desc->spinlock); 3814 if (desc->closed == true) { 3815 spdk_spin_unlock(&desc->spinlock); 3816 spdk_bdev_for_each_channel_continue(i, -1); 3817 return; 3818 } 3819 spdk_spin_unlock(&desc->spinlock); 3820 3821 now = spdk_get_ticks(); 3822 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3823 /* Exclude any I/O that are generated via splitting. */ 3824 if (bdev_io->internal.cb == bdev_io_split_done) { 3825 continue; 3826 } 3827 3828 /* Once we find an I/O that has not timed out, we can immediately 3829 * exit the loop. 3830 */ 3831 if (now < (bdev_io->internal.submit_tsc + 3832 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3833 goto end; 3834 } 3835 3836 if (bdev_io->internal.desc == desc) { 3837 ctx->cb_fn(ctx->cb_arg, bdev_io); 3838 } 3839 } 3840 3841 end: 3842 spdk_bdev_for_each_channel_continue(i, 0); 3843 } 3844 3845 static int 3846 bdev_poll_timeout_io(void *arg) 3847 { 3848 struct spdk_bdev_desc *desc = arg; 3849 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3850 struct poll_timeout_ctx *ctx; 3851 3852 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3853 if (!ctx) { 3854 SPDK_ERRLOG("failed to allocate memory\n"); 3855 return SPDK_POLLER_BUSY; 3856 } 3857 ctx->desc = desc; 3858 ctx->cb_arg = desc->cb_arg; 3859 ctx->cb_fn = desc->cb_fn; 3860 ctx->timeout_in_sec = desc->timeout_in_sec; 3861 3862 /* Take a ref on the descriptor in case it gets closed while we are checking 3863 * all of the channels. 3864 */ 3865 spdk_spin_lock(&desc->spinlock); 3866 desc->refs++; 3867 spdk_spin_unlock(&desc->spinlock); 3868 3869 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3870 bdev_channel_poll_timeout_io_done); 3871 3872 return SPDK_POLLER_BUSY; 3873 } 3874 3875 int 3876 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3877 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3878 { 3879 assert(desc->thread == spdk_get_thread()); 3880 3881 spdk_poller_unregister(&desc->io_timeout_poller); 3882 3883 if (timeout_in_sec) { 3884 assert(cb_fn != NULL); 3885 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3886 desc, 3887 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3888 1000); 3889 if (desc->io_timeout_poller == NULL) { 3890 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3891 return -1; 3892 } 3893 } 3894 3895 desc->cb_fn = cb_fn; 3896 desc->cb_arg = cb_arg; 3897 desc->timeout_in_sec = timeout_in_sec; 3898 3899 return 0; 3900 } 3901 3902 static int 3903 bdev_channel_create(void *io_device, void *ctx_buf) 3904 { 3905 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3906 struct spdk_bdev_channel *ch = ctx_buf; 3907 struct spdk_io_channel *mgmt_io_ch; 3908 struct spdk_bdev_mgmt_channel *mgmt_ch; 3909 struct spdk_bdev_shared_resource *shared_resource; 3910 struct lba_range *range; 3911 3912 ch->bdev = bdev; 3913 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3914 if (!ch->channel) { 3915 return -1; 3916 } 3917 3918 ch->accel_channel = spdk_accel_get_io_channel(); 3919 if (!ch->accel_channel) { 3920 spdk_put_io_channel(ch->channel); 3921 return -1; 3922 } 3923 3924 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3925 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3926 3927 assert(ch->histogram == NULL); 3928 if (bdev->internal.histogram_enabled) { 3929 ch->histogram = spdk_histogram_data_alloc(); 3930 if (ch->histogram == NULL) { 3931 SPDK_ERRLOG("Could not allocate histogram\n"); 3932 } 3933 } 3934 3935 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3936 if (!mgmt_io_ch) { 3937 spdk_put_io_channel(ch->channel); 3938 spdk_put_io_channel(ch->accel_channel); 3939 return -1; 3940 } 3941 3942 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3943 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3944 if (shared_resource->shared_ch == ch->channel) { 3945 spdk_put_io_channel(mgmt_io_ch); 3946 shared_resource->ref++; 3947 break; 3948 } 3949 } 3950 3951 if (shared_resource == NULL) { 3952 shared_resource = calloc(1, sizeof(*shared_resource)); 3953 if (shared_resource == NULL) { 3954 spdk_put_io_channel(ch->channel); 3955 spdk_put_io_channel(ch->accel_channel); 3956 spdk_put_io_channel(mgmt_io_ch); 3957 return -1; 3958 } 3959 3960 shared_resource->mgmt_ch = mgmt_ch; 3961 shared_resource->io_outstanding = 0; 3962 TAILQ_INIT(&shared_resource->nomem_io); 3963 shared_resource->nomem_threshold = 0; 3964 shared_resource->shared_ch = ch->channel; 3965 shared_resource->ref = 1; 3966 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3967 } 3968 3969 ch->io_outstanding = 0; 3970 TAILQ_INIT(&ch->queued_resets); 3971 TAILQ_INIT(&ch->locked_ranges); 3972 ch->flags = 0; 3973 ch->shared_resource = shared_resource; 3974 3975 TAILQ_INIT(&ch->io_submitted); 3976 TAILQ_INIT(&ch->io_locked); 3977 TAILQ_INIT(&ch->io_accel_exec); 3978 TAILQ_INIT(&ch->io_memory_domain); 3979 3980 ch->stat = bdev_alloc_io_stat(false); 3981 if (ch->stat == NULL) { 3982 bdev_channel_destroy_resource(ch); 3983 return -1; 3984 } 3985 3986 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3987 3988 #ifdef SPDK_CONFIG_VTUNE 3989 { 3990 char *name; 3991 __itt_init_ittlib(NULL, 0); 3992 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3993 if (!name) { 3994 bdev_channel_destroy_resource(ch); 3995 return -1; 3996 } 3997 ch->handle = __itt_string_handle_create(name); 3998 free(name); 3999 ch->start_tsc = spdk_get_ticks(); 4000 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4001 ch->prev_stat = bdev_alloc_io_stat(false); 4002 if (ch->prev_stat == NULL) { 4003 bdev_channel_destroy_resource(ch); 4004 return -1; 4005 } 4006 } 4007 #endif 4008 4009 spdk_spin_lock(&bdev->internal.spinlock); 4010 bdev_enable_qos(bdev, ch); 4011 4012 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4013 struct lba_range *new_range; 4014 4015 new_range = calloc(1, sizeof(*new_range)); 4016 if (new_range == NULL) { 4017 spdk_spin_unlock(&bdev->internal.spinlock); 4018 bdev_channel_destroy_resource(ch); 4019 return -1; 4020 } 4021 new_range->length = range->length; 4022 new_range->offset = range->offset; 4023 new_range->locked_ctx = range->locked_ctx; 4024 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4025 } 4026 4027 spdk_spin_unlock(&bdev->internal.spinlock); 4028 4029 return 0; 4030 } 4031 4032 static int 4033 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4034 void *cb_ctx) 4035 { 4036 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4037 struct spdk_bdev_io *bdev_io; 4038 uint64_t buf_len; 4039 4040 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4041 if (bdev_io->internal.ch == bdev_ch) { 4042 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4043 spdk_iobuf_entry_abort(ch, entry, buf_len); 4044 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4045 } 4046 4047 return 0; 4048 } 4049 4050 /* 4051 * Abort I/O that are waiting on a data buffer. 4052 */ 4053 static void 4054 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4055 { 4056 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4057 bdev_abort_all_buf_io_cb, ch); 4058 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4059 bdev_abort_all_buf_io_cb, ch); 4060 } 4061 4062 /* 4063 * Abort I/O that are queued waiting for submission. These types of I/O are 4064 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4065 */ 4066 static void 4067 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4068 { 4069 struct spdk_bdev_io *bdev_io, *tmp; 4070 4071 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4072 if (bdev_io->internal.ch == ch) { 4073 TAILQ_REMOVE(queue, bdev_io, internal.link); 4074 /* 4075 * spdk_bdev_io_complete() assumes that the completed I/O had 4076 * been submitted to the bdev module. Since in this case it 4077 * hadn't, bump io_outstanding to account for the decrement 4078 * that spdk_bdev_io_complete() will do. 4079 */ 4080 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4081 ch->io_outstanding++; 4082 ch->shared_resource->io_outstanding++; 4083 } 4084 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4085 } 4086 } 4087 } 4088 4089 static bool 4090 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4091 { 4092 struct spdk_bdev_io *bdev_io; 4093 4094 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4095 if (bdev_io == bio_to_abort) { 4096 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4097 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4098 return true; 4099 } 4100 } 4101 4102 return false; 4103 } 4104 4105 static int 4106 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4107 { 4108 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4109 uint64_t buf_len; 4110 4111 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4112 if (bdev_io == bio_to_abort) { 4113 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4114 spdk_iobuf_entry_abort(ch, entry, buf_len); 4115 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4116 return 1; 4117 } 4118 4119 return 0; 4120 } 4121 4122 static bool 4123 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4124 { 4125 int rc; 4126 4127 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4128 bdev_abort_buf_io_cb, bio_to_abort); 4129 if (rc == 1) { 4130 return true; 4131 } 4132 4133 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4134 bdev_abort_buf_io_cb, bio_to_abort); 4135 return rc == 1; 4136 } 4137 4138 static void 4139 bdev_qos_channel_destroy(void *cb_arg) 4140 { 4141 struct spdk_bdev_qos *qos = cb_arg; 4142 4143 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4144 spdk_poller_unregister(&qos->poller); 4145 4146 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4147 4148 free(qos); 4149 } 4150 4151 static int 4152 bdev_qos_destroy(struct spdk_bdev *bdev) 4153 { 4154 int i; 4155 4156 /* 4157 * Cleanly shutting down the QoS poller is tricky, because 4158 * during the asynchronous operation the user could open 4159 * a new descriptor and create a new channel, spawning 4160 * a new QoS poller. 4161 * 4162 * The strategy is to create a new QoS structure here and swap it 4163 * in. The shutdown path then continues to refer to the old one 4164 * until it completes and then releases it. 4165 */ 4166 struct spdk_bdev_qos *new_qos, *old_qos; 4167 4168 old_qos = bdev->internal.qos; 4169 4170 new_qos = calloc(1, sizeof(*new_qos)); 4171 if (!new_qos) { 4172 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4173 return -ENOMEM; 4174 } 4175 4176 /* Copy the old QoS data into the newly allocated structure */ 4177 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4178 4179 /* Zero out the key parts of the QoS structure */ 4180 new_qos->ch = NULL; 4181 new_qos->thread = NULL; 4182 new_qos->poller = NULL; 4183 TAILQ_INIT(&new_qos->queued); 4184 /* 4185 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4186 * It will be used later for the new QoS structure. 4187 */ 4188 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4189 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4190 new_qos->rate_limits[i].min_per_timeslice = 0; 4191 new_qos->rate_limits[i].max_per_timeslice = 0; 4192 } 4193 4194 bdev->internal.qos = new_qos; 4195 4196 if (old_qos->thread == NULL) { 4197 free(old_qos); 4198 } else { 4199 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4200 } 4201 4202 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4203 * been destroyed yet. The destruction path will end up waiting for the final 4204 * channel to be put before it releases resources. */ 4205 4206 return 0; 4207 } 4208 4209 void 4210 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4211 { 4212 total->bytes_read += add->bytes_read; 4213 total->num_read_ops += add->num_read_ops; 4214 total->bytes_written += add->bytes_written; 4215 total->num_write_ops += add->num_write_ops; 4216 total->bytes_unmapped += add->bytes_unmapped; 4217 total->num_unmap_ops += add->num_unmap_ops; 4218 total->bytes_copied += add->bytes_copied; 4219 total->num_copy_ops += add->num_copy_ops; 4220 total->read_latency_ticks += add->read_latency_ticks; 4221 total->write_latency_ticks += add->write_latency_ticks; 4222 total->unmap_latency_ticks += add->unmap_latency_ticks; 4223 total->copy_latency_ticks += add->copy_latency_ticks; 4224 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4225 total->max_read_latency_ticks = add->max_read_latency_ticks; 4226 } 4227 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4228 total->min_read_latency_ticks = add->min_read_latency_ticks; 4229 } 4230 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4231 total->max_write_latency_ticks = add->max_write_latency_ticks; 4232 } 4233 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4234 total->min_write_latency_ticks = add->min_write_latency_ticks; 4235 } 4236 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4237 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4238 } 4239 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4240 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4241 } 4242 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4243 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4244 } 4245 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4246 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4247 } 4248 } 4249 4250 static void 4251 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4252 { 4253 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4254 4255 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4256 memcpy(to_stat->io_error, from_stat->io_error, 4257 sizeof(struct spdk_bdev_io_error_stat)); 4258 } 4259 } 4260 4261 void 4262 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4263 { 4264 stat->max_read_latency_ticks = 0; 4265 stat->min_read_latency_ticks = UINT64_MAX; 4266 stat->max_write_latency_ticks = 0; 4267 stat->min_write_latency_ticks = UINT64_MAX; 4268 stat->max_unmap_latency_ticks = 0; 4269 stat->min_unmap_latency_ticks = UINT64_MAX; 4270 stat->max_copy_latency_ticks = 0; 4271 stat->min_copy_latency_ticks = UINT64_MAX; 4272 4273 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4274 return; 4275 } 4276 4277 stat->bytes_read = 0; 4278 stat->num_read_ops = 0; 4279 stat->bytes_written = 0; 4280 stat->num_write_ops = 0; 4281 stat->bytes_unmapped = 0; 4282 stat->num_unmap_ops = 0; 4283 stat->bytes_copied = 0; 4284 stat->num_copy_ops = 0; 4285 stat->read_latency_ticks = 0; 4286 stat->write_latency_ticks = 0; 4287 stat->unmap_latency_ticks = 0; 4288 stat->copy_latency_ticks = 0; 4289 4290 if (stat->io_error != NULL) { 4291 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4292 } 4293 } 4294 4295 struct spdk_bdev_io_stat * 4296 bdev_alloc_io_stat(bool io_error_stat) 4297 { 4298 struct spdk_bdev_io_stat *stat; 4299 4300 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4301 if (stat == NULL) { 4302 return NULL; 4303 } 4304 4305 if (io_error_stat) { 4306 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4307 if (stat->io_error == NULL) { 4308 free(stat); 4309 return NULL; 4310 } 4311 } else { 4312 stat->io_error = NULL; 4313 } 4314 4315 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4316 4317 return stat; 4318 } 4319 4320 void 4321 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4322 { 4323 if (stat != NULL) { 4324 free(stat->io_error); 4325 free(stat); 4326 } 4327 } 4328 4329 void 4330 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4331 { 4332 int i; 4333 4334 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4335 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4336 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4337 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4338 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4339 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4340 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4341 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4342 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4343 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4344 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4345 stat->min_read_latency_ticks != UINT64_MAX ? 4346 stat->min_read_latency_ticks : 0); 4347 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4348 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4349 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4350 stat->min_write_latency_ticks != UINT64_MAX ? 4351 stat->min_write_latency_ticks : 0); 4352 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4353 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4354 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4355 stat->min_unmap_latency_ticks != UINT64_MAX ? 4356 stat->min_unmap_latency_ticks : 0); 4357 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4358 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4359 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4360 stat->min_copy_latency_ticks != UINT64_MAX ? 4361 stat->min_copy_latency_ticks : 0); 4362 4363 if (stat->io_error != NULL) { 4364 spdk_json_write_named_object_begin(w, "io_error"); 4365 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4366 if (stat->io_error->error_status[i] != 0) { 4367 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4368 stat->io_error->error_status[i]); 4369 } 4370 } 4371 spdk_json_write_object_end(w); 4372 } 4373 } 4374 4375 static void 4376 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4377 { 4378 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4379 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4380 4381 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4382 bdev_abort_all_buf_io(mgmt_ch, ch); 4383 bdev_abort_all_buf_io(mgmt_ch, ch); 4384 } 4385 4386 static void 4387 bdev_channel_destroy(void *io_device, void *ctx_buf) 4388 { 4389 struct spdk_bdev_channel *ch = ctx_buf; 4390 4391 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4392 spdk_get_thread()); 4393 4394 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4395 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4396 4397 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4398 spdk_spin_lock(&ch->bdev->internal.spinlock); 4399 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4400 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4401 4402 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4403 4404 bdev_channel_abort_queued_ios(ch); 4405 4406 if (ch->histogram) { 4407 spdk_histogram_data_free(ch->histogram); 4408 } 4409 4410 bdev_channel_destroy_resource(ch); 4411 } 4412 4413 /* 4414 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4415 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4416 */ 4417 static int 4418 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4419 { 4420 struct spdk_bdev_name *tmp; 4421 4422 bdev_name->name = strdup(name); 4423 if (bdev_name->name == NULL) { 4424 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4425 return -ENOMEM; 4426 } 4427 4428 bdev_name->bdev = bdev; 4429 4430 spdk_spin_lock(&g_bdev_mgr.spinlock); 4431 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4432 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4433 4434 if (tmp != NULL) { 4435 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4436 free(bdev_name->name); 4437 return -EEXIST; 4438 } 4439 4440 return 0; 4441 } 4442 4443 static void 4444 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4445 { 4446 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4447 free(bdev_name->name); 4448 } 4449 4450 static void 4451 bdev_name_del(struct spdk_bdev_name *bdev_name) 4452 { 4453 spdk_spin_lock(&g_bdev_mgr.spinlock); 4454 bdev_name_del_unsafe(bdev_name); 4455 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4456 } 4457 4458 int 4459 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4460 { 4461 struct spdk_bdev_alias *tmp; 4462 int ret; 4463 4464 if (alias == NULL) { 4465 SPDK_ERRLOG("Empty alias passed\n"); 4466 return -EINVAL; 4467 } 4468 4469 tmp = calloc(1, sizeof(*tmp)); 4470 if (tmp == NULL) { 4471 SPDK_ERRLOG("Unable to allocate alias\n"); 4472 return -ENOMEM; 4473 } 4474 4475 ret = bdev_name_add(&tmp->alias, bdev, alias); 4476 if (ret != 0) { 4477 free(tmp); 4478 return ret; 4479 } 4480 4481 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4482 4483 return 0; 4484 } 4485 4486 static int 4487 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4488 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4489 { 4490 struct spdk_bdev_alias *tmp; 4491 4492 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4493 if (strcmp(alias, tmp->alias.name) == 0) { 4494 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4495 alias_del_fn(&tmp->alias); 4496 free(tmp); 4497 return 0; 4498 } 4499 } 4500 4501 return -ENOENT; 4502 } 4503 4504 int 4505 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4506 { 4507 int rc; 4508 4509 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4510 if (rc == -ENOENT) { 4511 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4512 } 4513 4514 return rc; 4515 } 4516 4517 void 4518 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4519 { 4520 struct spdk_bdev_alias *p, *tmp; 4521 4522 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4523 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4524 bdev_name_del(&p->alias); 4525 free(p); 4526 } 4527 } 4528 4529 struct spdk_io_channel * 4530 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4531 { 4532 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4533 } 4534 4535 void * 4536 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4537 { 4538 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4539 void *ctx = NULL; 4540 4541 if (bdev->fn_table->get_module_ctx) { 4542 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4543 } 4544 4545 return ctx; 4546 } 4547 4548 const char * 4549 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4550 { 4551 return bdev->module->name; 4552 } 4553 4554 const char * 4555 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4556 { 4557 return bdev->name; 4558 } 4559 4560 const char * 4561 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4562 { 4563 return bdev->product_name; 4564 } 4565 4566 const struct spdk_bdev_aliases_list * 4567 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4568 { 4569 return &bdev->aliases; 4570 } 4571 4572 uint32_t 4573 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4574 { 4575 return bdev->blocklen; 4576 } 4577 4578 uint32_t 4579 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4580 { 4581 return bdev->write_unit_size; 4582 } 4583 4584 uint64_t 4585 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4586 { 4587 return bdev->blockcnt; 4588 } 4589 4590 const char * 4591 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4592 { 4593 return qos_rpc_type[type]; 4594 } 4595 4596 void 4597 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4598 { 4599 int i; 4600 4601 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4602 4603 spdk_spin_lock(&bdev->internal.spinlock); 4604 if (bdev->internal.qos) { 4605 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4606 if (bdev->internal.qos->rate_limits[i].limit != 4607 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4608 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4609 if (bdev_qos_is_iops_rate_limit(i) == false) { 4610 /* Change from Byte to Megabyte which is user visible. */ 4611 limits[i] = limits[i] / 1024 / 1024; 4612 } 4613 } 4614 } 4615 } 4616 spdk_spin_unlock(&bdev->internal.spinlock); 4617 } 4618 4619 size_t 4620 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4621 { 4622 return 1 << bdev->required_alignment; 4623 } 4624 4625 uint32_t 4626 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4627 { 4628 return bdev->optimal_io_boundary; 4629 } 4630 4631 bool 4632 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4633 { 4634 return bdev->write_cache; 4635 } 4636 4637 const struct spdk_uuid * 4638 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4639 { 4640 return &bdev->uuid; 4641 } 4642 4643 uint16_t 4644 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4645 { 4646 return bdev->acwu; 4647 } 4648 4649 uint32_t 4650 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4651 { 4652 return bdev->md_len; 4653 } 4654 4655 bool 4656 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4657 { 4658 return (bdev->md_len != 0) && bdev->md_interleave; 4659 } 4660 4661 bool 4662 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4663 { 4664 return (bdev->md_len != 0) && !bdev->md_interleave; 4665 } 4666 4667 bool 4668 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4669 { 4670 return bdev->zoned; 4671 } 4672 4673 uint32_t 4674 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4675 { 4676 if (spdk_bdev_is_md_interleaved(bdev)) { 4677 return bdev->blocklen - bdev->md_len; 4678 } else { 4679 return bdev->blocklen; 4680 } 4681 } 4682 4683 uint32_t 4684 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4685 { 4686 return bdev->phys_blocklen; 4687 } 4688 4689 static uint32_t 4690 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4691 { 4692 if (!spdk_bdev_is_md_interleaved(bdev)) { 4693 return bdev->blocklen + bdev->md_len; 4694 } else { 4695 return bdev->blocklen; 4696 } 4697 } 4698 4699 /* We have to use the typedef in the function declaration to appease astyle. */ 4700 typedef enum spdk_dif_type spdk_dif_type_t; 4701 4702 spdk_dif_type_t 4703 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4704 { 4705 if (bdev->md_len != 0) { 4706 return bdev->dif_type; 4707 } else { 4708 return SPDK_DIF_DISABLE; 4709 } 4710 } 4711 4712 bool 4713 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4714 { 4715 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4716 return bdev->dif_is_head_of_md; 4717 } else { 4718 return false; 4719 } 4720 } 4721 4722 bool 4723 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4724 enum spdk_dif_check_type check_type) 4725 { 4726 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4727 return false; 4728 } 4729 4730 switch (check_type) { 4731 case SPDK_DIF_CHECK_TYPE_REFTAG: 4732 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4733 case SPDK_DIF_CHECK_TYPE_APPTAG: 4734 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4735 case SPDK_DIF_CHECK_TYPE_GUARD: 4736 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4737 default: 4738 return false; 4739 } 4740 } 4741 4742 static uint32_t 4743 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4744 { 4745 uint64_t aligned_length, max_write_blocks; 4746 4747 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4748 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4749 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4750 4751 return max_write_blocks; 4752 } 4753 4754 uint32_t 4755 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4756 { 4757 return bdev->max_copy; 4758 } 4759 4760 uint64_t 4761 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4762 { 4763 return bdev->internal.measured_queue_depth; 4764 } 4765 4766 uint64_t 4767 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4768 { 4769 return bdev->internal.period; 4770 } 4771 4772 uint64_t 4773 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4774 { 4775 return bdev->internal.weighted_io_time; 4776 } 4777 4778 uint64_t 4779 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4780 { 4781 return bdev->internal.io_time; 4782 } 4783 4784 static void bdev_update_qd_sampling_period(void *ctx); 4785 4786 static void 4787 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4788 { 4789 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4790 4791 if (bdev->internal.measured_queue_depth) { 4792 bdev->internal.io_time += bdev->internal.period; 4793 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4794 } 4795 4796 bdev->internal.qd_poll_in_progress = false; 4797 4798 bdev_update_qd_sampling_period(bdev); 4799 } 4800 4801 static void 4802 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4803 struct spdk_io_channel *io_ch, void *_ctx) 4804 { 4805 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4806 4807 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4808 spdk_bdev_for_each_channel_continue(i, 0); 4809 } 4810 4811 static int 4812 bdev_calculate_measured_queue_depth(void *ctx) 4813 { 4814 struct spdk_bdev *bdev = ctx; 4815 4816 bdev->internal.qd_poll_in_progress = true; 4817 bdev->internal.temporary_queue_depth = 0; 4818 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4819 return SPDK_POLLER_BUSY; 4820 } 4821 4822 static void 4823 bdev_update_qd_sampling_period(void *ctx) 4824 { 4825 struct spdk_bdev *bdev = ctx; 4826 4827 if (bdev->internal.period == bdev->internal.new_period) { 4828 return; 4829 } 4830 4831 if (bdev->internal.qd_poll_in_progress) { 4832 return; 4833 } 4834 4835 bdev->internal.period = bdev->internal.new_period; 4836 4837 spdk_poller_unregister(&bdev->internal.qd_poller); 4838 if (bdev->internal.period != 0) { 4839 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4840 bdev, bdev->internal.period); 4841 } else { 4842 spdk_bdev_close(bdev->internal.qd_desc); 4843 bdev->internal.qd_desc = NULL; 4844 } 4845 } 4846 4847 static void 4848 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4849 { 4850 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4851 } 4852 4853 void 4854 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4855 { 4856 int rc; 4857 4858 if (bdev->internal.new_period == period) { 4859 return; 4860 } 4861 4862 bdev->internal.new_period = period; 4863 4864 if (bdev->internal.qd_desc != NULL) { 4865 assert(bdev->internal.period != 0); 4866 4867 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4868 bdev_update_qd_sampling_period, bdev); 4869 return; 4870 } 4871 4872 assert(bdev->internal.period == 0); 4873 4874 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4875 NULL, &bdev->internal.qd_desc); 4876 if (rc != 0) { 4877 return; 4878 } 4879 4880 bdev->internal.period = period; 4881 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4882 bdev, period); 4883 } 4884 4885 struct bdev_get_current_qd_ctx { 4886 uint64_t current_qd; 4887 spdk_bdev_get_current_qd_cb cb_fn; 4888 void *cb_arg; 4889 }; 4890 4891 static void 4892 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4893 { 4894 struct bdev_get_current_qd_ctx *ctx = _ctx; 4895 4896 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4897 4898 free(ctx); 4899 } 4900 4901 static void 4902 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4903 struct spdk_io_channel *io_ch, void *_ctx) 4904 { 4905 struct bdev_get_current_qd_ctx *ctx = _ctx; 4906 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4907 4908 ctx->current_qd += bdev_ch->io_outstanding; 4909 4910 spdk_bdev_for_each_channel_continue(i, 0); 4911 } 4912 4913 void 4914 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4915 void *cb_arg) 4916 { 4917 struct bdev_get_current_qd_ctx *ctx; 4918 4919 assert(cb_fn != NULL); 4920 4921 ctx = calloc(1, sizeof(*ctx)); 4922 if (ctx == NULL) { 4923 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4924 return; 4925 } 4926 4927 ctx->cb_fn = cb_fn; 4928 ctx->cb_arg = cb_arg; 4929 4930 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4931 } 4932 4933 static void 4934 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4935 { 4936 assert(desc->thread == spdk_get_thread()); 4937 4938 spdk_spin_lock(&desc->spinlock); 4939 desc->refs--; 4940 if (!desc->closed) { 4941 spdk_spin_unlock(&desc->spinlock); 4942 desc->callback.event_fn(type, 4943 desc->bdev, 4944 desc->callback.ctx); 4945 return; 4946 } else if (desc->refs == 0) { 4947 /* This descriptor was closed after this event_notify message was sent. 4948 * spdk_bdev_close() could not free the descriptor since this message was 4949 * in flight, so we free it now using bdev_desc_free(). 4950 */ 4951 spdk_spin_unlock(&desc->spinlock); 4952 bdev_desc_free(desc); 4953 return; 4954 } 4955 spdk_spin_unlock(&desc->spinlock); 4956 } 4957 4958 static void 4959 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4960 { 4961 spdk_spin_lock(&desc->spinlock); 4962 desc->refs++; 4963 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4964 spdk_spin_unlock(&desc->spinlock); 4965 } 4966 4967 static void 4968 _resize_notify(void *ctx) 4969 { 4970 struct spdk_bdev_desc *desc = ctx; 4971 4972 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4973 } 4974 4975 int 4976 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4977 { 4978 struct spdk_bdev_desc *desc; 4979 int ret; 4980 4981 if (size == bdev->blockcnt) { 4982 return 0; 4983 } 4984 4985 spdk_spin_lock(&bdev->internal.spinlock); 4986 4987 /* bdev has open descriptors */ 4988 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4989 bdev->blockcnt > size) { 4990 ret = -EBUSY; 4991 } else { 4992 bdev->blockcnt = size; 4993 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4994 event_notify(desc, _resize_notify); 4995 } 4996 ret = 0; 4997 } 4998 4999 spdk_spin_unlock(&bdev->internal.spinlock); 5000 5001 return ret; 5002 } 5003 5004 /* 5005 * Convert I/O offset and length from bytes to blocks. 5006 * 5007 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5008 */ 5009 static uint64_t 5010 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5011 uint64_t num_bytes, uint64_t *num_blocks) 5012 { 5013 uint32_t block_size = bdev->blocklen; 5014 uint8_t shift_cnt; 5015 5016 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5017 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5018 shift_cnt = spdk_u32log2(block_size); 5019 *offset_blocks = offset_bytes >> shift_cnt; 5020 *num_blocks = num_bytes >> shift_cnt; 5021 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5022 (num_bytes - (*num_blocks << shift_cnt)); 5023 } else { 5024 *offset_blocks = offset_bytes / block_size; 5025 *num_blocks = num_bytes / block_size; 5026 return (offset_bytes % block_size) | (num_bytes % block_size); 5027 } 5028 } 5029 5030 static bool 5031 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5032 { 5033 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5034 * has been an overflow and hence the offset has been wrapped around */ 5035 if (offset_blocks + num_blocks < offset_blocks) { 5036 return false; 5037 } 5038 5039 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5040 if (offset_blocks + num_blocks > bdev->blockcnt) { 5041 return false; 5042 } 5043 5044 return true; 5045 } 5046 5047 static void 5048 bdev_seek_complete_cb(void *ctx) 5049 { 5050 struct spdk_bdev_io *bdev_io = ctx; 5051 5052 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5053 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5054 } 5055 5056 static int 5057 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5058 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5059 spdk_bdev_io_completion_cb cb, void *cb_arg) 5060 { 5061 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5062 struct spdk_bdev_io *bdev_io; 5063 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5064 5065 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5066 5067 /* Check if offset_blocks is valid looking at the validity of one block */ 5068 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5069 return -EINVAL; 5070 } 5071 5072 bdev_io = bdev_channel_get_io(channel); 5073 if (!bdev_io) { 5074 return -ENOMEM; 5075 } 5076 5077 bdev_io->internal.ch = channel; 5078 bdev_io->internal.desc = desc; 5079 bdev_io->type = io_type; 5080 bdev_io->u.bdev.offset_blocks = offset_blocks; 5081 bdev_io->u.bdev.memory_domain = NULL; 5082 bdev_io->u.bdev.memory_domain_ctx = NULL; 5083 bdev_io->u.bdev.accel_sequence = NULL; 5084 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5085 5086 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5087 /* In case bdev doesn't support seek to next data/hole offset, 5088 * it is assumed that only data and no holes are present */ 5089 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5090 bdev_io->u.bdev.seek.offset = offset_blocks; 5091 } else { 5092 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5093 } 5094 5095 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5096 return 0; 5097 } 5098 5099 bdev_io_submit(bdev_io); 5100 return 0; 5101 } 5102 5103 int 5104 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5105 uint64_t offset_blocks, 5106 spdk_bdev_io_completion_cb cb, void *cb_arg) 5107 { 5108 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5109 } 5110 5111 int 5112 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5113 uint64_t offset_blocks, 5114 spdk_bdev_io_completion_cb cb, void *cb_arg) 5115 { 5116 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5117 } 5118 5119 uint64_t 5120 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5121 { 5122 return bdev_io->u.bdev.seek.offset; 5123 } 5124 5125 static int 5126 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5127 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5128 spdk_bdev_io_completion_cb cb, void *cb_arg) 5129 { 5130 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5131 struct spdk_bdev_io *bdev_io; 5132 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5133 5134 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5135 return -EINVAL; 5136 } 5137 5138 bdev_io = bdev_channel_get_io(channel); 5139 if (!bdev_io) { 5140 return -ENOMEM; 5141 } 5142 5143 bdev_io->internal.ch = channel; 5144 bdev_io->internal.desc = desc; 5145 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5146 bdev_io->u.bdev.iovs = &bdev_io->iov; 5147 bdev_io->u.bdev.iovs[0].iov_base = buf; 5148 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5149 bdev_io->u.bdev.iovcnt = 1; 5150 bdev_io->u.bdev.md_buf = md_buf; 5151 bdev_io->u.bdev.num_blocks = num_blocks; 5152 bdev_io->u.bdev.offset_blocks = offset_blocks; 5153 bdev_io->u.bdev.memory_domain = NULL; 5154 bdev_io->u.bdev.memory_domain_ctx = NULL; 5155 bdev_io->u.bdev.accel_sequence = NULL; 5156 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5157 5158 bdev_io_submit(bdev_io); 5159 return 0; 5160 } 5161 5162 int 5163 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5164 void *buf, uint64_t offset, uint64_t nbytes, 5165 spdk_bdev_io_completion_cb cb, void *cb_arg) 5166 { 5167 uint64_t offset_blocks, num_blocks; 5168 5169 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5170 nbytes, &num_blocks) != 0) { 5171 return -EINVAL; 5172 } 5173 5174 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5175 } 5176 5177 int 5178 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5179 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5180 spdk_bdev_io_completion_cb cb, void *cb_arg) 5181 { 5182 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5183 } 5184 5185 int 5186 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5187 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5188 spdk_bdev_io_completion_cb cb, void *cb_arg) 5189 { 5190 struct iovec iov = { 5191 .iov_base = buf, 5192 }; 5193 5194 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5195 return -EINVAL; 5196 } 5197 5198 if (md_buf && !_is_buf_allocated(&iov)) { 5199 return -EINVAL; 5200 } 5201 5202 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5203 cb, cb_arg); 5204 } 5205 5206 int 5207 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5208 struct iovec *iov, int iovcnt, 5209 uint64_t offset, uint64_t nbytes, 5210 spdk_bdev_io_completion_cb cb, void *cb_arg) 5211 { 5212 uint64_t offset_blocks, num_blocks; 5213 5214 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5215 nbytes, &num_blocks) != 0) { 5216 return -EINVAL; 5217 } 5218 5219 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5220 } 5221 5222 static int 5223 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5224 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5225 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5226 struct spdk_accel_sequence *seq, 5227 spdk_bdev_io_completion_cb cb, void *cb_arg) 5228 { 5229 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5230 struct spdk_bdev_io *bdev_io; 5231 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5232 5233 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5234 return -EINVAL; 5235 } 5236 5237 bdev_io = bdev_channel_get_io(channel); 5238 if (!bdev_io) { 5239 return -ENOMEM; 5240 } 5241 5242 bdev_io->internal.ch = channel; 5243 bdev_io->internal.desc = desc; 5244 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5245 bdev_io->u.bdev.iovs = iov; 5246 bdev_io->u.bdev.iovcnt = iovcnt; 5247 bdev_io->u.bdev.md_buf = md_buf; 5248 bdev_io->u.bdev.num_blocks = num_blocks; 5249 bdev_io->u.bdev.offset_blocks = offset_blocks; 5250 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5251 bdev_io->internal.memory_domain = domain; 5252 bdev_io->internal.memory_domain_ctx = domain_ctx; 5253 bdev_io->internal.accel_sequence = seq; 5254 bdev_io->u.bdev.memory_domain = domain; 5255 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5256 bdev_io->u.bdev.accel_sequence = seq; 5257 5258 _bdev_io_submit_ext(desc, bdev_io); 5259 5260 return 0; 5261 } 5262 5263 int 5264 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5265 struct iovec *iov, int iovcnt, 5266 uint64_t offset_blocks, uint64_t num_blocks, 5267 spdk_bdev_io_completion_cb cb, void *cb_arg) 5268 { 5269 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5270 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5271 } 5272 5273 int 5274 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5275 struct iovec *iov, int iovcnt, void *md_buf, 5276 uint64_t offset_blocks, uint64_t num_blocks, 5277 spdk_bdev_io_completion_cb cb, void *cb_arg) 5278 { 5279 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5280 return -EINVAL; 5281 } 5282 5283 if (md_buf && !_is_buf_allocated(iov)) { 5284 return -EINVAL; 5285 } 5286 5287 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5288 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5289 } 5290 5291 static inline bool 5292 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5293 { 5294 /* 5295 * We check if opts size is at least of size when we first introduced 5296 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5297 * are not checked internal. 5298 */ 5299 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5300 sizeof(opts->metadata) && 5301 opts->size <= sizeof(*opts) && 5302 /* When memory domain is used, the user must provide data buffers */ 5303 (!opts->memory_domain || (iov && iov[0].iov_base)); 5304 } 5305 5306 int 5307 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5308 struct iovec *iov, int iovcnt, 5309 uint64_t offset_blocks, uint64_t num_blocks, 5310 spdk_bdev_io_completion_cb cb, void *cb_arg, 5311 struct spdk_bdev_ext_io_opts *opts) 5312 { 5313 void *md = NULL; 5314 5315 if (opts) { 5316 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5317 return -EINVAL; 5318 } 5319 md = opts->metadata; 5320 } 5321 5322 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5323 return -EINVAL; 5324 } 5325 5326 if (md && !_is_buf_allocated(iov)) { 5327 return -EINVAL; 5328 } 5329 5330 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5331 num_blocks, 5332 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5333 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5334 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5335 cb, cb_arg); 5336 } 5337 5338 static int 5339 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5340 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5341 spdk_bdev_io_completion_cb cb, void *cb_arg) 5342 { 5343 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5344 struct spdk_bdev_io *bdev_io; 5345 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5346 5347 if (!desc->write) { 5348 return -EBADF; 5349 } 5350 5351 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5352 return -EINVAL; 5353 } 5354 5355 bdev_io = bdev_channel_get_io(channel); 5356 if (!bdev_io) { 5357 return -ENOMEM; 5358 } 5359 5360 bdev_io->internal.ch = channel; 5361 bdev_io->internal.desc = desc; 5362 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5363 bdev_io->u.bdev.iovs = &bdev_io->iov; 5364 bdev_io->u.bdev.iovs[0].iov_base = buf; 5365 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5366 bdev_io->u.bdev.iovcnt = 1; 5367 bdev_io->u.bdev.md_buf = md_buf; 5368 bdev_io->u.bdev.num_blocks = num_blocks; 5369 bdev_io->u.bdev.offset_blocks = offset_blocks; 5370 bdev_io->u.bdev.memory_domain = NULL; 5371 bdev_io->u.bdev.memory_domain_ctx = NULL; 5372 bdev_io->u.bdev.accel_sequence = NULL; 5373 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5374 5375 bdev_io_submit(bdev_io); 5376 return 0; 5377 } 5378 5379 int 5380 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5381 void *buf, uint64_t offset, uint64_t nbytes, 5382 spdk_bdev_io_completion_cb cb, void *cb_arg) 5383 { 5384 uint64_t offset_blocks, num_blocks; 5385 5386 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5387 nbytes, &num_blocks) != 0) { 5388 return -EINVAL; 5389 } 5390 5391 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5392 } 5393 5394 int 5395 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5396 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5397 spdk_bdev_io_completion_cb cb, void *cb_arg) 5398 { 5399 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5400 cb, cb_arg); 5401 } 5402 5403 int 5404 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5405 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5406 spdk_bdev_io_completion_cb cb, void *cb_arg) 5407 { 5408 struct iovec iov = { 5409 .iov_base = buf, 5410 }; 5411 5412 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5413 return -EINVAL; 5414 } 5415 5416 if (md_buf && !_is_buf_allocated(&iov)) { 5417 return -EINVAL; 5418 } 5419 5420 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5421 cb, cb_arg); 5422 } 5423 5424 static int 5425 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5426 struct iovec *iov, int iovcnt, void *md_buf, 5427 uint64_t offset_blocks, uint64_t num_blocks, 5428 struct spdk_memory_domain *domain, void *domain_ctx, 5429 struct spdk_accel_sequence *seq, 5430 spdk_bdev_io_completion_cb cb, void *cb_arg) 5431 { 5432 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5433 struct spdk_bdev_io *bdev_io; 5434 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5435 5436 if (!desc->write) { 5437 return -EBADF; 5438 } 5439 5440 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5441 return -EINVAL; 5442 } 5443 5444 bdev_io = bdev_channel_get_io(channel); 5445 if (!bdev_io) { 5446 return -ENOMEM; 5447 } 5448 5449 bdev_io->internal.ch = channel; 5450 bdev_io->internal.desc = desc; 5451 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5452 bdev_io->u.bdev.iovs = iov; 5453 bdev_io->u.bdev.iovcnt = iovcnt; 5454 bdev_io->u.bdev.md_buf = md_buf; 5455 bdev_io->u.bdev.num_blocks = num_blocks; 5456 bdev_io->u.bdev.offset_blocks = offset_blocks; 5457 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5458 bdev_io->internal.memory_domain = domain; 5459 bdev_io->internal.memory_domain_ctx = domain_ctx; 5460 bdev_io->internal.accel_sequence = seq; 5461 bdev_io->u.bdev.memory_domain = domain; 5462 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5463 bdev_io->u.bdev.accel_sequence = seq; 5464 5465 _bdev_io_submit_ext(desc, bdev_io); 5466 5467 return 0; 5468 } 5469 5470 int 5471 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5472 struct iovec *iov, int iovcnt, 5473 uint64_t offset, uint64_t len, 5474 spdk_bdev_io_completion_cb cb, void *cb_arg) 5475 { 5476 uint64_t offset_blocks, num_blocks; 5477 5478 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5479 len, &num_blocks) != 0) { 5480 return -EINVAL; 5481 } 5482 5483 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5484 } 5485 5486 int 5487 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5488 struct iovec *iov, int iovcnt, 5489 uint64_t offset_blocks, uint64_t num_blocks, 5490 spdk_bdev_io_completion_cb cb, void *cb_arg) 5491 { 5492 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5493 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5494 } 5495 5496 int 5497 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5498 struct iovec *iov, int iovcnt, void *md_buf, 5499 uint64_t offset_blocks, uint64_t num_blocks, 5500 spdk_bdev_io_completion_cb cb, void *cb_arg) 5501 { 5502 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5503 return -EINVAL; 5504 } 5505 5506 if (md_buf && !_is_buf_allocated(iov)) { 5507 return -EINVAL; 5508 } 5509 5510 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5511 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5512 } 5513 5514 int 5515 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5516 struct iovec *iov, int iovcnt, 5517 uint64_t offset_blocks, uint64_t num_blocks, 5518 spdk_bdev_io_completion_cb cb, void *cb_arg, 5519 struct spdk_bdev_ext_io_opts *opts) 5520 { 5521 void *md = NULL; 5522 5523 if (opts) { 5524 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5525 return -EINVAL; 5526 } 5527 md = opts->metadata; 5528 } 5529 5530 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5531 return -EINVAL; 5532 } 5533 5534 if (md && !_is_buf_allocated(iov)) { 5535 return -EINVAL; 5536 } 5537 5538 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5539 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5540 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5541 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5542 cb, cb_arg); 5543 } 5544 5545 static void 5546 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5547 { 5548 struct spdk_bdev_io *parent_io = cb_arg; 5549 struct spdk_bdev *bdev = parent_io->bdev; 5550 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5551 int i, rc = 0; 5552 5553 if (!success) { 5554 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5555 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5556 spdk_bdev_free_io(bdev_io); 5557 return; 5558 } 5559 5560 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5561 rc = memcmp(read_buf, 5562 parent_io->u.bdev.iovs[i].iov_base, 5563 parent_io->u.bdev.iovs[i].iov_len); 5564 if (rc) { 5565 break; 5566 } 5567 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5568 } 5569 5570 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5571 rc = memcmp(bdev_io->u.bdev.md_buf, 5572 parent_io->u.bdev.md_buf, 5573 spdk_bdev_get_md_size(bdev)); 5574 } 5575 5576 spdk_bdev_free_io(bdev_io); 5577 5578 if (rc == 0) { 5579 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5580 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5581 } else { 5582 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5583 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5584 } 5585 } 5586 5587 static void 5588 bdev_compare_do_read(void *_bdev_io) 5589 { 5590 struct spdk_bdev_io *bdev_io = _bdev_io; 5591 int rc; 5592 5593 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5594 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5595 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5596 bdev_compare_do_read_done, bdev_io); 5597 5598 if (rc == -ENOMEM) { 5599 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5600 } else if (rc != 0) { 5601 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5602 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5603 } 5604 } 5605 5606 static int 5607 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5608 struct iovec *iov, int iovcnt, void *md_buf, 5609 uint64_t offset_blocks, uint64_t num_blocks, 5610 spdk_bdev_io_completion_cb cb, void *cb_arg) 5611 { 5612 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5613 struct spdk_bdev_io *bdev_io; 5614 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5615 5616 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5617 return -EINVAL; 5618 } 5619 5620 bdev_io = bdev_channel_get_io(channel); 5621 if (!bdev_io) { 5622 return -ENOMEM; 5623 } 5624 5625 bdev_io->internal.ch = channel; 5626 bdev_io->internal.desc = desc; 5627 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5628 bdev_io->u.bdev.iovs = iov; 5629 bdev_io->u.bdev.iovcnt = iovcnt; 5630 bdev_io->u.bdev.md_buf = md_buf; 5631 bdev_io->u.bdev.num_blocks = num_blocks; 5632 bdev_io->u.bdev.offset_blocks = offset_blocks; 5633 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5634 bdev_io->u.bdev.memory_domain = NULL; 5635 bdev_io->u.bdev.memory_domain_ctx = NULL; 5636 bdev_io->u.bdev.accel_sequence = NULL; 5637 5638 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5639 bdev_io_submit(bdev_io); 5640 return 0; 5641 } 5642 5643 bdev_compare_do_read(bdev_io); 5644 5645 return 0; 5646 } 5647 5648 int 5649 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5650 struct iovec *iov, int iovcnt, 5651 uint64_t offset_blocks, uint64_t num_blocks, 5652 spdk_bdev_io_completion_cb cb, void *cb_arg) 5653 { 5654 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5655 num_blocks, cb, cb_arg); 5656 } 5657 5658 int 5659 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5660 struct iovec *iov, int iovcnt, void *md_buf, 5661 uint64_t offset_blocks, uint64_t num_blocks, 5662 spdk_bdev_io_completion_cb cb, void *cb_arg) 5663 { 5664 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5665 return -EINVAL; 5666 } 5667 5668 if (md_buf && !_is_buf_allocated(iov)) { 5669 return -EINVAL; 5670 } 5671 5672 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5673 num_blocks, cb, cb_arg); 5674 } 5675 5676 static int 5677 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5678 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5679 spdk_bdev_io_completion_cb cb, void *cb_arg) 5680 { 5681 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5682 struct spdk_bdev_io *bdev_io; 5683 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5684 5685 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5686 return -EINVAL; 5687 } 5688 5689 bdev_io = bdev_channel_get_io(channel); 5690 if (!bdev_io) { 5691 return -ENOMEM; 5692 } 5693 5694 bdev_io->internal.ch = channel; 5695 bdev_io->internal.desc = desc; 5696 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5697 bdev_io->u.bdev.iovs = &bdev_io->iov; 5698 bdev_io->u.bdev.iovs[0].iov_base = buf; 5699 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5700 bdev_io->u.bdev.iovcnt = 1; 5701 bdev_io->u.bdev.md_buf = md_buf; 5702 bdev_io->u.bdev.num_blocks = num_blocks; 5703 bdev_io->u.bdev.offset_blocks = offset_blocks; 5704 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5705 bdev_io->u.bdev.memory_domain = NULL; 5706 bdev_io->u.bdev.memory_domain_ctx = NULL; 5707 bdev_io->u.bdev.accel_sequence = NULL; 5708 5709 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5710 bdev_io_submit(bdev_io); 5711 return 0; 5712 } 5713 5714 bdev_compare_do_read(bdev_io); 5715 5716 return 0; 5717 } 5718 5719 int 5720 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5721 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5722 spdk_bdev_io_completion_cb cb, void *cb_arg) 5723 { 5724 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5725 cb, cb_arg); 5726 } 5727 5728 int 5729 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5730 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5731 spdk_bdev_io_completion_cb cb, void *cb_arg) 5732 { 5733 struct iovec iov = { 5734 .iov_base = buf, 5735 }; 5736 5737 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5738 return -EINVAL; 5739 } 5740 5741 if (md_buf && !_is_buf_allocated(&iov)) { 5742 return -EINVAL; 5743 } 5744 5745 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5746 cb, cb_arg); 5747 } 5748 5749 static void 5750 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5751 { 5752 struct spdk_bdev_io *bdev_io = ctx; 5753 5754 if (unlock_status) { 5755 SPDK_ERRLOG("LBA range unlock failed\n"); 5756 } 5757 5758 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5759 false, bdev_io->internal.caller_ctx); 5760 } 5761 5762 static void 5763 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5764 { 5765 bdev_io->internal.status = status; 5766 5767 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5768 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5769 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5770 } 5771 5772 static void 5773 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5774 { 5775 struct spdk_bdev_io *parent_io = cb_arg; 5776 5777 if (!success) { 5778 SPDK_ERRLOG("Compare and write operation failed\n"); 5779 } 5780 5781 spdk_bdev_free_io(bdev_io); 5782 5783 bdev_comparev_and_writev_blocks_unlock(parent_io, 5784 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5785 } 5786 5787 static void 5788 bdev_compare_and_write_do_write(void *_bdev_io) 5789 { 5790 struct spdk_bdev_io *bdev_io = _bdev_io; 5791 int rc; 5792 5793 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5794 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5795 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5796 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5797 bdev_compare_and_write_do_write_done, bdev_io); 5798 5799 5800 if (rc == -ENOMEM) { 5801 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5802 } else if (rc != 0) { 5803 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5804 } 5805 } 5806 5807 static void 5808 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5809 { 5810 struct spdk_bdev_io *parent_io = cb_arg; 5811 5812 spdk_bdev_free_io(bdev_io); 5813 5814 if (!success) { 5815 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5816 return; 5817 } 5818 5819 bdev_compare_and_write_do_write(parent_io); 5820 } 5821 5822 static void 5823 bdev_compare_and_write_do_compare(void *_bdev_io) 5824 { 5825 struct spdk_bdev_io *bdev_io = _bdev_io; 5826 int rc; 5827 5828 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5829 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5830 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5831 bdev_compare_and_write_do_compare_done, bdev_io); 5832 5833 if (rc == -ENOMEM) { 5834 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5835 } else if (rc != 0) { 5836 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5837 } 5838 } 5839 5840 static void 5841 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5842 { 5843 struct spdk_bdev_io *bdev_io = ctx; 5844 5845 if (status) { 5846 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5847 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5848 return; 5849 } 5850 5851 bdev_compare_and_write_do_compare(bdev_io); 5852 } 5853 5854 int 5855 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5856 struct iovec *compare_iov, int compare_iovcnt, 5857 struct iovec *write_iov, int write_iovcnt, 5858 uint64_t offset_blocks, uint64_t num_blocks, 5859 spdk_bdev_io_completion_cb cb, void *cb_arg) 5860 { 5861 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5862 struct spdk_bdev_io *bdev_io; 5863 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5864 5865 if (!desc->write) { 5866 return -EBADF; 5867 } 5868 5869 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5870 return -EINVAL; 5871 } 5872 5873 if (num_blocks > bdev->acwu) { 5874 return -EINVAL; 5875 } 5876 5877 bdev_io = bdev_channel_get_io(channel); 5878 if (!bdev_io) { 5879 return -ENOMEM; 5880 } 5881 5882 bdev_io->internal.ch = channel; 5883 bdev_io->internal.desc = desc; 5884 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5885 bdev_io->u.bdev.iovs = compare_iov; 5886 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5887 bdev_io->u.bdev.fused_iovs = write_iov; 5888 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5889 bdev_io->u.bdev.md_buf = NULL; 5890 bdev_io->u.bdev.num_blocks = num_blocks; 5891 bdev_io->u.bdev.offset_blocks = offset_blocks; 5892 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5893 bdev_io->u.bdev.memory_domain = NULL; 5894 bdev_io->u.bdev.memory_domain_ctx = NULL; 5895 bdev_io->u.bdev.accel_sequence = NULL; 5896 5897 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5898 bdev_io_submit(bdev_io); 5899 return 0; 5900 } 5901 5902 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5903 bdev_comparev_and_writev_blocks_locked, bdev_io); 5904 } 5905 5906 int 5907 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5908 struct iovec *iov, int iovcnt, 5909 uint64_t offset_blocks, uint64_t num_blocks, 5910 bool populate, 5911 spdk_bdev_io_completion_cb cb, void *cb_arg) 5912 { 5913 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5914 struct spdk_bdev_io *bdev_io; 5915 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5916 5917 if (!desc->write) { 5918 return -EBADF; 5919 } 5920 5921 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5922 return -EINVAL; 5923 } 5924 5925 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5926 return -ENOTSUP; 5927 } 5928 5929 bdev_io = bdev_channel_get_io(channel); 5930 if (!bdev_io) { 5931 return -ENOMEM; 5932 } 5933 5934 bdev_io->internal.ch = channel; 5935 bdev_io->internal.desc = desc; 5936 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5937 bdev_io->u.bdev.num_blocks = num_blocks; 5938 bdev_io->u.bdev.offset_blocks = offset_blocks; 5939 bdev_io->u.bdev.iovs = iov; 5940 bdev_io->u.bdev.iovcnt = iovcnt; 5941 bdev_io->u.bdev.md_buf = NULL; 5942 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5943 bdev_io->u.bdev.zcopy.commit = 0; 5944 bdev_io->u.bdev.zcopy.start = 1; 5945 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5946 bdev_io->u.bdev.memory_domain = NULL; 5947 bdev_io->u.bdev.memory_domain_ctx = NULL; 5948 bdev_io->u.bdev.accel_sequence = NULL; 5949 5950 bdev_io_submit(bdev_io); 5951 5952 return 0; 5953 } 5954 5955 int 5956 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5957 spdk_bdev_io_completion_cb cb, void *cb_arg) 5958 { 5959 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5960 return -EINVAL; 5961 } 5962 5963 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5964 bdev_io->u.bdev.zcopy.start = 0; 5965 bdev_io->internal.caller_ctx = cb_arg; 5966 bdev_io->internal.cb = cb; 5967 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5968 5969 bdev_io_submit(bdev_io); 5970 5971 return 0; 5972 } 5973 5974 int 5975 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5976 uint64_t offset, uint64_t len, 5977 spdk_bdev_io_completion_cb cb, void *cb_arg) 5978 { 5979 uint64_t offset_blocks, num_blocks; 5980 5981 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5982 len, &num_blocks) != 0) { 5983 return -EINVAL; 5984 } 5985 5986 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5987 } 5988 5989 int 5990 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5991 uint64_t offset_blocks, uint64_t num_blocks, 5992 spdk_bdev_io_completion_cb cb, void *cb_arg) 5993 { 5994 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5995 struct spdk_bdev_io *bdev_io; 5996 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5997 5998 if (!desc->write) { 5999 return -EBADF; 6000 } 6001 6002 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6003 return -EINVAL; 6004 } 6005 6006 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6007 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6008 return -ENOTSUP; 6009 } 6010 6011 bdev_io = bdev_channel_get_io(channel); 6012 6013 if (!bdev_io) { 6014 return -ENOMEM; 6015 } 6016 6017 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6018 bdev_io->internal.ch = channel; 6019 bdev_io->internal.desc = desc; 6020 bdev_io->u.bdev.offset_blocks = offset_blocks; 6021 bdev_io->u.bdev.num_blocks = num_blocks; 6022 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6023 bdev_io->u.bdev.memory_domain = NULL; 6024 bdev_io->u.bdev.memory_domain_ctx = NULL; 6025 bdev_io->u.bdev.accel_sequence = NULL; 6026 6027 /* If the write_zeroes size is large and should be split, use the generic split 6028 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6029 * 6030 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6031 * or emulate it using regular write request otherwise. 6032 */ 6033 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6034 bdev_io->internal.split) { 6035 bdev_io_submit(bdev_io); 6036 return 0; 6037 } 6038 6039 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6040 6041 return bdev_write_zero_buffer(bdev_io); 6042 } 6043 6044 int 6045 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6046 uint64_t offset, uint64_t nbytes, 6047 spdk_bdev_io_completion_cb cb, void *cb_arg) 6048 { 6049 uint64_t offset_blocks, num_blocks; 6050 6051 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6052 nbytes, &num_blocks) != 0) { 6053 return -EINVAL; 6054 } 6055 6056 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6057 } 6058 6059 int 6060 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6061 uint64_t offset_blocks, uint64_t num_blocks, 6062 spdk_bdev_io_completion_cb cb, void *cb_arg) 6063 { 6064 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6065 struct spdk_bdev_io *bdev_io; 6066 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6067 6068 if (!desc->write) { 6069 return -EBADF; 6070 } 6071 6072 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6073 return -EINVAL; 6074 } 6075 6076 if (num_blocks == 0) { 6077 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6078 return -EINVAL; 6079 } 6080 6081 bdev_io = bdev_channel_get_io(channel); 6082 if (!bdev_io) { 6083 return -ENOMEM; 6084 } 6085 6086 bdev_io->internal.ch = channel; 6087 bdev_io->internal.desc = desc; 6088 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6089 6090 bdev_io->u.bdev.iovs = &bdev_io->iov; 6091 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6092 bdev_io->u.bdev.iovs[0].iov_len = 0; 6093 bdev_io->u.bdev.iovcnt = 1; 6094 6095 bdev_io->u.bdev.offset_blocks = offset_blocks; 6096 bdev_io->u.bdev.num_blocks = num_blocks; 6097 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6098 bdev_io->u.bdev.memory_domain = NULL; 6099 bdev_io->u.bdev.memory_domain_ctx = NULL; 6100 bdev_io->u.bdev.accel_sequence = NULL; 6101 6102 bdev_io_submit(bdev_io); 6103 return 0; 6104 } 6105 6106 int 6107 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6108 uint64_t offset, uint64_t length, 6109 spdk_bdev_io_completion_cb cb, void *cb_arg) 6110 { 6111 uint64_t offset_blocks, num_blocks; 6112 6113 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6114 length, &num_blocks) != 0) { 6115 return -EINVAL; 6116 } 6117 6118 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6119 } 6120 6121 int 6122 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6123 uint64_t offset_blocks, uint64_t num_blocks, 6124 spdk_bdev_io_completion_cb cb, void *cb_arg) 6125 { 6126 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6127 struct spdk_bdev_io *bdev_io; 6128 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6129 6130 if (!desc->write) { 6131 return -EBADF; 6132 } 6133 6134 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6135 return -EINVAL; 6136 } 6137 6138 bdev_io = bdev_channel_get_io(channel); 6139 if (!bdev_io) { 6140 return -ENOMEM; 6141 } 6142 6143 bdev_io->internal.ch = channel; 6144 bdev_io->internal.desc = desc; 6145 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6146 bdev_io->u.bdev.iovs = NULL; 6147 bdev_io->u.bdev.iovcnt = 0; 6148 bdev_io->u.bdev.offset_blocks = offset_blocks; 6149 bdev_io->u.bdev.num_blocks = num_blocks; 6150 bdev_io->u.bdev.memory_domain = NULL; 6151 bdev_io->u.bdev.memory_domain_ctx = NULL; 6152 bdev_io->u.bdev.accel_sequence = NULL; 6153 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6154 6155 bdev_io_submit(bdev_io); 6156 return 0; 6157 } 6158 6159 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6160 6161 static void 6162 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6163 { 6164 struct spdk_bdev_channel *ch = _ctx; 6165 struct spdk_bdev_io *bdev_io; 6166 6167 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6168 6169 if (status == -EBUSY) { 6170 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6171 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6172 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6173 } else { 6174 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6175 6176 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6177 /* If outstanding IOs are still present and reset_io_drain_timeout 6178 * seconds passed, start the reset. */ 6179 bdev_io_submit_reset(bdev_io); 6180 } else { 6181 /* We still have in progress memory domain pull/push or we're 6182 * executing accel sequence. Since we cannot abort either of those 6183 * operaions, fail the reset request. */ 6184 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6185 } 6186 } 6187 } else { 6188 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6189 SPDK_DEBUGLOG(bdev, 6190 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6191 ch->bdev->name); 6192 /* Mark the completion status as a SUCCESS and complete the reset. */ 6193 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6194 } 6195 } 6196 6197 static void 6198 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6199 struct spdk_io_channel *io_ch, void *_ctx) 6200 { 6201 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6202 int status = 0; 6203 6204 if (cur_ch->io_outstanding > 0 || 6205 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6206 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6207 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6208 * further iteration over the rest of the channels and pass non-zero status 6209 * to the callback function. */ 6210 status = -EBUSY; 6211 } 6212 spdk_bdev_for_each_channel_continue(i, status); 6213 } 6214 6215 static int 6216 bdev_reset_poll_for_outstanding_io(void *ctx) 6217 { 6218 struct spdk_bdev_channel *ch = ctx; 6219 struct spdk_bdev_io *bdev_io; 6220 6221 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6222 6223 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6224 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6225 bdev_reset_check_outstanding_io_done); 6226 6227 return SPDK_POLLER_BUSY; 6228 } 6229 6230 static void 6231 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6232 { 6233 struct spdk_bdev_channel *ch = _ctx; 6234 struct spdk_bdev_io *bdev_io; 6235 6236 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6237 6238 if (bdev->reset_io_drain_timeout == 0) { 6239 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6240 6241 bdev_io_submit_reset(bdev_io); 6242 return; 6243 } 6244 6245 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6246 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6247 6248 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6249 * submit the reset to the underlying module only if outstanding I/O 6250 * remain after reset_io_drain_timeout seconds have passed. */ 6251 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6252 bdev_reset_check_outstanding_io_done); 6253 } 6254 6255 static void 6256 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6257 struct spdk_io_channel *ch, void *_ctx) 6258 { 6259 struct spdk_bdev_channel *channel; 6260 struct spdk_bdev_mgmt_channel *mgmt_channel; 6261 struct spdk_bdev_shared_resource *shared_resource; 6262 bdev_io_tailq_t tmp_queued; 6263 6264 TAILQ_INIT(&tmp_queued); 6265 6266 channel = __io_ch_to_bdev_ch(ch); 6267 shared_resource = channel->shared_resource; 6268 mgmt_channel = shared_resource->mgmt_ch; 6269 6270 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6271 6272 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6273 /* The QoS object is always valid and readable while 6274 * the channel flag is set, so the lock here should not 6275 * be necessary. We're not in the fast path though, so 6276 * just take it anyway. */ 6277 spdk_spin_lock(&channel->bdev->internal.spinlock); 6278 if (channel->bdev->internal.qos->ch == channel) { 6279 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6280 } 6281 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6282 } 6283 6284 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6285 bdev_abort_all_buf_io(mgmt_channel, channel); 6286 bdev_abort_all_buf_io(mgmt_channel, channel); 6287 bdev_abort_all_queued_io(&tmp_queued, channel); 6288 6289 spdk_bdev_for_each_channel_continue(i, 0); 6290 } 6291 6292 static void 6293 bdev_start_reset(void *ctx) 6294 { 6295 struct spdk_bdev_channel *ch = ctx; 6296 6297 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6298 bdev_reset_freeze_channel_done); 6299 } 6300 6301 static void 6302 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6303 { 6304 struct spdk_bdev *bdev = ch->bdev; 6305 6306 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6307 6308 spdk_spin_lock(&bdev->internal.spinlock); 6309 if (bdev->internal.reset_in_progress == NULL) { 6310 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6311 /* 6312 * Take a channel reference for the target bdev for the life of this 6313 * reset. This guards against the channel getting destroyed while 6314 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6315 * progress. We will release the reference when this reset is 6316 * completed. 6317 */ 6318 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6319 bdev_start_reset(ch); 6320 } 6321 spdk_spin_unlock(&bdev->internal.spinlock); 6322 } 6323 6324 int 6325 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6326 spdk_bdev_io_completion_cb cb, void *cb_arg) 6327 { 6328 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6329 struct spdk_bdev_io *bdev_io; 6330 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6331 6332 bdev_io = bdev_channel_get_io(channel); 6333 if (!bdev_io) { 6334 return -ENOMEM; 6335 } 6336 6337 bdev_io->internal.ch = channel; 6338 bdev_io->internal.desc = desc; 6339 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6340 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6341 bdev_io->u.reset.ch_ref = NULL; 6342 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6343 6344 spdk_spin_lock(&bdev->internal.spinlock); 6345 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6346 spdk_spin_unlock(&bdev->internal.spinlock); 6347 6348 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6349 internal.ch_link); 6350 6351 bdev_channel_start_reset(channel); 6352 6353 return 0; 6354 } 6355 6356 void 6357 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6358 struct spdk_bdev_io_stat *stat) 6359 { 6360 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6361 6362 bdev_get_io_stat(stat, channel->stat); 6363 } 6364 6365 static void 6366 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6367 { 6368 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6369 6370 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6371 bdev_iostat_ctx->cb_arg, 0); 6372 free(bdev_iostat_ctx); 6373 } 6374 6375 static void 6376 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6377 struct spdk_io_channel *ch, void *_ctx) 6378 { 6379 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6380 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6381 6382 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6383 spdk_bdev_for_each_channel_continue(i, 0); 6384 } 6385 6386 void 6387 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6388 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6389 { 6390 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6391 6392 assert(bdev != NULL); 6393 assert(stat != NULL); 6394 assert(cb != NULL); 6395 6396 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6397 if (bdev_iostat_ctx == NULL) { 6398 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6399 cb(bdev, stat, cb_arg, -ENOMEM); 6400 return; 6401 } 6402 6403 bdev_iostat_ctx->stat = stat; 6404 bdev_iostat_ctx->cb = cb; 6405 bdev_iostat_ctx->cb_arg = cb_arg; 6406 6407 /* Start with the statistics from previously deleted channels. */ 6408 spdk_spin_lock(&bdev->internal.spinlock); 6409 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6410 spdk_spin_unlock(&bdev->internal.spinlock); 6411 6412 /* Then iterate and add the statistics from each existing channel. */ 6413 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6414 bdev_get_device_stat_done); 6415 } 6416 6417 struct bdev_iostat_reset_ctx { 6418 enum spdk_bdev_reset_stat_mode mode; 6419 bdev_reset_device_stat_cb cb; 6420 void *cb_arg; 6421 }; 6422 6423 static void 6424 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6425 { 6426 struct bdev_iostat_reset_ctx *ctx = _ctx; 6427 6428 ctx->cb(bdev, ctx->cb_arg, 0); 6429 6430 free(ctx); 6431 } 6432 6433 static void 6434 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6435 struct spdk_io_channel *ch, void *_ctx) 6436 { 6437 struct bdev_iostat_reset_ctx *ctx = _ctx; 6438 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6439 6440 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6441 6442 spdk_bdev_for_each_channel_continue(i, 0); 6443 } 6444 6445 void 6446 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6447 bdev_reset_device_stat_cb cb, void *cb_arg) 6448 { 6449 struct bdev_iostat_reset_ctx *ctx; 6450 6451 assert(bdev != NULL); 6452 assert(cb != NULL); 6453 6454 ctx = calloc(1, sizeof(*ctx)); 6455 if (ctx == NULL) { 6456 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6457 cb(bdev, cb_arg, -ENOMEM); 6458 return; 6459 } 6460 6461 ctx->mode = mode; 6462 ctx->cb = cb; 6463 ctx->cb_arg = cb_arg; 6464 6465 spdk_spin_lock(&bdev->internal.spinlock); 6466 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6467 spdk_spin_unlock(&bdev->internal.spinlock); 6468 6469 spdk_bdev_for_each_channel(bdev, 6470 bdev_reset_each_channel_stat, 6471 ctx, 6472 bdev_reset_device_stat_done); 6473 } 6474 6475 int 6476 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6477 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6478 spdk_bdev_io_completion_cb cb, void *cb_arg) 6479 { 6480 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6481 struct spdk_bdev_io *bdev_io; 6482 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6483 6484 if (!desc->write) { 6485 return -EBADF; 6486 } 6487 6488 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6489 return -ENOTSUP; 6490 } 6491 6492 bdev_io = bdev_channel_get_io(channel); 6493 if (!bdev_io) { 6494 return -ENOMEM; 6495 } 6496 6497 bdev_io->internal.ch = channel; 6498 bdev_io->internal.desc = desc; 6499 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6500 bdev_io->u.nvme_passthru.cmd = *cmd; 6501 bdev_io->u.nvme_passthru.buf = buf; 6502 bdev_io->u.nvme_passthru.nbytes = nbytes; 6503 bdev_io->u.nvme_passthru.md_buf = NULL; 6504 bdev_io->u.nvme_passthru.md_len = 0; 6505 6506 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6507 6508 bdev_io_submit(bdev_io); 6509 return 0; 6510 } 6511 6512 int 6513 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6514 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6515 spdk_bdev_io_completion_cb cb, void *cb_arg) 6516 { 6517 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6518 struct spdk_bdev_io *bdev_io; 6519 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6520 6521 if (!desc->write) { 6522 /* 6523 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6524 * to easily determine if the command is a read or write, but for now just 6525 * do not allow io_passthru with a read-only descriptor. 6526 */ 6527 return -EBADF; 6528 } 6529 6530 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6531 return -ENOTSUP; 6532 } 6533 6534 bdev_io = bdev_channel_get_io(channel); 6535 if (!bdev_io) { 6536 return -ENOMEM; 6537 } 6538 6539 bdev_io->internal.ch = channel; 6540 bdev_io->internal.desc = desc; 6541 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6542 bdev_io->u.nvme_passthru.cmd = *cmd; 6543 bdev_io->u.nvme_passthru.buf = buf; 6544 bdev_io->u.nvme_passthru.nbytes = nbytes; 6545 bdev_io->u.nvme_passthru.md_buf = NULL; 6546 bdev_io->u.nvme_passthru.md_len = 0; 6547 6548 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6549 6550 bdev_io_submit(bdev_io); 6551 return 0; 6552 } 6553 6554 int 6555 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6556 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6557 spdk_bdev_io_completion_cb cb, void *cb_arg) 6558 { 6559 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6560 struct spdk_bdev_io *bdev_io; 6561 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6562 6563 if (!desc->write) { 6564 /* 6565 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6566 * to easily determine if the command is a read or write, but for now just 6567 * do not allow io_passthru with a read-only descriptor. 6568 */ 6569 return -EBADF; 6570 } 6571 6572 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6573 return -ENOTSUP; 6574 } 6575 6576 bdev_io = bdev_channel_get_io(channel); 6577 if (!bdev_io) { 6578 return -ENOMEM; 6579 } 6580 6581 bdev_io->internal.ch = channel; 6582 bdev_io->internal.desc = desc; 6583 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6584 bdev_io->u.nvme_passthru.cmd = *cmd; 6585 bdev_io->u.nvme_passthru.buf = buf; 6586 bdev_io->u.nvme_passthru.nbytes = nbytes; 6587 bdev_io->u.nvme_passthru.md_buf = md_buf; 6588 bdev_io->u.nvme_passthru.md_len = md_len; 6589 6590 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6591 6592 bdev_io_submit(bdev_io); 6593 return 0; 6594 } 6595 6596 static void bdev_abort_retry(void *ctx); 6597 static void bdev_abort(struct spdk_bdev_io *parent_io); 6598 6599 static void 6600 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6601 { 6602 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6603 struct spdk_bdev_io *parent_io = cb_arg; 6604 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6605 6606 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6607 6608 spdk_bdev_free_io(bdev_io); 6609 6610 if (!success) { 6611 /* Check if the target I/O completed in the meantime. */ 6612 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6613 if (tmp_io == bio_to_abort) { 6614 break; 6615 } 6616 } 6617 6618 /* If the target I/O still exists, set the parent to failed. */ 6619 if (tmp_io != NULL) { 6620 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6621 } 6622 } 6623 6624 parent_io->u.bdev.split_outstanding--; 6625 if (parent_io->u.bdev.split_outstanding == 0) { 6626 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6627 bdev_abort_retry(parent_io); 6628 } else { 6629 bdev_io_complete(parent_io); 6630 } 6631 } 6632 } 6633 6634 static int 6635 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6636 struct spdk_bdev_io *bio_to_abort, 6637 spdk_bdev_io_completion_cb cb, void *cb_arg) 6638 { 6639 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6640 struct spdk_bdev_io *bdev_io; 6641 6642 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6643 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6644 /* TODO: Abort reset or abort request. */ 6645 return -ENOTSUP; 6646 } 6647 6648 bdev_io = bdev_channel_get_io(channel); 6649 if (bdev_io == NULL) { 6650 return -ENOMEM; 6651 } 6652 6653 bdev_io->internal.ch = channel; 6654 bdev_io->internal.desc = desc; 6655 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6656 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6657 6658 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6659 assert(bdev_io_should_split(bio_to_abort)); 6660 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6661 6662 /* Parent abort request is not submitted directly, but to manage its 6663 * execution add it to the submitted list here. 6664 */ 6665 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6666 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6667 6668 bdev_abort(bdev_io); 6669 6670 return 0; 6671 } 6672 6673 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6674 6675 /* Submit the abort request to the underlying bdev module. */ 6676 bdev_io_submit(bdev_io); 6677 6678 return 0; 6679 } 6680 6681 static bool 6682 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6683 { 6684 struct spdk_bdev_io *iter; 6685 6686 TAILQ_FOREACH(iter, tailq, internal.link) { 6687 if (iter == bdev_io) { 6688 return true; 6689 } 6690 } 6691 6692 return false; 6693 } 6694 6695 static uint32_t 6696 _bdev_abort(struct spdk_bdev_io *parent_io) 6697 { 6698 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6699 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6700 void *bio_cb_arg; 6701 struct spdk_bdev_io *bio_to_abort; 6702 uint32_t matched_ios; 6703 int rc; 6704 6705 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6706 6707 /* matched_ios is returned and will be kept by the caller. 6708 * 6709 * This function will be used for two cases, 1) the same cb_arg is used for 6710 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6711 * Incrementing split_outstanding directly here may confuse readers especially 6712 * for the 1st case. 6713 * 6714 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6715 * works as expected. 6716 */ 6717 matched_ios = 0; 6718 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6719 6720 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6721 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6722 continue; 6723 } 6724 6725 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6726 /* Any I/O which was submitted after this abort command should be excluded. */ 6727 continue; 6728 } 6729 6730 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6731 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6732 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6733 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6734 break; 6735 } 6736 6737 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6738 if (rc != 0) { 6739 if (rc == -ENOMEM) { 6740 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6741 } else { 6742 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6743 } 6744 break; 6745 } 6746 matched_ios++; 6747 } 6748 6749 return matched_ios; 6750 } 6751 6752 static void 6753 bdev_abort_retry(void *ctx) 6754 { 6755 struct spdk_bdev_io *parent_io = ctx; 6756 uint32_t matched_ios; 6757 6758 matched_ios = _bdev_abort(parent_io); 6759 6760 if (matched_ios == 0) { 6761 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6762 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6763 } else { 6764 /* For retry, the case that no target I/O was found is success 6765 * because it means target I/Os completed in the meantime. 6766 */ 6767 bdev_io_complete(parent_io); 6768 } 6769 return; 6770 } 6771 6772 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6773 parent_io->u.bdev.split_outstanding = matched_ios; 6774 } 6775 6776 static void 6777 bdev_abort(struct spdk_bdev_io *parent_io) 6778 { 6779 uint32_t matched_ios; 6780 6781 matched_ios = _bdev_abort(parent_io); 6782 6783 if (matched_ios == 0) { 6784 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6785 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6786 } else { 6787 /* The case the no target I/O was found is failure. */ 6788 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6789 bdev_io_complete(parent_io); 6790 } 6791 return; 6792 } 6793 6794 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6795 parent_io->u.bdev.split_outstanding = matched_ios; 6796 } 6797 6798 int 6799 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6800 void *bio_cb_arg, 6801 spdk_bdev_io_completion_cb cb, void *cb_arg) 6802 { 6803 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6804 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6805 struct spdk_bdev_io *bdev_io; 6806 6807 if (bio_cb_arg == NULL) { 6808 return -EINVAL; 6809 } 6810 6811 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6812 return -ENOTSUP; 6813 } 6814 6815 bdev_io = bdev_channel_get_io(channel); 6816 if (bdev_io == NULL) { 6817 return -ENOMEM; 6818 } 6819 6820 bdev_io->internal.ch = channel; 6821 bdev_io->internal.desc = desc; 6822 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6823 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6824 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6825 6826 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6827 6828 /* Parent abort request is not submitted directly, but to manage its execution, 6829 * add it to the submitted list here. 6830 */ 6831 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6832 6833 bdev_abort(bdev_io); 6834 6835 return 0; 6836 } 6837 6838 int 6839 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6840 struct spdk_bdev_io_wait_entry *entry) 6841 { 6842 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6843 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6844 6845 if (bdev != entry->bdev) { 6846 SPDK_ERRLOG("bdevs do not match\n"); 6847 return -EINVAL; 6848 } 6849 6850 if (mgmt_ch->per_thread_cache_count > 0) { 6851 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6852 return -EINVAL; 6853 } 6854 6855 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6856 return 0; 6857 } 6858 6859 static inline void 6860 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6861 { 6862 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6863 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6864 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6865 uint32_t blocklen = bdev_io->bdev->blocklen; 6866 6867 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6868 switch (bdev_io->type) { 6869 case SPDK_BDEV_IO_TYPE_READ: 6870 io_stat->bytes_read += num_blocks * blocklen; 6871 io_stat->num_read_ops++; 6872 io_stat->read_latency_ticks += tsc_diff; 6873 if (io_stat->max_read_latency_ticks < tsc_diff) { 6874 io_stat->max_read_latency_ticks = tsc_diff; 6875 } 6876 if (io_stat->min_read_latency_ticks > tsc_diff) { 6877 io_stat->min_read_latency_ticks = tsc_diff; 6878 } 6879 break; 6880 case SPDK_BDEV_IO_TYPE_WRITE: 6881 io_stat->bytes_written += num_blocks * blocklen; 6882 io_stat->num_write_ops++; 6883 io_stat->write_latency_ticks += tsc_diff; 6884 if (io_stat->max_write_latency_ticks < tsc_diff) { 6885 io_stat->max_write_latency_ticks = tsc_diff; 6886 } 6887 if (io_stat->min_write_latency_ticks > tsc_diff) { 6888 io_stat->min_write_latency_ticks = tsc_diff; 6889 } 6890 break; 6891 case SPDK_BDEV_IO_TYPE_UNMAP: 6892 io_stat->bytes_unmapped += num_blocks * blocklen; 6893 io_stat->num_unmap_ops++; 6894 io_stat->unmap_latency_ticks += tsc_diff; 6895 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6896 io_stat->max_unmap_latency_ticks = tsc_diff; 6897 } 6898 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6899 io_stat->min_unmap_latency_ticks = tsc_diff; 6900 } 6901 break; 6902 case SPDK_BDEV_IO_TYPE_ZCOPY: 6903 /* Track the data in the start phase only */ 6904 if (bdev_io->u.bdev.zcopy.start) { 6905 if (bdev_io->u.bdev.zcopy.populate) { 6906 io_stat->bytes_read += num_blocks * blocklen; 6907 io_stat->num_read_ops++; 6908 io_stat->read_latency_ticks += tsc_diff; 6909 if (io_stat->max_read_latency_ticks < tsc_diff) { 6910 io_stat->max_read_latency_ticks = tsc_diff; 6911 } 6912 if (io_stat->min_read_latency_ticks > tsc_diff) { 6913 io_stat->min_read_latency_ticks = tsc_diff; 6914 } 6915 } else { 6916 io_stat->bytes_written += num_blocks * blocklen; 6917 io_stat->num_write_ops++; 6918 io_stat->write_latency_ticks += tsc_diff; 6919 if (io_stat->max_write_latency_ticks < tsc_diff) { 6920 io_stat->max_write_latency_ticks = tsc_diff; 6921 } 6922 if (io_stat->min_write_latency_ticks > tsc_diff) { 6923 io_stat->min_write_latency_ticks = tsc_diff; 6924 } 6925 } 6926 } 6927 break; 6928 case SPDK_BDEV_IO_TYPE_COPY: 6929 io_stat->bytes_copied += num_blocks * blocklen; 6930 io_stat->num_copy_ops++; 6931 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6932 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6933 io_stat->max_copy_latency_ticks = tsc_diff; 6934 } 6935 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6936 io_stat->min_copy_latency_ticks = tsc_diff; 6937 } 6938 break; 6939 default: 6940 break; 6941 } 6942 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6943 io_stat = bdev_io->bdev->internal.stat; 6944 assert(io_stat->io_error != NULL); 6945 6946 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6947 io_stat->io_error->error_status[-io_status - 1]++; 6948 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6949 } 6950 6951 #ifdef SPDK_CONFIG_VTUNE 6952 uint64_t now_tsc = spdk_get_ticks(); 6953 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6954 uint64_t data[5]; 6955 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6956 6957 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6958 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6959 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6960 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6961 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6962 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6963 6964 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6965 __itt_metadata_u64, 5, data); 6966 6967 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6968 bdev_io->internal.ch->start_tsc = now_tsc; 6969 } 6970 #endif 6971 } 6972 6973 static inline void 6974 _bdev_io_complete(void *ctx) 6975 { 6976 struct spdk_bdev_io *bdev_io = ctx; 6977 6978 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6979 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6980 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6981 } 6982 6983 assert(bdev_io->internal.cb != NULL); 6984 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6985 6986 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6987 bdev_io->internal.caller_ctx); 6988 } 6989 6990 static inline void 6991 bdev_io_complete(void *ctx) 6992 { 6993 struct spdk_bdev_io *bdev_io = ctx; 6994 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6995 uint64_t tsc, tsc_diff; 6996 6997 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6998 /* 6999 * Defer completion to avoid potential infinite recursion if the 7000 * user's completion callback issues a new I/O. 7001 */ 7002 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7003 bdev_io_complete, bdev_io); 7004 return; 7005 } 7006 7007 tsc = spdk_get_ticks(); 7008 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7009 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7010 bdev_io->internal.caller_ctx); 7011 7012 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7013 7014 if (bdev_io->internal.ch->histogram) { 7015 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7016 } 7017 7018 bdev_io_update_io_stat(bdev_io, tsc_diff); 7019 _bdev_io_complete(bdev_io); 7020 } 7021 7022 /* The difference between this function and bdev_io_complete() is that this should be called to 7023 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7024 * io_submitted list and don't have submit_tsc updated. 7025 */ 7026 static inline void 7027 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7028 { 7029 /* Since the IO hasn't been submitted it's bound to be failed */ 7030 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7031 7032 /* At this point we don't know if the IO is completed from submission context or not, but, 7033 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7034 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7035 _bdev_io_complete, bdev_io); 7036 } 7037 7038 static void bdev_destroy_cb(void *io_device); 7039 7040 static void 7041 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7042 { 7043 struct spdk_bdev_io *bdev_io = _ctx; 7044 7045 if (bdev_io->u.reset.ch_ref != NULL) { 7046 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7047 bdev_io->u.reset.ch_ref = NULL; 7048 } 7049 7050 bdev_io_complete(bdev_io); 7051 7052 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7053 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7054 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7055 } 7056 } 7057 7058 static void 7059 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7060 struct spdk_io_channel *_ch, void *_ctx) 7061 { 7062 struct spdk_bdev_io *bdev_io = _ctx; 7063 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7064 struct spdk_bdev_io *queued_reset; 7065 7066 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7067 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7068 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7069 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7070 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7071 } 7072 7073 spdk_bdev_for_each_channel_continue(i, 0); 7074 } 7075 7076 static void 7077 bdev_io_complete_sequence_cb(void *ctx, int status) 7078 { 7079 struct spdk_bdev_io *bdev_io = ctx; 7080 7081 /* u.bdev.accel_sequence should have already been cleared at this point */ 7082 assert(bdev_io->u.bdev.accel_sequence == NULL); 7083 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7084 bdev_io->internal.accel_sequence = NULL; 7085 7086 if (spdk_unlikely(status != 0)) { 7087 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7088 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7089 } 7090 7091 bdev_io_complete(bdev_io); 7092 } 7093 7094 void 7095 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7096 { 7097 struct spdk_bdev *bdev = bdev_io->bdev; 7098 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7099 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7100 7101 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7102 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7103 spdk_bdev_get_module_name(bdev), 7104 bdev_io_status_get_string(bdev_io->internal.status)); 7105 assert(false); 7106 } 7107 bdev_io->internal.status = status; 7108 7109 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7110 bool unlock_channels = false; 7111 7112 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7113 SPDK_ERRLOG("NOMEM returned for reset\n"); 7114 } 7115 spdk_spin_lock(&bdev->internal.spinlock); 7116 if (bdev_io == bdev->internal.reset_in_progress) { 7117 bdev->internal.reset_in_progress = NULL; 7118 unlock_channels = true; 7119 } 7120 spdk_spin_unlock(&bdev->internal.spinlock); 7121 7122 if (unlock_channels) { 7123 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7124 bdev_reset_complete); 7125 return; 7126 } 7127 } else { 7128 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7129 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7130 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7131 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7132 return; 7133 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 7134 _bdev_io_push_bounce_data_buffer(bdev_io, 7135 _bdev_io_complete_push_bounce_done); 7136 /* bdev IO will be completed in the callback */ 7137 return; 7138 } 7139 } 7140 7141 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7142 return; 7143 } 7144 } 7145 7146 bdev_io_complete(bdev_io); 7147 } 7148 7149 void 7150 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7151 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7152 { 7153 enum spdk_bdev_io_status status; 7154 7155 if (sc == SPDK_SCSI_STATUS_GOOD) { 7156 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7157 } else { 7158 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7159 bdev_io->internal.error.scsi.sc = sc; 7160 bdev_io->internal.error.scsi.sk = sk; 7161 bdev_io->internal.error.scsi.asc = asc; 7162 bdev_io->internal.error.scsi.ascq = ascq; 7163 } 7164 7165 spdk_bdev_io_complete(bdev_io, status); 7166 } 7167 7168 void 7169 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7170 int *sc, int *sk, int *asc, int *ascq) 7171 { 7172 assert(sc != NULL); 7173 assert(sk != NULL); 7174 assert(asc != NULL); 7175 assert(ascq != NULL); 7176 7177 switch (bdev_io->internal.status) { 7178 case SPDK_BDEV_IO_STATUS_SUCCESS: 7179 *sc = SPDK_SCSI_STATUS_GOOD; 7180 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7181 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7182 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7183 break; 7184 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7185 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7186 break; 7187 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7188 *sc = bdev_io->internal.error.scsi.sc; 7189 *sk = bdev_io->internal.error.scsi.sk; 7190 *asc = bdev_io->internal.error.scsi.asc; 7191 *ascq = bdev_io->internal.error.scsi.ascq; 7192 break; 7193 default: 7194 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7195 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7196 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7197 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7198 break; 7199 } 7200 } 7201 7202 void 7203 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7204 { 7205 enum spdk_bdev_io_status status; 7206 7207 if (aio_result == 0) { 7208 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7209 } else { 7210 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7211 } 7212 7213 bdev_io->internal.error.aio_result = aio_result; 7214 7215 spdk_bdev_io_complete(bdev_io, status); 7216 } 7217 7218 void 7219 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7220 { 7221 assert(aio_result != NULL); 7222 7223 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7224 *aio_result = bdev_io->internal.error.aio_result; 7225 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7226 *aio_result = 0; 7227 } else { 7228 *aio_result = -EIO; 7229 } 7230 } 7231 7232 void 7233 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7234 { 7235 enum spdk_bdev_io_status status; 7236 7237 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7238 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7239 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7240 status = SPDK_BDEV_IO_STATUS_ABORTED; 7241 } else { 7242 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7243 } 7244 7245 bdev_io->internal.error.nvme.cdw0 = cdw0; 7246 bdev_io->internal.error.nvme.sct = sct; 7247 bdev_io->internal.error.nvme.sc = sc; 7248 7249 spdk_bdev_io_complete(bdev_io, status); 7250 } 7251 7252 void 7253 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7254 { 7255 assert(sct != NULL); 7256 assert(sc != NULL); 7257 assert(cdw0 != NULL); 7258 7259 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7260 *sct = SPDK_NVME_SCT_GENERIC; 7261 *sc = SPDK_NVME_SC_SUCCESS; 7262 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7263 *cdw0 = 0; 7264 } else { 7265 *cdw0 = 1U; 7266 } 7267 return; 7268 } 7269 7270 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7271 *sct = bdev_io->internal.error.nvme.sct; 7272 *sc = bdev_io->internal.error.nvme.sc; 7273 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7274 *sct = SPDK_NVME_SCT_GENERIC; 7275 *sc = SPDK_NVME_SC_SUCCESS; 7276 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7277 *sct = SPDK_NVME_SCT_GENERIC; 7278 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7279 } else { 7280 *sct = SPDK_NVME_SCT_GENERIC; 7281 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7282 } 7283 7284 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7285 } 7286 7287 void 7288 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7289 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7290 { 7291 assert(first_sct != NULL); 7292 assert(first_sc != NULL); 7293 assert(second_sct != NULL); 7294 assert(second_sc != NULL); 7295 assert(cdw0 != NULL); 7296 7297 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7298 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7299 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7300 *first_sct = bdev_io->internal.error.nvme.sct; 7301 *first_sc = bdev_io->internal.error.nvme.sc; 7302 *second_sct = SPDK_NVME_SCT_GENERIC; 7303 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7304 } else { 7305 *first_sct = SPDK_NVME_SCT_GENERIC; 7306 *first_sc = SPDK_NVME_SC_SUCCESS; 7307 *second_sct = bdev_io->internal.error.nvme.sct; 7308 *second_sc = bdev_io->internal.error.nvme.sc; 7309 } 7310 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7311 *first_sct = SPDK_NVME_SCT_GENERIC; 7312 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7313 *second_sct = SPDK_NVME_SCT_GENERIC; 7314 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7315 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7316 *first_sct = SPDK_NVME_SCT_GENERIC; 7317 *first_sc = SPDK_NVME_SC_SUCCESS; 7318 *second_sct = SPDK_NVME_SCT_GENERIC; 7319 *second_sc = SPDK_NVME_SC_SUCCESS; 7320 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7321 *first_sct = SPDK_NVME_SCT_GENERIC; 7322 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7323 *second_sct = SPDK_NVME_SCT_GENERIC; 7324 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7325 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7326 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7327 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7328 *second_sct = SPDK_NVME_SCT_GENERIC; 7329 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7330 } else { 7331 *first_sct = SPDK_NVME_SCT_GENERIC; 7332 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7333 *second_sct = SPDK_NVME_SCT_GENERIC; 7334 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7335 } 7336 7337 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7338 } 7339 7340 struct spdk_thread * 7341 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7342 { 7343 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7344 } 7345 7346 struct spdk_io_channel * 7347 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7348 { 7349 return bdev_io->internal.ch->channel; 7350 } 7351 7352 static int 7353 bdev_register(struct spdk_bdev *bdev) 7354 { 7355 char *bdev_name; 7356 char uuid[SPDK_UUID_STRING_LEN]; 7357 struct spdk_iobuf_opts iobuf_opts; 7358 int ret, i; 7359 7360 assert(bdev->module != NULL); 7361 7362 if (!bdev->name) { 7363 SPDK_ERRLOG("Bdev name is NULL\n"); 7364 return -EINVAL; 7365 } 7366 7367 if (!strlen(bdev->name)) { 7368 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7369 return -EINVAL; 7370 } 7371 7372 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7373 if (bdev->fn_table->accel_sequence_supported == NULL) { 7374 continue; 7375 } 7376 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7377 (enum spdk_bdev_io_type)i)) { 7378 continue; 7379 } 7380 7381 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7382 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7383 "memory domains\n"); 7384 return -EINVAL; 7385 } 7386 7387 if (spdk_bdev_is_md_separate(bdev)) { 7388 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7389 "accel sequence support\n"); 7390 return -EINVAL; 7391 } 7392 } 7393 7394 /* Users often register their own I/O devices using the bdev name. In 7395 * order to avoid conflicts, prepend bdev_. */ 7396 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7397 if (!bdev_name) { 7398 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7399 return -ENOMEM; 7400 } 7401 7402 bdev->internal.stat = bdev_alloc_io_stat(true); 7403 if (!bdev->internal.stat) { 7404 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7405 free(bdev_name); 7406 return -ENOMEM; 7407 } 7408 7409 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7410 bdev->internal.measured_queue_depth = UINT64_MAX; 7411 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7412 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7413 bdev->internal.qd_poller = NULL; 7414 bdev->internal.qos = NULL; 7415 7416 TAILQ_INIT(&bdev->internal.open_descs); 7417 TAILQ_INIT(&bdev->internal.locked_ranges); 7418 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7419 TAILQ_INIT(&bdev->aliases); 7420 7421 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7422 if (ret != 0) { 7423 bdev_free_io_stat(bdev->internal.stat); 7424 free(bdev_name); 7425 return ret; 7426 } 7427 7428 /* UUID may be specified by the user or defined by bdev itself. 7429 * Otherwise it will be generated here, so this field will never be empty. */ 7430 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7431 spdk_uuid_generate(&bdev->uuid); 7432 } 7433 7434 /* Add the UUID alias only if it's different than the name */ 7435 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7436 if (strcmp(bdev->name, uuid) != 0) { 7437 ret = spdk_bdev_alias_add(bdev, uuid); 7438 if (ret != 0) { 7439 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7440 bdev_name_del(&bdev->internal.bdev_name); 7441 bdev_free_io_stat(bdev->internal.stat); 7442 free(bdev_name); 7443 return ret; 7444 } 7445 } 7446 7447 if (spdk_bdev_get_buf_align(bdev) > 1) { 7448 if (bdev->split_on_optimal_io_boundary) { 7449 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7450 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7451 } else { 7452 bdev->split_on_optimal_io_boundary = true; 7453 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7454 } 7455 } 7456 7457 /* If the user didn't specify a write unit size, set it to one. */ 7458 if (bdev->write_unit_size == 0) { 7459 bdev->write_unit_size = 1; 7460 } 7461 7462 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7463 if (bdev->acwu == 0) { 7464 bdev->acwu = bdev->write_unit_size; 7465 } 7466 7467 if (bdev->phys_blocklen == 0) { 7468 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7469 } 7470 7471 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7472 spdk_iobuf_get_opts(&iobuf_opts); 7473 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7474 } 7475 7476 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7477 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7478 } 7479 7480 bdev->internal.reset_in_progress = NULL; 7481 bdev->internal.qd_poll_in_progress = false; 7482 bdev->internal.period = 0; 7483 bdev->internal.new_period = 0; 7484 7485 spdk_io_device_register(__bdev_to_io_dev(bdev), 7486 bdev_channel_create, bdev_channel_destroy, 7487 sizeof(struct spdk_bdev_channel), 7488 bdev_name); 7489 7490 free(bdev_name); 7491 7492 spdk_spin_init(&bdev->internal.spinlock); 7493 7494 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7495 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7496 7497 return 0; 7498 } 7499 7500 static void 7501 bdev_destroy_cb(void *io_device) 7502 { 7503 int rc; 7504 struct spdk_bdev *bdev; 7505 spdk_bdev_unregister_cb cb_fn; 7506 void *cb_arg; 7507 7508 bdev = __bdev_from_io_dev(io_device); 7509 7510 if (bdev->internal.unregister_td != spdk_get_thread()) { 7511 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7512 return; 7513 } 7514 7515 cb_fn = bdev->internal.unregister_cb; 7516 cb_arg = bdev->internal.unregister_ctx; 7517 7518 spdk_spin_destroy(&bdev->internal.spinlock); 7519 free(bdev->internal.qos); 7520 bdev_free_io_stat(bdev->internal.stat); 7521 7522 rc = bdev->fn_table->destruct(bdev->ctxt); 7523 if (rc < 0) { 7524 SPDK_ERRLOG("destruct failed\n"); 7525 } 7526 if (rc <= 0 && cb_fn != NULL) { 7527 cb_fn(cb_arg, rc); 7528 } 7529 } 7530 7531 void 7532 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7533 { 7534 if (bdev->internal.unregister_cb != NULL) { 7535 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7536 } 7537 } 7538 7539 static void 7540 _remove_notify(void *arg) 7541 { 7542 struct spdk_bdev_desc *desc = arg; 7543 7544 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7545 } 7546 7547 /* returns: 0 - bdev removed and ready to be destructed. 7548 * -EBUSY - bdev can't be destructed yet. */ 7549 static int 7550 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7551 { 7552 struct spdk_bdev_desc *desc, *tmp; 7553 int rc = 0; 7554 char uuid[SPDK_UUID_STRING_LEN]; 7555 7556 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7557 assert(spdk_spin_held(&bdev->internal.spinlock)); 7558 7559 /* Notify each descriptor about hotremoval */ 7560 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7561 rc = -EBUSY; 7562 /* 7563 * Defer invocation of the event_cb to a separate message that will 7564 * run later on its thread. This ensures this context unwinds and 7565 * we don't recursively unregister this bdev again if the event_cb 7566 * immediately closes its descriptor. 7567 */ 7568 event_notify(desc, _remove_notify); 7569 } 7570 7571 /* If there are no descriptors, proceed removing the bdev */ 7572 if (rc == 0) { 7573 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7574 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7575 7576 /* Delete the name and the UUID alias */ 7577 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7578 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7579 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7580 7581 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7582 7583 if (bdev->internal.reset_in_progress != NULL) { 7584 /* If reset is in progress, let the completion callback for reset 7585 * unregister the bdev. 7586 */ 7587 rc = -EBUSY; 7588 } 7589 } 7590 7591 return rc; 7592 } 7593 7594 static void 7595 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7596 struct spdk_io_channel *io_ch, void *_ctx) 7597 { 7598 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7599 7600 bdev_channel_abort_queued_ios(bdev_ch); 7601 spdk_bdev_for_each_channel_continue(i, 0); 7602 } 7603 7604 static void 7605 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7606 { 7607 int rc; 7608 7609 spdk_spin_lock(&g_bdev_mgr.spinlock); 7610 spdk_spin_lock(&bdev->internal.spinlock); 7611 /* 7612 * Set the status to REMOVING after completing to abort channels. Otherwise, 7613 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7614 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7615 * may fail. 7616 */ 7617 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7618 rc = bdev_unregister_unsafe(bdev); 7619 spdk_spin_unlock(&bdev->internal.spinlock); 7620 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7621 7622 if (rc == 0) { 7623 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7624 } 7625 } 7626 7627 void 7628 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7629 { 7630 struct spdk_thread *thread; 7631 7632 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7633 7634 thread = spdk_get_thread(); 7635 if (!thread) { 7636 /* The user called this from a non-SPDK thread. */ 7637 if (cb_fn != NULL) { 7638 cb_fn(cb_arg, -ENOTSUP); 7639 } 7640 return; 7641 } 7642 7643 spdk_spin_lock(&g_bdev_mgr.spinlock); 7644 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7645 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7646 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7647 if (cb_fn) { 7648 cb_fn(cb_arg, -EBUSY); 7649 } 7650 return; 7651 } 7652 7653 spdk_spin_lock(&bdev->internal.spinlock); 7654 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7655 bdev->internal.unregister_cb = cb_fn; 7656 bdev->internal.unregister_ctx = cb_arg; 7657 bdev->internal.unregister_td = thread; 7658 spdk_spin_unlock(&bdev->internal.spinlock); 7659 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7660 7661 spdk_bdev_set_qd_sampling_period(bdev, 0); 7662 7663 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7664 bdev_unregister); 7665 } 7666 7667 int 7668 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7669 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7670 { 7671 struct spdk_bdev_desc *desc; 7672 struct spdk_bdev *bdev; 7673 int rc; 7674 7675 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7676 if (rc != 0) { 7677 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7678 return rc; 7679 } 7680 7681 bdev = spdk_bdev_desc_get_bdev(desc); 7682 7683 if (bdev->module != module) { 7684 spdk_bdev_close(desc); 7685 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7686 bdev_name); 7687 return -ENODEV; 7688 } 7689 7690 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7691 7692 spdk_bdev_close(desc); 7693 7694 return 0; 7695 } 7696 7697 static int 7698 bdev_start_qos(struct spdk_bdev *bdev) 7699 { 7700 struct set_qos_limit_ctx *ctx; 7701 7702 /* Enable QoS */ 7703 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7704 ctx = calloc(1, sizeof(*ctx)); 7705 if (ctx == NULL) { 7706 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7707 return -ENOMEM; 7708 } 7709 ctx->bdev = bdev; 7710 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7711 } 7712 7713 return 0; 7714 } 7715 7716 static void 7717 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7718 struct spdk_bdev *bdev) 7719 { 7720 enum spdk_bdev_claim_type type; 7721 const char *typename, *modname; 7722 extern struct spdk_log_flag SPDK_LOG_bdev; 7723 7724 assert(spdk_spin_held(&bdev->internal.spinlock)); 7725 7726 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7727 return; 7728 } 7729 7730 type = bdev->internal.claim_type; 7731 typename = spdk_bdev_claim_get_name(type); 7732 7733 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7734 modname = bdev->internal.claim.v1.module->name; 7735 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7736 bdev->name, detail, typename, modname); 7737 return; 7738 } 7739 7740 if (claim_type_is_v2(type)) { 7741 struct spdk_bdev_module_claim *claim; 7742 7743 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7744 modname = claim->module->name; 7745 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7746 bdev->name, detail, typename, modname); 7747 } 7748 return; 7749 } 7750 7751 assert(false); 7752 } 7753 7754 static int 7755 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7756 { 7757 struct spdk_thread *thread; 7758 int rc = 0; 7759 7760 thread = spdk_get_thread(); 7761 if (!thread) { 7762 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7763 return -ENOTSUP; 7764 } 7765 7766 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7767 spdk_get_thread()); 7768 7769 desc->bdev = bdev; 7770 desc->thread = thread; 7771 desc->write = write; 7772 7773 spdk_spin_lock(&bdev->internal.spinlock); 7774 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7775 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7776 spdk_spin_unlock(&bdev->internal.spinlock); 7777 return -ENODEV; 7778 } 7779 7780 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7781 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7782 spdk_spin_unlock(&bdev->internal.spinlock); 7783 return -EPERM; 7784 } 7785 7786 rc = bdev_start_qos(bdev); 7787 if (rc != 0) { 7788 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7789 spdk_spin_unlock(&bdev->internal.spinlock); 7790 return rc; 7791 } 7792 7793 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7794 7795 spdk_spin_unlock(&bdev->internal.spinlock); 7796 7797 return 0; 7798 } 7799 7800 static int 7801 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7802 struct spdk_bdev_desc **_desc) 7803 { 7804 struct spdk_bdev_desc *desc; 7805 unsigned int i; 7806 7807 desc = calloc(1, sizeof(*desc)); 7808 if (desc == NULL) { 7809 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7810 return -ENOMEM; 7811 } 7812 7813 TAILQ_INIT(&desc->pending_media_events); 7814 TAILQ_INIT(&desc->free_media_events); 7815 7816 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7817 desc->callback.event_fn = event_cb; 7818 desc->callback.ctx = event_ctx; 7819 spdk_spin_init(&desc->spinlock); 7820 7821 if (bdev->media_events) { 7822 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7823 sizeof(*desc->media_events_buffer)); 7824 if (desc->media_events_buffer == NULL) { 7825 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7826 bdev_desc_free(desc); 7827 return -ENOMEM; 7828 } 7829 7830 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7831 TAILQ_INSERT_TAIL(&desc->free_media_events, 7832 &desc->media_events_buffer[i], tailq); 7833 } 7834 } 7835 7836 if (bdev->fn_table->accel_sequence_supported != NULL) { 7837 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7838 desc->accel_sequence_supported[i] = 7839 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7840 (enum spdk_bdev_io_type)i); 7841 } 7842 } 7843 7844 *_desc = desc; 7845 7846 return 0; 7847 } 7848 7849 int 7850 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7851 void *event_ctx, struct spdk_bdev_desc **_desc) 7852 { 7853 struct spdk_bdev_desc *desc; 7854 struct spdk_bdev *bdev; 7855 int rc; 7856 7857 if (event_cb == NULL) { 7858 SPDK_ERRLOG("Missing event callback function\n"); 7859 return -EINVAL; 7860 } 7861 7862 spdk_spin_lock(&g_bdev_mgr.spinlock); 7863 7864 bdev = bdev_get_by_name(bdev_name); 7865 7866 if (bdev == NULL) { 7867 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7868 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7869 return -ENODEV; 7870 } 7871 7872 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7873 if (rc != 0) { 7874 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7875 return rc; 7876 } 7877 7878 rc = bdev_open(bdev, write, desc); 7879 if (rc != 0) { 7880 bdev_desc_free(desc); 7881 desc = NULL; 7882 } 7883 7884 *_desc = desc; 7885 7886 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7887 7888 return rc; 7889 } 7890 7891 static void 7892 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7893 { 7894 int rc; 7895 7896 spdk_spin_lock(&bdev->internal.spinlock); 7897 spdk_spin_lock(&desc->spinlock); 7898 7899 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7900 7901 desc->closed = true; 7902 7903 if (desc->claim != NULL) { 7904 bdev_desc_release_claims(desc); 7905 } 7906 7907 if (0 == desc->refs) { 7908 spdk_spin_unlock(&desc->spinlock); 7909 bdev_desc_free(desc); 7910 } else { 7911 spdk_spin_unlock(&desc->spinlock); 7912 } 7913 7914 /* If no more descriptors, kill QoS channel */ 7915 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7916 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7917 bdev->name, spdk_get_thread()); 7918 7919 if (bdev_qos_destroy(bdev)) { 7920 /* There isn't anything we can do to recover here. Just let the 7921 * old QoS poller keep running. The QoS handling won't change 7922 * cores when the user allocates a new channel, but it won't break. */ 7923 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7924 } 7925 } 7926 7927 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7928 rc = bdev_unregister_unsafe(bdev); 7929 spdk_spin_unlock(&bdev->internal.spinlock); 7930 7931 if (rc == 0) { 7932 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7933 } 7934 } else { 7935 spdk_spin_unlock(&bdev->internal.spinlock); 7936 } 7937 } 7938 7939 void 7940 spdk_bdev_close(struct spdk_bdev_desc *desc) 7941 { 7942 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7943 7944 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7945 spdk_get_thread()); 7946 7947 assert(desc->thread == spdk_get_thread()); 7948 7949 spdk_poller_unregister(&desc->io_timeout_poller); 7950 7951 spdk_spin_lock(&g_bdev_mgr.spinlock); 7952 7953 bdev_close(bdev, desc); 7954 7955 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7956 } 7957 7958 static void 7959 bdev_register_finished(void *arg) 7960 { 7961 struct spdk_bdev_desc *desc = arg; 7962 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7963 7964 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7965 7966 spdk_spin_lock(&g_bdev_mgr.spinlock); 7967 7968 bdev_close(bdev, desc); 7969 7970 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7971 } 7972 7973 int 7974 spdk_bdev_register(struct spdk_bdev *bdev) 7975 { 7976 struct spdk_bdev_desc *desc; 7977 struct spdk_thread *thread = spdk_get_thread(); 7978 int rc; 7979 7980 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7981 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7982 thread ? spdk_thread_get_name(thread) : "null"); 7983 return -EINVAL; 7984 } 7985 7986 rc = bdev_register(bdev); 7987 if (rc != 0) { 7988 return rc; 7989 } 7990 7991 /* A descriptor is opened to prevent bdev deletion during examination */ 7992 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7993 if (rc != 0) { 7994 spdk_bdev_unregister(bdev, NULL, NULL); 7995 return rc; 7996 } 7997 7998 rc = bdev_open(bdev, false, desc); 7999 if (rc != 0) { 8000 bdev_desc_free(desc); 8001 spdk_bdev_unregister(bdev, NULL, NULL); 8002 return rc; 8003 } 8004 8005 /* Examine configuration before initializing I/O */ 8006 bdev_examine(bdev); 8007 8008 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8009 if (rc != 0) { 8010 bdev_close(bdev, desc); 8011 spdk_bdev_unregister(bdev, NULL, NULL); 8012 } 8013 8014 return rc; 8015 } 8016 8017 int 8018 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8019 struct spdk_bdev_module *module) 8020 { 8021 spdk_spin_lock(&bdev->internal.spinlock); 8022 8023 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8024 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8025 spdk_spin_unlock(&bdev->internal.spinlock); 8026 return -EPERM; 8027 } 8028 8029 if (desc && !desc->write) { 8030 desc->write = true; 8031 } 8032 8033 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8034 bdev->internal.claim.v1.module = module; 8035 8036 spdk_spin_unlock(&bdev->internal.spinlock); 8037 return 0; 8038 } 8039 8040 void 8041 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8042 { 8043 spdk_spin_lock(&bdev->internal.spinlock); 8044 8045 assert(bdev->internal.claim.v1.module != NULL); 8046 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8047 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8048 bdev->internal.claim.v1.module = NULL; 8049 8050 spdk_spin_unlock(&bdev->internal.spinlock); 8051 } 8052 8053 /* 8054 * Start claims v2 8055 */ 8056 8057 const char * 8058 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8059 { 8060 switch (type) { 8061 case SPDK_BDEV_CLAIM_NONE: 8062 return "not_claimed"; 8063 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8064 return "exclusive_write"; 8065 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8066 return "read_many_write_one"; 8067 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8068 return "read_many_write_none"; 8069 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8070 return "read_many_write_many"; 8071 default: 8072 break; 8073 } 8074 return "invalid_claim"; 8075 } 8076 8077 static bool 8078 claim_type_is_v2(enum spdk_bdev_claim_type type) 8079 { 8080 switch (type) { 8081 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8082 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8083 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8084 return true; 8085 default: 8086 break; 8087 } 8088 return false; 8089 } 8090 8091 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8092 static bool 8093 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8094 { 8095 switch (type) { 8096 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8097 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8098 return true; 8099 default: 8100 break; 8101 } 8102 return false; 8103 } 8104 8105 void 8106 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8107 { 8108 if (opts == NULL) { 8109 SPDK_ERRLOG("opts should not be NULL\n"); 8110 assert(opts != NULL); 8111 return; 8112 } 8113 if (size == 0) { 8114 SPDK_ERRLOG("size should not be zero\n"); 8115 assert(size != 0); 8116 return; 8117 } 8118 8119 memset(opts, 0, size); 8120 opts->opts_size = size; 8121 8122 #define FIELD_OK(field) \ 8123 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8124 8125 #define SET_FIELD(field, value) \ 8126 if (FIELD_OK(field)) { \ 8127 opts->field = value; \ 8128 } \ 8129 8130 SET_FIELD(shared_claim_key, 0); 8131 8132 #undef FIELD_OK 8133 #undef SET_FIELD 8134 } 8135 8136 static int 8137 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8138 { 8139 if (src->opts_size == 0) { 8140 SPDK_ERRLOG("size should not be zero\n"); 8141 return -1; 8142 } 8143 8144 memset(dst, 0, sizeof(*dst)); 8145 dst->opts_size = src->opts_size; 8146 8147 #define FIELD_OK(field) \ 8148 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8149 8150 #define SET_FIELD(field) \ 8151 if (FIELD_OK(field)) { \ 8152 dst->field = src->field; \ 8153 } \ 8154 8155 if (FIELD_OK(name)) { 8156 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8157 } 8158 8159 SET_FIELD(shared_claim_key); 8160 8161 /* You should not remove this statement, but need to update the assert statement 8162 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8163 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8164 8165 #undef FIELD_OK 8166 #undef SET_FIELD 8167 return 0; 8168 } 8169 8170 /* Returns 0 if a read-write-once claim can be taken. */ 8171 static int 8172 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8173 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8174 { 8175 struct spdk_bdev *bdev = desc->bdev; 8176 struct spdk_bdev_desc *open_desc; 8177 8178 assert(spdk_spin_held(&bdev->internal.spinlock)); 8179 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8180 8181 if (opts->shared_claim_key != 0) { 8182 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8183 bdev->name); 8184 return -EINVAL; 8185 } 8186 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8187 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8188 return -EPERM; 8189 } 8190 if (desc->claim != NULL) { 8191 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8192 bdev->name, desc->claim->module->name); 8193 return -EPERM; 8194 } 8195 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8196 if (desc != open_desc && open_desc->write) { 8197 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8198 "another descriptor is open for writing\n", 8199 bdev->name); 8200 return -EPERM; 8201 } 8202 } 8203 8204 return 0; 8205 } 8206 8207 /* Returns 0 if a read-only-many claim can be taken. */ 8208 static int 8209 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8210 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8211 { 8212 struct spdk_bdev *bdev = desc->bdev; 8213 struct spdk_bdev_desc *open_desc; 8214 8215 assert(spdk_spin_held(&bdev->internal.spinlock)); 8216 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8217 assert(desc->claim == NULL); 8218 8219 if (desc->write) { 8220 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8221 bdev->name); 8222 return -EINVAL; 8223 } 8224 if (opts->shared_claim_key != 0) { 8225 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8226 return -EINVAL; 8227 } 8228 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8229 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8230 if (open_desc->write) { 8231 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8232 "another descriptor is open for writing\n", 8233 bdev->name); 8234 return -EPERM; 8235 } 8236 } 8237 } 8238 8239 return 0; 8240 } 8241 8242 /* Returns 0 if a read-write-many claim can be taken. */ 8243 static int 8244 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8245 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8246 { 8247 struct spdk_bdev *bdev = desc->bdev; 8248 struct spdk_bdev_desc *open_desc; 8249 8250 assert(spdk_spin_held(&bdev->internal.spinlock)); 8251 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8252 assert(desc->claim == NULL); 8253 8254 if (opts->shared_claim_key == 0) { 8255 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8256 bdev->name); 8257 return -EINVAL; 8258 } 8259 switch (bdev->internal.claim_type) { 8260 case SPDK_BDEV_CLAIM_NONE: 8261 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8262 if (open_desc == desc) { 8263 continue; 8264 } 8265 if (open_desc->write) { 8266 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8267 "another descriptor is open for writing without a " 8268 "claim\n", bdev->name); 8269 return -EPERM; 8270 } 8271 } 8272 break; 8273 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8274 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8275 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8276 return -EPERM; 8277 } 8278 break; 8279 default: 8280 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8281 return -EBUSY; 8282 } 8283 8284 return 0; 8285 } 8286 8287 /* Updates desc and its bdev with a v2 claim. */ 8288 static int 8289 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8290 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8291 { 8292 struct spdk_bdev *bdev = desc->bdev; 8293 struct spdk_bdev_module_claim *claim; 8294 8295 assert(spdk_spin_held(&bdev->internal.spinlock)); 8296 assert(claim_type_is_v2(type)); 8297 assert(desc->claim == NULL); 8298 8299 claim = calloc(1, sizeof(*desc->claim)); 8300 if (claim == NULL) { 8301 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8302 return -ENOMEM; 8303 } 8304 claim->module = module; 8305 claim->desc = desc; 8306 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8307 memcpy(claim->name, opts->name, sizeof(claim->name)); 8308 desc->claim = claim; 8309 8310 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8311 bdev->internal.claim_type = type; 8312 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8313 bdev->internal.claim.v2.key = opts->shared_claim_key; 8314 } 8315 assert(type == bdev->internal.claim_type); 8316 8317 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8318 8319 if (!desc->write && claim_type_promotes_to_write(type)) { 8320 desc->write = true; 8321 } 8322 8323 return 0; 8324 } 8325 8326 int 8327 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8328 struct spdk_bdev_claim_opts *_opts, 8329 struct spdk_bdev_module *module) 8330 { 8331 struct spdk_bdev *bdev; 8332 struct spdk_bdev_claim_opts opts; 8333 int rc = 0; 8334 8335 if (desc == NULL) { 8336 SPDK_ERRLOG("descriptor must not be NULL\n"); 8337 return -EINVAL; 8338 } 8339 8340 bdev = desc->bdev; 8341 8342 if (_opts == NULL) { 8343 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8344 } else if (claim_opts_copy(_opts, &opts) != 0) { 8345 return -EINVAL; 8346 } 8347 8348 spdk_spin_lock(&bdev->internal.spinlock); 8349 8350 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8351 bdev->internal.claim_type != type) { 8352 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8353 spdk_spin_unlock(&bdev->internal.spinlock); 8354 return -EPERM; 8355 } 8356 8357 if (claim_type_is_v2(type) && desc->claim != NULL) { 8358 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8359 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8360 spdk_spin_unlock(&bdev->internal.spinlock); 8361 return -EPERM; 8362 } 8363 8364 switch (type) { 8365 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8366 spdk_spin_unlock(&bdev->internal.spinlock); 8367 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8368 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8369 rc = claim_verify_rwo(desc, type, &opts, module); 8370 break; 8371 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8372 rc = claim_verify_rom(desc, type, &opts, module); 8373 break; 8374 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8375 rc = claim_verify_rwm(desc, type, &opts, module); 8376 break; 8377 default: 8378 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8379 rc = -ENOTSUP; 8380 } 8381 8382 if (rc == 0) { 8383 rc = claim_bdev(desc, type, &opts, module); 8384 } 8385 8386 spdk_spin_unlock(&bdev->internal.spinlock); 8387 return rc; 8388 } 8389 8390 static void 8391 claim_reset(struct spdk_bdev *bdev) 8392 { 8393 assert(spdk_spin_held(&bdev->internal.spinlock)); 8394 assert(claim_type_is_v2(bdev->internal.claim_type)); 8395 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8396 8397 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8398 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8399 } 8400 8401 static void 8402 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8403 { 8404 struct spdk_bdev *bdev = desc->bdev; 8405 8406 assert(spdk_spin_held(&bdev->internal.spinlock)); 8407 assert(claim_type_is_v2(bdev->internal.claim_type)); 8408 8409 if (bdev->internal.examine_in_progress == 0) { 8410 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8411 free(desc->claim); 8412 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8413 claim_reset(bdev); 8414 } 8415 } else { 8416 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8417 desc->claim->module = NULL; 8418 desc->claim->desc = NULL; 8419 } 8420 desc->claim = NULL; 8421 } 8422 8423 /* 8424 * End claims v2 8425 */ 8426 8427 struct spdk_bdev * 8428 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8429 { 8430 assert(desc != NULL); 8431 return desc->bdev; 8432 } 8433 8434 int 8435 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8436 { 8437 struct spdk_bdev *bdev, *tmp; 8438 struct spdk_bdev_desc *desc; 8439 int rc = 0; 8440 8441 assert(fn != NULL); 8442 8443 spdk_spin_lock(&g_bdev_mgr.spinlock); 8444 bdev = spdk_bdev_first(); 8445 while (bdev != NULL) { 8446 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8447 if (rc != 0) { 8448 break; 8449 } 8450 rc = bdev_open(bdev, false, desc); 8451 if (rc != 0) { 8452 bdev_desc_free(desc); 8453 if (rc == -ENODEV) { 8454 /* Ignore the error and move to the next bdev. */ 8455 rc = 0; 8456 bdev = spdk_bdev_next(bdev); 8457 continue; 8458 } 8459 break; 8460 } 8461 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8462 8463 rc = fn(ctx, bdev); 8464 8465 spdk_spin_lock(&g_bdev_mgr.spinlock); 8466 tmp = spdk_bdev_next(bdev); 8467 bdev_close(bdev, desc); 8468 if (rc != 0) { 8469 break; 8470 } 8471 bdev = tmp; 8472 } 8473 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8474 8475 return rc; 8476 } 8477 8478 int 8479 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8480 { 8481 struct spdk_bdev *bdev, *tmp; 8482 struct spdk_bdev_desc *desc; 8483 int rc = 0; 8484 8485 assert(fn != NULL); 8486 8487 spdk_spin_lock(&g_bdev_mgr.spinlock); 8488 bdev = spdk_bdev_first_leaf(); 8489 while (bdev != NULL) { 8490 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8491 if (rc != 0) { 8492 break; 8493 } 8494 rc = bdev_open(bdev, false, desc); 8495 if (rc != 0) { 8496 bdev_desc_free(desc); 8497 if (rc == -ENODEV) { 8498 /* Ignore the error and move to the next bdev. */ 8499 rc = 0; 8500 bdev = spdk_bdev_next_leaf(bdev); 8501 continue; 8502 } 8503 break; 8504 } 8505 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8506 8507 rc = fn(ctx, bdev); 8508 8509 spdk_spin_lock(&g_bdev_mgr.spinlock); 8510 tmp = spdk_bdev_next_leaf(bdev); 8511 bdev_close(bdev, desc); 8512 if (rc != 0) { 8513 break; 8514 } 8515 bdev = tmp; 8516 } 8517 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8518 8519 return rc; 8520 } 8521 8522 void 8523 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8524 { 8525 struct iovec *iovs; 8526 int iovcnt; 8527 8528 if (bdev_io == NULL) { 8529 return; 8530 } 8531 8532 switch (bdev_io->type) { 8533 case SPDK_BDEV_IO_TYPE_READ: 8534 case SPDK_BDEV_IO_TYPE_WRITE: 8535 case SPDK_BDEV_IO_TYPE_ZCOPY: 8536 iovs = bdev_io->u.bdev.iovs; 8537 iovcnt = bdev_io->u.bdev.iovcnt; 8538 break; 8539 default: 8540 iovs = NULL; 8541 iovcnt = 0; 8542 break; 8543 } 8544 8545 if (iovp) { 8546 *iovp = iovs; 8547 } 8548 if (iovcntp) { 8549 *iovcntp = iovcnt; 8550 } 8551 } 8552 8553 void * 8554 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8555 { 8556 if (bdev_io == NULL) { 8557 return NULL; 8558 } 8559 8560 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8561 return NULL; 8562 } 8563 8564 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8565 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8566 return bdev_io->u.bdev.md_buf; 8567 } 8568 8569 return NULL; 8570 } 8571 8572 void * 8573 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8574 { 8575 if (bdev_io == NULL) { 8576 assert(false); 8577 return NULL; 8578 } 8579 8580 return bdev_io->internal.caller_ctx; 8581 } 8582 8583 void 8584 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8585 { 8586 8587 if (spdk_bdev_module_list_find(bdev_module->name)) { 8588 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8589 assert(false); 8590 } 8591 8592 spdk_spin_init(&bdev_module->internal.spinlock); 8593 8594 /* 8595 * Modules with examine callbacks must be initialized first, so they are 8596 * ready to handle examine callbacks from later modules that will 8597 * register physical bdevs. 8598 */ 8599 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8600 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8601 } else { 8602 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8603 } 8604 } 8605 8606 struct spdk_bdev_module * 8607 spdk_bdev_module_list_find(const char *name) 8608 { 8609 struct spdk_bdev_module *bdev_module; 8610 8611 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8612 if (strcmp(name, bdev_module->name) == 0) { 8613 break; 8614 } 8615 } 8616 8617 return bdev_module; 8618 } 8619 8620 static int 8621 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8622 { 8623 uint64_t num_blocks; 8624 void *md_buf = NULL; 8625 8626 num_blocks = bdev_io->u.bdev.num_blocks; 8627 8628 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8629 md_buf = (char *)g_bdev_mgr.zero_buffer + 8630 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8631 } 8632 8633 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8634 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8635 g_bdev_mgr.zero_buffer, md_buf, 8636 bdev_io->u.bdev.offset_blocks, num_blocks, 8637 bdev_write_zero_buffer_done, bdev_io); 8638 } 8639 8640 static void 8641 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8642 { 8643 struct spdk_bdev_io *parent_io = cb_arg; 8644 8645 spdk_bdev_free_io(bdev_io); 8646 8647 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8648 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8649 } 8650 8651 static void 8652 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8653 { 8654 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8655 ctx->bdev->internal.qos_mod_in_progress = false; 8656 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8657 8658 if (ctx->cb_fn) { 8659 ctx->cb_fn(ctx->cb_arg, status); 8660 } 8661 free(ctx); 8662 } 8663 8664 static void 8665 bdev_disable_qos_done(void *cb_arg) 8666 { 8667 struct set_qos_limit_ctx *ctx = cb_arg; 8668 struct spdk_bdev *bdev = ctx->bdev; 8669 struct spdk_bdev_io *bdev_io; 8670 struct spdk_bdev_qos *qos; 8671 8672 spdk_spin_lock(&bdev->internal.spinlock); 8673 qos = bdev->internal.qos; 8674 bdev->internal.qos = NULL; 8675 spdk_spin_unlock(&bdev->internal.spinlock); 8676 8677 while (!TAILQ_EMPTY(&qos->queued)) { 8678 /* Send queued I/O back to their original thread for resubmission. */ 8679 bdev_io = TAILQ_FIRST(&qos->queued); 8680 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8681 8682 if (bdev_io->internal.io_submit_ch) { 8683 /* 8684 * Channel was changed when sending it to the QoS thread - change it back 8685 * before sending it back to the original thread. 8686 */ 8687 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8688 bdev_io->internal.io_submit_ch = NULL; 8689 } 8690 8691 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8692 _bdev_io_submit, bdev_io); 8693 } 8694 8695 if (qos->thread != NULL) { 8696 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8697 spdk_poller_unregister(&qos->poller); 8698 } 8699 8700 free(qos); 8701 8702 bdev_set_qos_limit_done(ctx, 0); 8703 } 8704 8705 static void 8706 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8707 { 8708 struct set_qos_limit_ctx *ctx = _ctx; 8709 struct spdk_thread *thread; 8710 8711 spdk_spin_lock(&bdev->internal.spinlock); 8712 thread = bdev->internal.qos->thread; 8713 spdk_spin_unlock(&bdev->internal.spinlock); 8714 8715 if (thread != NULL) { 8716 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8717 } else { 8718 bdev_disable_qos_done(ctx); 8719 } 8720 } 8721 8722 static void 8723 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8724 struct spdk_io_channel *ch, void *_ctx) 8725 { 8726 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8727 8728 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8729 8730 spdk_bdev_for_each_channel_continue(i, 0); 8731 } 8732 8733 static void 8734 bdev_update_qos_rate_limit_msg(void *cb_arg) 8735 { 8736 struct set_qos_limit_ctx *ctx = cb_arg; 8737 struct spdk_bdev *bdev = ctx->bdev; 8738 8739 spdk_spin_lock(&bdev->internal.spinlock); 8740 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8741 spdk_spin_unlock(&bdev->internal.spinlock); 8742 8743 bdev_set_qos_limit_done(ctx, 0); 8744 } 8745 8746 static void 8747 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8748 struct spdk_io_channel *ch, void *_ctx) 8749 { 8750 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8751 8752 spdk_spin_lock(&bdev->internal.spinlock); 8753 bdev_enable_qos(bdev, bdev_ch); 8754 spdk_spin_unlock(&bdev->internal.spinlock); 8755 spdk_bdev_for_each_channel_continue(i, 0); 8756 } 8757 8758 static void 8759 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8760 { 8761 struct set_qos_limit_ctx *ctx = _ctx; 8762 8763 bdev_set_qos_limit_done(ctx, status); 8764 } 8765 8766 static void 8767 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8768 { 8769 int i; 8770 8771 assert(bdev->internal.qos != NULL); 8772 8773 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8774 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8775 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8776 8777 if (limits[i] == 0) { 8778 bdev->internal.qos->rate_limits[i].limit = 8779 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8780 } 8781 } 8782 } 8783 } 8784 8785 void 8786 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8787 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8788 { 8789 struct set_qos_limit_ctx *ctx; 8790 uint32_t limit_set_complement; 8791 uint64_t min_limit_per_sec; 8792 int i; 8793 bool disable_rate_limit = true; 8794 8795 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8796 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8797 continue; 8798 } 8799 8800 if (limits[i] > 0) { 8801 disable_rate_limit = false; 8802 } 8803 8804 if (bdev_qos_is_iops_rate_limit(i) == true) { 8805 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8806 } else { 8807 /* Change from megabyte to byte rate limit */ 8808 limits[i] = limits[i] * 1024 * 1024; 8809 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8810 } 8811 8812 limit_set_complement = limits[i] % min_limit_per_sec; 8813 if (limit_set_complement) { 8814 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8815 limits[i], min_limit_per_sec); 8816 limits[i] += min_limit_per_sec - limit_set_complement; 8817 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8818 } 8819 } 8820 8821 ctx = calloc(1, sizeof(*ctx)); 8822 if (ctx == NULL) { 8823 cb_fn(cb_arg, -ENOMEM); 8824 return; 8825 } 8826 8827 ctx->cb_fn = cb_fn; 8828 ctx->cb_arg = cb_arg; 8829 ctx->bdev = bdev; 8830 8831 spdk_spin_lock(&bdev->internal.spinlock); 8832 if (bdev->internal.qos_mod_in_progress) { 8833 spdk_spin_unlock(&bdev->internal.spinlock); 8834 free(ctx); 8835 cb_fn(cb_arg, -EAGAIN); 8836 return; 8837 } 8838 bdev->internal.qos_mod_in_progress = true; 8839 8840 if (disable_rate_limit == true && bdev->internal.qos) { 8841 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8842 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8843 (bdev->internal.qos->rate_limits[i].limit > 0 && 8844 bdev->internal.qos->rate_limits[i].limit != 8845 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8846 disable_rate_limit = false; 8847 break; 8848 } 8849 } 8850 } 8851 8852 if (disable_rate_limit == false) { 8853 if (bdev->internal.qos == NULL) { 8854 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8855 if (!bdev->internal.qos) { 8856 spdk_spin_unlock(&bdev->internal.spinlock); 8857 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8858 bdev_set_qos_limit_done(ctx, -ENOMEM); 8859 return; 8860 } 8861 } 8862 8863 if (bdev->internal.qos->thread == NULL) { 8864 /* Enabling */ 8865 bdev_set_qos_rate_limits(bdev, limits); 8866 8867 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8868 bdev_enable_qos_done); 8869 } else { 8870 /* Updating */ 8871 bdev_set_qos_rate_limits(bdev, limits); 8872 8873 spdk_thread_send_msg(bdev->internal.qos->thread, 8874 bdev_update_qos_rate_limit_msg, ctx); 8875 } 8876 } else { 8877 if (bdev->internal.qos != NULL) { 8878 bdev_set_qos_rate_limits(bdev, limits); 8879 8880 /* Disabling */ 8881 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8882 bdev_disable_qos_msg_done); 8883 } else { 8884 spdk_spin_unlock(&bdev->internal.spinlock); 8885 bdev_set_qos_limit_done(ctx, 0); 8886 return; 8887 } 8888 } 8889 8890 spdk_spin_unlock(&bdev->internal.spinlock); 8891 } 8892 8893 struct spdk_bdev_histogram_ctx { 8894 spdk_bdev_histogram_status_cb cb_fn; 8895 void *cb_arg; 8896 struct spdk_bdev *bdev; 8897 int status; 8898 }; 8899 8900 static void 8901 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8902 { 8903 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8904 8905 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8906 ctx->bdev->internal.histogram_in_progress = false; 8907 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8908 ctx->cb_fn(ctx->cb_arg, ctx->status); 8909 free(ctx); 8910 } 8911 8912 static void 8913 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8914 struct spdk_io_channel *_ch, void *_ctx) 8915 { 8916 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8917 8918 if (ch->histogram != NULL) { 8919 spdk_histogram_data_free(ch->histogram); 8920 ch->histogram = NULL; 8921 } 8922 spdk_bdev_for_each_channel_continue(i, 0); 8923 } 8924 8925 static void 8926 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8927 { 8928 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8929 8930 if (status != 0) { 8931 ctx->status = status; 8932 ctx->bdev->internal.histogram_enabled = false; 8933 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8934 bdev_histogram_disable_channel_cb); 8935 } else { 8936 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8937 ctx->bdev->internal.histogram_in_progress = false; 8938 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8939 ctx->cb_fn(ctx->cb_arg, ctx->status); 8940 free(ctx); 8941 } 8942 } 8943 8944 static void 8945 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8946 struct spdk_io_channel *_ch, void *_ctx) 8947 { 8948 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8949 int status = 0; 8950 8951 if (ch->histogram == NULL) { 8952 ch->histogram = spdk_histogram_data_alloc(); 8953 if (ch->histogram == NULL) { 8954 status = -ENOMEM; 8955 } 8956 } 8957 8958 spdk_bdev_for_each_channel_continue(i, status); 8959 } 8960 8961 void 8962 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8963 void *cb_arg, bool enable) 8964 { 8965 struct spdk_bdev_histogram_ctx *ctx; 8966 8967 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8968 if (ctx == NULL) { 8969 cb_fn(cb_arg, -ENOMEM); 8970 return; 8971 } 8972 8973 ctx->bdev = bdev; 8974 ctx->status = 0; 8975 ctx->cb_fn = cb_fn; 8976 ctx->cb_arg = cb_arg; 8977 8978 spdk_spin_lock(&bdev->internal.spinlock); 8979 if (bdev->internal.histogram_in_progress) { 8980 spdk_spin_unlock(&bdev->internal.spinlock); 8981 free(ctx); 8982 cb_fn(cb_arg, -EAGAIN); 8983 return; 8984 } 8985 8986 bdev->internal.histogram_in_progress = true; 8987 spdk_spin_unlock(&bdev->internal.spinlock); 8988 8989 bdev->internal.histogram_enabled = enable; 8990 8991 if (enable) { 8992 /* Allocate histogram for each channel */ 8993 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8994 bdev_histogram_enable_channel_cb); 8995 } else { 8996 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 8997 bdev_histogram_disable_channel_cb); 8998 } 8999 } 9000 9001 struct spdk_bdev_histogram_data_ctx { 9002 spdk_bdev_histogram_data_cb cb_fn; 9003 void *cb_arg; 9004 struct spdk_bdev *bdev; 9005 /** merged histogram data from all channels */ 9006 struct spdk_histogram_data *histogram; 9007 }; 9008 9009 static void 9010 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9011 { 9012 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9013 9014 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9015 free(ctx); 9016 } 9017 9018 static void 9019 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9020 struct spdk_io_channel *_ch, void *_ctx) 9021 { 9022 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9023 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9024 int status = 0; 9025 9026 if (ch->histogram == NULL) { 9027 status = -EFAULT; 9028 } else { 9029 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9030 } 9031 9032 spdk_bdev_for_each_channel_continue(i, status); 9033 } 9034 9035 void 9036 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9037 spdk_bdev_histogram_data_cb cb_fn, 9038 void *cb_arg) 9039 { 9040 struct spdk_bdev_histogram_data_ctx *ctx; 9041 9042 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9043 if (ctx == NULL) { 9044 cb_fn(cb_arg, -ENOMEM, NULL); 9045 return; 9046 } 9047 9048 ctx->bdev = bdev; 9049 ctx->cb_fn = cb_fn; 9050 ctx->cb_arg = cb_arg; 9051 9052 ctx->histogram = histogram; 9053 9054 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9055 bdev_histogram_get_channel_cb); 9056 } 9057 9058 void 9059 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9060 void *cb_arg) 9061 { 9062 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9063 int status = 0; 9064 9065 assert(cb_fn != NULL); 9066 9067 if (bdev_ch->histogram == NULL) { 9068 status = -EFAULT; 9069 } 9070 cb_fn(cb_arg, status, bdev_ch->histogram); 9071 } 9072 9073 size_t 9074 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9075 size_t max_events) 9076 { 9077 struct media_event_entry *entry; 9078 size_t num_events = 0; 9079 9080 for (; num_events < max_events; ++num_events) { 9081 entry = TAILQ_FIRST(&desc->pending_media_events); 9082 if (entry == NULL) { 9083 break; 9084 } 9085 9086 events[num_events] = entry->event; 9087 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9088 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9089 } 9090 9091 return num_events; 9092 } 9093 9094 int 9095 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9096 size_t num_events) 9097 { 9098 struct spdk_bdev_desc *desc; 9099 struct media_event_entry *entry; 9100 size_t event_id; 9101 int rc = 0; 9102 9103 assert(bdev->media_events); 9104 9105 spdk_spin_lock(&bdev->internal.spinlock); 9106 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9107 if (desc->write) { 9108 break; 9109 } 9110 } 9111 9112 if (desc == NULL || desc->media_events_buffer == NULL) { 9113 rc = -ENODEV; 9114 goto out; 9115 } 9116 9117 for (event_id = 0; event_id < num_events; ++event_id) { 9118 entry = TAILQ_FIRST(&desc->free_media_events); 9119 if (entry == NULL) { 9120 break; 9121 } 9122 9123 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9124 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9125 entry->event = events[event_id]; 9126 } 9127 9128 rc = event_id; 9129 out: 9130 spdk_spin_unlock(&bdev->internal.spinlock); 9131 return rc; 9132 } 9133 9134 static void 9135 _media_management_notify(void *arg) 9136 { 9137 struct spdk_bdev_desc *desc = arg; 9138 9139 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9140 } 9141 9142 void 9143 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9144 { 9145 struct spdk_bdev_desc *desc; 9146 9147 spdk_spin_lock(&bdev->internal.spinlock); 9148 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9149 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9150 event_notify(desc, _media_management_notify); 9151 } 9152 } 9153 spdk_spin_unlock(&bdev->internal.spinlock); 9154 } 9155 9156 struct locked_lba_range_ctx { 9157 struct lba_range range; 9158 struct spdk_bdev *bdev; 9159 struct lba_range *current_range; 9160 struct lba_range *owner_range; 9161 struct spdk_poller *poller; 9162 lock_range_cb cb_fn; 9163 void *cb_arg; 9164 }; 9165 9166 static void 9167 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9168 { 9169 struct locked_lba_range_ctx *ctx = _ctx; 9170 9171 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 9172 free(ctx); 9173 } 9174 9175 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9176 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9177 9178 static void 9179 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9180 { 9181 struct locked_lba_range_ctx *ctx = _ctx; 9182 9183 if (status == -ENOMEM) { 9184 /* One of the channels could not allocate a range object. 9185 * So we have to go back and clean up any ranges that were 9186 * allocated successfully before we return error status to 9187 * the caller. We can reuse the unlock function to do that 9188 * clean up. 9189 */ 9190 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9191 bdev_lock_error_cleanup_cb); 9192 return; 9193 } 9194 9195 /* All channels have locked this range and no I/O overlapping the range 9196 * are outstanding! Set the owner_ch for the range object for the 9197 * locking channel, so that this channel will know that it is allowed 9198 * to write to this range. 9199 */ 9200 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9201 ctx->cb_fn(ctx->cb_arg, status); 9202 9203 /* Don't free the ctx here. Its range is in the bdev's global list of 9204 * locked ranges still, and will be removed and freed when this range 9205 * is later unlocked. 9206 */ 9207 } 9208 9209 static int 9210 bdev_lock_lba_range_check_io(void *_i) 9211 { 9212 struct spdk_bdev_channel_iter *i = _i; 9213 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9214 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9215 struct locked_lba_range_ctx *ctx = i->ctx; 9216 struct lba_range *range = ctx->current_range; 9217 struct spdk_bdev_io *bdev_io; 9218 9219 spdk_poller_unregister(&ctx->poller); 9220 9221 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9222 * range. But we need to wait until any outstanding IO overlapping with this range 9223 * are completed. 9224 */ 9225 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9226 if (bdev_io_range_is_locked(bdev_io, range)) { 9227 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9228 return SPDK_POLLER_BUSY; 9229 } 9230 } 9231 9232 spdk_bdev_for_each_channel_continue(i, 0); 9233 return SPDK_POLLER_BUSY; 9234 } 9235 9236 static void 9237 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9238 struct spdk_io_channel *_ch, void *_ctx) 9239 { 9240 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9241 struct locked_lba_range_ctx *ctx = _ctx; 9242 struct lba_range *range; 9243 9244 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9245 if (range->length == ctx->range.length && 9246 range->offset == ctx->range.offset && 9247 range->locked_ctx == ctx->range.locked_ctx) { 9248 /* This range already exists on this channel, so don't add 9249 * it again. This can happen when a new channel is created 9250 * while the for_each_channel operation is in progress. 9251 * Do not check for outstanding I/O in that case, since the 9252 * range was locked before any I/O could be submitted to the 9253 * new channel. 9254 */ 9255 spdk_bdev_for_each_channel_continue(i, 0); 9256 return; 9257 } 9258 } 9259 9260 range = calloc(1, sizeof(*range)); 9261 if (range == NULL) { 9262 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9263 return; 9264 } 9265 9266 range->length = ctx->range.length; 9267 range->offset = ctx->range.offset; 9268 range->locked_ctx = ctx->range.locked_ctx; 9269 ctx->current_range = range; 9270 if (ctx->range.owner_ch == ch) { 9271 /* This is the range object for the channel that will hold 9272 * the lock. Store it in the ctx object so that we can easily 9273 * set its owner_ch after the lock is finally acquired. 9274 */ 9275 ctx->owner_range = range; 9276 } 9277 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9278 bdev_lock_lba_range_check_io(i); 9279 } 9280 9281 static void 9282 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9283 { 9284 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 9285 9286 /* We will add a copy of this range to each channel now. */ 9287 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9288 bdev_lock_lba_range_cb); 9289 } 9290 9291 static bool 9292 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9293 { 9294 struct lba_range *r; 9295 9296 TAILQ_FOREACH(r, tailq, tailq) { 9297 if (bdev_lba_range_overlapped(range, r)) { 9298 return true; 9299 } 9300 } 9301 return false; 9302 } 9303 9304 static int 9305 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9306 uint64_t offset, uint64_t length, 9307 lock_range_cb cb_fn, void *cb_arg) 9308 { 9309 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9310 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9311 struct locked_lba_range_ctx *ctx; 9312 9313 if (cb_arg == NULL) { 9314 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9315 return -EINVAL; 9316 } 9317 9318 ctx = calloc(1, sizeof(*ctx)); 9319 if (ctx == NULL) { 9320 return -ENOMEM; 9321 } 9322 9323 ctx->range.offset = offset; 9324 ctx->range.length = length; 9325 ctx->range.owner_ch = ch; 9326 ctx->range.locked_ctx = cb_arg; 9327 ctx->bdev = bdev; 9328 ctx->cb_fn = cb_fn; 9329 ctx->cb_arg = cb_arg; 9330 9331 spdk_spin_lock(&bdev->internal.spinlock); 9332 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9333 /* There is an active lock overlapping with this range. 9334 * Put it on the pending list until this range no 9335 * longer overlaps with another. 9336 */ 9337 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9338 } else { 9339 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9340 bdev_lock_lba_range_ctx(bdev, ctx); 9341 } 9342 spdk_spin_unlock(&bdev->internal.spinlock); 9343 return 0; 9344 } 9345 9346 static void 9347 bdev_lock_lba_range_ctx_msg(void *_ctx) 9348 { 9349 struct locked_lba_range_ctx *ctx = _ctx; 9350 9351 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 9352 } 9353 9354 static void 9355 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9356 { 9357 struct locked_lba_range_ctx *ctx = _ctx; 9358 struct locked_lba_range_ctx *pending_ctx; 9359 struct lba_range *range, *tmp; 9360 9361 spdk_spin_lock(&bdev->internal.spinlock); 9362 /* Check if there are any pending locked ranges that overlap with this range 9363 * that was just unlocked. If there are, check that it doesn't overlap with any 9364 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9365 * the lock process. 9366 */ 9367 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9368 if (bdev_lba_range_overlapped(range, &ctx->range) && 9369 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9370 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9371 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9372 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9373 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 9374 bdev_lock_lba_range_ctx_msg, pending_ctx); 9375 } 9376 } 9377 spdk_spin_unlock(&bdev->internal.spinlock); 9378 9379 ctx->cb_fn(ctx->cb_arg, status); 9380 free(ctx); 9381 } 9382 9383 static void 9384 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9385 struct spdk_io_channel *_ch, void *_ctx) 9386 { 9387 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9388 struct locked_lba_range_ctx *ctx = _ctx; 9389 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9390 struct spdk_bdev_io *bdev_io; 9391 struct lba_range *range; 9392 9393 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9394 if (ctx->range.offset == range->offset && 9395 ctx->range.length == range->length && 9396 ctx->range.locked_ctx == range->locked_ctx) { 9397 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9398 free(range); 9399 break; 9400 } 9401 } 9402 9403 /* Note: we should almost always be able to assert that the range specified 9404 * was found. But there are some very rare corner cases where a new channel 9405 * gets created simultaneously with a range unlock, where this function 9406 * would execute on that new channel and wouldn't have the range. 9407 * We also use this to clean up range allocations when a later allocation 9408 * fails in the locking path. 9409 * So we can't actually assert() here. 9410 */ 9411 9412 /* Swap the locked IO into a temporary list, and then try to submit them again. 9413 * We could hyper-optimize this to only resubmit locked I/O that overlap 9414 * with the range that was just unlocked, but this isn't a performance path so 9415 * we go for simplicity here. 9416 */ 9417 TAILQ_INIT(&io_locked); 9418 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9419 while (!TAILQ_EMPTY(&io_locked)) { 9420 bdev_io = TAILQ_FIRST(&io_locked); 9421 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9422 bdev_io_submit(bdev_io); 9423 } 9424 9425 spdk_bdev_for_each_channel_continue(i, 0); 9426 } 9427 9428 static int 9429 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9430 uint64_t offset, uint64_t length, 9431 lock_range_cb cb_fn, void *cb_arg) 9432 { 9433 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9434 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9435 struct locked_lba_range_ctx *ctx; 9436 struct lba_range *range; 9437 bool range_found = false; 9438 9439 /* Let's make sure the specified channel actually has a lock on 9440 * the specified range. Note that the range must match exactly. 9441 */ 9442 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9443 if (range->offset == offset && range->length == length && 9444 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9445 range_found = true; 9446 break; 9447 } 9448 } 9449 9450 if (!range_found) { 9451 return -EINVAL; 9452 } 9453 9454 spdk_spin_lock(&bdev->internal.spinlock); 9455 /* We confirmed that this channel has locked the specified range. To 9456 * start the unlock the process, we find the range in the bdev's locked_ranges 9457 * and remove it. This ensures new channels don't inherit the locked range. 9458 * Then we will send a message to each channel (including the one specified 9459 * here) to remove the range from its per-channel list. 9460 */ 9461 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9462 if (range->offset == offset && range->length == length && 9463 range->locked_ctx == cb_arg) { 9464 break; 9465 } 9466 } 9467 if (range == NULL) { 9468 assert(false); 9469 spdk_spin_unlock(&bdev->internal.spinlock); 9470 return -EINVAL; 9471 } 9472 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9473 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9474 spdk_spin_unlock(&bdev->internal.spinlock); 9475 9476 ctx->cb_fn = cb_fn; 9477 ctx->cb_arg = cb_arg; 9478 9479 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9480 bdev_unlock_lba_range_cb); 9481 return 0; 9482 } 9483 9484 int 9485 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9486 int array_size) 9487 { 9488 if (!bdev) { 9489 return -EINVAL; 9490 } 9491 9492 if (bdev->fn_table->get_memory_domains) { 9493 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9494 } 9495 9496 return 0; 9497 } 9498 9499 struct spdk_bdev_for_each_io_ctx { 9500 void *ctx; 9501 spdk_bdev_io_fn fn; 9502 spdk_bdev_for_each_io_cb cb; 9503 }; 9504 9505 static void 9506 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9507 struct spdk_io_channel *io_ch, void *_ctx) 9508 { 9509 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9510 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9511 struct spdk_bdev_io *bdev_io; 9512 int rc = 0; 9513 9514 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9515 rc = ctx->fn(ctx->ctx, bdev_io); 9516 if (rc != 0) { 9517 break; 9518 } 9519 } 9520 9521 spdk_bdev_for_each_channel_continue(i, rc); 9522 } 9523 9524 static void 9525 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9526 { 9527 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9528 9529 ctx->cb(ctx->ctx, status); 9530 9531 free(ctx); 9532 } 9533 9534 void 9535 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9536 spdk_bdev_for_each_io_cb cb) 9537 { 9538 struct spdk_bdev_for_each_io_ctx *ctx; 9539 9540 assert(fn != NULL && cb != NULL); 9541 9542 ctx = calloc(1, sizeof(*ctx)); 9543 if (ctx == NULL) { 9544 SPDK_ERRLOG("Failed to allocate context.\n"); 9545 cb(_ctx, -ENOMEM); 9546 return; 9547 } 9548 9549 ctx->ctx = _ctx; 9550 ctx->fn = fn; 9551 ctx->cb = cb; 9552 9553 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9554 bdev_for_each_io_done); 9555 } 9556 9557 void 9558 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9559 { 9560 spdk_for_each_channel_continue(iter->i, status); 9561 } 9562 9563 static struct spdk_bdev * 9564 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9565 { 9566 void *io_device = spdk_io_channel_iter_get_io_device(i); 9567 9568 return __bdev_from_io_dev(io_device); 9569 } 9570 9571 static void 9572 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9573 { 9574 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9575 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9576 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9577 9578 iter->i = i; 9579 iter->fn(iter, bdev, ch, iter->ctx); 9580 } 9581 9582 static void 9583 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9584 { 9585 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9586 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9587 9588 iter->i = i; 9589 iter->cpl(bdev, iter->ctx, status); 9590 9591 free(iter); 9592 } 9593 9594 void 9595 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9596 void *ctx, spdk_bdev_for_each_channel_done cpl) 9597 { 9598 struct spdk_bdev_channel_iter *iter; 9599 9600 assert(bdev != NULL && fn != NULL && ctx != NULL); 9601 9602 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9603 if (iter == NULL) { 9604 SPDK_ERRLOG("Unable to allocate iterator\n"); 9605 assert(false); 9606 return; 9607 } 9608 9609 iter->fn = fn; 9610 iter->cpl = cpl; 9611 iter->ctx = ctx; 9612 9613 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9614 iter, bdev_each_channel_cpl); 9615 } 9616 9617 static void 9618 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9619 { 9620 struct spdk_bdev_io *parent_io = cb_arg; 9621 9622 spdk_bdev_free_io(bdev_io); 9623 9624 /* Check return status of write */ 9625 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9626 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9627 } 9628 9629 static void 9630 bdev_copy_do_write(void *_bdev_io) 9631 { 9632 struct spdk_bdev_io *bdev_io = _bdev_io; 9633 int rc; 9634 9635 /* Write blocks */ 9636 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9637 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9638 bdev_io->u.bdev.iovs[0].iov_base, 9639 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9640 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 9641 9642 if (rc == -ENOMEM) { 9643 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9644 } else if (rc != 0) { 9645 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9646 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9647 } 9648 } 9649 9650 static void 9651 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9652 { 9653 struct spdk_bdev_io *parent_io = cb_arg; 9654 9655 spdk_bdev_free_io(bdev_io); 9656 9657 /* Check return status of read */ 9658 if (!success) { 9659 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9660 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9661 return; 9662 } 9663 9664 /* Do write */ 9665 bdev_copy_do_write(parent_io); 9666 } 9667 9668 static void 9669 bdev_copy_do_read(void *_bdev_io) 9670 { 9671 struct spdk_bdev_io *bdev_io = _bdev_io; 9672 int rc; 9673 9674 /* Read blocks */ 9675 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9676 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9677 bdev_io->u.bdev.iovs[0].iov_base, 9678 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9679 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 9680 9681 if (rc == -ENOMEM) { 9682 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9683 } else if (rc != 0) { 9684 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9685 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9686 } 9687 } 9688 9689 static void 9690 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9691 { 9692 if (!success) { 9693 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9694 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9695 return; 9696 } 9697 9698 bdev_copy_do_read(bdev_io); 9699 } 9700 9701 int 9702 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9703 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9704 spdk_bdev_io_completion_cb cb, void *cb_arg) 9705 { 9706 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9707 struct spdk_bdev_io *bdev_io; 9708 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9709 9710 if (!desc->write) { 9711 return -EBADF; 9712 } 9713 9714 if (num_blocks == 0) { 9715 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9716 return -EINVAL; 9717 } 9718 9719 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9720 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9721 SPDK_DEBUGLOG(bdev, 9722 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9723 dst_offset_blocks, src_offset_blocks, num_blocks); 9724 return -EINVAL; 9725 } 9726 9727 bdev_io = bdev_channel_get_io(channel); 9728 if (!bdev_io) { 9729 return -ENOMEM; 9730 } 9731 9732 bdev_io->internal.ch = channel; 9733 bdev_io->internal.desc = desc; 9734 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9735 9736 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9737 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9738 bdev_io->u.bdev.num_blocks = num_blocks; 9739 bdev_io->u.bdev.memory_domain = NULL; 9740 bdev_io->u.bdev.memory_domain_ctx = NULL; 9741 bdev_io->u.bdev.iovs = NULL; 9742 bdev_io->u.bdev.iovcnt = 0; 9743 bdev_io->u.bdev.md_buf = NULL; 9744 bdev_io->u.bdev.accel_sequence = NULL; 9745 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9746 9747 if (dst_offset_blocks == src_offset_blocks) { 9748 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9749 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9750 9751 return 0; 9752 } 9753 9754 9755 /* If the copy size is large and should be split, use the generic split logic 9756 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 9757 * 9758 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 9759 * emulate it using regular read and write requests otherwise. 9760 */ 9761 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 9762 bdev_io->internal.split) { 9763 bdev_io_submit(bdev_io); 9764 return 0; 9765 } 9766 9767 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9768 9769 return 0; 9770 } 9771 9772 SPDK_LOG_REGISTER_COMPONENT(bdev) 9773 9774 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9775 { 9776 struct spdk_trace_tpoint_opts opts[] = { 9777 { 9778 "BDEV_IO_START", TRACE_BDEV_IO_START, 9779 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9780 { 9781 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9782 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9783 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9784 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9785 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9786 } 9787 }, 9788 { 9789 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9790 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9791 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9792 }, 9793 { 9794 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9795 OWNER_BDEV, OBJECT_NONE, 1, 9796 { 9797 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9798 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9799 } 9800 }, 9801 { 9802 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9803 OWNER_BDEV, OBJECT_NONE, 0, 9804 { 9805 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9806 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9807 } 9808 }, 9809 }; 9810 9811 9812 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9813 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9814 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9815 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9816 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9817 } 9818