1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/accel.h" 12 #include "spdk/config.h" 13 #include "spdk/env.h" 14 #include "spdk/thread.h" 15 #include "spdk/likely.h" 16 #include "spdk/queue.h" 17 #include "spdk/nvme_spec.h" 18 #include "spdk/scsi_spec.h" 19 #include "spdk/notify.h" 20 #include "spdk/util.h" 21 #include "spdk/trace.h" 22 #include "spdk/dma.h" 23 24 #include "spdk/bdev_module.h" 25 #include "spdk/log.h" 26 #include "spdk/string.h" 27 28 #include "bdev_internal.h" 29 #include "spdk_internal/trace_defs.h" 30 #include "spdk_internal/assert.h" 31 32 #ifdef SPDK_CONFIG_VTUNE 33 #include "ittnotify.h" 34 #include "ittnotify_types.h" 35 int __itt_init_ittlib(const char *, __itt_group_id); 36 #endif 37 38 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 39 #define SPDK_BDEV_IO_CACHE_SIZE 256 40 #define SPDK_BDEV_AUTO_EXAMINE true 41 #define BUF_SMALL_POOL_SIZE 8191 42 #define BUF_LARGE_POOL_SIZE 1023 43 #define BUF_SMALL_CACHE_SIZE 128 44 #define BUF_LARGE_CACHE_SIZE 16 45 #define NOMEM_THRESHOLD_COUNT 8 46 47 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 48 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 49 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 50 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 51 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 52 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 53 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 54 55 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 56 * when splitting into children requests at a time. 57 */ 58 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 59 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 60 61 /* The maximum number of children requests for a COPY command 62 * when splitting into children requests at a time. 63 */ 64 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 65 66 #define LOG_ALREADY_CLAIMED_ERROR(detail, bdev) \ 67 log_already_claimed(SPDK_LOG_ERROR, __LINE__, __func__, detail, bdev) 68 #ifdef DEBUG 69 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) \ 70 log_already_claimed(SPDK_LOG_DEBUG, __LINE__, __func__, detail, bdev) 71 #else 72 #define LOG_ALREADY_CLAIMED_DEBUG(detail, bdev) do {} while(0) 73 #endif 74 75 static void log_already_claimed(enum spdk_log_level level, const int line, const char *func, 76 const char *detail, struct spdk_bdev *bdev); 77 78 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 79 80 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 81 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 82 }; 83 84 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 85 86 RB_HEAD(bdev_name_tree, spdk_bdev_name); 87 88 static int 89 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 90 { 91 return strcmp(name1->name, name2->name); 92 } 93 94 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 95 96 struct spdk_bdev_mgr { 97 struct spdk_mempool *bdev_io_pool; 98 99 void *zero_buffer; 100 101 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 102 103 struct spdk_bdev_list bdevs; 104 struct bdev_name_tree bdev_names; 105 106 bool init_complete; 107 bool module_init_complete; 108 109 struct spdk_spinlock spinlock; 110 111 #ifdef SPDK_CONFIG_VTUNE 112 __itt_domain *domain; 113 #endif 114 }; 115 116 static struct spdk_bdev_mgr g_bdev_mgr = { 117 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 118 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 119 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 120 .init_complete = false, 121 .module_init_complete = false, 122 }; 123 124 static void 125 __attribute__((constructor)) 126 _bdev_init(void) 127 { 128 spdk_spin_init(&g_bdev_mgr.spinlock); 129 } 130 131 typedef void (*lock_range_cb)(struct lba_range *range, void *ctx, int status); 132 133 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 134 135 struct lba_range { 136 struct spdk_bdev *bdev; 137 uint64_t offset; 138 uint64_t length; 139 void *locked_ctx; 140 struct spdk_thread *owner_thread; 141 struct spdk_bdev_channel *owner_ch; 142 TAILQ_ENTRY(lba_range) tailq; 143 TAILQ_ENTRY(lba_range) tailq_module; 144 }; 145 146 static struct spdk_bdev_opts g_bdev_opts = { 147 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 148 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 149 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 150 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 151 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 152 }; 153 154 static spdk_bdev_init_cb g_init_cb_fn = NULL; 155 static void *g_init_cb_arg = NULL; 156 157 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 158 static void *g_fini_cb_arg = NULL; 159 static struct spdk_thread *g_fini_thread = NULL; 160 161 struct spdk_bdev_qos_limit { 162 /** IOs or bytes allowed per second (i.e., 1s). */ 163 uint64_t limit; 164 165 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 166 * For remaining bytes, allowed to run negative if an I/O is submitted when 167 * some bytes are remaining, but the I/O is bigger than that amount. The 168 * excess will be deducted from the next timeslice. 169 */ 170 int64_t remaining_this_timeslice; 171 172 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 173 uint32_t min_per_timeslice; 174 175 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 176 uint32_t max_per_timeslice; 177 178 /** Function to check whether to queue the IO. */ 179 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 180 181 /** Function to update for the submitted IO. */ 182 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 183 }; 184 185 struct spdk_bdev_qos { 186 /** Types of structure of rate limits. */ 187 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 188 189 /** The channel that all I/O are funneled through. */ 190 struct spdk_bdev_channel *ch; 191 192 /** The thread on which the poller is running. */ 193 struct spdk_thread *thread; 194 195 /** Queue of I/O waiting to be issued. */ 196 bdev_io_tailq_t queued; 197 198 /** Size of a timeslice in tsc ticks. */ 199 uint64_t timeslice_size; 200 201 /** Timestamp of start of last timeslice. */ 202 uint64_t last_timeslice; 203 204 /** Poller that processes queued I/O commands each time slice. */ 205 struct spdk_poller *poller; 206 }; 207 208 struct spdk_bdev_mgmt_channel { 209 /* 210 * Each thread keeps a cache of bdev_io - this allows 211 * bdev threads which are *not* DPDK threads to still 212 * benefit from a per-thread bdev_io cache. Without 213 * this, non-DPDK threads fetching from the mempool 214 * incur a cmpxchg on get and put. 215 */ 216 bdev_io_stailq_t per_thread_cache; 217 uint32_t per_thread_cache_count; 218 uint32_t bdev_io_cache_size; 219 220 struct spdk_iobuf_channel iobuf; 221 222 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 223 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 224 }; 225 226 /* 227 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 228 * will queue here their IO that awaits retry. It makes it possible to retry sending 229 * IO to one bdev after IO from other bdev completes. 230 */ 231 struct spdk_bdev_shared_resource { 232 /* The bdev management channel */ 233 struct spdk_bdev_mgmt_channel *mgmt_ch; 234 235 /* 236 * Count of I/O submitted to bdev module and waiting for completion. 237 * Incremented before submit_request() is called on an spdk_bdev_io. 238 */ 239 uint64_t io_outstanding; 240 241 /* 242 * Queue of IO awaiting retry because of a previous NOMEM status returned 243 * on this channel. 244 */ 245 bdev_io_tailq_t nomem_io; 246 247 /* 248 * Threshold which io_outstanding must drop to before retrying nomem_io. 249 */ 250 uint64_t nomem_threshold; 251 252 /* I/O channel allocated by a bdev module */ 253 struct spdk_io_channel *shared_ch; 254 255 /* Refcount of bdev channels using this resource */ 256 uint32_t ref; 257 258 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 259 }; 260 261 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 262 #define BDEV_CH_QOS_ENABLED (1 << 1) 263 264 struct spdk_bdev_channel { 265 struct spdk_bdev *bdev; 266 267 /* The channel for the underlying device */ 268 struct spdk_io_channel *channel; 269 270 /* Accel channel */ 271 struct spdk_io_channel *accel_channel; 272 273 /* Per io_device per thread data */ 274 struct spdk_bdev_shared_resource *shared_resource; 275 276 struct spdk_bdev_io_stat *stat; 277 278 /* 279 * Count of I/O submitted to the underlying dev module through this channel 280 * and waiting for completion. 281 */ 282 uint64_t io_outstanding; 283 284 /* 285 * List of all submitted I/Os including I/O that are generated via splitting. 286 */ 287 bdev_io_tailq_t io_submitted; 288 289 /* 290 * List of spdk_bdev_io that are currently queued because they write to a locked 291 * LBA range. 292 */ 293 bdev_io_tailq_t io_locked; 294 295 /* List of I/Os with accel sequence being currently executed */ 296 bdev_io_tailq_t io_accel_exec; 297 298 /* List of I/Os doing memory domain pull/push */ 299 bdev_io_tailq_t io_memory_domain; 300 301 uint32_t flags; 302 303 struct spdk_histogram_data *histogram; 304 305 #ifdef SPDK_CONFIG_VTUNE 306 uint64_t start_tsc; 307 uint64_t interval_tsc; 308 __itt_string_handle *handle; 309 struct spdk_bdev_io_stat *prev_stat; 310 #endif 311 312 bdev_io_tailq_t queued_resets; 313 314 lba_range_tailq_t locked_ranges; 315 }; 316 317 struct media_event_entry { 318 struct spdk_bdev_media_event event; 319 TAILQ_ENTRY(media_event_entry) tailq; 320 }; 321 322 #define MEDIA_EVENT_POOL_SIZE 64 323 324 struct spdk_bdev_desc { 325 struct spdk_bdev *bdev; 326 struct spdk_thread *thread; 327 struct { 328 spdk_bdev_event_cb_t event_fn; 329 void *ctx; 330 } callback; 331 bool closed; 332 bool write; 333 bool memory_domains_supported; 334 bool accel_sequence_supported[SPDK_BDEV_NUM_IO_TYPES]; 335 struct spdk_spinlock spinlock; 336 uint32_t refs; 337 TAILQ_HEAD(, media_event_entry) pending_media_events; 338 TAILQ_HEAD(, media_event_entry) free_media_events; 339 struct media_event_entry *media_events_buffer; 340 TAILQ_ENTRY(spdk_bdev_desc) link; 341 342 uint64_t timeout_in_sec; 343 spdk_bdev_io_timeout_cb cb_fn; 344 void *cb_arg; 345 struct spdk_poller *io_timeout_poller; 346 struct spdk_bdev_module_claim *claim; 347 }; 348 349 struct spdk_bdev_iostat_ctx { 350 struct spdk_bdev_io_stat *stat; 351 spdk_bdev_get_device_stat_cb cb; 352 void *cb_arg; 353 }; 354 355 struct set_qos_limit_ctx { 356 void (*cb_fn)(void *cb_arg, int status); 357 void *cb_arg; 358 struct spdk_bdev *bdev; 359 }; 360 361 struct spdk_bdev_channel_iter { 362 spdk_bdev_for_each_channel_msg fn; 363 spdk_bdev_for_each_channel_done cpl; 364 struct spdk_io_channel_iter *i; 365 void *ctx; 366 }; 367 368 struct spdk_bdev_io_error_stat { 369 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 370 }; 371 372 enum bdev_io_retry_state { 373 BDEV_IO_RETRY_STATE_INVALID, 374 BDEV_IO_RETRY_STATE_PULL, 375 BDEV_IO_RETRY_STATE_PULL_MD, 376 BDEV_IO_RETRY_STATE_SUBMIT, 377 BDEV_IO_RETRY_STATE_PUSH, 378 BDEV_IO_RETRY_STATE_PUSH_MD, 379 }; 380 381 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 382 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 383 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 384 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 385 386 static inline void bdev_io_complete(void *ctx); 387 static inline void bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io); 388 static void bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io); 389 static void bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io); 390 391 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 392 static int bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io); 393 394 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 395 struct spdk_io_channel *ch, void *_ctx); 396 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 397 398 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 399 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 400 uint64_t num_blocks, 401 struct spdk_memory_domain *domain, void *domain_ctx, 402 struct spdk_accel_sequence *seq, 403 spdk_bdev_io_completion_cb cb, void *cb_arg); 404 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 405 struct iovec *iov, int iovcnt, void *md_buf, 406 uint64_t offset_blocks, uint64_t num_blocks, 407 struct spdk_memory_domain *domain, void *domain_ctx, 408 struct spdk_accel_sequence *seq, 409 spdk_bdev_io_completion_cb cb, void *cb_arg); 410 411 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 412 uint64_t offset, uint64_t length, 413 lock_range_cb cb_fn, void *cb_arg); 414 415 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 416 uint64_t offset, uint64_t length, 417 lock_range_cb cb_fn, void *cb_arg); 418 419 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 420 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 421 422 static bool claim_type_is_v2(enum spdk_bdev_claim_type type); 423 static void bdev_desc_release_claims(struct spdk_bdev_desc *desc); 424 static void claim_reset(struct spdk_bdev *bdev); 425 426 static void bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch); 427 428 #define bdev_get_ext_io_opt(opts, field, defval) \ 429 (((opts) != NULL && offsetof(struct spdk_bdev_ext_io_opts, field) + \ 430 sizeof((opts)->field) <= sizeof(*(opts))) ? (opts)->field : (defval)) 431 432 void 433 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 434 { 435 if (!opts) { 436 SPDK_ERRLOG("opts should not be NULL\n"); 437 return; 438 } 439 440 if (!opts_size) { 441 SPDK_ERRLOG("opts_size should not be zero value\n"); 442 return; 443 } 444 445 opts->opts_size = opts_size; 446 447 #define SET_FIELD(field) \ 448 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 449 opts->field = g_bdev_opts.field; \ 450 } \ 451 452 SET_FIELD(bdev_io_pool_size); 453 SET_FIELD(bdev_io_cache_size); 454 SET_FIELD(bdev_auto_examine); 455 SET_FIELD(small_buf_pool_size); 456 SET_FIELD(large_buf_pool_size); 457 458 /* Do not remove this statement, you should always update this statement when you adding a new field, 459 * and do not forget to add the SET_FIELD statement for your added field. */ 460 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 461 462 #undef SET_FIELD 463 } 464 465 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 466 "v23.05", 0); 467 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 468 "v23.05", 0); 469 int 470 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 471 { 472 struct spdk_iobuf_opts iobuf_opts; 473 uint32_t min_pool_size; 474 int rc; 475 476 if (!opts) { 477 SPDK_ERRLOG("opts cannot be NULL\n"); 478 return -1; 479 } 480 481 if (!opts->opts_size) { 482 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 483 return -1; 484 } 485 486 /* 487 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 488 * initialization. A second mgmt_ch will be created on the same thread when the application starts 489 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 490 */ 491 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 492 if (opts->bdev_io_pool_size < min_pool_size) { 493 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 494 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 495 spdk_thread_get_count()); 496 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 497 return -1; 498 } 499 500 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 501 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 502 } 503 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 504 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 505 } 506 507 #define SET_FIELD(field) \ 508 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 509 g_bdev_opts.field = opts->field; \ 510 } \ 511 512 SET_FIELD(bdev_io_pool_size); 513 SET_FIELD(bdev_io_cache_size); 514 SET_FIELD(bdev_auto_examine); 515 SET_FIELD(small_buf_pool_size); 516 SET_FIELD(large_buf_pool_size); 517 518 spdk_iobuf_get_opts(&iobuf_opts); 519 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 520 iobuf_opts.small_bufsize = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE); 521 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 522 iobuf_opts.large_bufsize = SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE); 523 524 rc = spdk_iobuf_set_opts(&iobuf_opts); 525 if (rc != 0) { 526 SPDK_ERRLOG("Failed to set iobuf opts\n"); 527 return -1; 528 } 529 530 g_bdev_opts.opts_size = opts->opts_size; 531 532 #undef SET_FIELD 533 534 return 0; 535 } 536 537 static struct spdk_bdev * 538 bdev_get_by_name(const char *bdev_name) 539 { 540 struct spdk_bdev_name find; 541 struct spdk_bdev_name *res; 542 543 find.name = (char *)bdev_name; 544 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 545 if (res != NULL) { 546 return res->bdev; 547 } 548 549 return NULL; 550 } 551 552 struct spdk_bdev * 553 spdk_bdev_get_by_name(const char *bdev_name) 554 { 555 struct spdk_bdev *bdev; 556 557 spdk_spin_lock(&g_bdev_mgr.spinlock); 558 bdev = bdev_get_by_name(bdev_name); 559 spdk_spin_unlock(&g_bdev_mgr.spinlock); 560 561 return bdev; 562 } 563 564 struct bdev_io_status_string { 565 enum spdk_bdev_io_status status; 566 const char *str; 567 }; 568 569 static const struct bdev_io_status_string bdev_io_status_strings[] = { 570 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 571 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 572 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 573 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 574 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 575 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 576 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 577 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 578 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 579 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 580 }; 581 582 static const char * 583 bdev_io_status_get_string(enum spdk_bdev_io_status status) 584 { 585 uint32_t i; 586 587 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 588 if (bdev_io_status_strings[i].status == status) { 589 return bdev_io_status_strings[i].str; 590 } 591 } 592 593 return "reserved"; 594 } 595 596 struct spdk_bdev_wait_for_examine_ctx { 597 struct spdk_poller *poller; 598 spdk_bdev_wait_for_examine_cb cb_fn; 599 void *cb_arg; 600 }; 601 602 static bool bdev_module_all_actions_completed(void); 603 604 static int 605 bdev_wait_for_examine_cb(void *arg) 606 { 607 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 608 609 if (!bdev_module_all_actions_completed()) { 610 return SPDK_POLLER_IDLE; 611 } 612 613 spdk_poller_unregister(&ctx->poller); 614 ctx->cb_fn(ctx->cb_arg); 615 free(ctx); 616 617 return SPDK_POLLER_BUSY; 618 } 619 620 int 621 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 622 { 623 struct spdk_bdev_wait_for_examine_ctx *ctx; 624 625 ctx = calloc(1, sizeof(*ctx)); 626 if (ctx == NULL) { 627 return -ENOMEM; 628 } 629 ctx->cb_fn = cb_fn; 630 ctx->cb_arg = cb_arg; 631 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 632 633 return 0; 634 } 635 636 struct spdk_bdev_examine_item { 637 char *name; 638 TAILQ_ENTRY(spdk_bdev_examine_item) link; 639 }; 640 641 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 642 643 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 644 g_bdev_examine_allowlist); 645 646 static inline bool 647 bdev_examine_allowlist_check(const char *name) 648 { 649 struct spdk_bdev_examine_item *item; 650 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 651 if (strcmp(name, item->name) == 0) { 652 return true; 653 } 654 } 655 return false; 656 } 657 658 static inline void 659 bdev_examine_allowlist_free(void) 660 { 661 struct spdk_bdev_examine_item *item; 662 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 663 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 664 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 665 free(item->name); 666 free(item); 667 } 668 } 669 670 static inline bool 671 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 672 { 673 struct spdk_bdev_alias *tmp; 674 if (bdev_examine_allowlist_check(bdev->name)) { 675 return true; 676 } 677 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 678 if (bdev_examine_allowlist_check(tmp->alias.name)) { 679 return true; 680 } 681 } 682 return false; 683 } 684 685 static inline bool 686 bdev_ok_to_examine(struct spdk_bdev *bdev) 687 { 688 if (g_bdev_opts.bdev_auto_examine) { 689 return true; 690 } else { 691 return bdev_in_examine_allowlist(bdev); 692 } 693 } 694 695 static void 696 bdev_examine(struct spdk_bdev *bdev) 697 { 698 struct spdk_bdev_module *module; 699 struct spdk_bdev_module_claim *claim, *tmpclaim; 700 uint32_t action; 701 702 if (!bdev_ok_to_examine(bdev)) { 703 return; 704 } 705 706 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 707 if (module->examine_config) { 708 spdk_spin_lock(&module->internal.spinlock); 709 action = module->internal.action_in_progress; 710 module->internal.action_in_progress++; 711 spdk_spin_unlock(&module->internal.spinlock); 712 module->examine_config(bdev); 713 if (action != module->internal.action_in_progress) { 714 SPDK_ERRLOG("examine_config for module %s did not call " 715 "spdk_bdev_module_examine_done()\n", module->name); 716 } 717 } 718 } 719 720 spdk_spin_lock(&bdev->internal.spinlock); 721 722 switch (bdev->internal.claim_type) { 723 case SPDK_BDEV_CLAIM_NONE: 724 /* Examine by all bdev modules */ 725 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 726 if (module->examine_disk) { 727 spdk_spin_lock(&module->internal.spinlock); 728 module->internal.action_in_progress++; 729 spdk_spin_unlock(&module->internal.spinlock); 730 spdk_spin_unlock(&bdev->internal.spinlock); 731 module->examine_disk(bdev); 732 spdk_spin_lock(&bdev->internal.spinlock); 733 } 734 } 735 break; 736 case SPDK_BDEV_CLAIM_EXCL_WRITE: 737 /* Examine by the one bdev module with a v1 claim */ 738 module = bdev->internal.claim.v1.module; 739 if (module->examine_disk) { 740 spdk_spin_lock(&module->internal.spinlock); 741 module->internal.action_in_progress++; 742 spdk_spin_unlock(&module->internal.spinlock); 743 spdk_spin_unlock(&bdev->internal.spinlock); 744 module->examine_disk(bdev); 745 return; 746 } 747 break; 748 default: 749 /* Examine by all bdev modules with a v2 claim */ 750 assert(claim_type_is_v2(bdev->internal.claim_type)); 751 /* 752 * Removal of tailq nodes while iterating can cause the iteration to jump out of the 753 * list, perhaps accessing freed memory. Without protection, this could happen 754 * while the lock is dropped during the examine callback. 755 */ 756 bdev->internal.examine_in_progress++; 757 758 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 759 module = claim->module; 760 761 if (module == NULL) { 762 /* This is a vestigial claim, held by examine_count */ 763 continue; 764 } 765 766 if (module->examine_disk == NULL) { 767 continue; 768 } 769 770 spdk_spin_lock(&module->internal.spinlock); 771 module->internal.action_in_progress++; 772 spdk_spin_unlock(&module->internal.spinlock); 773 774 /* Call examine_disk without holding internal.spinlock. */ 775 spdk_spin_unlock(&bdev->internal.spinlock); 776 module->examine_disk(bdev); 777 spdk_spin_lock(&bdev->internal.spinlock); 778 } 779 780 assert(bdev->internal.examine_in_progress > 0); 781 bdev->internal.examine_in_progress--; 782 if (bdev->internal.examine_in_progress == 0) { 783 /* Remove any claims that were released during examine_disk */ 784 TAILQ_FOREACH_SAFE(claim, &bdev->internal.claim.v2.claims, link, tmpclaim) { 785 if (claim->desc != NULL) { 786 continue; 787 } 788 789 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, claim, link); 790 free(claim); 791 } 792 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 793 claim_reset(bdev); 794 } 795 } 796 } 797 798 spdk_spin_unlock(&bdev->internal.spinlock); 799 } 800 801 int 802 spdk_bdev_examine(const char *name) 803 { 804 struct spdk_bdev *bdev; 805 struct spdk_bdev_examine_item *item; 806 struct spdk_thread *thread = spdk_get_thread(); 807 808 if (spdk_unlikely(spdk_thread_get_app_thread() != thread)) { 809 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", name, thread, 810 thread ? spdk_thread_get_name(thread) : "null"); 811 return -EINVAL; 812 } 813 814 if (g_bdev_opts.bdev_auto_examine) { 815 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 816 return -EINVAL; 817 } 818 819 if (bdev_examine_allowlist_check(name)) { 820 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 821 return -EEXIST; 822 } 823 824 item = calloc(1, sizeof(*item)); 825 if (!item) { 826 return -ENOMEM; 827 } 828 item->name = strdup(name); 829 if (!item->name) { 830 free(item); 831 return -ENOMEM; 832 } 833 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 834 835 bdev = spdk_bdev_get_by_name(name); 836 if (bdev) { 837 bdev_examine(bdev); 838 } 839 return 0; 840 } 841 842 static inline void 843 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 844 { 845 struct spdk_bdev_examine_item *item; 846 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 847 spdk_json_write_object_begin(w); 848 spdk_json_write_named_string(w, "method", "bdev_examine"); 849 spdk_json_write_named_object_begin(w, "params"); 850 spdk_json_write_named_string(w, "name", item->name); 851 spdk_json_write_object_end(w); 852 spdk_json_write_object_end(w); 853 } 854 } 855 856 struct spdk_bdev * 857 spdk_bdev_first(void) 858 { 859 struct spdk_bdev *bdev; 860 861 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 862 if (bdev) { 863 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 864 } 865 866 return bdev; 867 } 868 869 struct spdk_bdev * 870 spdk_bdev_next(struct spdk_bdev *prev) 871 { 872 struct spdk_bdev *bdev; 873 874 bdev = TAILQ_NEXT(prev, internal.link); 875 if (bdev) { 876 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 877 } 878 879 return bdev; 880 } 881 882 static struct spdk_bdev * 883 _bdev_next_leaf(struct spdk_bdev *bdev) 884 { 885 while (bdev != NULL) { 886 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 887 return bdev; 888 } else { 889 bdev = TAILQ_NEXT(bdev, internal.link); 890 } 891 } 892 893 return bdev; 894 } 895 896 struct spdk_bdev * 897 spdk_bdev_first_leaf(void) 898 { 899 struct spdk_bdev *bdev; 900 901 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 902 903 if (bdev) { 904 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 905 } 906 907 return bdev; 908 } 909 910 struct spdk_bdev * 911 spdk_bdev_next_leaf(struct spdk_bdev *prev) 912 { 913 struct spdk_bdev *bdev; 914 915 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 916 917 if (bdev) { 918 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 919 } 920 921 return bdev; 922 } 923 924 static inline bool 925 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 926 { 927 return bdev_io->internal.memory_domain; 928 } 929 930 static inline bool 931 bdev_io_use_accel_sequence(struct spdk_bdev_io *bdev_io) 932 { 933 return bdev_io->internal.accel_sequence; 934 } 935 936 static inline void 937 bdev_queue_nomem_io_head(struct spdk_bdev_shared_resource *shared_resource, 938 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 939 { 940 /* Wait for some of the outstanding I/O to complete before we retry any of the nomem_io. 941 * Normally we will wait for NOMEM_THRESHOLD_COUNT I/O to complete but for low queue depth 942 * channels we will instead wait for half to complete. 943 */ 944 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 945 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 946 947 assert(state != BDEV_IO_RETRY_STATE_INVALID); 948 bdev_io->internal.retry_state = state; 949 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 950 } 951 952 static inline void 953 bdev_queue_nomem_io_tail(struct spdk_bdev_shared_resource *shared_resource, 954 struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 955 { 956 /* We only queue IOs at the end of the nomem_io queue if they're submitted by the user while 957 * the queue isn't empty, so we don't need to update the nomem_threshold here */ 958 assert(!TAILQ_EMPTY(&shared_resource->nomem_io)); 959 960 assert(state != BDEV_IO_RETRY_STATE_INVALID); 961 bdev_io->internal.retry_state = state; 962 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 963 } 964 965 void 966 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 967 { 968 struct iovec *iovs; 969 970 if (bdev_io->u.bdev.iovs == NULL) { 971 bdev_io->u.bdev.iovs = &bdev_io->iov; 972 bdev_io->u.bdev.iovcnt = 1; 973 } 974 975 iovs = bdev_io->u.bdev.iovs; 976 977 assert(iovs != NULL); 978 assert(bdev_io->u.bdev.iovcnt >= 1); 979 980 iovs[0].iov_base = buf; 981 iovs[0].iov_len = len; 982 } 983 984 void 985 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 986 { 987 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 988 bdev_io->u.bdev.md_buf = md_buf; 989 } 990 991 static bool 992 _is_buf_allocated(const struct iovec *iovs) 993 { 994 if (iovs == NULL) { 995 return false; 996 } 997 998 return iovs[0].iov_base != NULL; 999 } 1000 1001 static bool 1002 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 1003 { 1004 int i; 1005 uintptr_t iov_base; 1006 1007 if (spdk_likely(alignment == 1)) { 1008 return true; 1009 } 1010 1011 for (i = 0; i < iovcnt; i++) { 1012 iov_base = (uintptr_t)iovs[i].iov_base; 1013 if ((iov_base & (alignment - 1)) != 0) { 1014 return false; 1015 } 1016 } 1017 1018 return true; 1019 } 1020 1021 static inline bool 1022 bdev_io_needs_sequence_exec(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 1023 { 1024 if (!bdev_io_use_accel_sequence(bdev_io)) { 1025 return false; 1026 } 1027 1028 /* For now, we don't allow splitting IOs with an accel sequence and will treat them as if 1029 * bdev module didn't support accel sequences */ 1030 return !desc->accel_sequence_supported[bdev_io->type] || bdev_io->internal.split; 1031 } 1032 1033 static inline void 1034 bdev_io_increment_outstanding(struct spdk_bdev_channel *bdev_ch, 1035 struct spdk_bdev_shared_resource *shared_resource) 1036 { 1037 bdev_ch->io_outstanding++; 1038 shared_resource->io_outstanding++; 1039 } 1040 1041 static inline void 1042 bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1043 struct spdk_bdev_shared_resource *shared_resource) 1044 { 1045 assert(bdev_ch->io_outstanding > 0); 1046 assert(shared_resource->io_outstanding > 0); 1047 bdev_ch->io_outstanding--; 1048 shared_resource->io_outstanding--; 1049 } 1050 1051 static void 1052 bdev_io_submit_sequence_cb(void *ctx, int status) 1053 { 1054 struct spdk_bdev_io *bdev_io = ctx; 1055 1056 bdev_io->u.bdev.accel_sequence = NULL; 1057 bdev_io->internal.accel_sequence = NULL; 1058 1059 if (spdk_unlikely(status != 0)) { 1060 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 1061 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1062 bdev_io_complete_unsubmitted(bdev_io); 1063 return; 1064 } 1065 1066 bdev_io_submit(bdev_io); 1067 } 1068 1069 static void 1070 bdev_io_exec_sequence_cb(void *ctx, int status) 1071 { 1072 struct spdk_bdev_io *bdev_io = ctx; 1073 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1074 1075 TAILQ_REMOVE(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1076 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1077 1078 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1079 bdev_ch_retry_io(ch); 1080 } 1081 1082 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1083 } 1084 1085 static void 1086 bdev_io_exec_sequence(struct spdk_bdev_io *bdev_io, void (*cb_fn)(void *ctx, int status)) 1087 { 1088 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1089 1090 assert(bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1091 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1092 1093 /* Since the operations are appended during submission, they're in the opposite order than 1094 * how we want to execute them for reads (i.e. we need to execute the most recently added 1095 * operation first), so reverse the sequence before executing it. 1096 */ 1097 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1098 spdk_accel_sequence_reverse(bdev_io->internal.accel_sequence); 1099 } 1100 1101 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_accel_exec, bdev_io, internal.link); 1102 bdev_io_increment_outstanding(ch, ch->shared_resource); 1103 bdev_io->internal.data_transfer_cpl = cb_fn; 1104 1105 spdk_accel_sequence_finish(bdev_io->internal.accel_sequence, 1106 bdev_io_exec_sequence_cb, bdev_io); 1107 } 1108 1109 static void 1110 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 1111 { 1112 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 1113 void *buf; 1114 1115 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1116 buf = bdev_io->internal.buf; 1117 bdev_io->internal.buf = NULL; 1118 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 1119 bdev_io->internal.get_aux_buf_cb = NULL; 1120 } else { 1121 assert(bdev_io->internal.get_buf_cb != NULL); 1122 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 1123 bdev_io->internal.get_buf_cb = NULL; 1124 } 1125 } 1126 1127 static void 1128 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 1129 { 1130 struct spdk_bdev_io *bdev_io = ctx; 1131 1132 if (rc) { 1133 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 1134 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1135 } 1136 bdev_io_get_buf_complete(bdev_io, !rc); 1137 } 1138 1139 static void 1140 bdev_io_pull_md_buf_done(void *ctx, int status) 1141 { 1142 struct spdk_bdev_io *bdev_io = ctx; 1143 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1144 1145 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1146 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1147 1148 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1149 bdev_ch_retry_io(ch); 1150 } 1151 1152 assert(bdev_io->internal.data_transfer_cpl); 1153 bdev_io->internal.data_transfer_cpl(bdev_io, status); 1154 } 1155 1156 static void 1157 bdev_io_pull_md_buf(struct spdk_bdev_io *bdev_io) 1158 { 1159 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1160 int rc = 0; 1161 1162 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1163 if (bdev_io_use_memory_domain(bdev_io)) { 1164 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1165 bdev_io_increment_outstanding(ch, ch->shared_resource); 1166 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1167 bdev_io->internal.memory_domain_ctx, 1168 &bdev_io->internal.orig_md_iov, 1, 1169 &bdev_io->internal.bounce_md_iov, 1, 1170 bdev_io_pull_md_buf_done, bdev_io); 1171 if (rc == 0) { 1172 /* Continue to submit IO in completion callback */ 1173 return; 1174 } 1175 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1176 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1177 if (rc != -ENOMEM) { 1178 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 1179 spdk_memory_domain_get_dma_device_id( 1180 bdev_io->internal.memory_domain), rc); 1181 } 1182 } else { 1183 memcpy(bdev_io->internal.bounce_md_iov.iov_base, 1184 bdev_io->internal.orig_md_iov.iov_base, 1185 bdev_io->internal.orig_md_iov.iov_len); 1186 } 1187 } 1188 1189 if (spdk_unlikely(rc == -ENOMEM)) { 1190 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL_MD); 1191 } else { 1192 assert(bdev_io->internal.data_transfer_cpl); 1193 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1194 } 1195 } 1196 1197 static void 1198 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 1199 { 1200 /* save original md_buf */ 1201 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 1202 bdev_io->internal.orig_md_iov.iov_len = len; 1203 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 1204 bdev_io->internal.bounce_md_iov.iov_len = len; 1205 /* set bounce md_buf */ 1206 bdev_io->u.bdev.md_buf = md_buf; 1207 1208 bdev_io_pull_md_buf(bdev_io); 1209 } 1210 1211 static void 1212 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 1213 { 1214 struct spdk_bdev *bdev = bdev_io->bdev; 1215 uint64_t md_len; 1216 void *buf; 1217 1218 if (spdk_bdev_is_md_separate(bdev)) { 1219 assert(!bdev_io_use_accel_sequence(bdev_io)); 1220 1221 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 1222 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 1223 1224 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 1225 1226 if (bdev_io->u.bdev.md_buf != NULL) { 1227 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 1228 return; 1229 } else { 1230 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 1231 } 1232 } 1233 1234 bdev_io_get_buf_complete(bdev_io, true); 1235 } 1236 1237 static inline void 1238 bdev_io_pull_data_done(struct spdk_bdev_io *bdev_io, int rc) 1239 { 1240 if (rc) { 1241 SPDK_ERRLOG("Failed to get data buffer\n"); 1242 assert(bdev_io->internal.data_transfer_cpl); 1243 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1244 return; 1245 } 1246 1247 _bdev_io_set_md_buf(bdev_io); 1248 } 1249 1250 static void 1251 bdev_io_pull_data_done_and_track(void *ctx, int status) 1252 { 1253 struct spdk_bdev_io *bdev_io = ctx; 1254 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1255 1256 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1257 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1258 1259 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1260 bdev_ch_retry_io(ch); 1261 } 1262 1263 bdev_io_pull_data_done(bdev_io, status); 1264 } 1265 1266 static void 1267 bdev_io_pull_data(struct spdk_bdev_io *bdev_io) 1268 { 1269 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1270 int rc = 0; 1271 1272 /* If we need to exec an accel sequence, append a copy operation making accel change the 1273 * src/dst buffers of the previous operation */ 1274 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1275 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1276 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1277 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1278 NULL, NULL, 1279 bdev_io->internal.orig_iovs, 1280 bdev_io->internal.orig_iovcnt, 1281 bdev_io->internal.memory_domain, 1282 bdev_io->internal.memory_domain_ctx, 1283 0, NULL, NULL); 1284 } else { 1285 /* We need to reverse the src/dst for reads */ 1286 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1287 rc = spdk_accel_append_copy(&bdev_io->internal.accel_sequence, ch->accel_channel, 1288 bdev_io->internal.orig_iovs, 1289 bdev_io->internal.orig_iovcnt, 1290 bdev_io->internal.memory_domain, 1291 bdev_io->internal.memory_domain_ctx, 1292 bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, 1293 NULL, NULL, 0, NULL, NULL); 1294 } 1295 1296 if (spdk_unlikely(rc != 0 && rc != -ENOMEM)) { 1297 SPDK_ERRLOG("Failed to append copy to accel sequence: %p\n", 1298 bdev_io->internal.accel_sequence); 1299 } 1300 } else if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1301 /* if this is write path, copy data from original buffer to bounce buffer */ 1302 if (bdev_io_use_memory_domain(bdev_io)) { 1303 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1304 bdev_io_increment_outstanding(ch, ch->shared_resource); 1305 rc = spdk_memory_domain_pull_data(bdev_io->internal.memory_domain, 1306 bdev_io->internal.memory_domain_ctx, 1307 bdev_io->internal.orig_iovs, 1308 (uint32_t) bdev_io->internal.orig_iovcnt, 1309 bdev_io->u.bdev.iovs, 1, 1310 bdev_io_pull_data_done_and_track, 1311 bdev_io); 1312 if (rc == 0) { 1313 /* Continue to submit IO in completion callback */ 1314 return; 1315 } 1316 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1317 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1318 if (rc != -ENOMEM) { 1319 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1320 spdk_memory_domain_get_dma_device_id( 1321 bdev_io->internal.memory_domain)); 1322 } 1323 } else { 1324 assert(bdev_io->u.bdev.iovcnt == 1); 1325 spdk_copy_iovs_to_buf(bdev_io->u.bdev.iovs[0].iov_base, 1326 bdev_io->u.bdev.iovs[0].iov_len, 1327 bdev_io->internal.orig_iovs, 1328 bdev_io->internal.orig_iovcnt); 1329 } 1330 } 1331 1332 if (spdk_unlikely(rc == -ENOMEM)) { 1333 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1334 } else { 1335 bdev_io_pull_data_done(bdev_io, rc); 1336 } 1337 } 1338 1339 static void 1340 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 1341 bdev_copy_bounce_buffer_cpl cpl_cb) 1342 { 1343 struct spdk_bdev_shared_resource *shared_resource = bdev_io->internal.ch->shared_resource; 1344 1345 bdev_io->internal.data_transfer_cpl = cpl_cb; 1346 /* save original iovec */ 1347 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 1348 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 1349 /* set bounce iov */ 1350 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 1351 bdev_io->u.bdev.iovcnt = 1; 1352 /* set bounce buffer for this operation */ 1353 bdev_io->u.bdev.iovs[0].iov_base = buf; 1354 bdev_io->u.bdev.iovs[0].iov_len = len; 1355 1356 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1357 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PULL); 1358 } else { 1359 bdev_io_pull_data(bdev_io); 1360 } 1361 } 1362 1363 static void 1364 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1365 { 1366 struct spdk_bdev *bdev = bdev_io->bdev; 1367 bool buf_allocated; 1368 uint64_t alignment; 1369 void *aligned_buf; 1370 1371 bdev_io->internal.buf = buf; 1372 1373 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1374 bdev_io_get_buf_complete(bdev_io, true); 1375 return; 1376 } 1377 1378 alignment = spdk_bdev_get_buf_align(bdev); 1379 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1380 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1381 1382 if (buf_allocated) { 1383 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1384 /* Continue in completion callback */ 1385 return; 1386 } else { 1387 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1388 } 1389 1390 _bdev_io_set_md_buf(bdev_io); 1391 } 1392 1393 static inline uint64_t 1394 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1395 { 1396 struct spdk_bdev *bdev = bdev_io->bdev; 1397 uint64_t md_len, alignment; 1398 1399 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1400 1401 /* 1 byte alignment needs 0 byte of extra space, 64 bytes alignment needs 63 bytes of extra space, etc. */ 1402 alignment = spdk_bdev_get_buf_align(bdev) - 1; 1403 1404 return len + alignment + md_len; 1405 } 1406 1407 static void 1408 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1409 { 1410 struct spdk_bdev_mgmt_channel *ch; 1411 1412 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1413 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1414 } 1415 1416 static void 1417 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1418 { 1419 assert(bdev_io->internal.buf != NULL); 1420 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1421 bdev_io->internal.buf = NULL; 1422 } 1423 1424 void 1425 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1426 { 1427 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1428 1429 assert(buf != NULL); 1430 _bdev_io_put_buf(bdev_io, buf, len); 1431 } 1432 1433 static inline void 1434 bdev_submit_request(struct spdk_bdev *bdev, struct spdk_io_channel *ioch, 1435 struct spdk_bdev_io *bdev_io) 1436 { 1437 /* After a request is submitted to a bdev module, the ownership of an accel sequence 1438 * associated with that bdev_io is transferred to the bdev module. So, clear the internal 1439 * sequence pointer to make sure we won't touch it anymore. */ 1440 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE || 1441 bdev_io->type == SPDK_BDEV_IO_TYPE_READ) && bdev_io->u.bdev.accel_sequence != NULL) { 1442 assert(!bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)); 1443 bdev_io->internal.accel_sequence = NULL; 1444 } 1445 1446 bdev->fn_table->submit_request(ioch, bdev_io); 1447 } 1448 1449 static inline void 1450 bdev_ch_resubmit_io(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 1451 { 1452 struct spdk_bdev *bdev = bdev_ch->bdev; 1453 1454 bdev_io_increment_outstanding(bdev_io->internal.ch, bdev_ch->shared_resource); 1455 bdev_io->internal.error.nvme.cdw0 = 0; 1456 bdev_io->num_retries++; 1457 bdev_submit_request(bdev, spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1458 } 1459 1460 static void 1461 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1462 { 1463 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1464 struct spdk_bdev_io *bdev_io; 1465 1466 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1467 /* 1468 * Allow some more I/O to complete before retrying the nomem_io queue. 1469 * Some drivers (such as nvme) cannot immediately take a new I/O in 1470 * the context of a completion, because the resources for the I/O are 1471 * not released until control returns to the bdev poller. Also, we 1472 * may require several small I/O to complete before a larger I/O 1473 * (that requires splitting) can be submitted. 1474 */ 1475 return; 1476 } 1477 1478 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1479 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1480 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1481 1482 switch (bdev_io->internal.retry_state) { 1483 case BDEV_IO_RETRY_STATE_SUBMIT: 1484 bdev_ch_resubmit_io(bdev_ch, bdev_io); 1485 break; 1486 case BDEV_IO_RETRY_STATE_PULL: 1487 bdev_io_pull_data(bdev_io); 1488 break; 1489 case BDEV_IO_RETRY_STATE_PULL_MD: 1490 bdev_io_pull_md_buf(bdev_io); 1491 break; 1492 case BDEV_IO_RETRY_STATE_PUSH: 1493 bdev_io_push_bounce_data(bdev_io); 1494 break; 1495 case BDEV_IO_RETRY_STATE_PUSH_MD: 1496 bdev_io_push_bounce_md_buf(bdev_io); 1497 break; 1498 default: 1499 assert(0 && "invalid retry state"); 1500 break; 1501 } 1502 1503 if (bdev_io == TAILQ_FIRST(&shared_resource->nomem_io)) { 1504 /* This IO completed again with NOMEM status, so break the loop and 1505 * don't try anymore. Note that a bdev_io that fails with NOMEM 1506 * always gets requeued at the front of the list, to maintain 1507 * ordering. 1508 */ 1509 break; 1510 } 1511 } 1512 } 1513 1514 static inline bool 1515 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io, enum bdev_io_retry_state state) 1516 { 1517 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1518 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1519 1520 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1521 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1522 bdev_queue_nomem_io_head(shared_resource, bdev_io, state); 1523 1524 /* If bdev module completed an I/O that has an accel sequence with NOMEM status, the 1525 * ownership of that sequence is transferred back to the bdev layer, so we need to 1526 * restore internal.accel_sequence to make sure that the sequence is handled 1527 * correctly in case the I/O is later aborted. */ 1528 if ((bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 1529 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) && bdev_io->u.bdev.accel_sequence) { 1530 assert(bdev_io->internal.accel_sequence == NULL); 1531 bdev_io->internal.accel_sequence = bdev_io->u.bdev.accel_sequence; 1532 } 1533 1534 return true; 1535 } 1536 1537 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1538 bdev_ch_retry_io(bdev_ch); 1539 } 1540 1541 return false; 1542 } 1543 1544 static void 1545 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1546 { 1547 struct spdk_bdev_io *bdev_io = ctx; 1548 1549 if (rc) { 1550 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1551 } 1552 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1553 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1554 */ 1555 bdev_io_put_buf(bdev_io); 1556 1557 /* Continue with IO completion flow */ 1558 bdev_io_complete(bdev_io); 1559 } 1560 1561 static void 1562 bdev_io_push_bounce_md_buf_done(void *ctx, int rc) 1563 { 1564 struct spdk_bdev_io *bdev_io = ctx; 1565 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1566 1567 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1568 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1569 1570 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1571 bdev_ch_retry_io(ch); 1572 } 1573 1574 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1575 } 1576 1577 static inline void 1578 bdev_io_push_bounce_md_buf(struct spdk_bdev_io *bdev_io) 1579 { 1580 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1581 int rc = 0; 1582 1583 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1584 /* do the same for metadata buffer */ 1585 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1586 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1587 1588 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1589 if (bdev_io_use_memory_domain(bdev_io)) { 1590 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1591 bdev_io_increment_outstanding(ch, ch->shared_resource); 1592 /* If memory domain is used then we need to call async push function */ 1593 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1594 bdev_io->internal.memory_domain_ctx, 1595 &bdev_io->internal.orig_md_iov, 1596 (uint32_t)bdev_io->internal.orig_iovcnt, 1597 &bdev_io->internal.bounce_md_iov, 1, 1598 bdev_io_push_bounce_md_buf_done, 1599 bdev_io); 1600 if (rc == 0) { 1601 /* Continue IO completion in async callback */ 1602 return; 1603 } 1604 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1605 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1606 if (rc != -ENOMEM) { 1607 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1608 spdk_memory_domain_get_dma_device_id( 1609 bdev_io->internal.memory_domain)); 1610 } 1611 } else { 1612 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1613 bdev_io->internal.orig_md_iov.iov_len); 1614 } 1615 } 1616 } 1617 1618 if (spdk_unlikely(rc == -ENOMEM)) { 1619 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH_MD); 1620 } else { 1621 assert(bdev_io->internal.data_transfer_cpl); 1622 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1623 } 1624 } 1625 1626 static inline void 1627 bdev_io_push_bounce_data_done(struct spdk_bdev_io *bdev_io, int rc) 1628 { 1629 assert(bdev_io->internal.data_transfer_cpl); 1630 if (rc) { 1631 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1632 return; 1633 } 1634 1635 /* set original buffer for this io */ 1636 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1637 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1638 /* disable bouncing buffer for this io */ 1639 bdev_io->internal.orig_iovcnt = 0; 1640 bdev_io->internal.orig_iovs = NULL; 1641 1642 bdev_io_push_bounce_md_buf(bdev_io); 1643 } 1644 1645 static void 1646 bdev_io_push_bounce_data_done_and_track(void *ctx, int status) 1647 { 1648 struct spdk_bdev_io *bdev_io = ctx; 1649 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1650 1651 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1652 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1653 1654 if (spdk_unlikely(!TAILQ_EMPTY(&ch->shared_resource->nomem_io))) { 1655 bdev_ch_retry_io(ch); 1656 } 1657 1658 bdev_io_push_bounce_data_done(bdev_io, status); 1659 } 1660 1661 static inline void 1662 bdev_io_push_bounce_data(struct spdk_bdev_io *bdev_io) 1663 { 1664 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 1665 int rc = 0; 1666 1667 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 1668 /* if this is read path, copy data from bounce buffer to original buffer */ 1669 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ) { 1670 if (bdev_io_use_memory_domain(bdev_io)) { 1671 TAILQ_INSERT_TAIL(&ch->io_memory_domain, bdev_io, internal.link); 1672 bdev_io_increment_outstanding(ch, ch->shared_resource); 1673 /* If memory domain is used then we need to call async push function */ 1674 rc = spdk_memory_domain_push_data(bdev_io->internal.memory_domain, 1675 bdev_io->internal.memory_domain_ctx, 1676 bdev_io->internal.orig_iovs, 1677 (uint32_t)bdev_io->internal.orig_iovcnt, 1678 &bdev_io->internal.bounce_iov, 1, 1679 bdev_io_push_bounce_data_done_and_track, 1680 bdev_io); 1681 if (rc == 0) { 1682 /* Continue IO completion in async callback */ 1683 return; 1684 } 1685 1686 TAILQ_REMOVE(&ch->io_memory_domain, bdev_io, internal.link); 1687 bdev_io_decrement_outstanding(ch, ch->shared_resource); 1688 if (rc != -ENOMEM) { 1689 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1690 spdk_memory_domain_get_dma_device_id( 1691 bdev_io->internal.memory_domain)); 1692 } 1693 } else { 1694 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1695 bdev_io->internal.orig_iovcnt, 1696 bdev_io->internal.bounce_iov.iov_base, 1697 bdev_io->internal.bounce_iov.iov_len); 1698 } 1699 } 1700 1701 if (spdk_unlikely(rc == -ENOMEM)) { 1702 bdev_queue_nomem_io_head(ch->shared_resource, bdev_io, BDEV_IO_RETRY_STATE_PUSH); 1703 } else { 1704 bdev_io_push_bounce_data_done(bdev_io, rc); 1705 } 1706 } 1707 1708 static inline void 1709 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1710 { 1711 bdev_io->internal.data_transfer_cpl = cpl_cb; 1712 bdev_io_push_bounce_data(bdev_io); 1713 } 1714 1715 static void 1716 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1717 { 1718 struct spdk_bdev_io *bdev_io; 1719 1720 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1721 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1722 } 1723 1724 static void 1725 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1726 { 1727 struct spdk_bdev_mgmt_channel *mgmt_ch; 1728 uint64_t max_len; 1729 void *buf; 1730 1731 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1732 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1733 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1734 1735 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1736 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1737 bdev_io_get_buf_complete(bdev_io, false); 1738 return; 1739 } 1740 1741 bdev_io->internal.buf_len = len; 1742 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1743 bdev_io_get_iobuf_cb); 1744 if (buf != NULL) { 1745 _bdev_io_set_buf(bdev_io, buf, len); 1746 } 1747 } 1748 1749 void 1750 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1751 { 1752 struct spdk_bdev *bdev = bdev_io->bdev; 1753 uint64_t alignment; 1754 1755 assert(cb != NULL); 1756 bdev_io->internal.get_buf_cb = cb; 1757 1758 alignment = spdk_bdev_get_buf_align(bdev); 1759 1760 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1761 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1762 /* Buffer already present and aligned */ 1763 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1764 return; 1765 } 1766 1767 bdev_io_get_buf(bdev_io, len); 1768 } 1769 1770 static void 1771 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1772 bool success) 1773 { 1774 if (!success) { 1775 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1776 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1777 bdev_io_complete_unsubmitted(bdev_io); 1778 return; 1779 } 1780 1781 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 1782 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 1783 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 1784 return; 1785 } 1786 /* For reads we'll execute the sequence after the data is read, so, for now, only 1787 * clear out accel_sequence pointer and submit the IO */ 1788 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 1789 bdev_io->u.bdev.accel_sequence = NULL; 1790 } 1791 1792 bdev_io_submit(bdev_io); 1793 } 1794 1795 static void 1796 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1797 uint64_t len) 1798 { 1799 assert(cb != NULL); 1800 bdev_io->internal.get_buf_cb = cb; 1801 1802 bdev_io_get_buf(bdev_io, len); 1803 } 1804 1805 void 1806 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1807 { 1808 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1809 1810 assert(cb != NULL); 1811 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1812 bdev_io->internal.get_aux_buf_cb = cb; 1813 bdev_io_get_buf(bdev_io, len); 1814 } 1815 1816 static int 1817 bdev_module_get_max_ctx_size(void) 1818 { 1819 struct spdk_bdev_module *bdev_module; 1820 int max_bdev_module_size = 0; 1821 1822 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1823 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1824 max_bdev_module_size = bdev_module->get_ctx_size(); 1825 } 1826 } 1827 1828 return max_bdev_module_size; 1829 } 1830 1831 static void 1832 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1833 { 1834 int i; 1835 struct spdk_bdev_qos *qos = bdev->internal.qos; 1836 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1837 1838 if (!qos) { 1839 return; 1840 } 1841 1842 spdk_bdev_get_qos_rate_limits(bdev, limits); 1843 1844 spdk_json_write_object_begin(w); 1845 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1846 1847 spdk_json_write_named_object_begin(w, "params"); 1848 spdk_json_write_named_string(w, "name", bdev->name); 1849 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1850 if (limits[i] > 0) { 1851 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1852 } 1853 } 1854 spdk_json_write_object_end(w); 1855 1856 spdk_json_write_object_end(w); 1857 } 1858 1859 void 1860 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1861 { 1862 struct spdk_bdev_module *bdev_module; 1863 struct spdk_bdev *bdev; 1864 1865 assert(w != NULL); 1866 1867 spdk_json_write_array_begin(w); 1868 1869 spdk_json_write_object_begin(w); 1870 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1871 spdk_json_write_named_object_begin(w, "params"); 1872 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1873 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1874 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1875 spdk_json_write_object_end(w); 1876 spdk_json_write_object_end(w); 1877 1878 bdev_examine_allowlist_config_json(w); 1879 1880 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1881 if (bdev_module->config_json) { 1882 bdev_module->config_json(w); 1883 } 1884 } 1885 1886 spdk_spin_lock(&g_bdev_mgr.spinlock); 1887 1888 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1889 if (bdev->fn_table->write_config_json) { 1890 bdev->fn_table->write_config_json(bdev, w); 1891 } 1892 1893 bdev_qos_config_json(bdev, w); 1894 } 1895 1896 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1897 1898 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1899 spdk_json_write_object_begin(w); 1900 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1901 spdk_json_write_object_end(w); 1902 1903 spdk_json_write_array_end(w); 1904 } 1905 1906 static void 1907 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1908 { 1909 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1910 struct spdk_bdev_io *bdev_io; 1911 1912 spdk_iobuf_channel_fini(&ch->iobuf); 1913 1914 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1915 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1916 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1917 ch->per_thread_cache_count--; 1918 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1919 } 1920 1921 assert(ch->per_thread_cache_count == 0); 1922 } 1923 1924 static int 1925 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1926 { 1927 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1928 struct spdk_bdev_io *bdev_io; 1929 uint32_t i; 1930 int rc; 1931 1932 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1933 if (rc != 0) { 1934 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1935 return -1; 1936 } 1937 1938 STAILQ_INIT(&ch->per_thread_cache); 1939 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1940 1941 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1942 ch->per_thread_cache_count = 0; 1943 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1944 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1945 if (bdev_io == NULL) { 1946 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1947 assert(false); 1948 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1949 return -1; 1950 } 1951 ch->per_thread_cache_count++; 1952 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1953 } 1954 1955 TAILQ_INIT(&ch->shared_resources); 1956 TAILQ_INIT(&ch->io_wait_queue); 1957 1958 return 0; 1959 } 1960 1961 static void 1962 bdev_init_complete(int rc) 1963 { 1964 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1965 void *cb_arg = g_init_cb_arg; 1966 struct spdk_bdev_module *m; 1967 1968 g_bdev_mgr.init_complete = true; 1969 g_init_cb_fn = NULL; 1970 g_init_cb_arg = NULL; 1971 1972 /* 1973 * For modules that need to know when subsystem init is complete, 1974 * inform them now. 1975 */ 1976 if (rc == 0) { 1977 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1978 if (m->init_complete) { 1979 m->init_complete(); 1980 } 1981 } 1982 } 1983 1984 cb_fn(cb_arg, rc); 1985 } 1986 1987 static bool 1988 bdev_module_all_actions_completed(void) 1989 { 1990 struct spdk_bdev_module *m; 1991 1992 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1993 if (m->internal.action_in_progress > 0) { 1994 return false; 1995 } 1996 } 1997 return true; 1998 } 1999 2000 static void 2001 bdev_module_action_complete(void) 2002 { 2003 /* 2004 * Don't finish bdev subsystem initialization if 2005 * module pre-initialization is still in progress, or 2006 * the subsystem been already initialized. 2007 */ 2008 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 2009 return; 2010 } 2011 2012 /* 2013 * Check all bdev modules for inits/examinations in progress. If any 2014 * exist, return immediately since we cannot finish bdev subsystem 2015 * initialization until all are completed. 2016 */ 2017 if (!bdev_module_all_actions_completed()) { 2018 return; 2019 } 2020 2021 /* 2022 * Modules already finished initialization - now that all 2023 * the bdev modules have finished their asynchronous I/O 2024 * processing, the entire bdev layer can be marked as complete. 2025 */ 2026 bdev_init_complete(0); 2027 } 2028 2029 static void 2030 bdev_module_action_done(struct spdk_bdev_module *module) 2031 { 2032 spdk_spin_lock(&module->internal.spinlock); 2033 assert(module->internal.action_in_progress > 0); 2034 module->internal.action_in_progress--; 2035 spdk_spin_unlock(&module->internal.spinlock); 2036 bdev_module_action_complete(); 2037 } 2038 2039 void 2040 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 2041 { 2042 assert(module->async_init); 2043 bdev_module_action_done(module); 2044 } 2045 2046 void 2047 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 2048 { 2049 bdev_module_action_done(module); 2050 } 2051 2052 /** The last initialized bdev module */ 2053 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 2054 2055 static void 2056 bdev_init_failed(void *cb_arg) 2057 { 2058 struct spdk_bdev_module *module = cb_arg; 2059 2060 spdk_spin_lock(&module->internal.spinlock); 2061 assert(module->internal.action_in_progress > 0); 2062 module->internal.action_in_progress--; 2063 spdk_spin_unlock(&module->internal.spinlock); 2064 bdev_init_complete(-1); 2065 } 2066 2067 static int 2068 bdev_modules_init(void) 2069 { 2070 struct spdk_bdev_module *module; 2071 int rc = 0; 2072 2073 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 2074 g_resume_bdev_module = module; 2075 if (module->async_init) { 2076 spdk_spin_lock(&module->internal.spinlock); 2077 module->internal.action_in_progress = 1; 2078 spdk_spin_unlock(&module->internal.spinlock); 2079 } 2080 rc = module->module_init(); 2081 if (rc != 0) { 2082 /* Bump action_in_progress to prevent other modules from completion of modules_init 2083 * Send message to defer application shutdown until resources are cleaned up */ 2084 spdk_spin_lock(&module->internal.spinlock); 2085 module->internal.action_in_progress = 1; 2086 spdk_spin_unlock(&module->internal.spinlock); 2087 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 2088 return rc; 2089 } 2090 } 2091 2092 g_resume_bdev_module = NULL; 2093 return 0; 2094 } 2095 2096 void 2097 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 2098 { 2099 int rc = 0; 2100 char mempool_name[32]; 2101 2102 assert(cb_fn != NULL); 2103 2104 g_init_cb_fn = cb_fn; 2105 g_init_cb_arg = cb_arg; 2106 2107 spdk_notify_type_register("bdev_register"); 2108 spdk_notify_type_register("bdev_unregister"); 2109 2110 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 2111 2112 rc = spdk_iobuf_register_module("bdev"); 2113 if (rc != 0) { 2114 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 2115 bdev_init_complete(-1); 2116 return; 2117 } 2118 2119 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 2120 g_bdev_opts.bdev_io_pool_size, 2121 sizeof(struct spdk_bdev_io) + 2122 bdev_module_get_max_ctx_size(), 2123 0, 2124 SPDK_ENV_SOCKET_ID_ANY); 2125 2126 if (g_bdev_mgr.bdev_io_pool == NULL) { 2127 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 2128 bdev_init_complete(-1); 2129 return; 2130 } 2131 2132 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 2133 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 2134 if (!g_bdev_mgr.zero_buffer) { 2135 SPDK_ERRLOG("create bdev zero buffer failed\n"); 2136 bdev_init_complete(-1); 2137 return; 2138 } 2139 2140 #ifdef SPDK_CONFIG_VTUNE 2141 SPDK_LOG_DEPRECATED(vtune_support); 2142 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 2143 #endif 2144 2145 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 2146 bdev_mgmt_channel_destroy, 2147 sizeof(struct spdk_bdev_mgmt_channel), 2148 "bdev_mgr"); 2149 2150 rc = bdev_modules_init(); 2151 g_bdev_mgr.module_init_complete = true; 2152 if (rc != 0) { 2153 SPDK_ERRLOG("bdev modules init failed\n"); 2154 return; 2155 } 2156 2157 bdev_module_action_complete(); 2158 } 2159 2160 static void 2161 bdev_mgr_unregister_cb(void *io_device) 2162 { 2163 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 2164 2165 if (g_bdev_mgr.bdev_io_pool) { 2166 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 2167 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 2168 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 2169 g_bdev_opts.bdev_io_pool_size); 2170 } 2171 2172 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 2173 } 2174 2175 spdk_free(g_bdev_mgr.zero_buffer); 2176 2177 bdev_examine_allowlist_free(); 2178 2179 cb_fn(g_fini_cb_arg); 2180 g_fini_cb_fn = NULL; 2181 g_fini_cb_arg = NULL; 2182 g_bdev_mgr.init_complete = false; 2183 g_bdev_mgr.module_init_complete = false; 2184 } 2185 2186 static void 2187 bdev_module_fini_iter(void *arg) 2188 { 2189 struct spdk_bdev_module *bdev_module; 2190 2191 /* FIXME: Handling initialization failures is broken now, 2192 * so we won't even try cleaning up after successfully 2193 * initialized modules. if module_init_complete is false, 2194 * just call spdk_bdev_mgr_unregister_cb 2195 */ 2196 if (!g_bdev_mgr.module_init_complete) { 2197 bdev_mgr_unregister_cb(NULL); 2198 return; 2199 } 2200 2201 /* Start iterating from the last touched module */ 2202 if (!g_resume_bdev_module) { 2203 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2204 } else { 2205 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 2206 internal.tailq); 2207 } 2208 2209 while (bdev_module) { 2210 if (bdev_module->async_fini) { 2211 /* Save our place so we can resume later. We must 2212 * save the variable here, before calling module_fini() 2213 * below, because in some cases the module may immediately 2214 * call spdk_bdev_module_fini_done() and re-enter 2215 * this function to continue iterating. */ 2216 g_resume_bdev_module = bdev_module; 2217 } 2218 2219 if (bdev_module->module_fini) { 2220 bdev_module->module_fini(); 2221 } 2222 2223 if (bdev_module->async_fini) { 2224 return; 2225 } 2226 2227 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 2228 internal.tailq); 2229 } 2230 2231 g_resume_bdev_module = NULL; 2232 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 2233 } 2234 2235 void 2236 spdk_bdev_module_fini_done(void) 2237 { 2238 if (spdk_get_thread() != g_fini_thread) { 2239 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 2240 } else { 2241 bdev_module_fini_iter(NULL); 2242 } 2243 } 2244 2245 static void 2246 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 2247 { 2248 struct spdk_bdev *bdev = cb_arg; 2249 2250 if (bdeverrno && bdev) { 2251 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 2252 bdev->name); 2253 2254 /* 2255 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 2256 * bdev; try to continue by manually removing this bdev from the list and continue 2257 * with the next bdev in the list. 2258 */ 2259 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 2260 } 2261 2262 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 2263 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 2264 /* 2265 * Bdev module finish need to be deferred as we might be in the middle of some context 2266 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 2267 * after returning. 2268 */ 2269 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 2270 return; 2271 } 2272 2273 /* 2274 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 2275 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 2276 * to detect clean shutdown as opposed to run-time hot removal of the underlying 2277 * base bdevs. 2278 * 2279 * Also, walk the list in the reverse order. 2280 */ 2281 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2282 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2283 spdk_spin_lock(&bdev->internal.spinlock); 2284 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 2285 LOG_ALREADY_CLAIMED_DEBUG("claimed, skipping", bdev); 2286 spdk_spin_unlock(&bdev->internal.spinlock); 2287 continue; 2288 } 2289 spdk_spin_unlock(&bdev->internal.spinlock); 2290 2291 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 2292 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2293 return; 2294 } 2295 2296 /* 2297 * If any bdev fails to unclaim underlying bdev properly, we may face the 2298 * case of bdev list consisting of claimed bdevs only (if claims are managed 2299 * correctly, this would mean there's a loop in the claims graph which is 2300 * clearly impossible). Warn and unregister last bdev on the list then. 2301 */ 2302 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 2303 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 2304 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 2305 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 2306 return; 2307 } 2308 } 2309 2310 static void 2311 bdev_module_fini_start_iter(void *arg) 2312 { 2313 struct spdk_bdev_module *bdev_module; 2314 2315 if (!g_resume_bdev_module) { 2316 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 2317 } else { 2318 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 2319 } 2320 2321 while (bdev_module) { 2322 if (bdev_module->async_fini_start) { 2323 /* Save our place so we can resume later. We must 2324 * save the variable here, before calling fini_start() 2325 * below, because in some cases the module may immediately 2326 * call spdk_bdev_module_fini_start_done() and re-enter 2327 * this function to continue iterating. */ 2328 g_resume_bdev_module = bdev_module; 2329 } 2330 2331 if (bdev_module->fini_start) { 2332 bdev_module->fini_start(); 2333 } 2334 2335 if (bdev_module->async_fini_start) { 2336 return; 2337 } 2338 2339 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 2340 } 2341 2342 g_resume_bdev_module = NULL; 2343 2344 bdev_finish_unregister_bdevs_iter(NULL, 0); 2345 } 2346 2347 void 2348 spdk_bdev_module_fini_start_done(void) 2349 { 2350 if (spdk_get_thread() != g_fini_thread) { 2351 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 2352 } else { 2353 bdev_module_fini_start_iter(NULL); 2354 } 2355 } 2356 2357 static void 2358 bdev_finish_wait_for_examine_done(void *cb_arg) 2359 { 2360 bdev_module_fini_start_iter(NULL); 2361 } 2362 2363 void 2364 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 2365 { 2366 int rc; 2367 2368 assert(cb_fn != NULL); 2369 2370 g_fini_thread = spdk_get_thread(); 2371 2372 g_fini_cb_fn = cb_fn; 2373 g_fini_cb_arg = cb_arg; 2374 2375 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 2376 if (rc != 0) { 2377 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 2378 bdev_finish_wait_for_examine_done(NULL); 2379 } 2380 } 2381 2382 struct spdk_bdev_io * 2383 bdev_channel_get_io(struct spdk_bdev_channel *channel) 2384 { 2385 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 2386 struct spdk_bdev_io *bdev_io; 2387 2388 if (ch->per_thread_cache_count > 0) { 2389 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 2390 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 2391 ch->per_thread_cache_count--; 2392 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 2393 /* 2394 * Don't try to look for bdev_ios in the global pool if there are 2395 * waiters on bdev_ios - we don't want this caller to jump the line. 2396 */ 2397 bdev_io = NULL; 2398 } else { 2399 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 2400 } 2401 2402 return bdev_io; 2403 } 2404 2405 void 2406 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 2407 { 2408 struct spdk_bdev_mgmt_channel *ch; 2409 2410 assert(bdev_io != NULL); 2411 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 2412 2413 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 2414 2415 if (bdev_io->internal.buf != NULL) { 2416 bdev_io_put_buf(bdev_io); 2417 } 2418 2419 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 2420 ch->per_thread_cache_count++; 2421 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 2422 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 2423 struct spdk_bdev_io_wait_entry *entry; 2424 2425 entry = TAILQ_FIRST(&ch->io_wait_queue); 2426 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 2427 entry->cb_fn(entry->cb_arg); 2428 } 2429 } else { 2430 /* We should never have a full cache with entries on the io wait queue. */ 2431 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 2432 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 2433 } 2434 } 2435 2436 static bool 2437 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 2438 { 2439 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 2440 2441 switch (limit) { 2442 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2443 return true; 2444 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2445 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2446 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2447 return false; 2448 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 2449 default: 2450 return false; 2451 } 2452 } 2453 2454 static bool 2455 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2456 { 2457 switch (bdev_io->type) { 2458 case SPDK_BDEV_IO_TYPE_NVME_IO: 2459 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2460 case SPDK_BDEV_IO_TYPE_READ: 2461 case SPDK_BDEV_IO_TYPE_WRITE: 2462 return true; 2463 case SPDK_BDEV_IO_TYPE_ZCOPY: 2464 if (bdev_io->u.bdev.zcopy.start) { 2465 return true; 2466 } else { 2467 return false; 2468 } 2469 default: 2470 return false; 2471 } 2472 } 2473 2474 static bool 2475 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2476 { 2477 switch (bdev_io->type) { 2478 case SPDK_BDEV_IO_TYPE_NVME_IO: 2479 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2480 /* Bit 1 (0x2) set for read operation */ 2481 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2482 return true; 2483 } else { 2484 return false; 2485 } 2486 case SPDK_BDEV_IO_TYPE_READ: 2487 return true; 2488 case SPDK_BDEV_IO_TYPE_ZCOPY: 2489 /* Populate to read from disk */ 2490 if (bdev_io->u.bdev.zcopy.populate) { 2491 return true; 2492 } else { 2493 return false; 2494 } 2495 default: 2496 return false; 2497 } 2498 } 2499 2500 static uint64_t 2501 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2502 { 2503 struct spdk_bdev *bdev = bdev_io->bdev; 2504 2505 switch (bdev_io->type) { 2506 case SPDK_BDEV_IO_TYPE_NVME_IO: 2507 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2508 return bdev_io->u.nvme_passthru.nbytes; 2509 case SPDK_BDEV_IO_TYPE_READ: 2510 case SPDK_BDEV_IO_TYPE_WRITE: 2511 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2512 case SPDK_BDEV_IO_TYPE_ZCOPY: 2513 /* Track the data in the start phase only */ 2514 if (bdev_io->u.bdev.zcopy.start) { 2515 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2516 } else { 2517 return 0; 2518 } 2519 default: 2520 return 0; 2521 } 2522 } 2523 2524 static bool 2525 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2526 { 2527 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2528 return true; 2529 } else { 2530 return false; 2531 } 2532 } 2533 2534 static bool 2535 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2536 { 2537 if (bdev_is_read_io(io) == false) { 2538 return false; 2539 } 2540 2541 return bdev_qos_rw_queue_io(limit, io); 2542 } 2543 2544 static bool 2545 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2546 { 2547 if (bdev_is_read_io(io) == true) { 2548 return false; 2549 } 2550 2551 return bdev_qos_rw_queue_io(limit, io); 2552 } 2553 2554 static void 2555 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2556 { 2557 limit->remaining_this_timeslice--; 2558 } 2559 2560 static void 2561 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2562 { 2563 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2564 } 2565 2566 static void 2567 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2568 { 2569 if (bdev_is_read_io(io) == false) { 2570 return; 2571 } 2572 2573 return bdev_qos_rw_bps_update_quota(limit, io); 2574 } 2575 2576 static void 2577 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2578 { 2579 if (bdev_is_read_io(io) == true) { 2580 return; 2581 } 2582 2583 return bdev_qos_rw_bps_update_quota(limit, io); 2584 } 2585 2586 static void 2587 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2588 { 2589 int i; 2590 2591 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2592 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2593 qos->rate_limits[i].queue_io = NULL; 2594 qos->rate_limits[i].update_quota = NULL; 2595 continue; 2596 } 2597 2598 switch (i) { 2599 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2600 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2601 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2602 break; 2603 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2604 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2605 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2606 break; 2607 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2608 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2609 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2610 break; 2611 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2612 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2613 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2614 break; 2615 default: 2616 break; 2617 } 2618 } 2619 } 2620 2621 static void 2622 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2623 struct spdk_bdev_io *bdev_io, 2624 enum spdk_bdev_io_status status) 2625 { 2626 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2627 2628 bdev_io->internal.in_submit_request = true; 2629 bdev_ch->io_outstanding++; 2630 shared_resource->io_outstanding++; 2631 spdk_bdev_io_complete(bdev_io, status); 2632 bdev_io->internal.in_submit_request = false; 2633 } 2634 2635 static inline void 2636 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2637 { 2638 struct spdk_bdev *bdev = bdev_io->bdev; 2639 struct spdk_io_channel *ch = bdev_ch->channel; 2640 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2641 2642 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2643 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2644 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2645 2646 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2647 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2648 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2649 SPDK_BDEV_IO_STATUS_SUCCESS); 2650 return; 2651 } 2652 } 2653 2654 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2655 bdev_io->bdev->split_on_write_unit && 2656 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2657 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2658 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2659 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2660 return; 2661 } 2662 2663 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2664 bdev_ch->io_outstanding++; 2665 shared_resource->io_outstanding++; 2666 bdev_io->internal.in_submit_request = true; 2667 bdev_submit_request(bdev, ch, bdev_io); 2668 bdev_io->internal.in_submit_request = false; 2669 } else { 2670 bdev_queue_nomem_io_tail(shared_resource, bdev_io, BDEV_IO_RETRY_STATE_SUBMIT); 2671 } 2672 } 2673 2674 static bool 2675 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2676 { 2677 int i; 2678 2679 if (bdev_qos_io_to_limit(bdev_io) == true) { 2680 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2681 if (!qos->rate_limits[i].queue_io) { 2682 continue; 2683 } 2684 2685 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2686 bdev_io) == true) { 2687 return true; 2688 } 2689 } 2690 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2691 if (!qos->rate_limits[i].update_quota) { 2692 continue; 2693 } 2694 2695 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2696 } 2697 } 2698 2699 return false; 2700 } 2701 2702 static inline void 2703 _bdev_io_do_submit(void *ctx) 2704 { 2705 struct spdk_bdev_io *bdev_io = ctx; 2706 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2707 2708 bdev_io_do_submit(ch, bdev_io); 2709 } 2710 2711 static int 2712 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2713 { 2714 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2715 int submitted_ios = 0; 2716 2717 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2718 if (!bdev_qos_queue_io(qos, bdev_io)) { 2719 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2720 2721 if (bdev_io->internal.io_submit_ch) { 2722 /* Send back the IO to the original thread for the actual processing. */ 2723 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2724 bdev_io->internal.io_submit_ch = NULL; 2725 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2726 _bdev_io_do_submit, bdev_io); 2727 } else { 2728 bdev_io_do_submit(ch, bdev_io); 2729 } 2730 2731 submitted_ios++; 2732 } 2733 } 2734 2735 return submitted_ios; 2736 } 2737 2738 static void 2739 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2740 { 2741 int rc; 2742 2743 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2744 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2745 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2746 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2747 &bdev_io->internal.waitq_entry); 2748 if (rc != 0) { 2749 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2750 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2751 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2752 } 2753 } 2754 2755 static bool 2756 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2757 { 2758 uint32_t io_boundary; 2759 struct spdk_bdev *bdev = bdev_io->bdev; 2760 uint32_t max_size = bdev->max_segment_size; 2761 int max_segs = bdev->max_num_segments; 2762 2763 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2764 io_boundary = bdev->write_unit_size; 2765 } else if (bdev->split_on_optimal_io_boundary) { 2766 io_boundary = bdev->optimal_io_boundary; 2767 } else { 2768 io_boundary = 0; 2769 } 2770 2771 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2772 return false; 2773 } 2774 2775 if (io_boundary) { 2776 uint64_t start_stripe, end_stripe; 2777 2778 start_stripe = bdev_io->u.bdev.offset_blocks; 2779 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2780 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2781 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2782 start_stripe >>= spdk_u32log2(io_boundary); 2783 end_stripe >>= spdk_u32log2(io_boundary); 2784 } else { 2785 start_stripe /= io_boundary; 2786 end_stripe /= io_boundary; 2787 } 2788 2789 if (start_stripe != end_stripe) { 2790 return true; 2791 } 2792 } 2793 2794 if (max_segs) { 2795 if (bdev_io->u.bdev.iovcnt > max_segs) { 2796 return true; 2797 } 2798 } 2799 2800 if (max_size) { 2801 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2802 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2803 return true; 2804 } 2805 } 2806 } 2807 2808 return false; 2809 } 2810 2811 static bool 2812 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2813 { 2814 uint32_t num_unmap_segments; 2815 2816 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2817 return false; 2818 } 2819 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2820 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2821 return true; 2822 } 2823 2824 return false; 2825 } 2826 2827 static bool 2828 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2829 { 2830 if (!bdev_io->bdev->max_write_zeroes) { 2831 return false; 2832 } 2833 2834 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2835 return true; 2836 } 2837 2838 return false; 2839 } 2840 2841 static bool 2842 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2843 { 2844 if (bdev_io->bdev->max_copy != 0 && 2845 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2846 return true; 2847 } 2848 2849 return false; 2850 } 2851 2852 static bool 2853 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2854 { 2855 switch (bdev_io->type) { 2856 case SPDK_BDEV_IO_TYPE_READ: 2857 case SPDK_BDEV_IO_TYPE_WRITE: 2858 return bdev_rw_should_split(bdev_io); 2859 case SPDK_BDEV_IO_TYPE_UNMAP: 2860 return bdev_unmap_should_split(bdev_io); 2861 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2862 return bdev_write_zeroes_should_split(bdev_io); 2863 case SPDK_BDEV_IO_TYPE_COPY: 2864 return bdev_copy_should_split(bdev_io); 2865 default: 2866 return false; 2867 } 2868 } 2869 2870 static uint32_t 2871 _to_next_boundary(uint64_t offset, uint32_t boundary) 2872 { 2873 return (boundary - (offset % boundary)); 2874 } 2875 2876 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2877 2878 static void _bdev_rw_split(void *_bdev_io); 2879 2880 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2881 2882 static void 2883 _bdev_unmap_split(void *_bdev_io) 2884 { 2885 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2886 } 2887 2888 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2889 2890 static void 2891 _bdev_write_zeroes_split(void *_bdev_io) 2892 { 2893 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2894 } 2895 2896 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2897 2898 static void 2899 _bdev_copy_split(void *_bdev_io) 2900 { 2901 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2902 } 2903 2904 static int 2905 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2906 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2907 { 2908 int rc; 2909 uint64_t current_offset, current_remaining, current_src_offset; 2910 spdk_bdev_io_wait_cb io_wait_fn; 2911 2912 current_offset = *offset; 2913 current_remaining = *remaining; 2914 2915 bdev_io->u.bdev.split_outstanding++; 2916 2917 io_wait_fn = _bdev_rw_split; 2918 switch (bdev_io->type) { 2919 case SPDK_BDEV_IO_TYPE_READ: 2920 assert(bdev_io->u.bdev.accel_sequence == NULL); 2921 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2922 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2923 iov, iovcnt, md_buf, current_offset, 2924 num_blocks, bdev_io->internal.memory_domain, 2925 bdev_io->internal.memory_domain_ctx, NULL, 2926 bdev_io_split_done, bdev_io); 2927 break; 2928 case SPDK_BDEV_IO_TYPE_WRITE: 2929 assert(bdev_io->u.bdev.accel_sequence == NULL); 2930 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2931 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2932 iov, iovcnt, md_buf, current_offset, 2933 num_blocks, bdev_io->internal.memory_domain, 2934 bdev_io->internal.memory_domain_ctx, NULL, 2935 bdev_io_split_done, bdev_io); 2936 break; 2937 case SPDK_BDEV_IO_TYPE_UNMAP: 2938 io_wait_fn = _bdev_unmap_split; 2939 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2940 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2941 current_offset, num_blocks, 2942 bdev_io_split_done, bdev_io); 2943 break; 2944 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2945 io_wait_fn = _bdev_write_zeroes_split; 2946 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2947 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2948 current_offset, num_blocks, 2949 bdev_io_split_done, bdev_io); 2950 break; 2951 case SPDK_BDEV_IO_TYPE_COPY: 2952 io_wait_fn = _bdev_copy_split; 2953 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2954 (current_offset - bdev_io->u.bdev.offset_blocks); 2955 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2956 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2957 current_offset, current_src_offset, num_blocks, 2958 bdev_io_split_done, bdev_io); 2959 break; 2960 default: 2961 assert(false); 2962 rc = -EINVAL; 2963 break; 2964 } 2965 2966 if (rc == 0) { 2967 current_offset += num_blocks; 2968 current_remaining -= num_blocks; 2969 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2970 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2971 *offset = current_offset; 2972 *remaining = current_remaining; 2973 } else { 2974 bdev_io->u.bdev.split_outstanding--; 2975 if (rc == -ENOMEM) { 2976 if (bdev_io->u.bdev.split_outstanding == 0) { 2977 /* No I/O is outstanding. Hence we should wait here. */ 2978 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2979 } 2980 } else { 2981 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2982 if (bdev_io->u.bdev.split_outstanding == 0) { 2983 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2984 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2985 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2986 } 2987 } 2988 } 2989 2990 return rc; 2991 } 2992 2993 static void 2994 _bdev_rw_split(void *_bdev_io) 2995 { 2996 struct iovec *parent_iov, *iov; 2997 struct spdk_bdev_io *bdev_io = _bdev_io; 2998 struct spdk_bdev *bdev = bdev_io->bdev; 2999 uint64_t parent_offset, current_offset, remaining; 3000 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 3001 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 3002 uint32_t iovcnt, iov_len, child_iovsize; 3003 uint32_t blocklen = bdev->blocklen; 3004 uint32_t io_boundary; 3005 uint32_t max_segment_size = bdev->max_segment_size; 3006 uint32_t max_child_iovcnt = bdev->max_num_segments; 3007 void *md_buf = NULL; 3008 int rc; 3009 3010 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 3011 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 3012 SPDK_BDEV_IO_NUM_CHILD_IOV; 3013 3014 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 3015 io_boundary = bdev->write_unit_size; 3016 } else if (bdev->split_on_optimal_io_boundary) { 3017 io_boundary = bdev->optimal_io_boundary; 3018 } else { 3019 io_boundary = UINT32_MAX; 3020 } 3021 3022 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3023 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 3024 parent_offset = bdev_io->u.bdev.offset_blocks; 3025 parent_iov_offset = (current_offset - parent_offset) * blocklen; 3026 parent_iovcnt = bdev_io->u.bdev.iovcnt; 3027 3028 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 3029 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3030 if (parent_iov_offset < parent_iov->iov_len) { 3031 break; 3032 } 3033 parent_iov_offset -= parent_iov->iov_len; 3034 } 3035 3036 child_iovcnt = 0; 3037 while (remaining > 0 && parent_iovpos < parent_iovcnt && 3038 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 3039 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 3040 to_next_boundary = spdk_min(remaining, to_next_boundary); 3041 to_next_boundary_bytes = to_next_boundary * blocklen; 3042 3043 iov = &bdev_io->child_iov[child_iovcnt]; 3044 iovcnt = 0; 3045 3046 if (bdev_io->u.bdev.md_buf) { 3047 md_buf = (char *)bdev_io->u.bdev.md_buf + 3048 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 3049 } 3050 3051 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 3052 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 3053 iovcnt < child_iovsize) { 3054 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 3055 iov_len = parent_iov->iov_len - parent_iov_offset; 3056 3057 iov_len = spdk_min(iov_len, max_segment_size); 3058 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 3059 to_next_boundary_bytes -= iov_len; 3060 3061 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 3062 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 3063 3064 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 3065 parent_iov_offset += iov_len; 3066 } else { 3067 parent_iovpos++; 3068 parent_iov_offset = 0; 3069 } 3070 child_iovcnt++; 3071 iovcnt++; 3072 } 3073 3074 if (to_next_boundary_bytes > 0) { 3075 /* We had to stop this child I/O early because we ran out of 3076 * child_iov space or were limited by max_num_segments. 3077 * Ensure the iovs to be aligned with block size and 3078 * then adjust to_next_boundary before starting the 3079 * child I/O. 3080 */ 3081 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 3082 iovcnt == child_iovsize); 3083 to_last_block_bytes = to_next_boundary_bytes % blocklen; 3084 if (to_last_block_bytes != 0) { 3085 uint32_t child_iovpos = child_iovcnt - 1; 3086 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 3087 * so the loop will naturally end 3088 */ 3089 3090 to_last_block_bytes = blocklen - to_last_block_bytes; 3091 to_next_boundary_bytes += to_last_block_bytes; 3092 while (to_last_block_bytes > 0 && iovcnt > 0) { 3093 iov_len = spdk_min(to_last_block_bytes, 3094 bdev_io->child_iov[child_iovpos].iov_len); 3095 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 3096 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 3097 child_iovpos--; 3098 if (--iovcnt == 0) { 3099 /* If the child IO is less than a block size just return. 3100 * If the first child IO of any split round is less than 3101 * a block size, an error exit. 3102 */ 3103 if (bdev_io->u.bdev.split_outstanding == 0) { 3104 SPDK_ERRLOG("The first child io was less than a block size\n"); 3105 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3106 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 3107 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 3108 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 3109 } 3110 3111 return; 3112 } 3113 } 3114 3115 to_last_block_bytes -= iov_len; 3116 3117 if (parent_iov_offset == 0) { 3118 parent_iovpos--; 3119 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 3120 } 3121 parent_iov_offset -= iov_len; 3122 } 3123 3124 assert(to_last_block_bytes == 0); 3125 } 3126 to_next_boundary -= to_next_boundary_bytes / blocklen; 3127 } 3128 3129 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 3130 ¤t_offset, &remaining); 3131 if (spdk_unlikely(rc)) { 3132 return; 3133 } 3134 } 3135 } 3136 3137 static void 3138 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 3139 { 3140 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 3141 uint32_t num_children_reqs = 0; 3142 int rc; 3143 3144 offset = bdev_io->u.bdev.split_current_offset_blocks; 3145 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3146 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 3147 3148 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3149 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 3150 3151 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 3152 &offset, &remaining); 3153 if (spdk_likely(rc == 0)) { 3154 num_children_reqs++; 3155 } else { 3156 return; 3157 } 3158 } 3159 } 3160 3161 static void 3162 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 3163 { 3164 uint64_t offset, write_zeroes_blocks, remaining; 3165 uint32_t num_children_reqs = 0; 3166 int rc; 3167 3168 offset = bdev_io->u.bdev.split_current_offset_blocks; 3169 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3170 3171 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 3172 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 3173 3174 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 3175 &offset, &remaining); 3176 if (spdk_likely(rc == 0)) { 3177 num_children_reqs++; 3178 } else { 3179 return; 3180 } 3181 } 3182 } 3183 3184 static void 3185 bdev_copy_split(struct spdk_bdev_io *bdev_io) 3186 { 3187 uint64_t offset, copy_blocks, remaining; 3188 uint32_t num_children_reqs = 0; 3189 int rc; 3190 3191 offset = bdev_io->u.bdev.split_current_offset_blocks; 3192 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 3193 3194 assert(bdev_io->bdev->max_copy != 0); 3195 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 3196 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 3197 3198 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 3199 &offset, &remaining); 3200 if (spdk_likely(rc == 0)) { 3201 num_children_reqs++; 3202 } else { 3203 return; 3204 } 3205 } 3206 } 3207 3208 static void 3209 parent_bdev_io_complete(void *ctx, int rc) 3210 { 3211 struct spdk_bdev_io *parent_io = ctx; 3212 3213 if (rc) { 3214 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3215 } 3216 3217 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 3218 parent_io->internal.caller_ctx); 3219 } 3220 3221 static void 3222 bdev_io_complete_parent_sequence_cb(void *ctx, int status) 3223 { 3224 struct spdk_bdev_io *bdev_io = ctx; 3225 3226 /* u.bdev.accel_sequence should have already been cleared at this point */ 3227 assert(bdev_io->u.bdev.accel_sequence == NULL); 3228 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 3229 bdev_io->internal.accel_sequence = NULL; 3230 3231 if (spdk_unlikely(status != 0)) { 3232 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 3233 } 3234 3235 parent_bdev_io_complete(bdev_io, status); 3236 } 3237 3238 static void 3239 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 3240 { 3241 struct spdk_bdev_io *parent_io = cb_arg; 3242 3243 spdk_bdev_free_io(bdev_io); 3244 3245 if (!success) { 3246 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 3247 /* If any child I/O failed, stop further splitting process. */ 3248 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 3249 parent_io->u.bdev.split_remaining_num_blocks = 0; 3250 } 3251 parent_io->u.bdev.split_outstanding--; 3252 if (parent_io->u.bdev.split_outstanding != 0) { 3253 return; 3254 } 3255 3256 /* 3257 * Parent I/O finishes when all blocks are consumed. 3258 */ 3259 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 3260 assert(parent_io->internal.cb != bdev_io_split_done); 3261 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 3262 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 3263 3264 if (spdk_likely(parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 3265 if (bdev_io_needs_sequence_exec(parent_io->internal.desc, parent_io)) { 3266 bdev_io_exec_sequence(parent_io, bdev_io_complete_parent_sequence_cb); 3267 return; 3268 } else if (parent_io->internal.orig_iovcnt != 0) { 3269 /* bdev IO will be completed in the callback */ 3270 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 3271 return; 3272 } 3273 } 3274 3275 parent_bdev_io_complete(parent_io, 0); 3276 return; 3277 } 3278 3279 /* 3280 * Continue with the splitting process. This function will complete the parent I/O if the 3281 * splitting is done. 3282 */ 3283 switch (parent_io->type) { 3284 case SPDK_BDEV_IO_TYPE_READ: 3285 case SPDK_BDEV_IO_TYPE_WRITE: 3286 _bdev_rw_split(parent_io); 3287 break; 3288 case SPDK_BDEV_IO_TYPE_UNMAP: 3289 bdev_unmap_split(parent_io); 3290 break; 3291 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3292 bdev_write_zeroes_split(parent_io); 3293 break; 3294 case SPDK_BDEV_IO_TYPE_COPY: 3295 bdev_copy_split(parent_io); 3296 break; 3297 default: 3298 assert(false); 3299 break; 3300 } 3301 } 3302 3303 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 3304 bool success); 3305 3306 static void 3307 bdev_io_split(struct spdk_bdev_io *bdev_io) 3308 { 3309 assert(bdev_io_should_split(bdev_io)); 3310 3311 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 3312 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 3313 bdev_io->u.bdev.split_outstanding = 0; 3314 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 3315 3316 switch (bdev_io->type) { 3317 case SPDK_BDEV_IO_TYPE_READ: 3318 case SPDK_BDEV_IO_TYPE_WRITE: 3319 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 3320 _bdev_rw_split(bdev_io); 3321 } else { 3322 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3323 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 3324 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3325 } 3326 break; 3327 case SPDK_BDEV_IO_TYPE_UNMAP: 3328 bdev_unmap_split(bdev_io); 3329 break; 3330 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3331 bdev_write_zeroes_split(bdev_io); 3332 break; 3333 case SPDK_BDEV_IO_TYPE_COPY: 3334 bdev_copy_split(bdev_io); 3335 break; 3336 default: 3337 assert(false); 3338 break; 3339 } 3340 } 3341 3342 static void 3343 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 3344 { 3345 if (!success) { 3346 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3347 return; 3348 } 3349 3350 _bdev_rw_split(bdev_io); 3351 } 3352 3353 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 3354 * be inlined, at least on some compilers. 3355 */ 3356 static inline void 3357 _bdev_io_submit(void *ctx) 3358 { 3359 struct spdk_bdev_io *bdev_io = ctx; 3360 struct spdk_bdev *bdev = bdev_io->bdev; 3361 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3362 3363 if (spdk_likely(bdev_ch->flags == 0)) { 3364 bdev_io_do_submit(bdev_ch, bdev_io); 3365 return; 3366 } 3367 3368 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 3369 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3370 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 3371 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 3372 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 3373 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 3374 } else { 3375 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 3376 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 3377 } 3378 } else { 3379 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 3380 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 3381 } 3382 } 3383 3384 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 3385 3386 bool 3387 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 3388 { 3389 if (range1->length == 0 || range2->length == 0) { 3390 return false; 3391 } 3392 3393 if (range1->offset + range1->length <= range2->offset) { 3394 return false; 3395 } 3396 3397 if (range2->offset + range2->length <= range1->offset) { 3398 return false; 3399 } 3400 3401 return true; 3402 } 3403 3404 static bool 3405 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 3406 { 3407 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3408 struct lba_range r; 3409 3410 switch (bdev_io->type) { 3411 case SPDK_BDEV_IO_TYPE_NVME_IO: 3412 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 3413 /* Don't try to decode the NVMe command - just assume worst-case and that 3414 * it overlaps a locked range. 3415 */ 3416 return true; 3417 case SPDK_BDEV_IO_TYPE_WRITE: 3418 case SPDK_BDEV_IO_TYPE_UNMAP: 3419 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3420 case SPDK_BDEV_IO_TYPE_ZCOPY: 3421 case SPDK_BDEV_IO_TYPE_COPY: 3422 r.offset = bdev_io->u.bdev.offset_blocks; 3423 r.length = bdev_io->u.bdev.num_blocks; 3424 if (!bdev_lba_range_overlapped(range, &r)) { 3425 /* This I/O doesn't overlap the specified LBA range. */ 3426 return false; 3427 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 3428 /* This I/O overlaps, but the I/O is on the same channel that locked this 3429 * range, and the caller_ctx is the same as the locked_ctx. This means 3430 * that this I/O is associated with the lock, and is allowed to execute. 3431 */ 3432 return false; 3433 } else { 3434 return true; 3435 } 3436 default: 3437 return false; 3438 } 3439 } 3440 3441 void 3442 bdev_io_submit(struct spdk_bdev_io *bdev_io) 3443 { 3444 struct spdk_bdev *bdev = bdev_io->bdev; 3445 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 3446 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3447 3448 assert(thread != NULL); 3449 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3450 3451 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 3452 struct lba_range *range; 3453 3454 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 3455 if (bdev_io_range_is_locked(bdev_io, range)) { 3456 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 3457 return; 3458 } 3459 } 3460 } 3461 3462 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 3463 3464 bdev_io->internal.submit_tsc = spdk_get_ticks(); 3465 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 3466 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 3467 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 3468 spdk_bdev_get_name(bdev)); 3469 3470 if (bdev_io->internal.split) { 3471 bdev_io_split(bdev_io); 3472 return; 3473 } 3474 3475 if (ch->flags & BDEV_CH_QOS_ENABLED) { 3476 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 3477 _bdev_io_submit(bdev_io); 3478 } else { 3479 bdev_io->internal.io_submit_ch = ch; 3480 bdev_io->internal.ch = bdev->internal.qos->ch; 3481 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3482 } 3483 } else { 3484 _bdev_io_submit(bdev_io); 3485 } 3486 } 3487 3488 static inline void 3489 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3490 { 3491 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3492 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3493 * For write operation we need to pull buffers from memory domain before submitting IO. 3494 * Once read operation completes, we need to use memory_domain push functionality to 3495 * update data in original memory domain IO buffer 3496 * This IO request will go through a regular IO flow, so clear memory domains pointers */ 3497 bdev_io->u.bdev.memory_domain = NULL; 3498 bdev_io->u.bdev.memory_domain_ctx = NULL; 3499 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3500 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3501 } 3502 3503 static inline void 3504 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io) 3505 { 3506 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 3507 bool needs_exec = bdev_io_needs_sequence_exec(desc, bdev_io); 3508 3509 if (spdk_unlikely(ch->flags & BDEV_CH_RESET_IN_PROGRESS)) { 3510 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 3511 bdev_io_complete_unsubmitted(bdev_io); 3512 return; 3513 } 3514 3515 /* We need to allocate bounce buffer if bdev doesn't support memory domains, or if it does 3516 * support them, but we need to execute an accel sequence and the data buffer is from accel 3517 * memory domain (to avoid doing a push/pull from that domain). 3518 */ 3519 if ((bdev_io->internal.memory_domain && !desc->memory_domains_supported) || 3520 (needs_exec && bdev_io->internal.memory_domain == spdk_accel_get_memory_domain())) { 3521 _bdev_io_ext_use_bounce_buffer(bdev_io); 3522 return; 3523 } 3524 3525 if (needs_exec) { 3526 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 3527 bdev_io_exec_sequence(bdev_io, bdev_io_submit_sequence_cb); 3528 return; 3529 } 3530 /* For reads we'll execute the sequence after the data is read, so, for now, only 3531 * clear out accel_sequence pointer and submit the IO */ 3532 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 3533 bdev_io->u.bdev.accel_sequence = NULL; 3534 } 3535 3536 bdev_io_submit(bdev_io); 3537 } 3538 3539 static void 3540 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3541 { 3542 struct spdk_bdev *bdev = bdev_io->bdev; 3543 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3544 struct spdk_io_channel *ch = bdev_ch->channel; 3545 3546 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3547 3548 bdev_io->internal.in_submit_request = true; 3549 bdev_submit_request(bdev, ch, bdev_io); 3550 bdev_io->internal.in_submit_request = false; 3551 } 3552 3553 void 3554 bdev_io_init(struct spdk_bdev_io *bdev_io, 3555 struct spdk_bdev *bdev, void *cb_arg, 3556 spdk_bdev_io_completion_cb cb) 3557 { 3558 bdev_io->bdev = bdev; 3559 bdev_io->internal.caller_ctx = cb_arg; 3560 bdev_io->internal.cb = cb; 3561 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3562 bdev_io->internal.in_submit_request = false; 3563 bdev_io->internal.buf = NULL; 3564 bdev_io->internal.io_submit_ch = NULL; 3565 bdev_io->internal.orig_iovs = NULL; 3566 bdev_io->internal.orig_iovcnt = 0; 3567 bdev_io->internal.orig_md_iov.iov_base = NULL; 3568 bdev_io->internal.error.nvme.cdw0 = 0; 3569 bdev_io->num_retries = 0; 3570 bdev_io->internal.get_buf_cb = NULL; 3571 bdev_io->internal.get_aux_buf_cb = NULL; 3572 bdev_io->internal.memory_domain = NULL; 3573 bdev_io->internal.memory_domain_ctx = NULL; 3574 bdev_io->internal.data_transfer_cpl = NULL; 3575 bdev_io->internal.split = bdev_io_should_split(bdev_io); 3576 bdev_io->internal.accel_sequence = NULL; 3577 } 3578 3579 static bool 3580 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3581 { 3582 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3583 } 3584 3585 bool 3586 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3587 { 3588 bool supported; 3589 3590 supported = bdev_io_type_supported(bdev, io_type); 3591 3592 if (!supported) { 3593 switch (io_type) { 3594 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3595 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3596 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3597 break; 3598 default: 3599 break; 3600 } 3601 } 3602 3603 return supported; 3604 } 3605 3606 uint64_t 3607 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3608 { 3609 return bdev_io->internal.submit_tsc; 3610 } 3611 3612 int 3613 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3614 { 3615 if (bdev->fn_table->dump_info_json) { 3616 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3617 } 3618 3619 return 0; 3620 } 3621 3622 static void 3623 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3624 { 3625 uint32_t max_per_timeslice = 0; 3626 int i; 3627 3628 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3629 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3630 qos->rate_limits[i].max_per_timeslice = 0; 3631 continue; 3632 } 3633 3634 max_per_timeslice = qos->rate_limits[i].limit * 3635 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3636 3637 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3638 qos->rate_limits[i].min_per_timeslice); 3639 3640 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3641 } 3642 3643 bdev_qos_set_ops(qos); 3644 } 3645 3646 static int 3647 bdev_channel_poll_qos(void *arg) 3648 { 3649 struct spdk_bdev_qos *qos = arg; 3650 uint64_t now = spdk_get_ticks(); 3651 int i; 3652 3653 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3654 /* We received our callback earlier than expected - return 3655 * immediately and wait to do accounting until at least one 3656 * timeslice has actually expired. This should never happen 3657 * with a well-behaved timer implementation. 3658 */ 3659 return SPDK_POLLER_IDLE; 3660 } 3661 3662 /* Reset for next round of rate limiting */ 3663 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3664 /* We may have allowed the IOs or bytes to slightly overrun in the last 3665 * timeslice. remaining_this_timeslice is signed, so if it's negative 3666 * here, we'll account for the overrun so that the next timeslice will 3667 * be appropriately reduced. 3668 */ 3669 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3670 qos->rate_limits[i].remaining_this_timeslice = 0; 3671 } 3672 } 3673 3674 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3675 qos->last_timeslice += qos->timeslice_size; 3676 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3677 qos->rate_limits[i].remaining_this_timeslice += 3678 qos->rate_limits[i].max_per_timeslice; 3679 } 3680 } 3681 3682 return bdev_qos_io_submit(qos->ch, qos); 3683 } 3684 3685 static void 3686 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3687 { 3688 struct spdk_bdev_shared_resource *shared_resource; 3689 struct lba_range *range; 3690 3691 bdev_free_io_stat(ch->stat); 3692 #ifdef SPDK_CONFIG_VTUNE 3693 bdev_free_io_stat(ch->prev_stat); 3694 #endif 3695 3696 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3697 range = TAILQ_FIRST(&ch->locked_ranges); 3698 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3699 free(range); 3700 } 3701 3702 spdk_put_io_channel(ch->channel); 3703 spdk_put_io_channel(ch->accel_channel); 3704 3705 shared_resource = ch->shared_resource; 3706 3707 assert(TAILQ_EMPTY(&ch->io_locked)); 3708 assert(TAILQ_EMPTY(&ch->io_submitted)); 3709 assert(TAILQ_EMPTY(&ch->io_accel_exec)); 3710 assert(TAILQ_EMPTY(&ch->io_memory_domain)); 3711 assert(ch->io_outstanding == 0); 3712 assert(shared_resource->ref > 0); 3713 shared_resource->ref--; 3714 if (shared_resource->ref == 0) { 3715 assert(shared_resource->io_outstanding == 0); 3716 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3717 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3718 free(shared_resource); 3719 } 3720 } 3721 3722 static void 3723 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3724 { 3725 struct spdk_bdev_qos *qos = bdev->internal.qos; 3726 int i; 3727 3728 assert(spdk_spin_held(&bdev->internal.spinlock)); 3729 3730 /* Rate limiting on this bdev enabled */ 3731 if (qos) { 3732 if (qos->ch == NULL) { 3733 struct spdk_io_channel *io_ch; 3734 3735 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3736 bdev->name, spdk_get_thread()); 3737 3738 /* No qos channel has been selected, so set one up */ 3739 3740 /* Take another reference to ch */ 3741 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3742 assert(io_ch != NULL); 3743 qos->ch = ch; 3744 3745 qos->thread = spdk_io_channel_get_thread(io_ch); 3746 3747 TAILQ_INIT(&qos->queued); 3748 3749 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3750 if (bdev_qos_is_iops_rate_limit(i) == true) { 3751 qos->rate_limits[i].min_per_timeslice = 3752 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3753 } else { 3754 qos->rate_limits[i].min_per_timeslice = 3755 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3756 } 3757 3758 if (qos->rate_limits[i].limit == 0) { 3759 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3760 } 3761 } 3762 bdev_qos_update_max_quota_per_timeslice(qos); 3763 qos->timeslice_size = 3764 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3765 qos->last_timeslice = spdk_get_ticks(); 3766 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3767 qos, 3768 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3769 } 3770 3771 ch->flags |= BDEV_CH_QOS_ENABLED; 3772 } 3773 } 3774 3775 struct poll_timeout_ctx { 3776 struct spdk_bdev_desc *desc; 3777 uint64_t timeout_in_sec; 3778 spdk_bdev_io_timeout_cb cb_fn; 3779 void *cb_arg; 3780 }; 3781 3782 static void 3783 bdev_desc_free(struct spdk_bdev_desc *desc) 3784 { 3785 spdk_spin_destroy(&desc->spinlock); 3786 free(desc->media_events_buffer); 3787 free(desc); 3788 } 3789 3790 static void 3791 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3792 { 3793 struct poll_timeout_ctx *ctx = _ctx; 3794 struct spdk_bdev_desc *desc = ctx->desc; 3795 3796 free(ctx); 3797 3798 spdk_spin_lock(&desc->spinlock); 3799 desc->refs--; 3800 if (desc->closed == true && desc->refs == 0) { 3801 spdk_spin_unlock(&desc->spinlock); 3802 bdev_desc_free(desc); 3803 return; 3804 } 3805 spdk_spin_unlock(&desc->spinlock); 3806 } 3807 3808 static void 3809 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3810 struct spdk_io_channel *io_ch, void *_ctx) 3811 { 3812 struct poll_timeout_ctx *ctx = _ctx; 3813 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3814 struct spdk_bdev_desc *desc = ctx->desc; 3815 struct spdk_bdev_io *bdev_io; 3816 uint64_t now; 3817 3818 spdk_spin_lock(&desc->spinlock); 3819 if (desc->closed == true) { 3820 spdk_spin_unlock(&desc->spinlock); 3821 spdk_bdev_for_each_channel_continue(i, -1); 3822 return; 3823 } 3824 spdk_spin_unlock(&desc->spinlock); 3825 3826 now = spdk_get_ticks(); 3827 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3828 /* Exclude any I/O that are generated via splitting. */ 3829 if (bdev_io->internal.cb == bdev_io_split_done) { 3830 continue; 3831 } 3832 3833 /* Once we find an I/O that has not timed out, we can immediately 3834 * exit the loop. 3835 */ 3836 if (now < (bdev_io->internal.submit_tsc + 3837 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3838 goto end; 3839 } 3840 3841 if (bdev_io->internal.desc == desc) { 3842 ctx->cb_fn(ctx->cb_arg, bdev_io); 3843 } 3844 } 3845 3846 end: 3847 spdk_bdev_for_each_channel_continue(i, 0); 3848 } 3849 3850 static int 3851 bdev_poll_timeout_io(void *arg) 3852 { 3853 struct spdk_bdev_desc *desc = arg; 3854 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3855 struct poll_timeout_ctx *ctx; 3856 3857 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3858 if (!ctx) { 3859 SPDK_ERRLOG("failed to allocate memory\n"); 3860 return SPDK_POLLER_BUSY; 3861 } 3862 ctx->desc = desc; 3863 ctx->cb_arg = desc->cb_arg; 3864 ctx->cb_fn = desc->cb_fn; 3865 ctx->timeout_in_sec = desc->timeout_in_sec; 3866 3867 /* Take a ref on the descriptor in case it gets closed while we are checking 3868 * all of the channels. 3869 */ 3870 spdk_spin_lock(&desc->spinlock); 3871 desc->refs++; 3872 spdk_spin_unlock(&desc->spinlock); 3873 3874 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3875 bdev_channel_poll_timeout_io_done); 3876 3877 return SPDK_POLLER_BUSY; 3878 } 3879 3880 int 3881 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3882 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3883 { 3884 assert(desc->thread == spdk_get_thread()); 3885 3886 spdk_poller_unregister(&desc->io_timeout_poller); 3887 3888 if (timeout_in_sec) { 3889 assert(cb_fn != NULL); 3890 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3891 desc, 3892 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3893 1000); 3894 if (desc->io_timeout_poller == NULL) { 3895 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3896 return -1; 3897 } 3898 } 3899 3900 desc->cb_fn = cb_fn; 3901 desc->cb_arg = cb_arg; 3902 desc->timeout_in_sec = timeout_in_sec; 3903 3904 return 0; 3905 } 3906 3907 static int 3908 bdev_channel_create(void *io_device, void *ctx_buf) 3909 { 3910 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3911 struct spdk_bdev_channel *ch = ctx_buf; 3912 struct spdk_io_channel *mgmt_io_ch; 3913 struct spdk_bdev_mgmt_channel *mgmt_ch; 3914 struct spdk_bdev_shared_resource *shared_resource; 3915 struct lba_range *range; 3916 3917 ch->bdev = bdev; 3918 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3919 if (!ch->channel) { 3920 return -1; 3921 } 3922 3923 ch->accel_channel = spdk_accel_get_io_channel(); 3924 if (!ch->accel_channel) { 3925 spdk_put_io_channel(ch->channel); 3926 return -1; 3927 } 3928 3929 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3930 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3931 3932 assert(ch->histogram == NULL); 3933 if (bdev->internal.histogram_enabled) { 3934 ch->histogram = spdk_histogram_data_alloc(); 3935 if (ch->histogram == NULL) { 3936 SPDK_ERRLOG("Could not allocate histogram\n"); 3937 } 3938 } 3939 3940 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3941 if (!mgmt_io_ch) { 3942 spdk_put_io_channel(ch->channel); 3943 spdk_put_io_channel(ch->accel_channel); 3944 return -1; 3945 } 3946 3947 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3948 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3949 if (shared_resource->shared_ch == ch->channel) { 3950 spdk_put_io_channel(mgmt_io_ch); 3951 shared_resource->ref++; 3952 break; 3953 } 3954 } 3955 3956 if (shared_resource == NULL) { 3957 shared_resource = calloc(1, sizeof(*shared_resource)); 3958 if (shared_resource == NULL) { 3959 spdk_put_io_channel(ch->channel); 3960 spdk_put_io_channel(ch->accel_channel); 3961 spdk_put_io_channel(mgmt_io_ch); 3962 return -1; 3963 } 3964 3965 shared_resource->mgmt_ch = mgmt_ch; 3966 shared_resource->io_outstanding = 0; 3967 TAILQ_INIT(&shared_resource->nomem_io); 3968 shared_resource->nomem_threshold = 0; 3969 shared_resource->shared_ch = ch->channel; 3970 shared_resource->ref = 1; 3971 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3972 } 3973 3974 ch->io_outstanding = 0; 3975 TAILQ_INIT(&ch->queued_resets); 3976 TAILQ_INIT(&ch->locked_ranges); 3977 ch->flags = 0; 3978 ch->shared_resource = shared_resource; 3979 3980 TAILQ_INIT(&ch->io_submitted); 3981 TAILQ_INIT(&ch->io_locked); 3982 TAILQ_INIT(&ch->io_accel_exec); 3983 TAILQ_INIT(&ch->io_memory_domain); 3984 3985 ch->stat = bdev_alloc_io_stat(false); 3986 if (ch->stat == NULL) { 3987 bdev_channel_destroy_resource(ch); 3988 return -1; 3989 } 3990 3991 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3992 3993 #ifdef SPDK_CONFIG_VTUNE 3994 { 3995 char *name; 3996 __itt_init_ittlib(NULL, 0); 3997 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3998 if (!name) { 3999 bdev_channel_destroy_resource(ch); 4000 return -1; 4001 } 4002 ch->handle = __itt_string_handle_create(name); 4003 free(name); 4004 ch->start_tsc = spdk_get_ticks(); 4005 ch->interval_tsc = spdk_get_ticks_hz() / 100; 4006 ch->prev_stat = bdev_alloc_io_stat(false); 4007 if (ch->prev_stat == NULL) { 4008 bdev_channel_destroy_resource(ch); 4009 return -1; 4010 } 4011 } 4012 #endif 4013 4014 spdk_spin_lock(&bdev->internal.spinlock); 4015 bdev_enable_qos(bdev, ch); 4016 4017 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 4018 struct lba_range *new_range; 4019 4020 new_range = calloc(1, sizeof(*new_range)); 4021 if (new_range == NULL) { 4022 spdk_spin_unlock(&bdev->internal.spinlock); 4023 bdev_channel_destroy_resource(ch); 4024 return -1; 4025 } 4026 new_range->length = range->length; 4027 new_range->offset = range->offset; 4028 new_range->locked_ctx = range->locked_ctx; 4029 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 4030 } 4031 4032 spdk_spin_unlock(&bdev->internal.spinlock); 4033 4034 return 0; 4035 } 4036 4037 static int 4038 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 4039 void *cb_ctx) 4040 { 4041 struct spdk_bdev_channel *bdev_ch = cb_ctx; 4042 struct spdk_bdev_io *bdev_io; 4043 uint64_t buf_len; 4044 4045 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4046 if (bdev_io->internal.ch == bdev_ch) { 4047 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4048 spdk_iobuf_entry_abort(ch, entry, buf_len); 4049 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4050 } 4051 4052 return 0; 4053 } 4054 4055 /* 4056 * Abort I/O that are waiting on a data buffer. 4057 */ 4058 static void 4059 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 4060 { 4061 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4062 bdev_abort_all_buf_io_cb, ch); 4063 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4064 bdev_abort_all_buf_io_cb, ch); 4065 } 4066 4067 /* 4068 * Abort I/O that are queued waiting for submission. These types of I/O are 4069 * linked using the spdk_bdev_io link TAILQ_ENTRY. 4070 */ 4071 static void 4072 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 4073 { 4074 struct spdk_bdev_io *bdev_io, *tmp; 4075 4076 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 4077 if (bdev_io->internal.ch == ch) { 4078 TAILQ_REMOVE(queue, bdev_io, internal.link); 4079 /* 4080 * spdk_bdev_io_complete() assumes that the completed I/O had 4081 * been submitted to the bdev module. Since in this case it 4082 * hadn't, bump io_outstanding to account for the decrement 4083 * that spdk_bdev_io_complete() will do. 4084 */ 4085 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 4086 ch->io_outstanding++; 4087 ch->shared_resource->io_outstanding++; 4088 } 4089 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 4090 } 4091 } 4092 } 4093 4094 static bool 4095 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 4096 { 4097 struct spdk_bdev_io *bdev_io; 4098 4099 TAILQ_FOREACH(bdev_io, queue, internal.link) { 4100 if (bdev_io == bio_to_abort) { 4101 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 4102 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4103 return true; 4104 } 4105 } 4106 4107 return false; 4108 } 4109 4110 static int 4111 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 4112 { 4113 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 4114 uint64_t buf_len; 4115 4116 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 4117 if (bdev_io == bio_to_abort) { 4118 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 4119 spdk_iobuf_entry_abort(ch, entry, buf_len); 4120 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 4121 return 1; 4122 } 4123 4124 return 0; 4125 } 4126 4127 static bool 4128 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 4129 { 4130 int rc; 4131 4132 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 4133 bdev_abort_buf_io_cb, bio_to_abort); 4134 if (rc == 1) { 4135 return true; 4136 } 4137 4138 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 4139 bdev_abort_buf_io_cb, bio_to_abort); 4140 return rc == 1; 4141 } 4142 4143 static void 4144 bdev_qos_channel_destroy(void *cb_arg) 4145 { 4146 struct spdk_bdev_qos *qos = cb_arg; 4147 4148 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 4149 spdk_poller_unregister(&qos->poller); 4150 4151 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 4152 4153 free(qos); 4154 } 4155 4156 static int 4157 bdev_qos_destroy(struct spdk_bdev *bdev) 4158 { 4159 int i; 4160 4161 /* 4162 * Cleanly shutting down the QoS poller is tricky, because 4163 * during the asynchronous operation the user could open 4164 * a new descriptor and create a new channel, spawning 4165 * a new QoS poller. 4166 * 4167 * The strategy is to create a new QoS structure here and swap it 4168 * in. The shutdown path then continues to refer to the old one 4169 * until it completes and then releases it. 4170 */ 4171 struct spdk_bdev_qos *new_qos, *old_qos; 4172 4173 old_qos = bdev->internal.qos; 4174 4175 new_qos = calloc(1, sizeof(*new_qos)); 4176 if (!new_qos) { 4177 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 4178 return -ENOMEM; 4179 } 4180 4181 /* Copy the old QoS data into the newly allocated structure */ 4182 memcpy(new_qos, old_qos, sizeof(*new_qos)); 4183 4184 /* Zero out the key parts of the QoS structure */ 4185 new_qos->ch = NULL; 4186 new_qos->thread = NULL; 4187 new_qos->poller = NULL; 4188 TAILQ_INIT(&new_qos->queued); 4189 /* 4190 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 4191 * It will be used later for the new QoS structure. 4192 */ 4193 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4194 new_qos->rate_limits[i].remaining_this_timeslice = 0; 4195 new_qos->rate_limits[i].min_per_timeslice = 0; 4196 new_qos->rate_limits[i].max_per_timeslice = 0; 4197 } 4198 4199 bdev->internal.qos = new_qos; 4200 4201 if (old_qos->thread == NULL) { 4202 free(old_qos); 4203 } else { 4204 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 4205 } 4206 4207 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 4208 * been destroyed yet. The destruction path will end up waiting for the final 4209 * channel to be put before it releases resources. */ 4210 4211 return 0; 4212 } 4213 4214 void 4215 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 4216 { 4217 total->bytes_read += add->bytes_read; 4218 total->num_read_ops += add->num_read_ops; 4219 total->bytes_written += add->bytes_written; 4220 total->num_write_ops += add->num_write_ops; 4221 total->bytes_unmapped += add->bytes_unmapped; 4222 total->num_unmap_ops += add->num_unmap_ops; 4223 total->bytes_copied += add->bytes_copied; 4224 total->num_copy_ops += add->num_copy_ops; 4225 total->read_latency_ticks += add->read_latency_ticks; 4226 total->write_latency_ticks += add->write_latency_ticks; 4227 total->unmap_latency_ticks += add->unmap_latency_ticks; 4228 total->copy_latency_ticks += add->copy_latency_ticks; 4229 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 4230 total->max_read_latency_ticks = add->max_read_latency_ticks; 4231 } 4232 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 4233 total->min_read_latency_ticks = add->min_read_latency_ticks; 4234 } 4235 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 4236 total->max_write_latency_ticks = add->max_write_latency_ticks; 4237 } 4238 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 4239 total->min_write_latency_ticks = add->min_write_latency_ticks; 4240 } 4241 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 4242 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 4243 } 4244 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 4245 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 4246 } 4247 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 4248 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 4249 } 4250 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 4251 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 4252 } 4253 } 4254 4255 static void 4256 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 4257 { 4258 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 4259 4260 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 4261 memcpy(to_stat->io_error, from_stat->io_error, 4262 sizeof(struct spdk_bdev_io_error_stat)); 4263 } 4264 } 4265 4266 void 4267 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 4268 { 4269 stat->max_read_latency_ticks = 0; 4270 stat->min_read_latency_ticks = UINT64_MAX; 4271 stat->max_write_latency_ticks = 0; 4272 stat->min_write_latency_ticks = UINT64_MAX; 4273 stat->max_unmap_latency_ticks = 0; 4274 stat->min_unmap_latency_ticks = UINT64_MAX; 4275 stat->max_copy_latency_ticks = 0; 4276 stat->min_copy_latency_ticks = UINT64_MAX; 4277 4278 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 4279 return; 4280 } 4281 4282 stat->bytes_read = 0; 4283 stat->num_read_ops = 0; 4284 stat->bytes_written = 0; 4285 stat->num_write_ops = 0; 4286 stat->bytes_unmapped = 0; 4287 stat->num_unmap_ops = 0; 4288 stat->bytes_copied = 0; 4289 stat->num_copy_ops = 0; 4290 stat->read_latency_ticks = 0; 4291 stat->write_latency_ticks = 0; 4292 stat->unmap_latency_ticks = 0; 4293 stat->copy_latency_ticks = 0; 4294 4295 if (stat->io_error != NULL) { 4296 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 4297 } 4298 } 4299 4300 struct spdk_bdev_io_stat * 4301 bdev_alloc_io_stat(bool io_error_stat) 4302 { 4303 struct spdk_bdev_io_stat *stat; 4304 4305 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 4306 if (stat == NULL) { 4307 return NULL; 4308 } 4309 4310 if (io_error_stat) { 4311 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 4312 if (stat->io_error == NULL) { 4313 free(stat); 4314 return NULL; 4315 } 4316 } else { 4317 stat->io_error = NULL; 4318 } 4319 4320 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 4321 4322 return stat; 4323 } 4324 4325 void 4326 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 4327 { 4328 if (stat != NULL) { 4329 free(stat->io_error); 4330 free(stat); 4331 } 4332 } 4333 4334 void 4335 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 4336 { 4337 int i; 4338 4339 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 4340 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 4341 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 4342 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 4343 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 4344 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 4345 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 4346 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 4347 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 4348 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 4349 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 4350 stat->min_read_latency_ticks != UINT64_MAX ? 4351 stat->min_read_latency_ticks : 0); 4352 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 4353 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 4354 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 4355 stat->min_write_latency_ticks != UINT64_MAX ? 4356 stat->min_write_latency_ticks : 0); 4357 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 4358 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 4359 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 4360 stat->min_unmap_latency_ticks != UINT64_MAX ? 4361 stat->min_unmap_latency_ticks : 0); 4362 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 4363 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 4364 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 4365 stat->min_copy_latency_ticks != UINT64_MAX ? 4366 stat->min_copy_latency_ticks : 0); 4367 4368 if (stat->io_error != NULL) { 4369 spdk_json_write_named_object_begin(w, "io_error"); 4370 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 4371 if (stat->io_error->error_status[i] != 0) { 4372 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 4373 stat->io_error->error_status[i]); 4374 } 4375 } 4376 spdk_json_write_object_end(w); 4377 } 4378 } 4379 4380 static void 4381 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 4382 { 4383 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 4384 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 4385 4386 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 4387 bdev_abort_all_buf_io(mgmt_ch, ch); 4388 } 4389 4390 static void 4391 bdev_channel_destroy(void *io_device, void *ctx_buf) 4392 { 4393 struct spdk_bdev_channel *ch = ctx_buf; 4394 4395 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 4396 spdk_get_thread()); 4397 4398 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 4399 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 4400 4401 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 4402 spdk_spin_lock(&ch->bdev->internal.spinlock); 4403 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 4404 spdk_spin_unlock(&ch->bdev->internal.spinlock); 4405 4406 bdev_abort_all_queued_io(&ch->queued_resets, ch); 4407 4408 bdev_channel_abort_queued_ios(ch); 4409 4410 if (ch->histogram) { 4411 spdk_histogram_data_free(ch->histogram); 4412 } 4413 4414 bdev_channel_destroy_resource(ch); 4415 } 4416 4417 /* 4418 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 4419 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 4420 */ 4421 static int 4422 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 4423 { 4424 struct spdk_bdev_name *tmp; 4425 4426 bdev_name->name = strdup(name); 4427 if (bdev_name->name == NULL) { 4428 SPDK_ERRLOG("Unable to allocate bdev name\n"); 4429 return -ENOMEM; 4430 } 4431 4432 bdev_name->bdev = bdev; 4433 4434 spdk_spin_lock(&g_bdev_mgr.spinlock); 4435 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4436 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4437 4438 if (tmp != NULL) { 4439 SPDK_ERRLOG("Bdev name %s already exists\n", name); 4440 free(bdev_name->name); 4441 return -EEXIST; 4442 } 4443 4444 return 0; 4445 } 4446 4447 static void 4448 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 4449 { 4450 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 4451 free(bdev_name->name); 4452 } 4453 4454 static void 4455 bdev_name_del(struct spdk_bdev_name *bdev_name) 4456 { 4457 spdk_spin_lock(&g_bdev_mgr.spinlock); 4458 bdev_name_del_unsafe(bdev_name); 4459 spdk_spin_unlock(&g_bdev_mgr.spinlock); 4460 } 4461 4462 int 4463 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 4464 { 4465 struct spdk_bdev_alias *tmp; 4466 int ret; 4467 4468 if (alias == NULL) { 4469 SPDK_ERRLOG("Empty alias passed\n"); 4470 return -EINVAL; 4471 } 4472 4473 tmp = calloc(1, sizeof(*tmp)); 4474 if (tmp == NULL) { 4475 SPDK_ERRLOG("Unable to allocate alias\n"); 4476 return -ENOMEM; 4477 } 4478 4479 ret = bdev_name_add(&tmp->alias, bdev, alias); 4480 if (ret != 0) { 4481 free(tmp); 4482 return ret; 4483 } 4484 4485 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 4486 4487 return 0; 4488 } 4489 4490 static int 4491 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 4492 void (*alias_del_fn)(struct spdk_bdev_name *n)) 4493 { 4494 struct spdk_bdev_alias *tmp; 4495 4496 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4497 if (strcmp(alias, tmp->alias.name) == 0) { 4498 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4499 alias_del_fn(&tmp->alias); 4500 free(tmp); 4501 return 0; 4502 } 4503 } 4504 4505 return -ENOENT; 4506 } 4507 4508 int 4509 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4510 { 4511 int rc; 4512 4513 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4514 if (rc == -ENOENT) { 4515 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4516 } 4517 4518 return rc; 4519 } 4520 4521 void 4522 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4523 { 4524 struct spdk_bdev_alias *p, *tmp; 4525 4526 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4527 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4528 bdev_name_del(&p->alias); 4529 free(p); 4530 } 4531 } 4532 4533 struct spdk_io_channel * 4534 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4535 { 4536 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4537 } 4538 4539 void * 4540 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4541 { 4542 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4543 void *ctx = NULL; 4544 4545 if (bdev->fn_table->get_module_ctx) { 4546 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4547 } 4548 4549 return ctx; 4550 } 4551 4552 const char * 4553 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4554 { 4555 return bdev->module->name; 4556 } 4557 4558 const char * 4559 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4560 { 4561 return bdev->name; 4562 } 4563 4564 const char * 4565 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4566 { 4567 return bdev->product_name; 4568 } 4569 4570 const struct spdk_bdev_aliases_list * 4571 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4572 { 4573 return &bdev->aliases; 4574 } 4575 4576 uint32_t 4577 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4578 { 4579 return bdev->blocklen; 4580 } 4581 4582 uint32_t 4583 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4584 { 4585 return bdev->write_unit_size; 4586 } 4587 4588 uint64_t 4589 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4590 { 4591 return bdev->blockcnt; 4592 } 4593 4594 const char * 4595 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4596 { 4597 return qos_rpc_type[type]; 4598 } 4599 4600 void 4601 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4602 { 4603 int i; 4604 4605 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4606 4607 spdk_spin_lock(&bdev->internal.spinlock); 4608 if (bdev->internal.qos) { 4609 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4610 if (bdev->internal.qos->rate_limits[i].limit != 4611 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4612 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4613 if (bdev_qos_is_iops_rate_limit(i) == false) { 4614 /* Change from Byte to Megabyte which is user visible. */ 4615 limits[i] = limits[i] / 1024 / 1024; 4616 } 4617 } 4618 } 4619 } 4620 spdk_spin_unlock(&bdev->internal.spinlock); 4621 } 4622 4623 size_t 4624 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4625 { 4626 return 1 << bdev->required_alignment; 4627 } 4628 4629 uint32_t 4630 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4631 { 4632 return bdev->optimal_io_boundary; 4633 } 4634 4635 bool 4636 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4637 { 4638 return bdev->write_cache; 4639 } 4640 4641 const struct spdk_uuid * 4642 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4643 { 4644 return &bdev->uuid; 4645 } 4646 4647 uint16_t 4648 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4649 { 4650 return bdev->acwu; 4651 } 4652 4653 uint32_t 4654 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4655 { 4656 return bdev->md_len; 4657 } 4658 4659 bool 4660 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4661 { 4662 return (bdev->md_len != 0) && bdev->md_interleave; 4663 } 4664 4665 bool 4666 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4667 { 4668 return (bdev->md_len != 0) && !bdev->md_interleave; 4669 } 4670 4671 bool 4672 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4673 { 4674 return bdev->zoned; 4675 } 4676 4677 uint32_t 4678 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4679 { 4680 if (spdk_bdev_is_md_interleaved(bdev)) { 4681 return bdev->blocklen - bdev->md_len; 4682 } else { 4683 return bdev->blocklen; 4684 } 4685 } 4686 4687 uint32_t 4688 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4689 { 4690 return bdev->phys_blocklen; 4691 } 4692 4693 static uint32_t 4694 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4695 { 4696 if (!spdk_bdev_is_md_interleaved(bdev)) { 4697 return bdev->blocklen + bdev->md_len; 4698 } else { 4699 return bdev->blocklen; 4700 } 4701 } 4702 4703 /* We have to use the typedef in the function declaration to appease astyle. */ 4704 typedef enum spdk_dif_type spdk_dif_type_t; 4705 4706 spdk_dif_type_t 4707 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4708 { 4709 if (bdev->md_len != 0) { 4710 return bdev->dif_type; 4711 } else { 4712 return SPDK_DIF_DISABLE; 4713 } 4714 } 4715 4716 bool 4717 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4718 { 4719 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4720 return bdev->dif_is_head_of_md; 4721 } else { 4722 return false; 4723 } 4724 } 4725 4726 bool 4727 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4728 enum spdk_dif_check_type check_type) 4729 { 4730 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4731 return false; 4732 } 4733 4734 switch (check_type) { 4735 case SPDK_DIF_CHECK_TYPE_REFTAG: 4736 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4737 case SPDK_DIF_CHECK_TYPE_APPTAG: 4738 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4739 case SPDK_DIF_CHECK_TYPE_GUARD: 4740 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4741 default: 4742 return false; 4743 } 4744 } 4745 4746 static uint32_t 4747 bdev_get_max_write(const struct spdk_bdev *bdev, uint64_t num_bytes) 4748 { 4749 uint64_t aligned_length, max_write_blocks; 4750 4751 aligned_length = num_bytes - (spdk_bdev_get_buf_align(bdev) - 1); 4752 max_write_blocks = aligned_length / _bdev_get_block_size_with_md(bdev); 4753 max_write_blocks -= max_write_blocks % bdev->write_unit_size; 4754 4755 return max_write_blocks; 4756 } 4757 4758 uint32_t 4759 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4760 { 4761 return bdev->max_copy; 4762 } 4763 4764 uint64_t 4765 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4766 { 4767 return bdev->internal.measured_queue_depth; 4768 } 4769 4770 uint64_t 4771 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4772 { 4773 return bdev->internal.period; 4774 } 4775 4776 uint64_t 4777 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4778 { 4779 return bdev->internal.weighted_io_time; 4780 } 4781 4782 uint64_t 4783 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4784 { 4785 return bdev->internal.io_time; 4786 } 4787 4788 static void bdev_update_qd_sampling_period(void *ctx); 4789 4790 static void 4791 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4792 { 4793 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4794 4795 if (bdev->internal.measured_queue_depth) { 4796 bdev->internal.io_time += bdev->internal.period; 4797 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4798 } 4799 4800 bdev->internal.qd_poll_in_progress = false; 4801 4802 bdev_update_qd_sampling_period(bdev); 4803 } 4804 4805 static void 4806 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4807 struct spdk_io_channel *io_ch, void *_ctx) 4808 { 4809 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4810 4811 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4812 spdk_bdev_for_each_channel_continue(i, 0); 4813 } 4814 4815 static int 4816 bdev_calculate_measured_queue_depth(void *ctx) 4817 { 4818 struct spdk_bdev *bdev = ctx; 4819 4820 bdev->internal.qd_poll_in_progress = true; 4821 bdev->internal.temporary_queue_depth = 0; 4822 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4823 return SPDK_POLLER_BUSY; 4824 } 4825 4826 static void 4827 bdev_update_qd_sampling_period(void *ctx) 4828 { 4829 struct spdk_bdev *bdev = ctx; 4830 4831 if (bdev->internal.period == bdev->internal.new_period) { 4832 return; 4833 } 4834 4835 if (bdev->internal.qd_poll_in_progress) { 4836 return; 4837 } 4838 4839 bdev->internal.period = bdev->internal.new_period; 4840 4841 spdk_poller_unregister(&bdev->internal.qd_poller); 4842 if (bdev->internal.period != 0) { 4843 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4844 bdev, bdev->internal.period); 4845 } else { 4846 spdk_bdev_close(bdev->internal.qd_desc); 4847 bdev->internal.qd_desc = NULL; 4848 } 4849 } 4850 4851 static void 4852 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4853 { 4854 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4855 } 4856 4857 void 4858 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4859 { 4860 int rc; 4861 4862 if (bdev->internal.new_period == period) { 4863 return; 4864 } 4865 4866 bdev->internal.new_period = period; 4867 4868 if (bdev->internal.qd_desc != NULL) { 4869 assert(bdev->internal.period != 0); 4870 4871 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4872 bdev_update_qd_sampling_period, bdev); 4873 return; 4874 } 4875 4876 assert(bdev->internal.period == 0); 4877 4878 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4879 NULL, &bdev->internal.qd_desc); 4880 if (rc != 0) { 4881 return; 4882 } 4883 4884 bdev->internal.period = period; 4885 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4886 bdev, period); 4887 } 4888 4889 struct bdev_get_current_qd_ctx { 4890 uint64_t current_qd; 4891 spdk_bdev_get_current_qd_cb cb_fn; 4892 void *cb_arg; 4893 }; 4894 4895 static void 4896 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4897 { 4898 struct bdev_get_current_qd_ctx *ctx = _ctx; 4899 4900 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4901 4902 free(ctx); 4903 } 4904 4905 static void 4906 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4907 struct spdk_io_channel *io_ch, void *_ctx) 4908 { 4909 struct bdev_get_current_qd_ctx *ctx = _ctx; 4910 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4911 4912 ctx->current_qd += bdev_ch->io_outstanding; 4913 4914 spdk_bdev_for_each_channel_continue(i, 0); 4915 } 4916 4917 void 4918 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4919 void *cb_arg) 4920 { 4921 struct bdev_get_current_qd_ctx *ctx; 4922 4923 assert(cb_fn != NULL); 4924 4925 ctx = calloc(1, sizeof(*ctx)); 4926 if (ctx == NULL) { 4927 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4928 return; 4929 } 4930 4931 ctx->cb_fn = cb_fn; 4932 ctx->cb_arg = cb_arg; 4933 4934 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4935 } 4936 4937 static void 4938 _event_notify(struct spdk_bdev_desc *desc, enum spdk_bdev_event_type type) 4939 { 4940 assert(desc->thread == spdk_get_thread()); 4941 4942 spdk_spin_lock(&desc->spinlock); 4943 desc->refs--; 4944 if (!desc->closed) { 4945 spdk_spin_unlock(&desc->spinlock); 4946 desc->callback.event_fn(type, 4947 desc->bdev, 4948 desc->callback.ctx); 4949 return; 4950 } else if (desc->refs == 0) { 4951 /* This descriptor was closed after this event_notify message was sent. 4952 * spdk_bdev_close() could not free the descriptor since this message was 4953 * in flight, so we free it now using bdev_desc_free(). 4954 */ 4955 spdk_spin_unlock(&desc->spinlock); 4956 bdev_desc_free(desc); 4957 return; 4958 } 4959 spdk_spin_unlock(&desc->spinlock); 4960 } 4961 4962 static void 4963 event_notify(struct spdk_bdev_desc *desc, spdk_msg_fn event_notify_fn) 4964 { 4965 spdk_spin_lock(&desc->spinlock); 4966 desc->refs++; 4967 spdk_thread_send_msg(desc->thread, event_notify_fn, desc); 4968 spdk_spin_unlock(&desc->spinlock); 4969 } 4970 4971 static void 4972 _resize_notify(void *ctx) 4973 { 4974 struct spdk_bdev_desc *desc = ctx; 4975 4976 _event_notify(desc, SPDK_BDEV_EVENT_RESIZE); 4977 } 4978 4979 int 4980 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4981 { 4982 struct spdk_bdev_desc *desc; 4983 int ret; 4984 4985 if (size == bdev->blockcnt) { 4986 return 0; 4987 } 4988 4989 spdk_spin_lock(&bdev->internal.spinlock); 4990 4991 /* bdev has open descriptors */ 4992 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4993 bdev->blockcnt > size) { 4994 ret = -EBUSY; 4995 } else { 4996 bdev->blockcnt = size; 4997 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4998 event_notify(desc, _resize_notify); 4999 } 5000 ret = 0; 5001 } 5002 5003 spdk_spin_unlock(&bdev->internal.spinlock); 5004 5005 return ret; 5006 } 5007 5008 /* 5009 * Convert I/O offset and length from bytes to blocks. 5010 * 5011 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 5012 */ 5013 static uint64_t 5014 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 5015 uint64_t num_bytes, uint64_t *num_blocks) 5016 { 5017 uint32_t block_size = bdev->blocklen; 5018 uint8_t shift_cnt; 5019 5020 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 5021 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 5022 shift_cnt = spdk_u32log2(block_size); 5023 *offset_blocks = offset_bytes >> shift_cnt; 5024 *num_blocks = num_bytes >> shift_cnt; 5025 return (offset_bytes - (*offset_blocks << shift_cnt)) | 5026 (num_bytes - (*num_blocks << shift_cnt)); 5027 } else { 5028 *offset_blocks = offset_bytes / block_size; 5029 *num_blocks = num_bytes / block_size; 5030 return (offset_bytes % block_size) | (num_bytes % block_size); 5031 } 5032 } 5033 5034 static bool 5035 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 5036 { 5037 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 5038 * has been an overflow and hence the offset has been wrapped around */ 5039 if (offset_blocks + num_blocks < offset_blocks) { 5040 return false; 5041 } 5042 5043 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 5044 if (offset_blocks + num_blocks > bdev->blockcnt) { 5045 return false; 5046 } 5047 5048 return true; 5049 } 5050 5051 static void 5052 bdev_seek_complete_cb(void *ctx) 5053 { 5054 struct spdk_bdev_io *bdev_io = ctx; 5055 5056 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5057 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 5058 } 5059 5060 static int 5061 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5062 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 5063 spdk_bdev_io_completion_cb cb, void *cb_arg) 5064 { 5065 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5066 struct spdk_bdev_io *bdev_io; 5067 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5068 5069 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 5070 5071 /* Check if offset_blocks is valid looking at the validity of one block */ 5072 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 5073 return -EINVAL; 5074 } 5075 5076 bdev_io = bdev_channel_get_io(channel); 5077 if (!bdev_io) { 5078 return -ENOMEM; 5079 } 5080 5081 bdev_io->internal.ch = channel; 5082 bdev_io->internal.desc = desc; 5083 bdev_io->type = io_type; 5084 bdev_io->u.bdev.offset_blocks = offset_blocks; 5085 bdev_io->u.bdev.memory_domain = NULL; 5086 bdev_io->u.bdev.memory_domain_ctx = NULL; 5087 bdev_io->u.bdev.accel_sequence = NULL; 5088 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5089 5090 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 5091 /* In case bdev doesn't support seek to next data/hole offset, 5092 * it is assumed that only data and no holes are present */ 5093 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 5094 bdev_io->u.bdev.seek.offset = offset_blocks; 5095 } else { 5096 bdev_io->u.bdev.seek.offset = UINT64_MAX; 5097 } 5098 5099 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 5100 return 0; 5101 } 5102 5103 bdev_io_submit(bdev_io); 5104 return 0; 5105 } 5106 5107 int 5108 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5109 uint64_t offset_blocks, 5110 spdk_bdev_io_completion_cb cb, void *cb_arg) 5111 { 5112 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 5113 } 5114 5115 int 5116 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5117 uint64_t offset_blocks, 5118 spdk_bdev_io_completion_cb cb, void *cb_arg) 5119 { 5120 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 5121 } 5122 5123 uint64_t 5124 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 5125 { 5126 return bdev_io->u.bdev.seek.offset; 5127 } 5128 5129 static int 5130 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 5131 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5132 spdk_bdev_io_completion_cb cb, void *cb_arg) 5133 { 5134 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5135 struct spdk_bdev_io *bdev_io; 5136 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5137 5138 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5139 return -EINVAL; 5140 } 5141 5142 bdev_io = bdev_channel_get_io(channel); 5143 if (!bdev_io) { 5144 return -ENOMEM; 5145 } 5146 5147 bdev_io->internal.ch = channel; 5148 bdev_io->internal.desc = desc; 5149 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5150 bdev_io->u.bdev.iovs = &bdev_io->iov; 5151 bdev_io->u.bdev.iovs[0].iov_base = buf; 5152 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5153 bdev_io->u.bdev.iovcnt = 1; 5154 bdev_io->u.bdev.md_buf = md_buf; 5155 bdev_io->u.bdev.num_blocks = num_blocks; 5156 bdev_io->u.bdev.offset_blocks = offset_blocks; 5157 bdev_io->u.bdev.memory_domain = NULL; 5158 bdev_io->u.bdev.memory_domain_ctx = NULL; 5159 bdev_io->u.bdev.accel_sequence = NULL; 5160 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5161 5162 bdev_io_submit(bdev_io); 5163 return 0; 5164 } 5165 5166 int 5167 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5168 void *buf, uint64_t offset, uint64_t nbytes, 5169 spdk_bdev_io_completion_cb cb, void *cb_arg) 5170 { 5171 uint64_t offset_blocks, num_blocks; 5172 5173 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5174 nbytes, &num_blocks) != 0) { 5175 return -EINVAL; 5176 } 5177 5178 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5179 } 5180 5181 int 5182 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5183 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5184 spdk_bdev_io_completion_cb cb, void *cb_arg) 5185 { 5186 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 5187 } 5188 5189 int 5190 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5191 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5192 spdk_bdev_io_completion_cb cb, void *cb_arg) 5193 { 5194 struct iovec iov = { 5195 .iov_base = buf, 5196 }; 5197 5198 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5199 return -EINVAL; 5200 } 5201 5202 if (md_buf && !_is_buf_allocated(&iov)) { 5203 return -EINVAL; 5204 } 5205 5206 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5207 cb, cb_arg); 5208 } 5209 5210 int 5211 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5212 struct iovec *iov, int iovcnt, 5213 uint64_t offset, uint64_t nbytes, 5214 spdk_bdev_io_completion_cb cb, void *cb_arg) 5215 { 5216 uint64_t offset_blocks, num_blocks; 5217 5218 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5219 nbytes, &num_blocks) != 0) { 5220 return -EINVAL; 5221 } 5222 5223 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5224 } 5225 5226 static int 5227 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5228 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 5229 uint64_t num_blocks, struct spdk_memory_domain *domain, void *domain_ctx, 5230 struct spdk_accel_sequence *seq, 5231 spdk_bdev_io_completion_cb cb, void *cb_arg) 5232 { 5233 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5234 struct spdk_bdev_io *bdev_io; 5235 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5236 5237 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5238 return -EINVAL; 5239 } 5240 5241 bdev_io = bdev_channel_get_io(channel); 5242 if (!bdev_io) { 5243 return -ENOMEM; 5244 } 5245 5246 bdev_io->internal.ch = channel; 5247 bdev_io->internal.desc = desc; 5248 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 5249 bdev_io->u.bdev.iovs = iov; 5250 bdev_io->u.bdev.iovcnt = iovcnt; 5251 bdev_io->u.bdev.md_buf = md_buf; 5252 bdev_io->u.bdev.num_blocks = num_blocks; 5253 bdev_io->u.bdev.offset_blocks = offset_blocks; 5254 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5255 bdev_io->internal.memory_domain = domain; 5256 bdev_io->internal.memory_domain_ctx = domain_ctx; 5257 bdev_io->internal.accel_sequence = seq; 5258 bdev_io->u.bdev.memory_domain = domain; 5259 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5260 bdev_io->u.bdev.accel_sequence = seq; 5261 5262 _bdev_io_submit_ext(desc, bdev_io); 5263 5264 return 0; 5265 } 5266 5267 int 5268 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5269 struct iovec *iov, int iovcnt, 5270 uint64_t offset_blocks, uint64_t num_blocks, 5271 spdk_bdev_io_completion_cb cb, void *cb_arg) 5272 { 5273 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5274 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5275 } 5276 5277 int 5278 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5279 struct iovec *iov, int iovcnt, void *md_buf, 5280 uint64_t offset_blocks, uint64_t num_blocks, 5281 spdk_bdev_io_completion_cb cb, void *cb_arg) 5282 { 5283 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5284 return -EINVAL; 5285 } 5286 5287 if (md_buf && !_is_buf_allocated(iov)) { 5288 return -EINVAL; 5289 } 5290 5291 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5292 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5293 } 5294 5295 static inline bool 5296 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 5297 { 5298 /* 5299 * We check if opts size is at least of size when we first introduced 5300 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 5301 * are not checked internal. 5302 */ 5303 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 5304 sizeof(opts->metadata) && 5305 opts->size <= sizeof(*opts) && 5306 /* When memory domain is used, the user must provide data buffers */ 5307 (!opts->memory_domain || (iov && iov[0].iov_base)); 5308 } 5309 5310 int 5311 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5312 struct iovec *iov, int iovcnt, 5313 uint64_t offset_blocks, uint64_t num_blocks, 5314 spdk_bdev_io_completion_cb cb, void *cb_arg, 5315 struct spdk_bdev_ext_io_opts *opts) 5316 { 5317 void *md = NULL; 5318 5319 if (opts) { 5320 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5321 return -EINVAL; 5322 } 5323 md = opts->metadata; 5324 } 5325 5326 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5327 return -EINVAL; 5328 } 5329 5330 if (md && !_is_buf_allocated(iov)) { 5331 return -EINVAL; 5332 } 5333 5334 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5335 num_blocks, 5336 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5337 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5338 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5339 cb, cb_arg); 5340 } 5341 5342 static int 5343 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5344 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5345 spdk_bdev_io_completion_cb cb, void *cb_arg) 5346 { 5347 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5348 struct spdk_bdev_io *bdev_io; 5349 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5350 5351 if (!desc->write) { 5352 return -EBADF; 5353 } 5354 5355 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5356 return -EINVAL; 5357 } 5358 5359 bdev_io = bdev_channel_get_io(channel); 5360 if (!bdev_io) { 5361 return -ENOMEM; 5362 } 5363 5364 bdev_io->internal.ch = channel; 5365 bdev_io->internal.desc = desc; 5366 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5367 bdev_io->u.bdev.iovs = &bdev_io->iov; 5368 bdev_io->u.bdev.iovs[0].iov_base = buf; 5369 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5370 bdev_io->u.bdev.iovcnt = 1; 5371 bdev_io->u.bdev.md_buf = md_buf; 5372 bdev_io->u.bdev.num_blocks = num_blocks; 5373 bdev_io->u.bdev.offset_blocks = offset_blocks; 5374 bdev_io->u.bdev.memory_domain = NULL; 5375 bdev_io->u.bdev.memory_domain_ctx = NULL; 5376 bdev_io->u.bdev.accel_sequence = NULL; 5377 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5378 5379 bdev_io_submit(bdev_io); 5380 return 0; 5381 } 5382 5383 int 5384 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5385 void *buf, uint64_t offset, uint64_t nbytes, 5386 spdk_bdev_io_completion_cb cb, void *cb_arg) 5387 { 5388 uint64_t offset_blocks, num_blocks; 5389 5390 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5391 nbytes, &num_blocks) != 0) { 5392 return -EINVAL; 5393 } 5394 5395 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 5396 } 5397 5398 int 5399 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5400 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5401 spdk_bdev_io_completion_cb cb, void *cb_arg) 5402 { 5403 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5404 cb, cb_arg); 5405 } 5406 5407 int 5408 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5409 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5410 spdk_bdev_io_completion_cb cb, void *cb_arg) 5411 { 5412 struct iovec iov = { 5413 .iov_base = buf, 5414 }; 5415 5416 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5417 return -EINVAL; 5418 } 5419 5420 if (md_buf && !_is_buf_allocated(&iov)) { 5421 return -EINVAL; 5422 } 5423 5424 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5425 cb, cb_arg); 5426 } 5427 5428 static int 5429 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5430 struct iovec *iov, int iovcnt, void *md_buf, 5431 uint64_t offset_blocks, uint64_t num_blocks, 5432 struct spdk_memory_domain *domain, void *domain_ctx, 5433 struct spdk_accel_sequence *seq, 5434 spdk_bdev_io_completion_cb cb, void *cb_arg) 5435 { 5436 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5437 struct spdk_bdev_io *bdev_io; 5438 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5439 5440 if (!desc->write) { 5441 return -EBADF; 5442 } 5443 5444 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5445 return -EINVAL; 5446 } 5447 5448 bdev_io = bdev_channel_get_io(channel); 5449 if (!bdev_io) { 5450 return -ENOMEM; 5451 } 5452 5453 bdev_io->internal.ch = channel; 5454 bdev_io->internal.desc = desc; 5455 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 5456 bdev_io->u.bdev.iovs = iov; 5457 bdev_io->u.bdev.iovcnt = iovcnt; 5458 bdev_io->u.bdev.md_buf = md_buf; 5459 bdev_io->u.bdev.num_blocks = num_blocks; 5460 bdev_io->u.bdev.offset_blocks = offset_blocks; 5461 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5462 bdev_io->internal.memory_domain = domain; 5463 bdev_io->internal.memory_domain_ctx = domain_ctx; 5464 bdev_io->internal.accel_sequence = seq; 5465 bdev_io->u.bdev.memory_domain = domain; 5466 bdev_io->u.bdev.memory_domain_ctx = domain_ctx; 5467 bdev_io->u.bdev.accel_sequence = seq; 5468 5469 _bdev_io_submit_ext(desc, bdev_io); 5470 5471 return 0; 5472 } 5473 5474 int 5475 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5476 struct iovec *iov, int iovcnt, 5477 uint64_t offset, uint64_t len, 5478 spdk_bdev_io_completion_cb cb, void *cb_arg) 5479 { 5480 uint64_t offset_blocks, num_blocks; 5481 5482 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5483 len, &num_blocks) != 0) { 5484 return -EINVAL; 5485 } 5486 5487 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 5488 } 5489 5490 int 5491 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5492 struct iovec *iov, int iovcnt, 5493 uint64_t offset_blocks, uint64_t num_blocks, 5494 spdk_bdev_io_completion_cb cb, void *cb_arg) 5495 { 5496 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5497 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5498 } 5499 5500 int 5501 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5502 struct iovec *iov, int iovcnt, void *md_buf, 5503 uint64_t offset_blocks, uint64_t num_blocks, 5504 spdk_bdev_io_completion_cb cb, void *cb_arg) 5505 { 5506 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5507 return -EINVAL; 5508 } 5509 5510 if (md_buf && !_is_buf_allocated(iov)) { 5511 return -EINVAL; 5512 } 5513 5514 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5515 num_blocks, NULL, NULL, NULL, cb, cb_arg); 5516 } 5517 5518 int 5519 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5520 struct iovec *iov, int iovcnt, 5521 uint64_t offset_blocks, uint64_t num_blocks, 5522 spdk_bdev_io_completion_cb cb, void *cb_arg, 5523 struct spdk_bdev_ext_io_opts *opts) 5524 { 5525 void *md = NULL; 5526 5527 if (opts) { 5528 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 5529 return -EINVAL; 5530 } 5531 md = opts->metadata; 5532 } 5533 5534 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5535 return -EINVAL; 5536 } 5537 5538 if (md && !_is_buf_allocated(iov)) { 5539 return -EINVAL; 5540 } 5541 5542 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, num_blocks, 5543 bdev_get_ext_io_opt(opts, memory_domain, NULL), 5544 bdev_get_ext_io_opt(opts, memory_domain_ctx, NULL), 5545 bdev_get_ext_io_opt(opts, accel_sequence, NULL), 5546 cb, cb_arg); 5547 } 5548 5549 static void 5550 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5551 { 5552 struct spdk_bdev_io *parent_io = cb_arg; 5553 struct spdk_bdev *bdev = parent_io->bdev; 5554 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5555 int i, rc = 0; 5556 5557 if (!success) { 5558 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5559 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5560 spdk_bdev_free_io(bdev_io); 5561 return; 5562 } 5563 5564 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5565 rc = memcmp(read_buf, 5566 parent_io->u.bdev.iovs[i].iov_base, 5567 parent_io->u.bdev.iovs[i].iov_len); 5568 if (rc) { 5569 break; 5570 } 5571 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5572 } 5573 5574 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5575 rc = memcmp(bdev_io->u.bdev.md_buf, 5576 parent_io->u.bdev.md_buf, 5577 spdk_bdev_get_md_size(bdev)); 5578 } 5579 5580 spdk_bdev_free_io(bdev_io); 5581 5582 if (rc == 0) { 5583 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5584 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5585 } else { 5586 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5587 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5588 } 5589 } 5590 5591 static void 5592 bdev_compare_do_read(void *_bdev_io) 5593 { 5594 struct spdk_bdev_io *bdev_io = _bdev_io; 5595 int rc; 5596 5597 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5598 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5599 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5600 bdev_compare_do_read_done, bdev_io); 5601 5602 if (rc == -ENOMEM) { 5603 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5604 } else if (rc != 0) { 5605 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5606 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5607 } 5608 } 5609 5610 static int 5611 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5612 struct iovec *iov, int iovcnt, void *md_buf, 5613 uint64_t offset_blocks, uint64_t num_blocks, 5614 spdk_bdev_io_completion_cb cb, void *cb_arg) 5615 { 5616 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5617 struct spdk_bdev_io *bdev_io; 5618 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5619 5620 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5621 return -EINVAL; 5622 } 5623 5624 bdev_io = bdev_channel_get_io(channel); 5625 if (!bdev_io) { 5626 return -ENOMEM; 5627 } 5628 5629 bdev_io->internal.ch = channel; 5630 bdev_io->internal.desc = desc; 5631 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5632 bdev_io->u.bdev.iovs = iov; 5633 bdev_io->u.bdev.iovcnt = iovcnt; 5634 bdev_io->u.bdev.md_buf = md_buf; 5635 bdev_io->u.bdev.num_blocks = num_blocks; 5636 bdev_io->u.bdev.offset_blocks = offset_blocks; 5637 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5638 bdev_io->u.bdev.memory_domain = NULL; 5639 bdev_io->u.bdev.memory_domain_ctx = NULL; 5640 bdev_io->u.bdev.accel_sequence = NULL; 5641 5642 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5643 bdev_io_submit(bdev_io); 5644 return 0; 5645 } 5646 5647 bdev_compare_do_read(bdev_io); 5648 5649 return 0; 5650 } 5651 5652 int 5653 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5654 struct iovec *iov, int iovcnt, 5655 uint64_t offset_blocks, uint64_t num_blocks, 5656 spdk_bdev_io_completion_cb cb, void *cb_arg) 5657 { 5658 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5659 num_blocks, cb, cb_arg); 5660 } 5661 5662 int 5663 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5664 struct iovec *iov, int iovcnt, void *md_buf, 5665 uint64_t offset_blocks, uint64_t num_blocks, 5666 spdk_bdev_io_completion_cb cb, void *cb_arg) 5667 { 5668 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5669 return -EINVAL; 5670 } 5671 5672 if (md_buf && !_is_buf_allocated(iov)) { 5673 return -EINVAL; 5674 } 5675 5676 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5677 num_blocks, cb, cb_arg); 5678 } 5679 5680 static int 5681 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5682 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5683 spdk_bdev_io_completion_cb cb, void *cb_arg) 5684 { 5685 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5686 struct spdk_bdev_io *bdev_io; 5687 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5688 5689 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5690 return -EINVAL; 5691 } 5692 5693 bdev_io = bdev_channel_get_io(channel); 5694 if (!bdev_io) { 5695 return -ENOMEM; 5696 } 5697 5698 bdev_io->internal.ch = channel; 5699 bdev_io->internal.desc = desc; 5700 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5701 bdev_io->u.bdev.iovs = &bdev_io->iov; 5702 bdev_io->u.bdev.iovs[0].iov_base = buf; 5703 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5704 bdev_io->u.bdev.iovcnt = 1; 5705 bdev_io->u.bdev.md_buf = md_buf; 5706 bdev_io->u.bdev.num_blocks = num_blocks; 5707 bdev_io->u.bdev.offset_blocks = offset_blocks; 5708 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5709 bdev_io->u.bdev.memory_domain = NULL; 5710 bdev_io->u.bdev.memory_domain_ctx = NULL; 5711 bdev_io->u.bdev.accel_sequence = NULL; 5712 5713 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5714 bdev_io_submit(bdev_io); 5715 return 0; 5716 } 5717 5718 bdev_compare_do_read(bdev_io); 5719 5720 return 0; 5721 } 5722 5723 int 5724 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5725 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5726 spdk_bdev_io_completion_cb cb, void *cb_arg) 5727 { 5728 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5729 cb, cb_arg); 5730 } 5731 5732 int 5733 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5734 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5735 spdk_bdev_io_completion_cb cb, void *cb_arg) 5736 { 5737 struct iovec iov = { 5738 .iov_base = buf, 5739 }; 5740 5741 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5742 return -EINVAL; 5743 } 5744 5745 if (md_buf && !_is_buf_allocated(&iov)) { 5746 return -EINVAL; 5747 } 5748 5749 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5750 cb, cb_arg); 5751 } 5752 5753 static void 5754 bdev_comparev_and_writev_blocks_unlocked(struct lba_range *range, void *ctx, int unlock_status) 5755 { 5756 struct spdk_bdev_io *bdev_io = ctx; 5757 5758 if (unlock_status) { 5759 SPDK_ERRLOG("LBA range unlock failed\n"); 5760 } 5761 5762 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5763 false, bdev_io->internal.caller_ctx); 5764 } 5765 5766 static void 5767 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5768 { 5769 bdev_io->internal.status = status; 5770 5771 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5772 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5773 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5774 } 5775 5776 static void 5777 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5778 { 5779 struct spdk_bdev_io *parent_io = cb_arg; 5780 5781 if (!success) { 5782 SPDK_ERRLOG("Compare and write operation failed\n"); 5783 } 5784 5785 spdk_bdev_free_io(bdev_io); 5786 5787 bdev_comparev_and_writev_blocks_unlock(parent_io, 5788 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5789 } 5790 5791 static void 5792 bdev_compare_and_write_do_write(void *_bdev_io) 5793 { 5794 struct spdk_bdev_io *bdev_io = _bdev_io; 5795 int rc; 5796 5797 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5798 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5799 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5800 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5801 bdev_compare_and_write_do_write_done, bdev_io); 5802 5803 5804 if (rc == -ENOMEM) { 5805 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5806 } else if (rc != 0) { 5807 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5808 } 5809 } 5810 5811 static void 5812 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5813 { 5814 struct spdk_bdev_io *parent_io = cb_arg; 5815 5816 spdk_bdev_free_io(bdev_io); 5817 5818 if (!success) { 5819 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5820 return; 5821 } 5822 5823 bdev_compare_and_write_do_write(parent_io); 5824 } 5825 5826 static void 5827 bdev_compare_and_write_do_compare(void *_bdev_io) 5828 { 5829 struct spdk_bdev_io *bdev_io = _bdev_io; 5830 int rc; 5831 5832 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5833 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5834 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5835 bdev_compare_and_write_do_compare_done, bdev_io); 5836 5837 if (rc == -ENOMEM) { 5838 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5839 } else if (rc != 0) { 5840 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5841 } 5842 } 5843 5844 static void 5845 bdev_comparev_and_writev_blocks_locked(struct lba_range *range, void *ctx, int status) 5846 { 5847 struct spdk_bdev_io *bdev_io = ctx; 5848 5849 if (status) { 5850 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5851 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5852 return; 5853 } 5854 5855 bdev_compare_and_write_do_compare(bdev_io); 5856 } 5857 5858 int 5859 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5860 struct iovec *compare_iov, int compare_iovcnt, 5861 struct iovec *write_iov, int write_iovcnt, 5862 uint64_t offset_blocks, uint64_t num_blocks, 5863 spdk_bdev_io_completion_cb cb, void *cb_arg) 5864 { 5865 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5866 struct spdk_bdev_io *bdev_io; 5867 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5868 5869 if (!desc->write) { 5870 return -EBADF; 5871 } 5872 5873 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5874 return -EINVAL; 5875 } 5876 5877 if (num_blocks > bdev->acwu) { 5878 return -EINVAL; 5879 } 5880 5881 bdev_io = bdev_channel_get_io(channel); 5882 if (!bdev_io) { 5883 return -ENOMEM; 5884 } 5885 5886 bdev_io->internal.ch = channel; 5887 bdev_io->internal.desc = desc; 5888 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5889 bdev_io->u.bdev.iovs = compare_iov; 5890 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5891 bdev_io->u.bdev.fused_iovs = write_iov; 5892 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5893 bdev_io->u.bdev.md_buf = NULL; 5894 bdev_io->u.bdev.num_blocks = num_blocks; 5895 bdev_io->u.bdev.offset_blocks = offset_blocks; 5896 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5897 bdev_io->u.bdev.memory_domain = NULL; 5898 bdev_io->u.bdev.memory_domain_ctx = NULL; 5899 bdev_io->u.bdev.accel_sequence = NULL; 5900 5901 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5902 bdev_io_submit(bdev_io); 5903 return 0; 5904 } 5905 5906 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5907 bdev_comparev_and_writev_blocks_locked, bdev_io); 5908 } 5909 5910 int 5911 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5912 struct iovec *iov, int iovcnt, 5913 uint64_t offset_blocks, uint64_t num_blocks, 5914 bool populate, 5915 spdk_bdev_io_completion_cb cb, void *cb_arg) 5916 { 5917 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5918 struct spdk_bdev_io *bdev_io; 5919 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5920 5921 if (!desc->write) { 5922 return -EBADF; 5923 } 5924 5925 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5926 return -EINVAL; 5927 } 5928 5929 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5930 return -ENOTSUP; 5931 } 5932 5933 bdev_io = bdev_channel_get_io(channel); 5934 if (!bdev_io) { 5935 return -ENOMEM; 5936 } 5937 5938 bdev_io->internal.ch = channel; 5939 bdev_io->internal.desc = desc; 5940 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5941 bdev_io->u.bdev.num_blocks = num_blocks; 5942 bdev_io->u.bdev.offset_blocks = offset_blocks; 5943 bdev_io->u.bdev.iovs = iov; 5944 bdev_io->u.bdev.iovcnt = iovcnt; 5945 bdev_io->u.bdev.md_buf = NULL; 5946 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5947 bdev_io->u.bdev.zcopy.commit = 0; 5948 bdev_io->u.bdev.zcopy.start = 1; 5949 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5950 bdev_io->u.bdev.memory_domain = NULL; 5951 bdev_io->u.bdev.memory_domain_ctx = NULL; 5952 bdev_io->u.bdev.accel_sequence = NULL; 5953 5954 bdev_io_submit(bdev_io); 5955 5956 return 0; 5957 } 5958 5959 int 5960 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5961 spdk_bdev_io_completion_cb cb, void *cb_arg) 5962 { 5963 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5964 return -EINVAL; 5965 } 5966 5967 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5968 bdev_io->u.bdev.zcopy.start = 0; 5969 bdev_io->internal.caller_ctx = cb_arg; 5970 bdev_io->internal.cb = cb; 5971 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5972 5973 bdev_io_submit(bdev_io); 5974 5975 return 0; 5976 } 5977 5978 int 5979 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5980 uint64_t offset, uint64_t len, 5981 spdk_bdev_io_completion_cb cb, void *cb_arg) 5982 { 5983 uint64_t offset_blocks, num_blocks; 5984 5985 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5986 len, &num_blocks) != 0) { 5987 return -EINVAL; 5988 } 5989 5990 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5991 } 5992 5993 int 5994 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5995 uint64_t offset_blocks, uint64_t num_blocks, 5996 spdk_bdev_io_completion_cb cb, void *cb_arg) 5997 { 5998 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5999 struct spdk_bdev_io *bdev_io; 6000 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6001 6002 if (!desc->write) { 6003 return -EBADF; 6004 } 6005 6006 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6007 return -EINVAL; 6008 } 6009 6010 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 6011 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 6012 return -ENOTSUP; 6013 } 6014 6015 bdev_io = bdev_channel_get_io(channel); 6016 6017 if (!bdev_io) { 6018 return -ENOMEM; 6019 } 6020 6021 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 6022 bdev_io->internal.ch = channel; 6023 bdev_io->internal.desc = desc; 6024 bdev_io->u.bdev.offset_blocks = offset_blocks; 6025 bdev_io->u.bdev.num_blocks = num_blocks; 6026 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6027 bdev_io->u.bdev.memory_domain = NULL; 6028 bdev_io->u.bdev.memory_domain_ctx = NULL; 6029 bdev_io->u.bdev.accel_sequence = NULL; 6030 6031 /* If the write_zeroes size is large and should be split, use the generic split 6032 * logic regardless of whether SPDK_BDEV_IO_TYPE_WRITE_ZEREOS is supported or not. 6033 * 6034 * Then, send the write_zeroes request if SPDK_BDEV_IO_TYPE_WRITE_ZEROES is supported 6035 * or emulate it using regular write request otherwise. 6036 */ 6037 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) || 6038 bdev_io->internal.split) { 6039 bdev_io_submit(bdev_io); 6040 return 0; 6041 } 6042 6043 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 6044 6045 return bdev_write_zero_buffer(bdev_io); 6046 } 6047 6048 int 6049 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6050 uint64_t offset, uint64_t nbytes, 6051 spdk_bdev_io_completion_cb cb, void *cb_arg) 6052 { 6053 uint64_t offset_blocks, num_blocks; 6054 6055 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6056 nbytes, &num_blocks) != 0) { 6057 return -EINVAL; 6058 } 6059 6060 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6061 } 6062 6063 int 6064 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6065 uint64_t offset_blocks, uint64_t num_blocks, 6066 spdk_bdev_io_completion_cb cb, void *cb_arg) 6067 { 6068 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6069 struct spdk_bdev_io *bdev_io; 6070 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6071 6072 if (!desc->write) { 6073 return -EBADF; 6074 } 6075 6076 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6077 return -EINVAL; 6078 } 6079 6080 if (num_blocks == 0) { 6081 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 6082 return -EINVAL; 6083 } 6084 6085 bdev_io = bdev_channel_get_io(channel); 6086 if (!bdev_io) { 6087 return -ENOMEM; 6088 } 6089 6090 bdev_io->internal.ch = channel; 6091 bdev_io->internal.desc = desc; 6092 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 6093 6094 bdev_io->u.bdev.iovs = &bdev_io->iov; 6095 bdev_io->u.bdev.iovs[0].iov_base = NULL; 6096 bdev_io->u.bdev.iovs[0].iov_len = 0; 6097 bdev_io->u.bdev.iovcnt = 1; 6098 6099 bdev_io->u.bdev.offset_blocks = offset_blocks; 6100 bdev_io->u.bdev.num_blocks = num_blocks; 6101 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6102 bdev_io->u.bdev.memory_domain = NULL; 6103 bdev_io->u.bdev.memory_domain_ctx = NULL; 6104 bdev_io->u.bdev.accel_sequence = NULL; 6105 6106 bdev_io_submit(bdev_io); 6107 return 0; 6108 } 6109 6110 int 6111 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6112 uint64_t offset, uint64_t length, 6113 spdk_bdev_io_completion_cb cb, void *cb_arg) 6114 { 6115 uint64_t offset_blocks, num_blocks; 6116 6117 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 6118 length, &num_blocks) != 0) { 6119 return -EINVAL; 6120 } 6121 6122 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 6123 } 6124 6125 int 6126 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6127 uint64_t offset_blocks, uint64_t num_blocks, 6128 spdk_bdev_io_completion_cb cb, void *cb_arg) 6129 { 6130 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6131 struct spdk_bdev_io *bdev_io; 6132 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6133 6134 if (!desc->write) { 6135 return -EBADF; 6136 } 6137 6138 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 6139 return -EINVAL; 6140 } 6141 6142 bdev_io = bdev_channel_get_io(channel); 6143 if (!bdev_io) { 6144 return -ENOMEM; 6145 } 6146 6147 bdev_io->internal.ch = channel; 6148 bdev_io->internal.desc = desc; 6149 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 6150 bdev_io->u.bdev.iovs = NULL; 6151 bdev_io->u.bdev.iovcnt = 0; 6152 bdev_io->u.bdev.offset_blocks = offset_blocks; 6153 bdev_io->u.bdev.num_blocks = num_blocks; 6154 bdev_io->u.bdev.memory_domain = NULL; 6155 bdev_io->u.bdev.memory_domain_ctx = NULL; 6156 bdev_io->u.bdev.accel_sequence = NULL; 6157 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6158 6159 bdev_io_submit(bdev_io); 6160 return 0; 6161 } 6162 6163 static int bdev_reset_poll_for_outstanding_io(void *ctx); 6164 6165 static void 6166 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 6167 { 6168 struct spdk_bdev_channel *ch = _ctx; 6169 struct spdk_bdev_io *bdev_io; 6170 6171 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6172 6173 if (status == -EBUSY) { 6174 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 6175 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 6176 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 6177 } else { 6178 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6179 6180 if (TAILQ_EMPTY(&ch->io_memory_domain) && TAILQ_EMPTY(&ch->io_accel_exec)) { 6181 /* If outstanding IOs are still present and reset_io_drain_timeout 6182 * seconds passed, start the reset. */ 6183 bdev_io_submit_reset(bdev_io); 6184 } else { 6185 /* We still have in progress memory domain pull/push or we're 6186 * executing accel sequence. Since we cannot abort either of those 6187 * operaions, fail the reset request. */ 6188 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 6189 } 6190 } 6191 } else { 6192 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6193 SPDK_DEBUGLOG(bdev, 6194 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 6195 ch->bdev->name); 6196 /* Mark the completion status as a SUCCESS and complete the reset. */ 6197 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 6198 } 6199 } 6200 6201 static void 6202 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6203 struct spdk_io_channel *io_ch, void *_ctx) 6204 { 6205 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 6206 int status = 0; 6207 6208 if (cur_ch->io_outstanding > 0 || 6209 !TAILQ_EMPTY(&cur_ch->io_memory_domain) || 6210 !TAILQ_EMPTY(&cur_ch->io_accel_exec)) { 6211 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 6212 * further iteration over the rest of the channels and pass non-zero status 6213 * to the callback function. */ 6214 status = -EBUSY; 6215 } 6216 spdk_bdev_for_each_channel_continue(i, status); 6217 } 6218 6219 static int 6220 bdev_reset_poll_for_outstanding_io(void *ctx) 6221 { 6222 struct spdk_bdev_channel *ch = ctx; 6223 struct spdk_bdev_io *bdev_io; 6224 6225 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6226 6227 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 6228 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6229 bdev_reset_check_outstanding_io_done); 6230 6231 return SPDK_POLLER_BUSY; 6232 } 6233 6234 static void 6235 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 6236 { 6237 struct spdk_bdev_channel *ch = _ctx; 6238 struct spdk_bdev_io *bdev_io; 6239 6240 bdev_io = TAILQ_FIRST(&ch->queued_resets); 6241 6242 if (bdev->reset_io_drain_timeout == 0) { 6243 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 6244 6245 bdev_io_submit_reset(bdev_io); 6246 return; 6247 } 6248 6249 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 6250 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 6251 6252 /* In case bdev->reset_io_drain_timeout is not equal to zero, 6253 * submit the reset to the underlying module only if outstanding I/O 6254 * remain after reset_io_drain_timeout seconds have passed. */ 6255 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 6256 bdev_reset_check_outstanding_io_done); 6257 } 6258 6259 static void 6260 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6261 struct spdk_io_channel *ch, void *_ctx) 6262 { 6263 struct spdk_bdev_channel *channel; 6264 struct spdk_bdev_mgmt_channel *mgmt_channel; 6265 struct spdk_bdev_shared_resource *shared_resource; 6266 bdev_io_tailq_t tmp_queued; 6267 6268 TAILQ_INIT(&tmp_queued); 6269 6270 channel = __io_ch_to_bdev_ch(ch); 6271 shared_resource = channel->shared_resource; 6272 mgmt_channel = shared_resource->mgmt_ch; 6273 6274 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 6275 6276 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 6277 /* The QoS object is always valid and readable while 6278 * the channel flag is set, so the lock here should not 6279 * be necessary. We're not in the fast path though, so 6280 * just take it anyway. */ 6281 spdk_spin_lock(&channel->bdev->internal.spinlock); 6282 if (channel->bdev->internal.qos->ch == channel) { 6283 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 6284 } 6285 spdk_spin_unlock(&channel->bdev->internal.spinlock); 6286 } 6287 6288 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 6289 bdev_abort_all_buf_io(mgmt_channel, channel); 6290 bdev_abort_all_queued_io(&tmp_queued, channel); 6291 6292 spdk_bdev_for_each_channel_continue(i, 0); 6293 } 6294 6295 static void 6296 bdev_start_reset(void *ctx) 6297 { 6298 struct spdk_bdev_channel *ch = ctx; 6299 6300 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 6301 bdev_reset_freeze_channel_done); 6302 } 6303 6304 static void 6305 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 6306 { 6307 struct spdk_bdev *bdev = ch->bdev; 6308 6309 assert(!TAILQ_EMPTY(&ch->queued_resets)); 6310 6311 spdk_spin_lock(&bdev->internal.spinlock); 6312 if (bdev->internal.reset_in_progress == NULL) { 6313 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 6314 /* 6315 * Take a channel reference for the target bdev for the life of this 6316 * reset. This guards against the channel getting destroyed while 6317 * spdk_bdev_for_each_channel() calls related to this reset IO are in 6318 * progress. We will release the reference when this reset is 6319 * completed. 6320 */ 6321 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 6322 bdev_start_reset(ch); 6323 } 6324 spdk_spin_unlock(&bdev->internal.spinlock); 6325 } 6326 6327 int 6328 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6329 spdk_bdev_io_completion_cb cb, void *cb_arg) 6330 { 6331 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6332 struct spdk_bdev_io *bdev_io; 6333 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6334 6335 bdev_io = bdev_channel_get_io(channel); 6336 if (!bdev_io) { 6337 return -ENOMEM; 6338 } 6339 6340 bdev_io->internal.ch = channel; 6341 bdev_io->internal.desc = desc; 6342 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6343 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 6344 bdev_io->u.reset.ch_ref = NULL; 6345 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6346 6347 spdk_spin_lock(&bdev->internal.spinlock); 6348 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 6349 spdk_spin_unlock(&bdev->internal.spinlock); 6350 6351 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 6352 internal.ch_link); 6353 6354 bdev_channel_start_reset(channel); 6355 6356 return 0; 6357 } 6358 6359 void 6360 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6361 struct spdk_bdev_io_stat *stat) 6362 { 6363 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6364 6365 bdev_get_io_stat(stat, channel->stat); 6366 } 6367 6368 static void 6369 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6370 { 6371 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6372 6373 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 6374 bdev_iostat_ctx->cb_arg, 0); 6375 free(bdev_iostat_ctx); 6376 } 6377 6378 static void 6379 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6380 struct spdk_io_channel *ch, void *_ctx) 6381 { 6382 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 6383 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6384 6385 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 6386 spdk_bdev_for_each_channel_continue(i, 0); 6387 } 6388 6389 void 6390 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 6391 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 6392 { 6393 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 6394 6395 assert(bdev != NULL); 6396 assert(stat != NULL); 6397 assert(cb != NULL); 6398 6399 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 6400 if (bdev_iostat_ctx == NULL) { 6401 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 6402 cb(bdev, stat, cb_arg, -ENOMEM); 6403 return; 6404 } 6405 6406 bdev_iostat_ctx->stat = stat; 6407 bdev_iostat_ctx->cb = cb; 6408 bdev_iostat_ctx->cb_arg = cb_arg; 6409 6410 /* Start with the statistics from previously deleted channels. */ 6411 spdk_spin_lock(&bdev->internal.spinlock); 6412 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 6413 spdk_spin_unlock(&bdev->internal.spinlock); 6414 6415 /* Then iterate and add the statistics from each existing channel. */ 6416 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 6417 bdev_get_device_stat_done); 6418 } 6419 6420 struct bdev_iostat_reset_ctx { 6421 enum spdk_bdev_reset_stat_mode mode; 6422 bdev_reset_device_stat_cb cb; 6423 void *cb_arg; 6424 }; 6425 6426 static void 6427 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 6428 { 6429 struct bdev_iostat_reset_ctx *ctx = _ctx; 6430 6431 ctx->cb(bdev, ctx->cb_arg, 0); 6432 6433 free(ctx); 6434 } 6435 6436 static void 6437 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6438 struct spdk_io_channel *ch, void *_ctx) 6439 { 6440 struct bdev_iostat_reset_ctx *ctx = _ctx; 6441 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6442 6443 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 6444 6445 spdk_bdev_for_each_channel_continue(i, 0); 6446 } 6447 6448 void 6449 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 6450 bdev_reset_device_stat_cb cb, void *cb_arg) 6451 { 6452 struct bdev_iostat_reset_ctx *ctx; 6453 6454 assert(bdev != NULL); 6455 assert(cb != NULL); 6456 6457 ctx = calloc(1, sizeof(*ctx)); 6458 if (ctx == NULL) { 6459 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 6460 cb(bdev, cb_arg, -ENOMEM); 6461 return; 6462 } 6463 6464 ctx->mode = mode; 6465 ctx->cb = cb; 6466 ctx->cb_arg = cb_arg; 6467 6468 spdk_spin_lock(&bdev->internal.spinlock); 6469 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 6470 spdk_spin_unlock(&bdev->internal.spinlock); 6471 6472 spdk_bdev_for_each_channel(bdev, 6473 bdev_reset_each_channel_stat, 6474 ctx, 6475 bdev_reset_device_stat_done); 6476 } 6477 6478 int 6479 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6480 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6481 spdk_bdev_io_completion_cb cb, void *cb_arg) 6482 { 6483 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6484 struct spdk_bdev_io *bdev_io; 6485 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6486 6487 if (!desc->write) { 6488 return -EBADF; 6489 } 6490 6491 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 6492 return -ENOTSUP; 6493 } 6494 6495 bdev_io = bdev_channel_get_io(channel); 6496 if (!bdev_io) { 6497 return -ENOMEM; 6498 } 6499 6500 bdev_io->internal.ch = channel; 6501 bdev_io->internal.desc = desc; 6502 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 6503 bdev_io->u.nvme_passthru.cmd = *cmd; 6504 bdev_io->u.nvme_passthru.buf = buf; 6505 bdev_io->u.nvme_passthru.nbytes = nbytes; 6506 bdev_io->u.nvme_passthru.md_buf = NULL; 6507 bdev_io->u.nvme_passthru.md_len = 0; 6508 6509 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6510 6511 bdev_io_submit(bdev_io); 6512 return 0; 6513 } 6514 6515 int 6516 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6517 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 6518 spdk_bdev_io_completion_cb cb, void *cb_arg) 6519 { 6520 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6521 struct spdk_bdev_io *bdev_io; 6522 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6523 6524 if (!desc->write) { 6525 /* 6526 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6527 * to easily determine if the command is a read or write, but for now just 6528 * do not allow io_passthru with a read-only descriptor. 6529 */ 6530 return -EBADF; 6531 } 6532 6533 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 6534 return -ENOTSUP; 6535 } 6536 6537 bdev_io = bdev_channel_get_io(channel); 6538 if (!bdev_io) { 6539 return -ENOMEM; 6540 } 6541 6542 bdev_io->internal.ch = channel; 6543 bdev_io->internal.desc = desc; 6544 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 6545 bdev_io->u.nvme_passthru.cmd = *cmd; 6546 bdev_io->u.nvme_passthru.buf = buf; 6547 bdev_io->u.nvme_passthru.nbytes = nbytes; 6548 bdev_io->u.nvme_passthru.md_buf = NULL; 6549 bdev_io->u.nvme_passthru.md_len = 0; 6550 6551 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6552 6553 bdev_io_submit(bdev_io); 6554 return 0; 6555 } 6556 6557 int 6558 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6559 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 6560 spdk_bdev_io_completion_cb cb, void *cb_arg) 6561 { 6562 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6563 struct spdk_bdev_io *bdev_io; 6564 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6565 6566 if (!desc->write) { 6567 /* 6568 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6569 * to easily determine if the command is a read or write, but for now just 6570 * do not allow io_passthru with a read-only descriptor. 6571 */ 6572 return -EBADF; 6573 } 6574 6575 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6576 return -ENOTSUP; 6577 } 6578 6579 bdev_io = bdev_channel_get_io(channel); 6580 if (!bdev_io) { 6581 return -ENOMEM; 6582 } 6583 6584 bdev_io->internal.ch = channel; 6585 bdev_io->internal.desc = desc; 6586 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6587 bdev_io->u.nvme_passthru.cmd = *cmd; 6588 bdev_io->u.nvme_passthru.buf = buf; 6589 bdev_io->u.nvme_passthru.nbytes = nbytes; 6590 bdev_io->u.nvme_passthru.md_buf = md_buf; 6591 bdev_io->u.nvme_passthru.md_len = md_len; 6592 6593 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6594 6595 bdev_io_submit(bdev_io); 6596 return 0; 6597 } 6598 6599 static void bdev_abort_retry(void *ctx); 6600 static void bdev_abort(struct spdk_bdev_io *parent_io); 6601 6602 static void 6603 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6604 { 6605 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6606 struct spdk_bdev_io *parent_io = cb_arg; 6607 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6608 6609 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6610 6611 spdk_bdev_free_io(bdev_io); 6612 6613 if (!success) { 6614 /* Check if the target I/O completed in the meantime. */ 6615 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6616 if (tmp_io == bio_to_abort) { 6617 break; 6618 } 6619 } 6620 6621 /* If the target I/O still exists, set the parent to failed. */ 6622 if (tmp_io != NULL) { 6623 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6624 } 6625 } 6626 6627 parent_io->u.bdev.split_outstanding--; 6628 if (parent_io->u.bdev.split_outstanding == 0) { 6629 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6630 bdev_abort_retry(parent_io); 6631 } else { 6632 bdev_io_complete(parent_io); 6633 } 6634 } 6635 } 6636 6637 static int 6638 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6639 struct spdk_bdev_io *bio_to_abort, 6640 spdk_bdev_io_completion_cb cb, void *cb_arg) 6641 { 6642 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6643 struct spdk_bdev_io *bdev_io; 6644 6645 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6646 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6647 /* TODO: Abort reset or abort request. */ 6648 return -ENOTSUP; 6649 } 6650 6651 bdev_io = bdev_channel_get_io(channel); 6652 if (bdev_io == NULL) { 6653 return -ENOMEM; 6654 } 6655 6656 bdev_io->internal.ch = channel; 6657 bdev_io->internal.desc = desc; 6658 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6659 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6660 6661 if (bdev->split_on_optimal_io_boundary && bio_to_abort->internal.split) { 6662 assert(bdev_io_should_split(bio_to_abort)); 6663 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6664 6665 /* Parent abort request is not submitted directly, but to manage its 6666 * execution add it to the submitted list here. 6667 */ 6668 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6669 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6670 6671 bdev_abort(bdev_io); 6672 6673 return 0; 6674 } 6675 6676 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6677 6678 /* Submit the abort request to the underlying bdev module. */ 6679 bdev_io_submit(bdev_io); 6680 6681 return 0; 6682 } 6683 6684 static bool 6685 bdev_io_on_tailq(struct spdk_bdev_io *bdev_io, bdev_io_tailq_t *tailq) 6686 { 6687 struct spdk_bdev_io *iter; 6688 6689 TAILQ_FOREACH(iter, tailq, internal.link) { 6690 if (iter == bdev_io) { 6691 return true; 6692 } 6693 } 6694 6695 return false; 6696 } 6697 6698 static uint32_t 6699 _bdev_abort(struct spdk_bdev_io *parent_io) 6700 { 6701 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6702 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6703 void *bio_cb_arg; 6704 struct spdk_bdev_io *bio_to_abort; 6705 uint32_t matched_ios; 6706 int rc; 6707 6708 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6709 6710 /* matched_ios is returned and will be kept by the caller. 6711 * 6712 * This function will be used for two cases, 1) the same cb_arg is used for 6713 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6714 * Incrementing split_outstanding directly here may confuse readers especially 6715 * for the 1st case. 6716 * 6717 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6718 * works as expected. 6719 */ 6720 matched_ios = 0; 6721 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6722 6723 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6724 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6725 continue; 6726 } 6727 6728 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6729 /* Any I/O which was submitted after this abort command should be excluded. */ 6730 continue; 6731 } 6732 6733 /* We can't abort a request that's being pushed/pulled or executed by accel */ 6734 if (bdev_io_on_tailq(bio_to_abort, &channel->io_accel_exec) || 6735 bdev_io_on_tailq(bio_to_abort, &channel->io_memory_domain)) { 6736 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6737 break; 6738 } 6739 6740 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6741 if (rc != 0) { 6742 if (rc == -ENOMEM) { 6743 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6744 } else { 6745 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6746 } 6747 break; 6748 } 6749 matched_ios++; 6750 } 6751 6752 return matched_ios; 6753 } 6754 6755 static void 6756 bdev_abort_retry(void *ctx) 6757 { 6758 struct spdk_bdev_io *parent_io = ctx; 6759 uint32_t matched_ios; 6760 6761 matched_ios = _bdev_abort(parent_io); 6762 6763 if (matched_ios == 0) { 6764 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6765 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6766 } else { 6767 /* For retry, the case that no target I/O was found is success 6768 * because it means target I/Os completed in the meantime. 6769 */ 6770 bdev_io_complete(parent_io); 6771 } 6772 return; 6773 } 6774 6775 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6776 parent_io->u.bdev.split_outstanding = matched_ios; 6777 } 6778 6779 static void 6780 bdev_abort(struct spdk_bdev_io *parent_io) 6781 { 6782 uint32_t matched_ios; 6783 6784 matched_ios = _bdev_abort(parent_io); 6785 6786 if (matched_ios == 0) { 6787 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6788 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6789 } else { 6790 /* The case the no target I/O was found is failure. */ 6791 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6792 bdev_io_complete(parent_io); 6793 } 6794 return; 6795 } 6796 6797 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6798 parent_io->u.bdev.split_outstanding = matched_ios; 6799 } 6800 6801 int 6802 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6803 void *bio_cb_arg, 6804 spdk_bdev_io_completion_cb cb, void *cb_arg) 6805 { 6806 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6807 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6808 struct spdk_bdev_io *bdev_io; 6809 6810 if (bio_cb_arg == NULL) { 6811 return -EINVAL; 6812 } 6813 6814 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6815 return -ENOTSUP; 6816 } 6817 6818 bdev_io = bdev_channel_get_io(channel); 6819 if (bdev_io == NULL) { 6820 return -ENOMEM; 6821 } 6822 6823 bdev_io->internal.ch = channel; 6824 bdev_io->internal.desc = desc; 6825 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6826 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6827 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6828 6829 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6830 6831 /* Parent abort request is not submitted directly, but to manage its execution, 6832 * add it to the submitted list here. 6833 */ 6834 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6835 6836 bdev_abort(bdev_io); 6837 6838 return 0; 6839 } 6840 6841 int 6842 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6843 struct spdk_bdev_io_wait_entry *entry) 6844 { 6845 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6846 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6847 6848 if (bdev != entry->bdev) { 6849 SPDK_ERRLOG("bdevs do not match\n"); 6850 return -EINVAL; 6851 } 6852 6853 if (mgmt_ch->per_thread_cache_count > 0) { 6854 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6855 return -EINVAL; 6856 } 6857 6858 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6859 return 0; 6860 } 6861 6862 static inline void 6863 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6864 { 6865 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6866 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6867 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6868 uint32_t blocklen = bdev_io->bdev->blocklen; 6869 6870 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6871 switch (bdev_io->type) { 6872 case SPDK_BDEV_IO_TYPE_READ: 6873 io_stat->bytes_read += num_blocks * blocklen; 6874 io_stat->num_read_ops++; 6875 io_stat->read_latency_ticks += tsc_diff; 6876 if (io_stat->max_read_latency_ticks < tsc_diff) { 6877 io_stat->max_read_latency_ticks = tsc_diff; 6878 } 6879 if (io_stat->min_read_latency_ticks > tsc_diff) { 6880 io_stat->min_read_latency_ticks = tsc_diff; 6881 } 6882 break; 6883 case SPDK_BDEV_IO_TYPE_WRITE: 6884 io_stat->bytes_written += num_blocks * blocklen; 6885 io_stat->num_write_ops++; 6886 io_stat->write_latency_ticks += tsc_diff; 6887 if (io_stat->max_write_latency_ticks < tsc_diff) { 6888 io_stat->max_write_latency_ticks = tsc_diff; 6889 } 6890 if (io_stat->min_write_latency_ticks > tsc_diff) { 6891 io_stat->min_write_latency_ticks = tsc_diff; 6892 } 6893 break; 6894 case SPDK_BDEV_IO_TYPE_UNMAP: 6895 io_stat->bytes_unmapped += num_blocks * blocklen; 6896 io_stat->num_unmap_ops++; 6897 io_stat->unmap_latency_ticks += tsc_diff; 6898 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6899 io_stat->max_unmap_latency_ticks = tsc_diff; 6900 } 6901 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6902 io_stat->min_unmap_latency_ticks = tsc_diff; 6903 } 6904 break; 6905 case SPDK_BDEV_IO_TYPE_ZCOPY: 6906 /* Track the data in the start phase only */ 6907 if (bdev_io->u.bdev.zcopy.start) { 6908 if (bdev_io->u.bdev.zcopy.populate) { 6909 io_stat->bytes_read += num_blocks * blocklen; 6910 io_stat->num_read_ops++; 6911 io_stat->read_latency_ticks += tsc_diff; 6912 if (io_stat->max_read_latency_ticks < tsc_diff) { 6913 io_stat->max_read_latency_ticks = tsc_diff; 6914 } 6915 if (io_stat->min_read_latency_ticks > tsc_diff) { 6916 io_stat->min_read_latency_ticks = tsc_diff; 6917 } 6918 } else { 6919 io_stat->bytes_written += num_blocks * blocklen; 6920 io_stat->num_write_ops++; 6921 io_stat->write_latency_ticks += tsc_diff; 6922 if (io_stat->max_write_latency_ticks < tsc_diff) { 6923 io_stat->max_write_latency_ticks = tsc_diff; 6924 } 6925 if (io_stat->min_write_latency_ticks > tsc_diff) { 6926 io_stat->min_write_latency_ticks = tsc_diff; 6927 } 6928 } 6929 } 6930 break; 6931 case SPDK_BDEV_IO_TYPE_COPY: 6932 io_stat->bytes_copied += num_blocks * blocklen; 6933 io_stat->num_copy_ops++; 6934 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6935 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6936 io_stat->max_copy_latency_ticks = tsc_diff; 6937 } 6938 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6939 io_stat->min_copy_latency_ticks = tsc_diff; 6940 } 6941 break; 6942 default: 6943 break; 6944 } 6945 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6946 io_stat = bdev_io->bdev->internal.stat; 6947 assert(io_stat->io_error != NULL); 6948 6949 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6950 io_stat->io_error->error_status[-io_status - 1]++; 6951 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6952 } 6953 6954 #ifdef SPDK_CONFIG_VTUNE 6955 uint64_t now_tsc = spdk_get_ticks(); 6956 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6957 uint64_t data[5]; 6958 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6959 6960 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6961 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6962 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6963 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6964 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6965 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6966 6967 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6968 __itt_metadata_u64, 5, data); 6969 6970 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6971 bdev_io->internal.ch->start_tsc = now_tsc; 6972 } 6973 #endif 6974 } 6975 6976 static inline void 6977 _bdev_io_complete(void *ctx) 6978 { 6979 struct spdk_bdev_io *bdev_io = ctx; 6980 6981 if (spdk_unlikely(bdev_io->internal.accel_sequence != NULL)) { 6982 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 6983 spdk_accel_sequence_abort(bdev_io->internal.accel_sequence); 6984 } 6985 6986 assert(bdev_io->internal.cb != NULL); 6987 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6988 6989 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6990 bdev_io->internal.caller_ctx); 6991 } 6992 6993 static inline void 6994 bdev_io_complete(void *ctx) 6995 { 6996 struct spdk_bdev_io *bdev_io = ctx; 6997 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6998 uint64_t tsc, tsc_diff; 6999 7000 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 7001 /* 7002 * Defer completion to avoid potential infinite recursion if the 7003 * user's completion callback issues a new I/O. 7004 */ 7005 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7006 bdev_io_complete, bdev_io); 7007 return; 7008 } 7009 7010 tsc = spdk_get_ticks(); 7011 tsc_diff = tsc - bdev_io->internal.submit_tsc; 7012 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 7013 bdev_io->internal.caller_ctx); 7014 7015 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 7016 7017 if (bdev_io->internal.ch->histogram) { 7018 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 7019 } 7020 7021 bdev_io_update_io_stat(bdev_io, tsc_diff); 7022 _bdev_io_complete(bdev_io); 7023 } 7024 7025 /* The difference between this function and bdev_io_complete() is that this should be called to 7026 * complete IOs that haven't been submitted via bdev_io_submit(), as they weren't added onto the 7027 * io_submitted list and don't have submit_tsc updated. 7028 */ 7029 static inline void 7030 bdev_io_complete_unsubmitted(struct spdk_bdev_io *bdev_io) 7031 { 7032 /* Since the IO hasn't been submitted it's bound to be failed */ 7033 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_SUCCESS); 7034 7035 /* At this point we don't know if the IO is completed from submission context or not, but, 7036 * since this is an error path, we can always do an spdk_thread_send_msg(). */ 7037 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7038 _bdev_io_complete, bdev_io); 7039 } 7040 7041 static void bdev_destroy_cb(void *io_device); 7042 7043 static void 7044 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 7045 { 7046 struct spdk_bdev_io *bdev_io = _ctx; 7047 7048 if (bdev_io->u.reset.ch_ref != NULL) { 7049 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 7050 bdev_io->u.reset.ch_ref = NULL; 7051 } 7052 7053 bdev_io_complete(bdev_io); 7054 7055 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 7056 TAILQ_EMPTY(&bdev->internal.open_descs)) { 7057 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7058 } 7059 } 7060 7061 static void 7062 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7063 struct spdk_io_channel *_ch, void *_ctx) 7064 { 7065 struct spdk_bdev_io *bdev_io = _ctx; 7066 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7067 struct spdk_bdev_io *queued_reset; 7068 7069 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 7070 while (!TAILQ_EMPTY(&ch->queued_resets)) { 7071 queued_reset = TAILQ_FIRST(&ch->queued_resets); 7072 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 7073 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 7074 } 7075 7076 spdk_bdev_for_each_channel_continue(i, 0); 7077 } 7078 7079 static void 7080 bdev_io_complete_sequence_cb(void *ctx, int status) 7081 { 7082 struct spdk_bdev_io *bdev_io = ctx; 7083 7084 /* u.bdev.accel_sequence should have already been cleared at this point */ 7085 assert(bdev_io->u.bdev.accel_sequence == NULL); 7086 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS); 7087 bdev_io->internal.accel_sequence = NULL; 7088 7089 if (spdk_unlikely(status != 0)) { 7090 SPDK_ERRLOG("Failed to execute accel sequence, status=%d\n", status); 7091 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7092 } 7093 7094 bdev_io_complete(bdev_io); 7095 } 7096 7097 void 7098 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 7099 { 7100 struct spdk_bdev *bdev = bdev_io->bdev; 7101 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 7102 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 7103 7104 if (bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING) { 7105 SPDK_ERRLOG("Unexpected completion on IO from %s module, status was %s\n", 7106 spdk_bdev_get_module_name(bdev), 7107 bdev_io_status_get_string(bdev_io->internal.status)); 7108 assert(false); 7109 } 7110 bdev_io->internal.status = status; 7111 7112 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 7113 bool unlock_channels = false; 7114 7115 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 7116 SPDK_ERRLOG("NOMEM returned for reset\n"); 7117 } 7118 spdk_spin_lock(&bdev->internal.spinlock); 7119 if (bdev_io == bdev->internal.reset_in_progress) { 7120 bdev->internal.reset_in_progress = NULL; 7121 unlock_channels = true; 7122 } 7123 spdk_spin_unlock(&bdev->internal.spinlock); 7124 7125 if (unlock_channels) { 7126 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 7127 bdev_reset_complete); 7128 return; 7129 } 7130 } else { 7131 bdev_io_decrement_outstanding(bdev_ch, shared_resource); 7132 if (spdk_likely(status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 7133 if (bdev_io_needs_sequence_exec(bdev_io->internal.desc, bdev_io)) { 7134 bdev_io_exec_sequence(bdev_io, bdev_io_complete_sequence_cb); 7135 return; 7136 } else if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 7137 _bdev_io_push_bounce_data_buffer(bdev_io, 7138 _bdev_io_complete_push_bounce_done); 7139 /* bdev IO will be completed in the callback */ 7140 return; 7141 } 7142 } 7143 7144 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io, BDEV_IO_RETRY_STATE_SUBMIT))) { 7145 return; 7146 } 7147 } 7148 7149 bdev_io_complete(bdev_io); 7150 } 7151 7152 void 7153 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 7154 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 7155 { 7156 enum spdk_bdev_io_status status; 7157 7158 if (sc == SPDK_SCSI_STATUS_GOOD) { 7159 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7160 } else { 7161 status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 7162 bdev_io->internal.error.scsi.sc = sc; 7163 bdev_io->internal.error.scsi.sk = sk; 7164 bdev_io->internal.error.scsi.asc = asc; 7165 bdev_io->internal.error.scsi.ascq = ascq; 7166 } 7167 7168 spdk_bdev_io_complete(bdev_io, status); 7169 } 7170 7171 void 7172 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 7173 int *sc, int *sk, int *asc, int *ascq) 7174 { 7175 assert(sc != NULL); 7176 assert(sk != NULL); 7177 assert(asc != NULL); 7178 assert(ascq != NULL); 7179 7180 switch (bdev_io->internal.status) { 7181 case SPDK_BDEV_IO_STATUS_SUCCESS: 7182 *sc = SPDK_SCSI_STATUS_GOOD; 7183 *sk = SPDK_SCSI_SENSE_NO_SENSE; 7184 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7185 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7186 break; 7187 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 7188 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 7189 break; 7190 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 7191 *sc = bdev_io->internal.error.scsi.sc; 7192 *sk = bdev_io->internal.error.scsi.sk; 7193 *asc = bdev_io->internal.error.scsi.asc; 7194 *ascq = bdev_io->internal.error.scsi.ascq; 7195 break; 7196 default: 7197 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 7198 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 7199 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 7200 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 7201 break; 7202 } 7203 } 7204 7205 void 7206 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 7207 { 7208 enum spdk_bdev_io_status status; 7209 7210 if (aio_result == 0) { 7211 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7212 } else { 7213 status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 7214 } 7215 7216 bdev_io->internal.error.aio_result = aio_result; 7217 7218 spdk_bdev_io_complete(bdev_io, status); 7219 } 7220 7221 void 7222 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 7223 { 7224 assert(aio_result != NULL); 7225 7226 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 7227 *aio_result = bdev_io->internal.error.aio_result; 7228 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7229 *aio_result = 0; 7230 } else { 7231 *aio_result = -EIO; 7232 } 7233 } 7234 7235 void 7236 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 7237 { 7238 enum spdk_bdev_io_status status; 7239 7240 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 7241 status = SPDK_BDEV_IO_STATUS_SUCCESS; 7242 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 7243 status = SPDK_BDEV_IO_STATUS_ABORTED; 7244 } else { 7245 status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 7246 } 7247 7248 bdev_io->internal.error.nvme.cdw0 = cdw0; 7249 bdev_io->internal.error.nvme.sct = sct; 7250 bdev_io->internal.error.nvme.sc = sc; 7251 7252 spdk_bdev_io_complete(bdev_io, status); 7253 } 7254 7255 void 7256 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 7257 { 7258 assert(sct != NULL); 7259 assert(sc != NULL); 7260 assert(cdw0 != NULL); 7261 7262 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 7263 *sct = SPDK_NVME_SCT_GENERIC; 7264 *sc = SPDK_NVME_SC_SUCCESS; 7265 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7266 *cdw0 = 0; 7267 } else { 7268 *cdw0 = 1U; 7269 } 7270 return; 7271 } 7272 7273 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7274 *sct = bdev_io->internal.error.nvme.sct; 7275 *sc = bdev_io->internal.error.nvme.sc; 7276 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7277 *sct = SPDK_NVME_SCT_GENERIC; 7278 *sc = SPDK_NVME_SC_SUCCESS; 7279 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7280 *sct = SPDK_NVME_SCT_GENERIC; 7281 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7282 } else { 7283 *sct = SPDK_NVME_SCT_GENERIC; 7284 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7285 } 7286 7287 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7288 } 7289 7290 void 7291 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 7292 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 7293 { 7294 assert(first_sct != NULL); 7295 assert(first_sc != NULL); 7296 assert(second_sct != NULL); 7297 assert(second_sc != NULL); 7298 assert(cdw0 != NULL); 7299 7300 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 7301 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 7302 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 7303 *first_sct = bdev_io->internal.error.nvme.sct; 7304 *first_sc = bdev_io->internal.error.nvme.sc; 7305 *second_sct = SPDK_NVME_SCT_GENERIC; 7306 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7307 } else { 7308 *first_sct = SPDK_NVME_SCT_GENERIC; 7309 *first_sc = SPDK_NVME_SC_SUCCESS; 7310 *second_sct = bdev_io->internal.error.nvme.sct; 7311 *second_sc = bdev_io->internal.error.nvme.sc; 7312 } 7313 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 7314 *first_sct = SPDK_NVME_SCT_GENERIC; 7315 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7316 *second_sct = SPDK_NVME_SCT_GENERIC; 7317 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 7318 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 7319 *first_sct = SPDK_NVME_SCT_GENERIC; 7320 *first_sc = SPDK_NVME_SC_SUCCESS; 7321 *second_sct = SPDK_NVME_SCT_GENERIC; 7322 *second_sc = SPDK_NVME_SC_SUCCESS; 7323 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 7324 *first_sct = SPDK_NVME_SCT_GENERIC; 7325 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7326 *second_sct = SPDK_NVME_SCT_GENERIC; 7327 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7328 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 7329 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 7330 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 7331 *second_sct = SPDK_NVME_SCT_GENERIC; 7332 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 7333 } else { 7334 *first_sct = SPDK_NVME_SCT_GENERIC; 7335 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7336 *second_sct = SPDK_NVME_SCT_GENERIC; 7337 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 7338 } 7339 7340 *cdw0 = bdev_io->internal.error.nvme.cdw0; 7341 } 7342 7343 struct spdk_thread * 7344 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 7345 { 7346 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 7347 } 7348 7349 struct spdk_io_channel * 7350 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 7351 { 7352 return bdev_io->internal.ch->channel; 7353 } 7354 7355 static int 7356 bdev_register(struct spdk_bdev *bdev) 7357 { 7358 char *bdev_name; 7359 char uuid[SPDK_UUID_STRING_LEN]; 7360 struct spdk_iobuf_opts iobuf_opts; 7361 int ret, i; 7362 7363 assert(bdev->module != NULL); 7364 7365 if (!bdev->name) { 7366 SPDK_ERRLOG("Bdev name is NULL\n"); 7367 return -EINVAL; 7368 } 7369 7370 if (!strlen(bdev->name)) { 7371 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 7372 return -EINVAL; 7373 } 7374 7375 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7376 if (bdev->fn_table->accel_sequence_supported == NULL) { 7377 continue; 7378 } 7379 if (!bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7380 (enum spdk_bdev_io_type)i)) { 7381 continue; 7382 } 7383 7384 if (spdk_bdev_get_memory_domains(bdev, NULL, 0) <= 0) { 7385 SPDK_ERRLOG("bdev supporting accel sequence is required to support " 7386 "memory domains\n"); 7387 return -EINVAL; 7388 } 7389 7390 if (spdk_bdev_is_md_separate(bdev)) { 7391 SPDK_ERRLOG("Separate metadata is currently unsupported for bdevs with " 7392 "accel sequence support\n"); 7393 return -EINVAL; 7394 } 7395 } 7396 7397 /* Users often register their own I/O devices using the bdev name. In 7398 * order to avoid conflicts, prepend bdev_. */ 7399 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 7400 if (!bdev_name) { 7401 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 7402 return -ENOMEM; 7403 } 7404 7405 bdev->internal.stat = bdev_alloc_io_stat(true); 7406 if (!bdev->internal.stat) { 7407 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 7408 free(bdev_name); 7409 return -ENOMEM; 7410 } 7411 7412 bdev->internal.status = SPDK_BDEV_STATUS_READY; 7413 bdev->internal.measured_queue_depth = UINT64_MAX; 7414 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7415 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 7416 bdev->internal.qd_poller = NULL; 7417 bdev->internal.qos = NULL; 7418 7419 TAILQ_INIT(&bdev->internal.open_descs); 7420 TAILQ_INIT(&bdev->internal.locked_ranges); 7421 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 7422 TAILQ_INIT(&bdev->aliases); 7423 7424 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 7425 if (ret != 0) { 7426 bdev_free_io_stat(bdev->internal.stat); 7427 free(bdev_name); 7428 return ret; 7429 } 7430 7431 /* UUID may be specified by the user or defined by bdev itself. 7432 * Otherwise it will be generated here, so this field will never be empty. */ 7433 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 7434 spdk_uuid_generate(&bdev->uuid); 7435 } 7436 7437 /* Add the UUID alias only if it's different than the name */ 7438 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7439 if (strcmp(bdev->name, uuid) != 0) { 7440 ret = spdk_bdev_alias_add(bdev, uuid); 7441 if (ret != 0) { 7442 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 7443 bdev_name_del(&bdev->internal.bdev_name); 7444 bdev_free_io_stat(bdev->internal.stat); 7445 free(bdev_name); 7446 return ret; 7447 } 7448 } 7449 7450 if (spdk_bdev_get_buf_align(bdev) > 1) { 7451 if (bdev->split_on_optimal_io_boundary) { 7452 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 7453 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 7454 } else { 7455 bdev->split_on_optimal_io_boundary = true; 7456 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 7457 } 7458 } 7459 7460 /* If the user didn't specify a write unit size, set it to one. */ 7461 if (bdev->write_unit_size == 0) { 7462 bdev->write_unit_size = 1; 7463 } 7464 7465 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 7466 if (bdev->acwu == 0) { 7467 bdev->acwu = bdev->write_unit_size; 7468 } 7469 7470 if (bdev->phys_blocklen == 0) { 7471 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 7472 } 7473 7474 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY)) { 7475 spdk_iobuf_get_opts(&iobuf_opts); 7476 bdev->max_copy = bdev_get_max_write(bdev, iobuf_opts.large_bufsize); 7477 } 7478 7479 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 7480 bdev->max_write_zeroes = bdev_get_max_write(bdev, ZERO_BUFFER_SIZE); 7481 } 7482 7483 bdev->internal.reset_in_progress = NULL; 7484 bdev->internal.qd_poll_in_progress = false; 7485 bdev->internal.period = 0; 7486 bdev->internal.new_period = 0; 7487 7488 spdk_io_device_register(__bdev_to_io_dev(bdev), 7489 bdev_channel_create, bdev_channel_destroy, 7490 sizeof(struct spdk_bdev_channel), 7491 bdev_name); 7492 7493 free(bdev_name); 7494 7495 spdk_spin_init(&bdev->internal.spinlock); 7496 7497 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 7498 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 7499 7500 return 0; 7501 } 7502 7503 static void 7504 bdev_destroy_cb(void *io_device) 7505 { 7506 int rc; 7507 struct spdk_bdev *bdev; 7508 spdk_bdev_unregister_cb cb_fn; 7509 void *cb_arg; 7510 7511 bdev = __bdev_from_io_dev(io_device); 7512 7513 if (bdev->internal.unregister_td != spdk_get_thread()) { 7514 spdk_thread_send_msg(bdev->internal.unregister_td, bdev_destroy_cb, io_device); 7515 return; 7516 } 7517 7518 cb_fn = bdev->internal.unregister_cb; 7519 cb_arg = bdev->internal.unregister_ctx; 7520 7521 spdk_spin_destroy(&bdev->internal.spinlock); 7522 free(bdev->internal.qos); 7523 bdev_free_io_stat(bdev->internal.stat); 7524 7525 rc = bdev->fn_table->destruct(bdev->ctxt); 7526 if (rc < 0) { 7527 SPDK_ERRLOG("destruct failed\n"); 7528 } 7529 if (rc <= 0 && cb_fn != NULL) { 7530 cb_fn(cb_arg, rc); 7531 } 7532 } 7533 7534 void 7535 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 7536 { 7537 if (bdev->internal.unregister_cb != NULL) { 7538 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 7539 } 7540 } 7541 7542 static void 7543 _remove_notify(void *arg) 7544 { 7545 struct spdk_bdev_desc *desc = arg; 7546 7547 _event_notify(desc, SPDK_BDEV_EVENT_REMOVE); 7548 } 7549 7550 /* returns: 0 - bdev removed and ready to be destructed. 7551 * -EBUSY - bdev can't be destructed yet. */ 7552 static int 7553 bdev_unregister_unsafe(struct spdk_bdev *bdev) 7554 { 7555 struct spdk_bdev_desc *desc, *tmp; 7556 int rc = 0; 7557 char uuid[SPDK_UUID_STRING_LEN]; 7558 7559 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 7560 assert(spdk_spin_held(&bdev->internal.spinlock)); 7561 7562 /* Notify each descriptor about hotremoval */ 7563 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 7564 rc = -EBUSY; 7565 /* 7566 * Defer invocation of the event_cb to a separate message that will 7567 * run later on its thread. This ensures this context unwinds and 7568 * we don't recursively unregister this bdev again if the event_cb 7569 * immediately closes its descriptor. 7570 */ 7571 event_notify(desc, _remove_notify); 7572 } 7573 7574 /* If there are no descriptors, proceed removing the bdev */ 7575 if (rc == 0) { 7576 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 7577 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 7578 7579 /* Delete the name and the UUID alias */ 7580 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 7581 bdev_name_del_unsafe(&bdev->internal.bdev_name); 7582 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 7583 7584 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 7585 7586 if (bdev->internal.reset_in_progress != NULL) { 7587 /* If reset is in progress, let the completion callback for reset 7588 * unregister the bdev. 7589 */ 7590 rc = -EBUSY; 7591 } 7592 } 7593 7594 return rc; 7595 } 7596 7597 static void 7598 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7599 struct spdk_io_channel *io_ch, void *_ctx) 7600 { 7601 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 7602 7603 bdev_channel_abort_queued_ios(bdev_ch); 7604 spdk_bdev_for_each_channel_continue(i, 0); 7605 } 7606 7607 static void 7608 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 7609 { 7610 int rc; 7611 7612 spdk_spin_lock(&g_bdev_mgr.spinlock); 7613 spdk_spin_lock(&bdev->internal.spinlock); 7614 /* 7615 * Set the status to REMOVING after completing to abort channels. Otherwise, 7616 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 7617 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 7618 * may fail. 7619 */ 7620 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 7621 rc = bdev_unregister_unsafe(bdev); 7622 spdk_spin_unlock(&bdev->internal.spinlock); 7623 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7624 7625 if (rc == 0) { 7626 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7627 } 7628 } 7629 7630 void 7631 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7632 { 7633 struct spdk_thread *thread; 7634 7635 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 7636 7637 thread = spdk_get_thread(); 7638 if (!thread) { 7639 /* The user called this from a non-SPDK thread. */ 7640 if (cb_fn != NULL) { 7641 cb_fn(cb_arg, -ENOTSUP); 7642 } 7643 return; 7644 } 7645 7646 spdk_spin_lock(&g_bdev_mgr.spinlock); 7647 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7648 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7649 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7650 if (cb_fn) { 7651 cb_fn(cb_arg, -EBUSY); 7652 } 7653 return; 7654 } 7655 7656 spdk_spin_lock(&bdev->internal.spinlock); 7657 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 7658 bdev->internal.unregister_cb = cb_fn; 7659 bdev->internal.unregister_ctx = cb_arg; 7660 bdev->internal.unregister_td = thread; 7661 spdk_spin_unlock(&bdev->internal.spinlock); 7662 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7663 7664 spdk_bdev_set_qd_sampling_period(bdev, 0); 7665 7666 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 7667 bdev_unregister); 7668 } 7669 7670 int 7671 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 7672 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 7673 { 7674 struct spdk_bdev_desc *desc; 7675 struct spdk_bdev *bdev; 7676 int rc; 7677 7678 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7679 if (rc != 0) { 7680 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7681 return rc; 7682 } 7683 7684 bdev = spdk_bdev_desc_get_bdev(desc); 7685 7686 if (bdev->module != module) { 7687 spdk_bdev_close(desc); 7688 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7689 bdev_name); 7690 return -ENODEV; 7691 } 7692 7693 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7694 7695 spdk_bdev_close(desc); 7696 7697 return 0; 7698 } 7699 7700 static int 7701 bdev_start_qos(struct spdk_bdev *bdev) 7702 { 7703 struct set_qos_limit_ctx *ctx; 7704 7705 /* Enable QoS */ 7706 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7707 ctx = calloc(1, sizeof(*ctx)); 7708 if (ctx == NULL) { 7709 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7710 return -ENOMEM; 7711 } 7712 ctx->bdev = bdev; 7713 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7714 } 7715 7716 return 0; 7717 } 7718 7719 static void 7720 log_already_claimed(enum spdk_log_level level, const int line, const char *func, const char *detail, 7721 struct spdk_bdev *bdev) 7722 { 7723 enum spdk_bdev_claim_type type; 7724 const char *typename, *modname; 7725 extern struct spdk_log_flag SPDK_LOG_bdev; 7726 7727 assert(spdk_spin_held(&bdev->internal.spinlock)); 7728 7729 if (level >= SPDK_LOG_INFO && !SPDK_LOG_bdev.enabled) { 7730 return; 7731 } 7732 7733 type = bdev->internal.claim_type; 7734 typename = spdk_bdev_claim_get_name(type); 7735 7736 if (type == SPDK_BDEV_CLAIM_EXCL_WRITE) { 7737 modname = bdev->internal.claim.v1.module->name; 7738 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7739 bdev->name, detail, typename, modname); 7740 return; 7741 } 7742 7743 if (claim_type_is_v2(type)) { 7744 struct spdk_bdev_module_claim *claim; 7745 7746 TAILQ_FOREACH(claim, &bdev->internal.claim.v2.claims, link) { 7747 modname = claim->module->name; 7748 spdk_log(level, __FILE__, line, func, "bdev %s %s: type %s by module %s\n", 7749 bdev->name, detail, typename, modname); 7750 } 7751 return; 7752 } 7753 7754 assert(false); 7755 } 7756 7757 static int 7758 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7759 { 7760 struct spdk_thread *thread; 7761 int rc = 0; 7762 7763 thread = spdk_get_thread(); 7764 if (!thread) { 7765 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7766 return -ENOTSUP; 7767 } 7768 7769 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7770 spdk_get_thread()); 7771 7772 desc->bdev = bdev; 7773 desc->thread = thread; 7774 desc->write = write; 7775 7776 spdk_spin_lock(&bdev->internal.spinlock); 7777 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7778 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7779 spdk_spin_unlock(&bdev->internal.spinlock); 7780 return -ENODEV; 7781 } 7782 7783 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7784 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 7785 spdk_spin_unlock(&bdev->internal.spinlock); 7786 return -EPERM; 7787 } 7788 7789 rc = bdev_start_qos(bdev); 7790 if (rc != 0) { 7791 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7792 spdk_spin_unlock(&bdev->internal.spinlock); 7793 return rc; 7794 } 7795 7796 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7797 7798 spdk_spin_unlock(&bdev->internal.spinlock); 7799 7800 return 0; 7801 } 7802 7803 static int 7804 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7805 struct spdk_bdev_desc **_desc) 7806 { 7807 struct spdk_bdev_desc *desc; 7808 unsigned int i; 7809 7810 desc = calloc(1, sizeof(*desc)); 7811 if (desc == NULL) { 7812 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7813 return -ENOMEM; 7814 } 7815 7816 TAILQ_INIT(&desc->pending_media_events); 7817 TAILQ_INIT(&desc->free_media_events); 7818 7819 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7820 desc->callback.event_fn = event_cb; 7821 desc->callback.ctx = event_ctx; 7822 spdk_spin_init(&desc->spinlock); 7823 7824 if (bdev->media_events) { 7825 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7826 sizeof(*desc->media_events_buffer)); 7827 if (desc->media_events_buffer == NULL) { 7828 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7829 bdev_desc_free(desc); 7830 return -ENOMEM; 7831 } 7832 7833 for (i = 0; i < MEDIA_EVENT_POOL_SIZE; ++i) { 7834 TAILQ_INSERT_TAIL(&desc->free_media_events, 7835 &desc->media_events_buffer[i], tailq); 7836 } 7837 } 7838 7839 if (bdev->fn_table->accel_sequence_supported != NULL) { 7840 for (i = 0; i < SPDK_BDEV_NUM_IO_TYPES; ++i) { 7841 desc->accel_sequence_supported[i] = 7842 bdev->fn_table->accel_sequence_supported(bdev->ctxt, 7843 (enum spdk_bdev_io_type)i); 7844 } 7845 } 7846 7847 *_desc = desc; 7848 7849 return 0; 7850 } 7851 7852 int 7853 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7854 void *event_ctx, struct spdk_bdev_desc **_desc) 7855 { 7856 struct spdk_bdev_desc *desc; 7857 struct spdk_bdev *bdev; 7858 int rc; 7859 7860 if (event_cb == NULL) { 7861 SPDK_ERRLOG("Missing event callback function\n"); 7862 return -EINVAL; 7863 } 7864 7865 spdk_spin_lock(&g_bdev_mgr.spinlock); 7866 7867 bdev = bdev_get_by_name(bdev_name); 7868 7869 if (bdev == NULL) { 7870 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7871 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7872 return -ENODEV; 7873 } 7874 7875 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7876 if (rc != 0) { 7877 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7878 return rc; 7879 } 7880 7881 rc = bdev_open(bdev, write, desc); 7882 if (rc != 0) { 7883 bdev_desc_free(desc); 7884 desc = NULL; 7885 } 7886 7887 *_desc = desc; 7888 7889 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7890 7891 return rc; 7892 } 7893 7894 static void 7895 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7896 { 7897 int rc; 7898 7899 spdk_spin_lock(&bdev->internal.spinlock); 7900 spdk_spin_lock(&desc->spinlock); 7901 7902 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7903 7904 desc->closed = true; 7905 7906 if (desc->claim != NULL) { 7907 bdev_desc_release_claims(desc); 7908 } 7909 7910 if (0 == desc->refs) { 7911 spdk_spin_unlock(&desc->spinlock); 7912 bdev_desc_free(desc); 7913 } else { 7914 spdk_spin_unlock(&desc->spinlock); 7915 } 7916 7917 /* If no more descriptors, kill QoS channel */ 7918 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7919 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7920 bdev->name, spdk_get_thread()); 7921 7922 if (bdev_qos_destroy(bdev)) { 7923 /* There isn't anything we can do to recover here. Just let the 7924 * old QoS poller keep running. The QoS handling won't change 7925 * cores when the user allocates a new channel, but it won't break. */ 7926 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7927 } 7928 } 7929 7930 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7931 rc = bdev_unregister_unsafe(bdev); 7932 spdk_spin_unlock(&bdev->internal.spinlock); 7933 7934 if (rc == 0) { 7935 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7936 } 7937 } else { 7938 spdk_spin_unlock(&bdev->internal.spinlock); 7939 } 7940 } 7941 7942 void 7943 spdk_bdev_close(struct spdk_bdev_desc *desc) 7944 { 7945 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7946 7947 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7948 spdk_get_thread()); 7949 7950 assert(desc->thread == spdk_get_thread()); 7951 7952 spdk_poller_unregister(&desc->io_timeout_poller); 7953 7954 spdk_spin_lock(&g_bdev_mgr.spinlock); 7955 7956 bdev_close(bdev, desc); 7957 7958 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7959 } 7960 7961 static void 7962 bdev_register_finished(void *arg) 7963 { 7964 struct spdk_bdev_desc *desc = arg; 7965 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7966 7967 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7968 7969 spdk_spin_lock(&g_bdev_mgr.spinlock); 7970 7971 bdev_close(bdev, desc); 7972 7973 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7974 } 7975 7976 int 7977 spdk_bdev_register(struct spdk_bdev *bdev) 7978 { 7979 struct spdk_bdev_desc *desc; 7980 struct spdk_thread *thread = spdk_get_thread(); 7981 int rc; 7982 7983 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7984 SPDK_ERRLOG("Cannot examine bdev %s on thread %p (%s)\n", bdev->name, thread, 7985 thread ? spdk_thread_get_name(thread) : "null"); 7986 return -EINVAL; 7987 } 7988 7989 rc = bdev_register(bdev); 7990 if (rc != 0) { 7991 return rc; 7992 } 7993 7994 /* A descriptor is opened to prevent bdev deletion during examination */ 7995 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7996 if (rc != 0) { 7997 spdk_bdev_unregister(bdev, NULL, NULL); 7998 return rc; 7999 } 8000 8001 rc = bdev_open(bdev, false, desc); 8002 if (rc != 0) { 8003 bdev_desc_free(desc); 8004 spdk_bdev_unregister(bdev, NULL, NULL); 8005 return rc; 8006 } 8007 8008 /* Examine configuration before initializing I/O */ 8009 bdev_examine(bdev); 8010 8011 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 8012 if (rc != 0) { 8013 bdev_close(bdev, desc); 8014 spdk_bdev_unregister(bdev, NULL, NULL); 8015 } 8016 8017 return rc; 8018 } 8019 8020 int 8021 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 8022 struct spdk_bdev_module *module) 8023 { 8024 spdk_spin_lock(&bdev->internal.spinlock); 8025 8026 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8027 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8028 spdk_spin_unlock(&bdev->internal.spinlock); 8029 return -EPERM; 8030 } 8031 8032 if (desc && !desc->write) { 8033 desc->write = true; 8034 } 8035 8036 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 8037 bdev->internal.claim.v1.module = module; 8038 8039 spdk_spin_unlock(&bdev->internal.spinlock); 8040 return 0; 8041 } 8042 8043 void 8044 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 8045 { 8046 spdk_spin_lock(&bdev->internal.spinlock); 8047 8048 assert(bdev->internal.claim.v1.module != NULL); 8049 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 8050 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8051 bdev->internal.claim.v1.module = NULL; 8052 8053 spdk_spin_unlock(&bdev->internal.spinlock); 8054 } 8055 8056 /* 8057 * Start claims v2 8058 */ 8059 8060 const char * 8061 spdk_bdev_claim_get_name(enum spdk_bdev_claim_type type) 8062 { 8063 switch (type) { 8064 case SPDK_BDEV_CLAIM_NONE: 8065 return "not_claimed"; 8066 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8067 return "exclusive_write"; 8068 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8069 return "read_many_write_one"; 8070 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8071 return "read_many_write_none"; 8072 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8073 return "read_many_write_many"; 8074 default: 8075 break; 8076 } 8077 return "invalid_claim"; 8078 } 8079 8080 static bool 8081 claim_type_is_v2(enum spdk_bdev_claim_type type) 8082 { 8083 switch (type) { 8084 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8085 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8086 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8087 return true; 8088 default: 8089 break; 8090 } 8091 return false; 8092 } 8093 8094 /* Returns true if taking a claim with desc->write == false should make the descriptor writable. */ 8095 static bool 8096 claim_type_promotes_to_write(enum spdk_bdev_claim_type type) 8097 { 8098 switch (type) { 8099 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8100 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8101 return true; 8102 default: 8103 break; 8104 } 8105 return false; 8106 } 8107 8108 void 8109 spdk_bdev_claim_opts_init(struct spdk_bdev_claim_opts *opts, size_t size) 8110 { 8111 if (opts == NULL) { 8112 SPDK_ERRLOG("opts should not be NULL\n"); 8113 assert(opts != NULL); 8114 return; 8115 } 8116 if (size == 0) { 8117 SPDK_ERRLOG("size should not be zero\n"); 8118 assert(size != 0); 8119 return; 8120 } 8121 8122 memset(opts, 0, size); 8123 opts->opts_size = size; 8124 8125 #define FIELD_OK(field) \ 8126 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(opts->field) <= size 8127 8128 #define SET_FIELD(field, value) \ 8129 if (FIELD_OK(field)) { \ 8130 opts->field = value; \ 8131 } \ 8132 8133 SET_FIELD(shared_claim_key, 0); 8134 8135 #undef FIELD_OK 8136 #undef SET_FIELD 8137 } 8138 8139 static int 8140 claim_opts_copy(struct spdk_bdev_claim_opts *src, struct spdk_bdev_claim_opts *dst) 8141 { 8142 if (src->opts_size == 0) { 8143 SPDK_ERRLOG("size should not be zero\n"); 8144 return -1; 8145 } 8146 8147 memset(dst, 0, sizeof(*dst)); 8148 dst->opts_size = src->opts_size; 8149 8150 #define FIELD_OK(field) \ 8151 offsetof(struct spdk_bdev_claim_opts, field) + sizeof(src->field) <= src->opts_size 8152 8153 #define SET_FIELD(field) \ 8154 if (FIELD_OK(field)) { \ 8155 dst->field = src->field; \ 8156 } \ 8157 8158 if (FIELD_OK(name)) { 8159 snprintf(dst->name, sizeof(dst->name), "%s", src->name); 8160 } 8161 8162 SET_FIELD(shared_claim_key); 8163 8164 /* You should not remove this statement, but need to update the assert statement 8165 * if you add a new field, and also add a corresponding SET_FIELD statement */ 8166 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_claim_opts) == 48, "Incorrect size"); 8167 8168 #undef FIELD_OK 8169 #undef SET_FIELD 8170 return 0; 8171 } 8172 8173 /* Returns 0 if a read-write-once claim can be taken. */ 8174 static int 8175 claim_verify_rwo(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8176 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8177 { 8178 struct spdk_bdev *bdev = desc->bdev; 8179 struct spdk_bdev_desc *open_desc; 8180 8181 assert(spdk_spin_held(&bdev->internal.spinlock)); 8182 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE); 8183 8184 if (opts->shared_claim_key != 0) { 8185 SPDK_ERRLOG("%s: key option not supported with read-write-once claims\n", 8186 bdev->name); 8187 return -EINVAL; 8188 } 8189 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 8190 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8191 return -EPERM; 8192 } 8193 if (desc->claim != NULL) { 8194 SPDK_NOTICELOG("%s: descriptor already claimed bdev with module %s\n", 8195 bdev->name, desc->claim->module->name); 8196 return -EPERM; 8197 } 8198 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8199 if (desc != open_desc && open_desc->write) { 8200 SPDK_NOTICELOG("%s: Cannot obtain read-write-once claim while " 8201 "another descriptor is open for writing\n", 8202 bdev->name); 8203 return -EPERM; 8204 } 8205 } 8206 8207 return 0; 8208 } 8209 8210 /* Returns 0 if a read-only-many claim can be taken. */ 8211 static int 8212 claim_verify_rom(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8213 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8214 { 8215 struct spdk_bdev *bdev = desc->bdev; 8216 struct spdk_bdev_desc *open_desc; 8217 8218 assert(spdk_spin_held(&bdev->internal.spinlock)); 8219 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE); 8220 assert(desc->claim == NULL); 8221 8222 if (desc->write) { 8223 SPDK_ERRLOG("%s: Cannot obtain read-only-many claim with writable descriptor\n", 8224 bdev->name); 8225 return -EINVAL; 8226 } 8227 if (opts->shared_claim_key != 0) { 8228 SPDK_ERRLOG("%s: key option not supported with read-only-may claims\n", bdev->name); 8229 return -EINVAL; 8230 } 8231 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8232 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8233 if (open_desc->write) { 8234 SPDK_NOTICELOG("%s: Cannot obtain read-only-many claim while " 8235 "another descriptor is open for writing\n", 8236 bdev->name); 8237 return -EPERM; 8238 } 8239 } 8240 } 8241 8242 return 0; 8243 } 8244 8245 /* Returns 0 if a read-write-many claim can be taken. */ 8246 static int 8247 claim_verify_rwm(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8248 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8249 { 8250 struct spdk_bdev *bdev = desc->bdev; 8251 struct spdk_bdev_desc *open_desc; 8252 8253 assert(spdk_spin_held(&bdev->internal.spinlock)); 8254 assert(type == SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED); 8255 assert(desc->claim == NULL); 8256 8257 if (opts->shared_claim_key == 0) { 8258 SPDK_ERRLOG("%s: shared_claim_key option required with read-write-may claims\n", 8259 bdev->name); 8260 return -EINVAL; 8261 } 8262 switch (bdev->internal.claim_type) { 8263 case SPDK_BDEV_CLAIM_NONE: 8264 TAILQ_FOREACH(open_desc, &bdev->internal.open_descs, link) { 8265 if (open_desc == desc) { 8266 continue; 8267 } 8268 if (open_desc->write) { 8269 SPDK_NOTICELOG("%s: Cannot obtain read-write-many claim while " 8270 "another descriptor is open for writing without a " 8271 "claim\n", bdev->name); 8272 return -EPERM; 8273 } 8274 } 8275 break; 8276 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8277 if (opts->shared_claim_key != bdev->internal.claim.v2.key) { 8278 LOG_ALREADY_CLAIMED_ERROR("already claimed with another key", bdev); 8279 return -EPERM; 8280 } 8281 break; 8282 default: 8283 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8284 return -EBUSY; 8285 } 8286 8287 return 0; 8288 } 8289 8290 /* Updates desc and its bdev with a v2 claim. */ 8291 static int 8292 claim_bdev(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8293 struct spdk_bdev_claim_opts *opts, struct spdk_bdev_module *module) 8294 { 8295 struct spdk_bdev *bdev = desc->bdev; 8296 struct spdk_bdev_module_claim *claim; 8297 8298 assert(spdk_spin_held(&bdev->internal.spinlock)); 8299 assert(claim_type_is_v2(type)); 8300 assert(desc->claim == NULL); 8301 8302 claim = calloc(1, sizeof(*desc->claim)); 8303 if (claim == NULL) { 8304 SPDK_ERRLOG("%s: out of memory while allocating claim\n", bdev->name); 8305 return -ENOMEM; 8306 } 8307 claim->module = module; 8308 claim->desc = desc; 8309 SPDK_STATIC_ASSERT(sizeof(claim->name) == sizeof(opts->name), "sizes must match"); 8310 memcpy(claim->name, opts->name, sizeof(claim->name)); 8311 desc->claim = claim; 8312 8313 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 8314 bdev->internal.claim_type = type; 8315 TAILQ_INIT(&bdev->internal.claim.v2.claims); 8316 bdev->internal.claim.v2.key = opts->shared_claim_key; 8317 } 8318 assert(type == bdev->internal.claim_type); 8319 8320 TAILQ_INSERT_TAIL(&bdev->internal.claim.v2.claims, claim, link); 8321 8322 if (!desc->write && claim_type_promotes_to_write(type)) { 8323 desc->write = true; 8324 } 8325 8326 return 0; 8327 } 8328 8329 int 8330 spdk_bdev_module_claim_bdev_desc(struct spdk_bdev_desc *desc, enum spdk_bdev_claim_type type, 8331 struct spdk_bdev_claim_opts *_opts, 8332 struct spdk_bdev_module *module) 8333 { 8334 struct spdk_bdev *bdev; 8335 struct spdk_bdev_claim_opts opts; 8336 int rc = 0; 8337 8338 if (desc == NULL) { 8339 SPDK_ERRLOG("descriptor must not be NULL\n"); 8340 return -EINVAL; 8341 } 8342 8343 bdev = desc->bdev; 8344 8345 if (_opts == NULL) { 8346 spdk_bdev_claim_opts_init(&opts, sizeof(opts)); 8347 } else if (claim_opts_copy(_opts, &opts) != 0) { 8348 return -EINVAL; 8349 } 8350 8351 spdk_spin_lock(&bdev->internal.spinlock); 8352 8353 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE && 8354 bdev->internal.claim_type != type) { 8355 LOG_ALREADY_CLAIMED_ERROR("already claimed", bdev); 8356 spdk_spin_unlock(&bdev->internal.spinlock); 8357 return -EPERM; 8358 } 8359 8360 if (claim_type_is_v2(type) && desc->claim != NULL) { 8361 SPDK_ERRLOG("%s: descriptor already has %s claim with name '%s'\n", 8362 bdev->name, spdk_bdev_claim_get_name(type), desc->claim->name); 8363 spdk_spin_unlock(&bdev->internal.spinlock); 8364 return -EPERM; 8365 } 8366 8367 switch (type) { 8368 case SPDK_BDEV_CLAIM_EXCL_WRITE: 8369 spdk_spin_unlock(&bdev->internal.spinlock); 8370 return spdk_bdev_module_claim_bdev(bdev, desc, module); 8371 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_ONE: 8372 rc = claim_verify_rwo(desc, type, &opts, module); 8373 break; 8374 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_NONE: 8375 rc = claim_verify_rom(desc, type, &opts, module); 8376 break; 8377 case SPDK_BDEV_CLAIM_READ_MANY_WRITE_SHARED: 8378 rc = claim_verify_rwm(desc, type, &opts, module); 8379 break; 8380 default: 8381 SPDK_ERRLOG("%s: claim type %d not supported\n", bdev->name, type); 8382 rc = -ENOTSUP; 8383 } 8384 8385 if (rc == 0) { 8386 rc = claim_bdev(desc, type, &opts, module); 8387 } 8388 8389 spdk_spin_unlock(&bdev->internal.spinlock); 8390 return rc; 8391 } 8392 8393 static void 8394 claim_reset(struct spdk_bdev *bdev) 8395 { 8396 assert(spdk_spin_held(&bdev->internal.spinlock)); 8397 assert(claim_type_is_v2(bdev->internal.claim_type)); 8398 assert(TAILQ_EMPTY(&bdev->internal.claim.v2.claims)); 8399 8400 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 8401 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 8402 } 8403 8404 static void 8405 bdev_desc_release_claims(struct spdk_bdev_desc *desc) 8406 { 8407 struct spdk_bdev *bdev = desc->bdev; 8408 8409 assert(spdk_spin_held(&bdev->internal.spinlock)); 8410 assert(claim_type_is_v2(bdev->internal.claim_type)); 8411 8412 if (bdev->internal.examine_in_progress == 0) { 8413 TAILQ_REMOVE(&bdev->internal.claim.v2.claims, desc->claim, link); 8414 free(desc->claim); 8415 if (TAILQ_EMPTY(&bdev->internal.claim.v2.claims)) { 8416 claim_reset(bdev); 8417 } 8418 } else { 8419 /* This is a dead claim that will be cleaned up when bdev_examine() is done. */ 8420 desc->claim->module = NULL; 8421 desc->claim->desc = NULL; 8422 } 8423 desc->claim = NULL; 8424 } 8425 8426 /* 8427 * End claims v2 8428 */ 8429 8430 struct spdk_bdev * 8431 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 8432 { 8433 assert(desc != NULL); 8434 return desc->bdev; 8435 } 8436 8437 int 8438 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 8439 { 8440 struct spdk_bdev *bdev, *tmp; 8441 struct spdk_bdev_desc *desc; 8442 int rc = 0; 8443 8444 assert(fn != NULL); 8445 8446 spdk_spin_lock(&g_bdev_mgr.spinlock); 8447 bdev = spdk_bdev_first(); 8448 while (bdev != NULL) { 8449 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8450 if (rc != 0) { 8451 break; 8452 } 8453 rc = bdev_open(bdev, false, desc); 8454 if (rc != 0) { 8455 bdev_desc_free(desc); 8456 if (rc == -ENODEV) { 8457 /* Ignore the error and move to the next bdev. */ 8458 rc = 0; 8459 bdev = spdk_bdev_next(bdev); 8460 continue; 8461 } 8462 break; 8463 } 8464 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8465 8466 rc = fn(ctx, bdev); 8467 8468 spdk_spin_lock(&g_bdev_mgr.spinlock); 8469 tmp = spdk_bdev_next(bdev); 8470 bdev_close(bdev, desc); 8471 if (rc != 0) { 8472 break; 8473 } 8474 bdev = tmp; 8475 } 8476 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8477 8478 return rc; 8479 } 8480 8481 int 8482 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 8483 { 8484 struct spdk_bdev *bdev, *tmp; 8485 struct spdk_bdev_desc *desc; 8486 int rc = 0; 8487 8488 assert(fn != NULL); 8489 8490 spdk_spin_lock(&g_bdev_mgr.spinlock); 8491 bdev = spdk_bdev_first_leaf(); 8492 while (bdev != NULL) { 8493 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 8494 if (rc != 0) { 8495 break; 8496 } 8497 rc = bdev_open(bdev, false, desc); 8498 if (rc != 0) { 8499 bdev_desc_free(desc); 8500 if (rc == -ENODEV) { 8501 /* Ignore the error and move to the next bdev. */ 8502 rc = 0; 8503 bdev = spdk_bdev_next_leaf(bdev); 8504 continue; 8505 } 8506 break; 8507 } 8508 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8509 8510 rc = fn(ctx, bdev); 8511 8512 spdk_spin_lock(&g_bdev_mgr.spinlock); 8513 tmp = spdk_bdev_next_leaf(bdev); 8514 bdev_close(bdev, desc); 8515 if (rc != 0) { 8516 break; 8517 } 8518 bdev = tmp; 8519 } 8520 spdk_spin_unlock(&g_bdev_mgr.spinlock); 8521 8522 return rc; 8523 } 8524 8525 void 8526 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 8527 { 8528 struct iovec *iovs; 8529 int iovcnt; 8530 8531 if (bdev_io == NULL) { 8532 return; 8533 } 8534 8535 switch (bdev_io->type) { 8536 case SPDK_BDEV_IO_TYPE_READ: 8537 case SPDK_BDEV_IO_TYPE_WRITE: 8538 case SPDK_BDEV_IO_TYPE_ZCOPY: 8539 iovs = bdev_io->u.bdev.iovs; 8540 iovcnt = bdev_io->u.bdev.iovcnt; 8541 break; 8542 default: 8543 iovs = NULL; 8544 iovcnt = 0; 8545 break; 8546 } 8547 8548 if (iovp) { 8549 *iovp = iovs; 8550 } 8551 if (iovcntp) { 8552 *iovcntp = iovcnt; 8553 } 8554 } 8555 8556 void * 8557 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 8558 { 8559 if (bdev_io == NULL) { 8560 return NULL; 8561 } 8562 8563 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 8564 return NULL; 8565 } 8566 8567 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 8568 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 8569 return bdev_io->u.bdev.md_buf; 8570 } 8571 8572 return NULL; 8573 } 8574 8575 void * 8576 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 8577 { 8578 if (bdev_io == NULL) { 8579 assert(false); 8580 return NULL; 8581 } 8582 8583 return bdev_io->internal.caller_ctx; 8584 } 8585 8586 void 8587 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 8588 { 8589 8590 if (spdk_bdev_module_list_find(bdev_module->name)) { 8591 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 8592 assert(false); 8593 } 8594 8595 spdk_spin_init(&bdev_module->internal.spinlock); 8596 TAILQ_INIT(&bdev_module->internal.quiesced_ranges); 8597 8598 /* 8599 * Modules with examine callbacks must be initialized first, so they are 8600 * ready to handle examine callbacks from later modules that will 8601 * register physical bdevs. 8602 */ 8603 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 8604 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8605 } else { 8606 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 8607 } 8608 } 8609 8610 struct spdk_bdev_module * 8611 spdk_bdev_module_list_find(const char *name) 8612 { 8613 struct spdk_bdev_module *bdev_module; 8614 8615 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 8616 if (strcmp(name, bdev_module->name) == 0) { 8617 break; 8618 } 8619 } 8620 8621 return bdev_module; 8622 } 8623 8624 static int 8625 bdev_write_zero_buffer(struct spdk_bdev_io *bdev_io) 8626 { 8627 uint64_t num_blocks; 8628 void *md_buf = NULL; 8629 8630 num_blocks = bdev_io->u.bdev.num_blocks; 8631 8632 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 8633 md_buf = (char *)g_bdev_mgr.zero_buffer + 8634 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 8635 } 8636 8637 return bdev_write_blocks_with_md(bdev_io->internal.desc, 8638 spdk_io_channel_from_ctx(bdev_io->internal.ch), 8639 g_bdev_mgr.zero_buffer, md_buf, 8640 bdev_io->u.bdev.offset_blocks, num_blocks, 8641 bdev_write_zero_buffer_done, bdev_io); 8642 } 8643 8644 static void 8645 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 8646 { 8647 struct spdk_bdev_io *parent_io = cb_arg; 8648 8649 spdk_bdev_free_io(bdev_io); 8650 8651 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 8652 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 8653 } 8654 8655 static void 8656 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 8657 { 8658 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8659 ctx->bdev->internal.qos_mod_in_progress = false; 8660 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8661 8662 if (ctx->cb_fn) { 8663 ctx->cb_fn(ctx->cb_arg, status); 8664 } 8665 free(ctx); 8666 } 8667 8668 static void 8669 bdev_disable_qos_done(void *cb_arg) 8670 { 8671 struct set_qos_limit_ctx *ctx = cb_arg; 8672 struct spdk_bdev *bdev = ctx->bdev; 8673 struct spdk_bdev_io *bdev_io; 8674 struct spdk_bdev_qos *qos; 8675 8676 spdk_spin_lock(&bdev->internal.spinlock); 8677 qos = bdev->internal.qos; 8678 bdev->internal.qos = NULL; 8679 spdk_spin_unlock(&bdev->internal.spinlock); 8680 8681 while (!TAILQ_EMPTY(&qos->queued)) { 8682 /* Send queued I/O back to their original thread for resubmission. */ 8683 bdev_io = TAILQ_FIRST(&qos->queued); 8684 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 8685 8686 if (bdev_io->internal.io_submit_ch) { 8687 /* 8688 * Channel was changed when sending it to the QoS thread - change it back 8689 * before sending it back to the original thread. 8690 */ 8691 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 8692 bdev_io->internal.io_submit_ch = NULL; 8693 } 8694 8695 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 8696 _bdev_io_submit, bdev_io); 8697 } 8698 8699 if (qos->thread != NULL) { 8700 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 8701 spdk_poller_unregister(&qos->poller); 8702 } 8703 8704 free(qos); 8705 8706 bdev_set_qos_limit_done(ctx, 0); 8707 } 8708 8709 static void 8710 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 8711 { 8712 struct set_qos_limit_ctx *ctx = _ctx; 8713 struct spdk_thread *thread; 8714 8715 spdk_spin_lock(&bdev->internal.spinlock); 8716 thread = bdev->internal.qos->thread; 8717 spdk_spin_unlock(&bdev->internal.spinlock); 8718 8719 if (thread != NULL) { 8720 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 8721 } else { 8722 bdev_disable_qos_done(ctx); 8723 } 8724 } 8725 8726 static void 8727 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8728 struct spdk_io_channel *ch, void *_ctx) 8729 { 8730 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8731 8732 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 8733 8734 spdk_bdev_for_each_channel_continue(i, 0); 8735 } 8736 8737 static void 8738 bdev_update_qos_rate_limit_msg(void *cb_arg) 8739 { 8740 struct set_qos_limit_ctx *ctx = cb_arg; 8741 struct spdk_bdev *bdev = ctx->bdev; 8742 8743 spdk_spin_lock(&bdev->internal.spinlock); 8744 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 8745 spdk_spin_unlock(&bdev->internal.spinlock); 8746 8747 bdev_set_qos_limit_done(ctx, 0); 8748 } 8749 8750 static void 8751 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8752 struct spdk_io_channel *ch, void *_ctx) 8753 { 8754 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 8755 8756 spdk_spin_lock(&bdev->internal.spinlock); 8757 bdev_enable_qos(bdev, bdev_ch); 8758 spdk_spin_unlock(&bdev->internal.spinlock); 8759 spdk_bdev_for_each_channel_continue(i, 0); 8760 } 8761 8762 static void 8763 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 8764 { 8765 struct set_qos_limit_ctx *ctx = _ctx; 8766 8767 bdev_set_qos_limit_done(ctx, status); 8768 } 8769 8770 static void 8771 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 8772 { 8773 int i; 8774 8775 assert(bdev->internal.qos != NULL); 8776 8777 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8778 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8779 bdev->internal.qos->rate_limits[i].limit = limits[i]; 8780 8781 if (limits[i] == 0) { 8782 bdev->internal.qos->rate_limits[i].limit = 8783 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 8784 } 8785 } 8786 } 8787 } 8788 8789 void 8790 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 8791 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 8792 { 8793 struct set_qos_limit_ctx *ctx; 8794 uint32_t limit_set_complement; 8795 uint64_t min_limit_per_sec; 8796 int i; 8797 bool disable_rate_limit = true; 8798 8799 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8800 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 8801 continue; 8802 } 8803 8804 if (limits[i] > 0) { 8805 disable_rate_limit = false; 8806 } 8807 8808 if (bdev_qos_is_iops_rate_limit(i) == true) { 8809 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 8810 } else { 8811 /* Change from megabyte to byte rate limit */ 8812 limits[i] = limits[i] * 1024 * 1024; 8813 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 8814 } 8815 8816 limit_set_complement = limits[i] % min_limit_per_sec; 8817 if (limit_set_complement) { 8818 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 8819 limits[i], min_limit_per_sec); 8820 limits[i] += min_limit_per_sec - limit_set_complement; 8821 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 8822 } 8823 } 8824 8825 ctx = calloc(1, sizeof(*ctx)); 8826 if (ctx == NULL) { 8827 cb_fn(cb_arg, -ENOMEM); 8828 return; 8829 } 8830 8831 ctx->cb_fn = cb_fn; 8832 ctx->cb_arg = cb_arg; 8833 ctx->bdev = bdev; 8834 8835 spdk_spin_lock(&bdev->internal.spinlock); 8836 if (bdev->internal.qos_mod_in_progress) { 8837 spdk_spin_unlock(&bdev->internal.spinlock); 8838 free(ctx); 8839 cb_fn(cb_arg, -EAGAIN); 8840 return; 8841 } 8842 bdev->internal.qos_mod_in_progress = true; 8843 8844 if (disable_rate_limit == true && bdev->internal.qos) { 8845 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 8846 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 8847 (bdev->internal.qos->rate_limits[i].limit > 0 && 8848 bdev->internal.qos->rate_limits[i].limit != 8849 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 8850 disable_rate_limit = false; 8851 break; 8852 } 8853 } 8854 } 8855 8856 if (disable_rate_limit == false) { 8857 if (bdev->internal.qos == NULL) { 8858 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 8859 if (!bdev->internal.qos) { 8860 spdk_spin_unlock(&bdev->internal.spinlock); 8861 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 8862 bdev_set_qos_limit_done(ctx, -ENOMEM); 8863 return; 8864 } 8865 } 8866 8867 if (bdev->internal.qos->thread == NULL) { 8868 /* Enabling */ 8869 bdev_set_qos_rate_limits(bdev, limits); 8870 8871 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 8872 bdev_enable_qos_done); 8873 } else { 8874 /* Updating */ 8875 bdev_set_qos_rate_limits(bdev, limits); 8876 8877 spdk_thread_send_msg(bdev->internal.qos->thread, 8878 bdev_update_qos_rate_limit_msg, ctx); 8879 } 8880 } else { 8881 if (bdev->internal.qos != NULL) { 8882 bdev_set_qos_rate_limits(bdev, limits); 8883 8884 /* Disabling */ 8885 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 8886 bdev_disable_qos_msg_done); 8887 } else { 8888 spdk_spin_unlock(&bdev->internal.spinlock); 8889 bdev_set_qos_limit_done(ctx, 0); 8890 return; 8891 } 8892 } 8893 8894 spdk_spin_unlock(&bdev->internal.spinlock); 8895 } 8896 8897 struct spdk_bdev_histogram_ctx { 8898 spdk_bdev_histogram_status_cb cb_fn; 8899 void *cb_arg; 8900 struct spdk_bdev *bdev; 8901 int status; 8902 }; 8903 8904 static void 8905 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8906 { 8907 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8908 8909 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8910 ctx->bdev->internal.histogram_in_progress = false; 8911 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8912 ctx->cb_fn(ctx->cb_arg, ctx->status); 8913 free(ctx); 8914 } 8915 8916 static void 8917 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8918 struct spdk_io_channel *_ch, void *_ctx) 8919 { 8920 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8921 8922 if (ch->histogram != NULL) { 8923 spdk_histogram_data_free(ch->histogram); 8924 ch->histogram = NULL; 8925 } 8926 spdk_bdev_for_each_channel_continue(i, 0); 8927 } 8928 8929 static void 8930 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8931 { 8932 struct spdk_bdev_histogram_ctx *ctx = _ctx; 8933 8934 if (status != 0) { 8935 ctx->status = status; 8936 ctx->bdev->internal.histogram_enabled = false; 8937 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 8938 bdev_histogram_disable_channel_cb); 8939 } else { 8940 spdk_spin_lock(&ctx->bdev->internal.spinlock); 8941 ctx->bdev->internal.histogram_in_progress = false; 8942 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 8943 ctx->cb_fn(ctx->cb_arg, ctx->status); 8944 free(ctx); 8945 } 8946 } 8947 8948 static void 8949 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8950 struct spdk_io_channel *_ch, void *_ctx) 8951 { 8952 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8953 int status = 0; 8954 8955 if (ch->histogram == NULL) { 8956 ch->histogram = spdk_histogram_data_alloc(); 8957 if (ch->histogram == NULL) { 8958 status = -ENOMEM; 8959 } 8960 } 8961 8962 spdk_bdev_for_each_channel_continue(i, status); 8963 } 8964 8965 void 8966 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 8967 void *cb_arg, bool enable) 8968 { 8969 struct spdk_bdev_histogram_ctx *ctx; 8970 8971 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 8972 if (ctx == NULL) { 8973 cb_fn(cb_arg, -ENOMEM); 8974 return; 8975 } 8976 8977 ctx->bdev = bdev; 8978 ctx->status = 0; 8979 ctx->cb_fn = cb_fn; 8980 ctx->cb_arg = cb_arg; 8981 8982 spdk_spin_lock(&bdev->internal.spinlock); 8983 if (bdev->internal.histogram_in_progress) { 8984 spdk_spin_unlock(&bdev->internal.spinlock); 8985 free(ctx); 8986 cb_fn(cb_arg, -EAGAIN); 8987 return; 8988 } 8989 8990 bdev->internal.histogram_in_progress = true; 8991 spdk_spin_unlock(&bdev->internal.spinlock); 8992 8993 bdev->internal.histogram_enabled = enable; 8994 8995 if (enable) { 8996 /* Allocate histogram for each channel */ 8997 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 8998 bdev_histogram_enable_channel_cb); 8999 } else { 9000 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 9001 bdev_histogram_disable_channel_cb); 9002 } 9003 } 9004 9005 struct spdk_bdev_histogram_data_ctx { 9006 spdk_bdev_histogram_data_cb cb_fn; 9007 void *cb_arg; 9008 struct spdk_bdev *bdev; 9009 /** merged histogram data from all channels */ 9010 struct spdk_histogram_data *histogram; 9011 }; 9012 9013 static void 9014 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9015 { 9016 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9017 9018 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 9019 free(ctx); 9020 } 9021 9022 static void 9023 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9024 struct spdk_io_channel *_ch, void *_ctx) 9025 { 9026 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9027 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 9028 int status = 0; 9029 9030 if (ch->histogram == NULL) { 9031 status = -EFAULT; 9032 } else { 9033 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 9034 } 9035 9036 spdk_bdev_for_each_channel_continue(i, status); 9037 } 9038 9039 void 9040 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 9041 spdk_bdev_histogram_data_cb cb_fn, 9042 void *cb_arg) 9043 { 9044 struct spdk_bdev_histogram_data_ctx *ctx; 9045 9046 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 9047 if (ctx == NULL) { 9048 cb_fn(cb_arg, -ENOMEM, NULL); 9049 return; 9050 } 9051 9052 ctx->bdev = bdev; 9053 ctx->cb_fn = cb_fn; 9054 ctx->cb_arg = cb_arg; 9055 9056 ctx->histogram = histogram; 9057 9058 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 9059 bdev_histogram_get_channel_cb); 9060 } 9061 9062 void 9063 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 9064 void *cb_arg) 9065 { 9066 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 9067 int status = 0; 9068 9069 assert(cb_fn != NULL); 9070 9071 if (bdev_ch->histogram == NULL) { 9072 status = -EFAULT; 9073 } 9074 cb_fn(cb_arg, status, bdev_ch->histogram); 9075 } 9076 9077 size_t 9078 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 9079 size_t max_events) 9080 { 9081 struct media_event_entry *entry; 9082 size_t num_events = 0; 9083 9084 for (; num_events < max_events; ++num_events) { 9085 entry = TAILQ_FIRST(&desc->pending_media_events); 9086 if (entry == NULL) { 9087 break; 9088 } 9089 9090 events[num_events] = entry->event; 9091 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 9092 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 9093 } 9094 9095 return num_events; 9096 } 9097 9098 int 9099 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 9100 size_t num_events) 9101 { 9102 struct spdk_bdev_desc *desc; 9103 struct media_event_entry *entry; 9104 size_t event_id; 9105 int rc = 0; 9106 9107 assert(bdev->media_events); 9108 9109 spdk_spin_lock(&bdev->internal.spinlock); 9110 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9111 if (desc->write) { 9112 break; 9113 } 9114 } 9115 9116 if (desc == NULL || desc->media_events_buffer == NULL) { 9117 rc = -ENODEV; 9118 goto out; 9119 } 9120 9121 for (event_id = 0; event_id < num_events; ++event_id) { 9122 entry = TAILQ_FIRST(&desc->free_media_events); 9123 if (entry == NULL) { 9124 break; 9125 } 9126 9127 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 9128 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 9129 entry->event = events[event_id]; 9130 } 9131 9132 rc = event_id; 9133 out: 9134 spdk_spin_unlock(&bdev->internal.spinlock); 9135 return rc; 9136 } 9137 9138 static void 9139 _media_management_notify(void *arg) 9140 { 9141 struct spdk_bdev_desc *desc = arg; 9142 9143 _event_notify(desc, SPDK_BDEV_EVENT_MEDIA_MANAGEMENT); 9144 } 9145 9146 void 9147 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 9148 { 9149 struct spdk_bdev_desc *desc; 9150 9151 spdk_spin_lock(&bdev->internal.spinlock); 9152 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 9153 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 9154 event_notify(desc, _media_management_notify); 9155 } 9156 } 9157 spdk_spin_unlock(&bdev->internal.spinlock); 9158 } 9159 9160 struct locked_lba_range_ctx { 9161 struct lba_range range; 9162 struct lba_range *current_range; 9163 struct lba_range *owner_range; 9164 struct spdk_poller *poller; 9165 lock_range_cb cb_fn; 9166 void *cb_arg; 9167 }; 9168 9169 static void 9170 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9171 { 9172 struct locked_lba_range_ctx *ctx = _ctx; 9173 9174 ctx->cb_fn(&ctx->range, ctx->cb_arg, -ENOMEM); 9175 free(ctx); 9176 } 9177 9178 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 9179 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 9180 9181 static void 9182 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9183 { 9184 struct locked_lba_range_ctx *ctx = _ctx; 9185 9186 if (status == -ENOMEM) { 9187 /* One of the channels could not allocate a range object. 9188 * So we have to go back and clean up any ranges that were 9189 * allocated successfully before we return error status to 9190 * the caller. We can reuse the unlock function to do that 9191 * clean up. 9192 */ 9193 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9194 bdev_lock_error_cleanup_cb); 9195 return; 9196 } 9197 9198 /* All channels have locked this range and no I/O overlapping the range 9199 * are outstanding! Set the owner_ch for the range object for the 9200 * locking channel, so that this channel will know that it is allowed 9201 * to write to this range. 9202 */ 9203 if (ctx->owner_range != NULL) { 9204 ctx->owner_range->owner_ch = ctx->range.owner_ch; 9205 } 9206 9207 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9208 9209 /* Don't free the ctx here. Its range is in the bdev's global list of 9210 * locked ranges still, and will be removed and freed when this range 9211 * is later unlocked. 9212 */ 9213 } 9214 9215 static int 9216 bdev_lock_lba_range_check_io(void *_i) 9217 { 9218 struct spdk_bdev_channel_iter *i = _i; 9219 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 9220 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9221 struct locked_lba_range_ctx *ctx = i->ctx; 9222 struct lba_range *range = ctx->current_range; 9223 struct spdk_bdev_io *bdev_io; 9224 9225 spdk_poller_unregister(&ctx->poller); 9226 9227 /* The range is now in the locked_ranges, so no new IO can be submitted to this 9228 * range. But we need to wait until any outstanding IO overlapping with this range 9229 * are completed. 9230 */ 9231 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 9232 if (bdev_io_range_is_locked(bdev_io, range)) { 9233 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 9234 return SPDK_POLLER_BUSY; 9235 } 9236 } 9237 9238 spdk_bdev_for_each_channel_continue(i, 0); 9239 return SPDK_POLLER_BUSY; 9240 } 9241 9242 static void 9243 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9244 struct spdk_io_channel *_ch, void *_ctx) 9245 { 9246 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9247 struct locked_lba_range_ctx *ctx = _ctx; 9248 struct lba_range *range; 9249 9250 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9251 if (range->length == ctx->range.length && 9252 range->offset == ctx->range.offset && 9253 range->locked_ctx == ctx->range.locked_ctx) { 9254 /* This range already exists on this channel, so don't add 9255 * it again. This can happen when a new channel is created 9256 * while the for_each_channel operation is in progress. 9257 * Do not check for outstanding I/O in that case, since the 9258 * range was locked before any I/O could be submitted to the 9259 * new channel. 9260 */ 9261 spdk_bdev_for_each_channel_continue(i, 0); 9262 return; 9263 } 9264 } 9265 9266 range = calloc(1, sizeof(*range)); 9267 if (range == NULL) { 9268 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 9269 return; 9270 } 9271 9272 range->length = ctx->range.length; 9273 range->offset = ctx->range.offset; 9274 range->locked_ctx = ctx->range.locked_ctx; 9275 ctx->current_range = range; 9276 if (ctx->range.owner_ch == ch) { 9277 /* This is the range object for the channel that will hold 9278 * the lock. Store it in the ctx object so that we can easily 9279 * set its owner_ch after the lock is finally acquired. 9280 */ 9281 ctx->owner_range = range; 9282 } 9283 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 9284 bdev_lock_lba_range_check_io(i); 9285 } 9286 9287 static void 9288 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 9289 { 9290 assert(spdk_get_thread() == ctx->range.owner_thread); 9291 assert(ctx->range.owner_ch == NULL || 9292 spdk_io_channel_get_thread(ctx->range.owner_ch->channel) == ctx->range.owner_thread); 9293 9294 /* We will add a copy of this range to each channel now. */ 9295 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 9296 bdev_lock_lba_range_cb); 9297 } 9298 9299 static bool 9300 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 9301 { 9302 struct lba_range *r; 9303 9304 TAILQ_FOREACH(r, tailq, tailq) { 9305 if (bdev_lba_range_overlapped(range, r)) { 9306 return true; 9307 } 9308 } 9309 return false; 9310 } 9311 9312 static int 9313 _bdev_lock_lba_range(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch, 9314 uint64_t offset, uint64_t length, 9315 lock_range_cb cb_fn, void *cb_arg) 9316 { 9317 struct locked_lba_range_ctx *ctx; 9318 9319 ctx = calloc(1, sizeof(*ctx)); 9320 if (ctx == NULL) { 9321 return -ENOMEM; 9322 } 9323 9324 ctx->range.offset = offset; 9325 ctx->range.length = length; 9326 ctx->range.owner_thread = spdk_get_thread(); 9327 ctx->range.owner_ch = ch; 9328 ctx->range.locked_ctx = cb_arg; 9329 ctx->range.bdev = bdev; 9330 ctx->cb_fn = cb_fn; 9331 ctx->cb_arg = cb_arg; 9332 9333 spdk_spin_lock(&bdev->internal.spinlock); 9334 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 9335 /* There is an active lock overlapping with this range. 9336 * Put it on the pending list until this range no 9337 * longer overlaps with another. 9338 */ 9339 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 9340 } else { 9341 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 9342 bdev_lock_lba_range_ctx(bdev, ctx); 9343 } 9344 spdk_spin_unlock(&bdev->internal.spinlock); 9345 return 0; 9346 } 9347 9348 static int 9349 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9350 uint64_t offset, uint64_t length, 9351 lock_range_cb cb_fn, void *cb_arg) 9352 { 9353 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9354 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9355 9356 if (cb_arg == NULL) { 9357 SPDK_ERRLOG("cb_arg must not be NULL\n"); 9358 return -EINVAL; 9359 } 9360 9361 return _bdev_lock_lba_range(bdev, ch, offset, length, cb_fn, cb_arg); 9362 } 9363 9364 static void 9365 bdev_lock_lba_range_ctx_msg(void *_ctx) 9366 { 9367 struct locked_lba_range_ctx *ctx = _ctx; 9368 9369 bdev_lock_lba_range_ctx(ctx->range.bdev, ctx); 9370 } 9371 9372 static void 9373 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 9374 { 9375 struct locked_lba_range_ctx *ctx = _ctx; 9376 struct locked_lba_range_ctx *pending_ctx; 9377 struct lba_range *range, *tmp; 9378 9379 spdk_spin_lock(&bdev->internal.spinlock); 9380 /* Check if there are any pending locked ranges that overlap with this range 9381 * that was just unlocked. If there are, check that it doesn't overlap with any 9382 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 9383 * the lock process. 9384 */ 9385 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 9386 if (bdev_lba_range_overlapped(range, &ctx->range) && 9387 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 9388 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 9389 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9390 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 9391 spdk_thread_send_msg(pending_ctx->range.owner_thread, 9392 bdev_lock_lba_range_ctx_msg, pending_ctx); 9393 } 9394 } 9395 spdk_spin_unlock(&bdev->internal.spinlock); 9396 9397 ctx->cb_fn(&ctx->range, ctx->cb_arg, status); 9398 free(ctx); 9399 } 9400 9401 static void 9402 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9403 struct spdk_io_channel *_ch, void *_ctx) 9404 { 9405 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9406 struct locked_lba_range_ctx *ctx = _ctx; 9407 TAILQ_HEAD(, spdk_bdev_io) io_locked; 9408 struct spdk_bdev_io *bdev_io; 9409 struct lba_range *range; 9410 9411 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9412 if (ctx->range.offset == range->offset && 9413 ctx->range.length == range->length && 9414 ctx->range.locked_ctx == range->locked_ctx) { 9415 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 9416 free(range); 9417 break; 9418 } 9419 } 9420 9421 /* Note: we should almost always be able to assert that the range specified 9422 * was found. But there are some very rare corner cases where a new channel 9423 * gets created simultaneously with a range unlock, where this function 9424 * would execute on that new channel and wouldn't have the range. 9425 * We also use this to clean up range allocations when a later allocation 9426 * fails in the locking path. 9427 * So we can't actually assert() here. 9428 */ 9429 9430 /* Swap the locked IO into a temporary list, and then try to submit them again. 9431 * We could hyper-optimize this to only resubmit locked I/O that overlap 9432 * with the range that was just unlocked, but this isn't a performance path so 9433 * we go for simplicity here. 9434 */ 9435 TAILQ_INIT(&io_locked); 9436 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 9437 while (!TAILQ_EMPTY(&io_locked)) { 9438 bdev_io = TAILQ_FIRST(&io_locked); 9439 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 9440 bdev_io_submit(bdev_io); 9441 } 9442 9443 spdk_bdev_for_each_channel_continue(i, 0); 9444 } 9445 9446 static int 9447 _bdev_unlock_lba_range(struct spdk_bdev *bdev, uint64_t offset, uint64_t length, 9448 lock_range_cb cb_fn, void *cb_arg) 9449 { 9450 struct locked_lba_range_ctx *ctx; 9451 struct lba_range *range; 9452 9453 spdk_spin_lock(&bdev->internal.spinlock); 9454 /* To start the unlock the process, we find the range in the bdev's locked_ranges 9455 * and remove it. This ensures new channels don't inherit the locked range. 9456 * Then we will send a message to each channel to remove the range from its 9457 * per-channel list. 9458 */ 9459 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 9460 if (range->offset == offset && range->length == length && 9461 (range->owner_ch == NULL || range->locked_ctx == cb_arg)) { 9462 break; 9463 } 9464 } 9465 if (range == NULL) { 9466 assert(false); 9467 spdk_spin_unlock(&bdev->internal.spinlock); 9468 return -EINVAL; 9469 } 9470 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 9471 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 9472 spdk_spin_unlock(&bdev->internal.spinlock); 9473 9474 ctx->cb_fn = cb_fn; 9475 ctx->cb_arg = cb_arg; 9476 9477 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 9478 bdev_unlock_lba_range_cb); 9479 return 0; 9480 } 9481 9482 static int 9483 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 9484 uint64_t offset, uint64_t length, 9485 lock_range_cb cb_fn, void *cb_arg) 9486 { 9487 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9488 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 9489 struct lba_range *range; 9490 bool range_found = false; 9491 9492 /* Let's make sure the specified channel actually has a lock on 9493 * the specified range. Note that the range must match exactly. 9494 */ 9495 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 9496 if (range->offset == offset && range->length == length && 9497 range->owner_ch == ch && range->locked_ctx == cb_arg) { 9498 range_found = true; 9499 break; 9500 } 9501 } 9502 9503 if (!range_found) { 9504 return -EINVAL; 9505 } 9506 9507 return _bdev_unlock_lba_range(bdev, offset, length, cb_fn, cb_arg); 9508 } 9509 9510 struct bdev_quiesce_ctx { 9511 spdk_bdev_quiesce_cb cb_fn; 9512 void *cb_arg; 9513 }; 9514 9515 static void 9516 bdev_unquiesce_range_unlocked(struct lba_range *range, void *ctx, int status) 9517 { 9518 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9519 9520 if (quiesce_ctx->cb_fn != NULL) { 9521 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9522 } 9523 9524 free(quiesce_ctx); 9525 } 9526 9527 static void 9528 bdev_quiesce_range_locked(struct lba_range *range, void *ctx, int status) 9529 { 9530 struct bdev_quiesce_ctx *quiesce_ctx = ctx; 9531 struct spdk_bdev_module *module = range->bdev->module; 9532 9533 if (status != 0) { 9534 if (quiesce_ctx->cb_fn != NULL) { 9535 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9536 } 9537 free(quiesce_ctx); 9538 return; 9539 } 9540 9541 spdk_spin_lock(&module->internal.spinlock); 9542 TAILQ_INSERT_TAIL(&module->internal.quiesced_ranges, range, tailq_module); 9543 spdk_spin_unlock(&module->internal.spinlock); 9544 9545 if (quiesce_ctx->cb_fn != NULL) { 9546 quiesce_ctx->cb_fn(quiesce_ctx->cb_arg, status); 9547 quiesce_ctx->cb_fn = NULL; 9548 quiesce_ctx->cb_arg = NULL; 9549 } 9550 /* quiesce_ctx will be freed on unquiesce */ 9551 } 9552 9553 static int 9554 _spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9555 uint64_t offset, uint64_t length, 9556 spdk_bdev_quiesce_cb cb_fn, void *cb_arg, 9557 bool unquiesce) 9558 { 9559 struct bdev_quiesce_ctx *quiesce_ctx; 9560 int rc; 9561 9562 if (module != bdev->module) { 9563 SPDK_ERRLOG("Bdev does not belong to specified module.\n"); 9564 return -EINVAL; 9565 } 9566 9567 if (!bdev_io_valid_blocks(bdev, offset, length)) { 9568 return -EINVAL; 9569 } 9570 9571 if (unquiesce) { 9572 struct lba_range *range; 9573 9574 /* Make sure the specified range is actually quiesced in the specified module and 9575 * then remove it from the list. Note that the range must match exactly. 9576 */ 9577 spdk_spin_lock(&module->internal.spinlock); 9578 TAILQ_FOREACH(range, &module->internal.quiesced_ranges, tailq_module) { 9579 if (range->bdev == bdev && range->offset == offset && range->length == length) { 9580 TAILQ_REMOVE(&module->internal.quiesced_ranges, range, tailq_module); 9581 break; 9582 } 9583 } 9584 spdk_spin_unlock(&module->internal.spinlock); 9585 9586 if (range == NULL) { 9587 SPDK_ERRLOG("The range to unquiesce was not found.\n"); 9588 return -EINVAL; 9589 } 9590 9591 quiesce_ctx = range->locked_ctx; 9592 quiesce_ctx->cb_fn = cb_fn; 9593 quiesce_ctx->cb_arg = cb_arg; 9594 9595 rc = _bdev_unlock_lba_range(bdev, offset, length, bdev_unquiesce_range_unlocked, quiesce_ctx); 9596 } else { 9597 quiesce_ctx = malloc(sizeof(*quiesce_ctx)); 9598 if (quiesce_ctx == NULL) { 9599 return -ENOMEM; 9600 } 9601 9602 quiesce_ctx->cb_fn = cb_fn; 9603 quiesce_ctx->cb_arg = cb_arg; 9604 9605 rc = _bdev_lock_lba_range(bdev, NULL, offset, length, bdev_quiesce_range_locked, quiesce_ctx); 9606 if (rc != 0) { 9607 free(quiesce_ctx); 9608 } 9609 } 9610 9611 return rc; 9612 } 9613 9614 int 9615 spdk_bdev_quiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9616 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9617 { 9618 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, false); 9619 } 9620 9621 int 9622 spdk_bdev_unquiesce(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9623 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9624 { 9625 return _spdk_bdev_quiesce(bdev, module, 0, bdev->blockcnt, cb_fn, cb_arg, true); 9626 } 9627 9628 int 9629 spdk_bdev_quiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9630 uint64_t offset, uint64_t length, 9631 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9632 { 9633 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, false); 9634 } 9635 9636 int 9637 spdk_bdev_unquiesce_range(struct spdk_bdev *bdev, struct spdk_bdev_module *module, 9638 uint64_t offset, uint64_t length, 9639 spdk_bdev_quiesce_cb cb_fn, void *cb_arg) 9640 { 9641 return _spdk_bdev_quiesce(bdev, module, offset, length, cb_fn, cb_arg, true); 9642 } 9643 9644 int 9645 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 9646 int array_size) 9647 { 9648 if (!bdev) { 9649 return -EINVAL; 9650 } 9651 9652 if (bdev->fn_table->get_memory_domains) { 9653 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 9654 } 9655 9656 return 0; 9657 } 9658 9659 struct spdk_bdev_for_each_io_ctx { 9660 void *ctx; 9661 spdk_bdev_io_fn fn; 9662 spdk_bdev_for_each_io_cb cb; 9663 }; 9664 9665 static void 9666 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 9667 struct spdk_io_channel *io_ch, void *_ctx) 9668 { 9669 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9670 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 9671 struct spdk_bdev_io *bdev_io; 9672 int rc = 0; 9673 9674 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 9675 rc = ctx->fn(ctx->ctx, bdev_io); 9676 if (rc != 0) { 9677 break; 9678 } 9679 } 9680 9681 spdk_bdev_for_each_channel_continue(i, rc); 9682 } 9683 9684 static void 9685 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 9686 { 9687 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 9688 9689 ctx->cb(ctx->ctx, status); 9690 9691 free(ctx); 9692 } 9693 9694 void 9695 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 9696 spdk_bdev_for_each_io_cb cb) 9697 { 9698 struct spdk_bdev_for_each_io_ctx *ctx; 9699 9700 assert(fn != NULL && cb != NULL); 9701 9702 ctx = calloc(1, sizeof(*ctx)); 9703 if (ctx == NULL) { 9704 SPDK_ERRLOG("Failed to allocate context.\n"); 9705 cb(_ctx, -ENOMEM); 9706 return; 9707 } 9708 9709 ctx->ctx = _ctx; 9710 ctx->fn = fn; 9711 ctx->cb = cb; 9712 9713 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 9714 bdev_for_each_io_done); 9715 } 9716 9717 void 9718 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 9719 { 9720 spdk_for_each_channel_continue(iter->i, status); 9721 } 9722 9723 static struct spdk_bdev * 9724 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 9725 { 9726 void *io_device = spdk_io_channel_iter_get_io_device(i); 9727 9728 return __bdev_from_io_dev(io_device); 9729 } 9730 9731 static void 9732 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 9733 { 9734 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9735 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9736 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 9737 9738 iter->i = i; 9739 iter->fn(iter, bdev, ch, iter->ctx); 9740 } 9741 9742 static void 9743 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 9744 { 9745 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 9746 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 9747 9748 iter->i = i; 9749 iter->cpl(bdev, iter->ctx, status); 9750 9751 free(iter); 9752 } 9753 9754 void 9755 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 9756 void *ctx, spdk_bdev_for_each_channel_done cpl) 9757 { 9758 struct spdk_bdev_channel_iter *iter; 9759 9760 assert(bdev != NULL && fn != NULL && ctx != NULL); 9761 9762 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 9763 if (iter == NULL) { 9764 SPDK_ERRLOG("Unable to allocate iterator\n"); 9765 assert(false); 9766 return; 9767 } 9768 9769 iter->fn = fn; 9770 iter->cpl = cpl; 9771 iter->ctx = ctx; 9772 9773 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 9774 iter, bdev_each_channel_cpl); 9775 } 9776 9777 static void 9778 bdev_copy_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9779 { 9780 struct spdk_bdev_io *parent_io = cb_arg; 9781 9782 spdk_bdev_free_io(bdev_io); 9783 9784 /* Check return status of write */ 9785 parent_io->internal.status = success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; 9786 parent_io->internal.cb(parent_io, success, parent_io->internal.caller_ctx); 9787 } 9788 9789 static void 9790 bdev_copy_do_write(void *_bdev_io) 9791 { 9792 struct spdk_bdev_io *bdev_io = _bdev_io; 9793 int rc; 9794 9795 /* Write blocks */ 9796 rc = spdk_bdev_write_blocks_with_md(bdev_io->internal.desc, 9797 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9798 bdev_io->u.bdev.iovs[0].iov_base, 9799 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.offset_blocks, 9800 bdev_io->u.bdev.num_blocks, bdev_copy_do_write_done, bdev_io); 9801 9802 if (rc == -ENOMEM) { 9803 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_write); 9804 } else if (rc != 0) { 9805 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9806 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9807 } 9808 } 9809 9810 static void 9811 bdev_copy_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 9812 { 9813 struct spdk_bdev_io *parent_io = cb_arg; 9814 9815 spdk_bdev_free_io(bdev_io); 9816 9817 /* Check return status of read */ 9818 if (!success) { 9819 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9820 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 9821 return; 9822 } 9823 9824 /* Do write */ 9825 bdev_copy_do_write(parent_io); 9826 } 9827 9828 static void 9829 bdev_copy_do_read(void *_bdev_io) 9830 { 9831 struct spdk_bdev_io *bdev_io = _bdev_io; 9832 int rc; 9833 9834 /* Read blocks */ 9835 rc = spdk_bdev_read_blocks_with_md(bdev_io->internal.desc, 9836 spdk_io_channel_from_ctx(bdev_io->internal.ch), 9837 bdev_io->u.bdev.iovs[0].iov_base, 9838 bdev_io->u.bdev.md_buf, bdev_io->u.bdev.copy.src_offset_blocks, 9839 bdev_io->u.bdev.num_blocks, bdev_copy_do_read_done, bdev_io); 9840 9841 if (rc == -ENOMEM) { 9842 bdev_queue_io_wait_with_cb(bdev_io, bdev_copy_do_read); 9843 } else if (rc != 0) { 9844 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9845 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9846 } 9847 } 9848 9849 static void 9850 bdev_copy_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 9851 { 9852 if (!success) { 9853 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 9854 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 9855 return; 9856 } 9857 9858 bdev_copy_do_read(bdev_io); 9859 } 9860 9861 int 9862 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 9863 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 9864 spdk_bdev_io_completion_cb cb, void *cb_arg) 9865 { 9866 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 9867 struct spdk_bdev_io *bdev_io; 9868 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 9869 9870 if (!desc->write) { 9871 return -EBADF; 9872 } 9873 9874 if (num_blocks == 0) { 9875 SPDK_ERRLOG("Can't copy 0 blocks\n"); 9876 return -EINVAL; 9877 } 9878 9879 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 9880 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 9881 SPDK_DEBUGLOG(bdev, 9882 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 9883 dst_offset_blocks, src_offset_blocks, num_blocks); 9884 return -EINVAL; 9885 } 9886 9887 bdev_io = bdev_channel_get_io(channel); 9888 if (!bdev_io) { 9889 return -ENOMEM; 9890 } 9891 9892 bdev_io->internal.ch = channel; 9893 bdev_io->internal.desc = desc; 9894 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 9895 9896 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 9897 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 9898 bdev_io->u.bdev.num_blocks = num_blocks; 9899 bdev_io->u.bdev.memory_domain = NULL; 9900 bdev_io->u.bdev.memory_domain_ctx = NULL; 9901 bdev_io->u.bdev.iovs = NULL; 9902 bdev_io->u.bdev.iovcnt = 0; 9903 bdev_io->u.bdev.md_buf = NULL; 9904 bdev_io->u.bdev.accel_sequence = NULL; 9905 bdev_io_init(bdev_io, bdev, cb_arg, cb); 9906 9907 if (dst_offset_blocks == src_offset_blocks) { 9908 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 9909 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 9910 9911 return 0; 9912 } 9913 9914 9915 /* If the copy size is large and should be split, use the generic split logic 9916 * regardless of whether SPDK_BDEV_IO_TYPE_COPY is supported or not. 9917 * 9918 * Then, send the copy request if SPDK_BDEV_IO_TYPE_COPY is supported or 9919 * emulate it using regular read and write requests otherwise. 9920 */ 9921 if (spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY) || 9922 bdev_io->internal.split) { 9923 bdev_io_submit(bdev_io); 9924 return 0; 9925 } 9926 9927 spdk_bdev_io_get_buf(bdev_io, bdev_copy_get_buf_cb, num_blocks * spdk_bdev_get_block_size(bdev)); 9928 9929 return 0; 9930 } 9931 9932 SPDK_LOG_REGISTER_COMPONENT(bdev) 9933 9934 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 9935 { 9936 struct spdk_trace_tpoint_opts opts[] = { 9937 { 9938 "BDEV_IO_START", TRACE_BDEV_IO_START, 9939 OWNER_BDEV, OBJECT_BDEV_IO, 1, 9940 { 9941 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9942 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 9943 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9944 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 9945 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 9946 } 9947 }, 9948 { 9949 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 9950 OWNER_BDEV, OBJECT_BDEV_IO, 0, 9951 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 9952 }, 9953 { 9954 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 9955 OWNER_BDEV, OBJECT_NONE, 1, 9956 { 9957 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9958 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9959 } 9960 }, 9961 { 9962 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 9963 OWNER_BDEV, OBJECT_NONE, 0, 9964 { 9965 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 9966 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 9967 } 9968 }, 9969 }; 9970 9971 9972 spdk_trace_register_owner(OWNER_BDEV, 'b'); 9973 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 9974 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 9975 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 9976 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 9977 } 9978