1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 SPDK_LOG_DEPRECATION_REGISTER(bdev_register_examine_thread, 65 "bdev register and examine on non-app thread", "SPDK 23.05", 0); 66 67 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 68 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 69 }; 70 71 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 72 73 RB_HEAD(bdev_name_tree, spdk_bdev_name); 74 75 static int 76 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 77 { 78 return strcmp(name1->name, name2->name); 79 } 80 81 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 82 83 struct spdk_bdev_mgr { 84 struct spdk_mempool *bdev_io_pool; 85 86 void *zero_buffer; 87 88 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 89 90 struct spdk_bdev_list bdevs; 91 struct bdev_name_tree bdev_names; 92 93 bool init_complete; 94 bool module_init_complete; 95 96 struct spdk_spinlock spinlock; 97 98 #ifdef SPDK_CONFIG_VTUNE 99 __itt_domain *domain; 100 #endif 101 }; 102 103 static struct spdk_bdev_mgr g_bdev_mgr = { 104 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 105 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 106 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 107 .init_complete = false, 108 .module_init_complete = false, 109 }; 110 111 static void 112 __attribute__((constructor)) 113 _bdev_init(void) 114 { 115 spdk_spin_init(&g_bdev_mgr.spinlock); 116 } 117 118 typedef void (*lock_range_cb)(void *ctx, int status); 119 120 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 121 122 struct lba_range { 123 uint64_t offset; 124 uint64_t length; 125 void *locked_ctx; 126 struct spdk_bdev_channel *owner_ch; 127 TAILQ_ENTRY(lba_range) tailq; 128 }; 129 130 static struct spdk_bdev_opts g_bdev_opts = { 131 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 132 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 133 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 134 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 135 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 136 }; 137 138 static spdk_bdev_init_cb g_init_cb_fn = NULL; 139 static void *g_init_cb_arg = NULL; 140 141 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 142 static void *g_fini_cb_arg = NULL; 143 static struct spdk_thread *g_fini_thread = NULL; 144 145 struct spdk_bdev_qos_limit { 146 /** IOs or bytes allowed per second (i.e., 1s). */ 147 uint64_t limit; 148 149 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 150 * For remaining bytes, allowed to run negative if an I/O is submitted when 151 * some bytes are remaining, but the I/O is bigger than that amount. The 152 * excess will be deducted from the next timeslice. 153 */ 154 int64_t remaining_this_timeslice; 155 156 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 157 uint32_t min_per_timeslice; 158 159 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 160 uint32_t max_per_timeslice; 161 162 /** Function to check whether to queue the IO. */ 163 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 164 165 /** Function to update for the submitted IO. */ 166 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 167 }; 168 169 struct spdk_bdev_qos { 170 /** Types of structure of rate limits. */ 171 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 172 173 /** The channel that all I/O are funneled through. */ 174 struct spdk_bdev_channel *ch; 175 176 /** The thread on which the poller is running. */ 177 struct spdk_thread *thread; 178 179 /** Queue of I/O waiting to be issued. */ 180 bdev_io_tailq_t queued; 181 182 /** Size of a timeslice in tsc ticks. */ 183 uint64_t timeslice_size; 184 185 /** Timestamp of start of last timeslice. */ 186 uint64_t last_timeslice; 187 188 /** Poller that processes queued I/O commands each time slice. */ 189 struct spdk_poller *poller; 190 }; 191 192 struct spdk_bdev_mgmt_channel { 193 /* 194 * Each thread keeps a cache of bdev_io - this allows 195 * bdev threads which are *not* DPDK threads to still 196 * benefit from a per-thread bdev_io cache. Without 197 * this, non-DPDK threads fetching from the mempool 198 * incur a cmpxchg on get and put. 199 */ 200 bdev_io_stailq_t per_thread_cache; 201 uint32_t per_thread_cache_count; 202 uint32_t bdev_io_cache_size; 203 204 struct spdk_iobuf_channel iobuf; 205 206 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 207 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 208 }; 209 210 /* 211 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 212 * will queue here their IO that awaits retry. It makes it possible to retry sending 213 * IO to one bdev after IO from other bdev completes. 214 */ 215 struct spdk_bdev_shared_resource { 216 /* The bdev management channel */ 217 struct spdk_bdev_mgmt_channel *mgmt_ch; 218 219 /* 220 * Count of I/O submitted to bdev module and waiting for completion. 221 * Incremented before submit_request() is called on an spdk_bdev_io. 222 */ 223 uint64_t io_outstanding; 224 225 /* 226 * Queue of IO awaiting retry because of a previous NOMEM status returned 227 * on this channel. 228 */ 229 bdev_io_tailq_t nomem_io; 230 231 /* 232 * Threshold which io_outstanding must drop to before retrying nomem_io. 233 */ 234 uint64_t nomem_threshold; 235 236 /* I/O channel allocated by a bdev module */ 237 struct spdk_io_channel *shared_ch; 238 239 /* Refcount of bdev channels using this resource */ 240 uint32_t ref; 241 242 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 243 }; 244 245 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 246 #define BDEV_CH_QOS_ENABLED (1 << 1) 247 248 struct spdk_bdev_channel { 249 struct spdk_bdev *bdev; 250 251 /* The channel for the underlying device */ 252 struct spdk_io_channel *channel; 253 254 /* Per io_device per thread data */ 255 struct spdk_bdev_shared_resource *shared_resource; 256 257 struct spdk_bdev_io_stat *stat; 258 259 /* 260 * Count of I/O submitted to the underlying dev module through this channel 261 * and waiting for completion. 262 */ 263 uint64_t io_outstanding; 264 265 /* 266 * List of all submitted I/Os including I/O that are generated via splitting. 267 */ 268 bdev_io_tailq_t io_submitted; 269 270 /* 271 * List of spdk_bdev_io that are currently queued because they write to a locked 272 * LBA range. 273 */ 274 bdev_io_tailq_t io_locked; 275 276 uint32_t flags; 277 278 struct spdk_histogram_data *histogram; 279 280 #ifdef SPDK_CONFIG_VTUNE 281 uint64_t start_tsc; 282 uint64_t interval_tsc; 283 __itt_string_handle *handle; 284 struct spdk_bdev_io_stat *prev_stat; 285 #endif 286 287 bdev_io_tailq_t queued_resets; 288 289 lba_range_tailq_t locked_ranges; 290 }; 291 292 struct media_event_entry { 293 struct spdk_bdev_media_event event; 294 TAILQ_ENTRY(media_event_entry) tailq; 295 }; 296 297 #define MEDIA_EVENT_POOL_SIZE 64 298 299 struct spdk_bdev_desc { 300 struct spdk_bdev *bdev; 301 struct spdk_thread *thread; 302 struct { 303 spdk_bdev_event_cb_t event_fn; 304 void *ctx; 305 } callback; 306 bool closed; 307 bool write; 308 bool memory_domains_supported; 309 struct spdk_spinlock spinlock; 310 uint32_t refs; 311 TAILQ_HEAD(, media_event_entry) pending_media_events; 312 TAILQ_HEAD(, media_event_entry) free_media_events; 313 struct media_event_entry *media_events_buffer; 314 TAILQ_ENTRY(spdk_bdev_desc) link; 315 316 uint64_t timeout_in_sec; 317 spdk_bdev_io_timeout_cb cb_fn; 318 void *cb_arg; 319 struct spdk_poller *io_timeout_poller; 320 }; 321 322 struct spdk_bdev_iostat_ctx { 323 struct spdk_bdev_io_stat *stat; 324 spdk_bdev_get_device_stat_cb cb; 325 void *cb_arg; 326 }; 327 328 struct set_qos_limit_ctx { 329 void (*cb_fn)(void *cb_arg, int status); 330 void *cb_arg; 331 struct spdk_bdev *bdev; 332 }; 333 334 struct spdk_bdev_channel_iter { 335 spdk_bdev_for_each_channel_msg fn; 336 spdk_bdev_for_each_channel_done cpl; 337 struct spdk_io_channel_iter *i; 338 void *ctx; 339 }; 340 341 struct spdk_bdev_io_error_stat { 342 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 343 }; 344 345 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 346 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 347 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 348 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 349 350 static inline void bdev_io_complete(void *ctx); 351 352 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 353 static void bdev_write_zero_buffer_next(void *_bdev_io); 354 355 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 356 struct spdk_io_channel *ch, void *_ctx); 357 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 358 359 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 360 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 361 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 362 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 363 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 368 369 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 370 uint64_t offset, uint64_t length, 371 lock_range_cb cb_fn, void *cb_arg); 372 373 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 374 uint64_t offset, uint64_t length, 375 lock_range_cb cb_fn, void *cb_arg); 376 377 static inline void bdev_io_complete(void *ctx); 378 379 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 380 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 381 382 void 383 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 384 { 385 if (!opts) { 386 SPDK_ERRLOG("opts should not be NULL\n"); 387 return; 388 } 389 390 if (!opts_size) { 391 SPDK_ERRLOG("opts_size should not be zero value\n"); 392 return; 393 } 394 395 opts->opts_size = opts_size; 396 397 #define SET_FIELD(field) \ 398 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 399 opts->field = g_bdev_opts.field; \ 400 } \ 401 402 SET_FIELD(bdev_io_pool_size); 403 SET_FIELD(bdev_io_cache_size); 404 SET_FIELD(bdev_auto_examine); 405 SET_FIELD(small_buf_pool_size); 406 SET_FIELD(large_buf_pool_size); 407 408 /* Do not remove this statement, you should always update this statement when you adding a new field, 409 * and do not forget to add the SET_FIELD statement for your added field. */ 410 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 411 412 #undef SET_FIELD 413 } 414 415 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 416 "v23.05", 0); 417 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 418 "v23.05", 0); 419 int 420 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 421 { 422 struct spdk_iobuf_opts iobuf_opts; 423 uint32_t min_pool_size; 424 int rc; 425 426 if (!opts) { 427 SPDK_ERRLOG("opts cannot be NULL\n"); 428 return -1; 429 } 430 431 if (!opts->opts_size) { 432 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 433 return -1; 434 } 435 436 /* 437 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 438 * initialization. A second mgmt_ch will be created on the same thread when the application starts 439 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 440 */ 441 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 442 if (opts->bdev_io_pool_size < min_pool_size) { 443 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 444 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 445 spdk_thread_get_count()); 446 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 447 return -1; 448 } 449 450 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 451 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 452 } 453 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 454 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 455 } 456 457 #define SET_FIELD(field) \ 458 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 459 g_bdev_opts.field = opts->field; \ 460 } \ 461 462 SET_FIELD(bdev_io_pool_size); 463 SET_FIELD(bdev_io_cache_size); 464 SET_FIELD(bdev_auto_examine); 465 SET_FIELD(small_buf_pool_size); 466 SET_FIELD(large_buf_pool_size); 467 468 spdk_iobuf_get_opts(&iobuf_opts); 469 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 470 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 471 472 rc = spdk_iobuf_set_opts(&iobuf_opts); 473 if (rc != 0) { 474 SPDK_ERRLOG("Failed to set iobuf opts\n"); 475 return -1; 476 } 477 478 g_bdev_opts.opts_size = opts->opts_size; 479 480 #undef SET_FIELD 481 482 return 0; 483 } 484 485 static struct spdk_bdev * 486 bdev_get_by_name(const char *bdev_name) 487 { 488 struct spdk_bdev_name find; 489 struct spdk_bdev_name *res; 490 491 find.name = (char *)bdev_name; 492 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 493 if (res != NULL) { 494 return res->bdev; 495 } 496 497 return NULL; 498 } 499 500 struct spdk_bdev * 501 spdk_bdev_get_by_name(const char *bdev_name) 502 { 503 struct spdk_bdev *bdev; 504 505 spdk_spin_lock(&g_bdev_mgr.spinlock); 506 bdev = bdev_get_by_name(bdev_name); 507 spdk_spin_unlock(&g_bdev_mgr.spinlock); 508 509 return bdev; 510 } 511 512 struct bdev_io_status_string { 513 enum spdk_bdev_io_status status; 514 const char *str; 515 }; 516 517 static const struct bdev_io_status_string bdev_io_status_strings[] = { 518 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 519 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 520 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 521 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 522 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 523 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 524 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 525 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 526 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 527 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 528 }; 529 530 static const char * 531 bdev_io_status_get_string(enum spdk_bdev_io_status status) 532 { 533 uint32_t i; 534 535 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 536 if (bdev_io_status_strings[i].status == status) { 537 return bdev_io_status_strings[i].str; 538 } 539 } 540 541 return "reserved"; 542 } 543 544 struct spdk_bdev_wait_for_examine_ctx { 545 struct spdk_poller *poller; 546 spdk_bdev_wait_for_examine_cb cb_fn; 547 void *cb_arg; 548 }; 549 550 static bool bdev_module_all_actions_completed(void); 551 552 static int 553 bdev_wait_for_examine_cb(void *arg) 554 { 555 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 556 557 if (!bdev_module_all_actions_completed()) { 558 return SPDK_POLLER_IDLE; 559 } 560 561 spdk_poller_unregister(&ctx->poller); 562 ctx->cb_fn(ctx->cb_arg); 563 free(ctx); 564 565 return SPDK_POLLER_BUSY; 566 } 567 568 int 569 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 570 { 571 struct spdk_bdev_wait_for_examine_ctx *ctx; 572 573 ctx = calloc(1, sizeof(*ctx)); 574 if (ctx == NULL) { 575 return -ENOMEM; 576 } 577 ctx->cb_fn = cb_fn; 578 ctx->cb_arg = cb_arg; 579 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 580 581 return 0; 582 } 583 584 struct spdk_bdev_examine_item { 585 char *name; 586 TAILQ_ENTRY(spdk_bdev_examine_item) link; 587 }; 588 589 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 590 591 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 592 g_bdev_examine_allowlist); 593 594 static inline bool 595 bdev_examine_allowlist_check(const char *name) 596 { 597 struct spdk_bdev_examine_item *item; 598 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 599 if (strcmp(name, item->name) == 0) { 600 return true; 601 } 602 } 603 return false; 604 } 605 606 static inline void 607 bdev_examine_allowlist_free(void) 608 { 609 struct spdk_bdev_examine_item *item; 610 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 611 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 612 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 613 free(item->name); 614 free(item); 615 } 616 } 617 618 static inline bool 619 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 620 { 621 struct spdk_bdev_alias *tmp; 622 if (bdev_examine_allowlist_check(bdev->name)) { 623 return true; 624 } 625 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 626 if (bdev_examine_allowlist_check(tmp->alias.name)) { 627 return true; 628 } 629 } 630 return false; 631 } 632 633 static inline bool 634 bdev_ok_to_examine(struct spdk_bdev *bdev) 635 { 636 if (g_bdev_opts.bdev_auto_examine) { 637 return true; 638 } else { 639 return bdev_in_examine_allowlist(bdev); 640 } 641 } 642 643 static void 644 bdev_examine(struct spdk_bdev *bdev) 645 { 646 struct spdk_bdev_module *module; 647 uint32_t action; 648 649 if (!bdev_ok_to_examine(bdev)) { 650 return; 651 } 652 653 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 654 if (module->examine_config) { 655 spdk_spin_lock(&module->internal.spinlock); 656 action = module->internal.action_in_progress; 657 module->internal.action_in_progress++; 658 spdk_spin_unlock(&module->internal.spinlock); 659 module->examine_config(bdev); 660 if (action != module->internal.action_in_progress) { 661 SPDK_ERRLOG("examine_config for module %s did not call " 662 "spdk_bdev_module_examine_done()\n", module->name); 663 } 664 } 665 } 666 667 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 668 module = bdev->internal.claim.v1.module; 669 if (module->examine_disk) { 670 spdk_spin_lock(&module->internal.spinlock); 671 module->internal.action_in_progress++; 672 spdk_spin_unlock(&module->internal.spinlock); 673 module->examine_disk(bdev); 674 } 675 return; 676 } 677 678 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 679 if (module->examine_disk) { 680 spdk_spin_lock(&module->internal.spinlock); 681 module->internal.action_in_progress++; 682 spdk_spin_unlock(&module->internal.spinlock); 683 module->examine_disk(bdev); 684 } 685 } 686 } 687 688 int 689 spdk_bdev_examine(const char *name) 690 { 691 struct spdk_bdev *bdev; 692 struct spdk_bdev_examine_item *item; 693 694 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 695 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 696 } 697 698 if (g_bdev_opts.bdev_auto_examine) { 699 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 700 return -EINVAL; 701 } 702 703 if (bdev_examine_allowlist_check(name)) { 704 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 705 return -EEXIST; 706 } 707 708 item = calloc(1, sizeof(*item)); 709 if (!item) { 710 return -ENOMEM; 711 } 712 item->name = strdup(name); 713 if (!item->name) { 714 free(item); 715 return -ENOMEM; 716 } 717 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 718 719 bdev = spdk_bdev_get_by_name(name); 720 if (bdev) { 721 bdev_examine(bdev); 722 } 723 return 0; 724 } 725 726 static inline void 727 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 728 { 729 struct spdk_bdev_examine_item *item; 730 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 731 spdk_json_write_object_begin(w); 732 spdk_json_write_named_string(w, "method", "bdev_examine"); 733 spdk_json_write_named_object_begin(w, "params"); 734 spdk_json_write_named_string(w, "name", item->name); 735 spdk_json_write_object_end(w); 736 spdk_json_write_object_end(w); 737 } 738 } 739 740 struct spdk_bdev * 741 spdk_bdev_first(void) 742 { 743 struct spdk_bdev *bdev; 744 745 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 746 if (bdev) { 747 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 748 } 749 750 return bdev; 751 } 752 753 struct spdk_bdev * 754 spdk_bdev_next(struct spdk_bdev *prev) 755 { 756 struct spdk_bdev *bdev; 757 758 bdev = TAILQ_NEXT(prev, internal.link); 759 if (bdev) { 760 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 761 } 762 763 return bdev; 764 } 765 766 static struct spdk_bdev * 767 _bdev_next_leaf(struct spdk_bdev *bdev) 768 { 769 while (bdev != NULL) { 770 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 771 return bdev; 772 } else { 773 bdev = TAILQ_NEXT(bdev, internal.link); 774 } 775 } 776 777 return bdev; 778 } 779 780 struct spdk_bdev * 781 spdk_bdev_first_leaf(void) 782 { 783 struct spdk_bdev *bdev; 784 785 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 786 787 if (bdev) { 788 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 789 } 790 791 return bdev; 792 } 793 794 struct spdk_bdev * 795 spdk_bdev_next_leaf(struct spdk_bdev *prev) 796 { 797 struct spdk_bdev *bdev; 798 799 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 800 801 if (bdev) { 802 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 803 } 804 805 return bdev; 806 } 807 808 static inline bool 809 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 810 { 811 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 812 } 813 814 void 815 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 816 { 817 struct iovec *iovs; 818 819 if (bdev_io->u.bdev.iovs == NULL) { 820 bdev_io->u.bdev.iovs = &bdev_io->iov; 821 bdev_io->u.bdev.iovcnt = 1; 822 } 823 824 iovs = bdev_io->u.bdev.iovs; 825 826 assert(iovs != NULL); 827 assert(bdev_io->u.bdev.iovcnt >= 1); 828 829 iovs[0].iov_base = buf; 830 iovs[0].iov_len = len; 831 } 832 833 void 834 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 835 { 836 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 837 bdev_io->u.bdev.md_buf = md_buf; 838 } 839 840 static bool 841 _is_buf_allocated(const struct iovec *iovs) 842 { 843 if (iovs == NULL) { 844 return false; 845 } 846 847 return iovs[0].iov_base != NULL; 848 } 849 850 static bool 851 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 852 { 853 int i; 854 uintptr_t iov_base; 855 856 if (spdk_likely(alignment == 1)) { 857 return true; 858 } 859 860 for (i = 0; i < iovcnt; i++) { 861 iov_base = (uintptr_t)iovs[i].iov_base; 862 if ((iov_base & (alignment - 1)) != 0) { 863 return false; 864 } 865 } 866 867 return true; 868 } 869 870 static void 871 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 872 { 873 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 874 void *buf; 875 876 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 877 buf = bdev_io->internal.buf; 878 bdev_io->internal.buf = NULL; 879 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 880 bdev_io->internal.get_aux_buf_cb = NULL; 881 } else { 882 assert(bdev_io->internal.get_buf_cb != NULL); 883 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 884 bdev_io->internal.get_buf_cb = NULL; 885 } 886 } 887 888 static void 889 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 890 { 891 struct spdk_bdev_io *bdev_io = ctx; 892 893 if (rc) { 894 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 895 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 896 } 897 bdev_io_get_buf_complete(bdev_io, !rc); 898 } 899 900 static void 901 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 902 { 903 int rc = 0; 904 905 /* save original md_buf */ 906 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 907 bdev_io->internal.orig_md_iov.iov_len = len; 908 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 909 bdev_io->internal.bounce_md_iov.iov_len = len; 910 /* set bounce md_buf */ 911 bdev_io->u.bdev.md_buf = md_buf; 912 913 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 914 if (bdev_io_use_memory_domain(bdev_io)) { 915 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 916 bdev_io->internal.ext_opts->memory_domain_ctx, 917 &bdev_io->internal.orig_md_iov, 1, 918 &bdev_io->internal.bounce_md_iov, 1, 919 bdev_io->internal.data_transfer_cpl, 920 bdev_io); 921 if (rc == 0) { 922 /* Continue to submit IO in completion callback */ 923 return; 924 } 925 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 926 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 927 } else { 928 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 929 } 930 } 931 932 assert(bdev_io->internal.data_transfer_cpl); 933 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 934 } 935 936 static void 937 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 938 { 939 struct spdk_bdev *bdev = bdev_io->bdev; 940 uint64_t md_len; 941 void *buf; 942 943 if (spdk_bdev_is_md_separate(bdev)) { 944 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 945 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 946 947 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 948 949 if (bdev_io->u.bdev.md_buf != NULL) { 950 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 951 return; 952 } else { 953 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 954 } 955 } 956 957 bdev_io_get_buf_complete(bdev_io, true); 958 } 959 960 static void 961 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 962 { 963 struct spdk_bdev_io *bdev_io = ctx; 964 965 if (rc) { 966 SPDK_ERRLOG("Failed to get data buffer\n"); 967 assert(bdev_io->internal.data_transfer_cpl); 968 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 969 return; 970 } 971 972 _bdev_io_set_md_buf(bdev_io); 973 } 974 975 static void 976 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 977 bdev_copy_bounce_buffer_cpl cpl_cb) 978 { 979 int rc = 0; 980 981 bdev_io->internal.data_transfer_cpl = cpl_cb; 982 /* save original iovec */ 983 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 984 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 985 /* set bounce iov */ 986 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 987 bdev_io->u.bdev.iovcnt = 1; 988 /* set bounce buffer for this operation */ 989 bdev_io->u.bdev.iovs[0].iov_base = buf; 990 bdev_io->u.bdev.iovs[0].iov_len = len; 991 /* if this is write path, copy data from original buffer to bounce buffer */ 992 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 993 if (bdev_io_use_memory_domain(bdev_io)) { 994 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 995 bdev_io->internal.ext_opts->memory_domain_ctx, 996 bdev_io->internal.orig_iovs, 997 (uint32_t) bdev_io->internal.orig_iovcnt, 998 bdev_io->u.bdev.iovs, 1, 999 _bdev_io_pull_bounce_data_buf_done, 1000 bdev_io); 1001 if (rc == 0) { 1002 /* Continue to submit IO in completion callback */ 1003 return; 1004 } 1005 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1006 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1007 } else { 1008 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1009 } 1010 } 1011 1012 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1013 } 1014 1015 static void 1016 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1017 { 1018 struct spdk_bdev *bdev = bdev_io->bdev; 1019 bool buf_allocated; 1020 uint64_t alignment; 1021 void *aligned_buf; 1022 1023 bdev_io->internal.buf = buf; 1024 1025 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1026 bdev_io_get_buf_complete(bdev_io, true); 1027 return; 1028 } 1029 1030 alignment = spdk_bdev_get_buf_align(bdev); 1031 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1032 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1033 1034 if (buf_allocated) { 1035 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1036 /* Continue in completion callback */ 1037 return; 1038 } else { 1039 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1040 } 1041 1042 _bdev_io_set_md_buf(bdev_io); 1043 } 1044 1045 static inline uint64_t 1046 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1047 { 1048 struct spdk_bdev *bdev = bdev_io->bdev; 1049 uint64_t md_len, alignment; 1050 1051 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1052 alignment = spdk_bdev_get_buf_align(bdev); 1053 1054 return len + alignment + md_len; 1055 } 1056 1057 static void 1058 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1059 { 1060 struct spdk_bdev_mgmt_channel *ch; 1061 1062 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1063 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1064 } 1065 1066 static void 1067 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1068 { 1069 assert(bdev_io->internal.buf != NULL); 1070 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1071 bdev_io->internal.buf = NULL; 1072 } 1073 1074 void 1075 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1076 { 1077 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1078 1079 assert(buf != NULL); 1080 _bdev_io_put_buf(bdev_io, buf, len); 1081 } 1082 1083 static void 1084 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1085 { 1086 struct spdk_bdev *bdev = bdev_ch->bdev; 1087 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1088 struct spdk_bdev_io *bdev_io; 1089 1090 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1091 /* 1092 * Allow some more I/O to complete before retrying the nomem_io queue. 1093 * Some drivers (such as nvme) cannot immediately take a new I/O in 1094 * the context of a completion, because the resources for the I/O are 1095 * not released until control returns to the bdev poller. Also, we 1096 * may require several small I/O to complete before a larger I/O 1097 * (that requires splitting) can be submitted. 1098 */ 1099 return; 1100 } 1101 1102 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1103 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1104 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1105 bdev_io->internal.ch->io_outstanding++; 1106 shared_resource->io_outstanding++; 1107 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1108 bdev_io->internal.error.nvme.cdw0 = 0; 1109 bdev_io->num_retries++; 1110 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1111 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1112 break; 1113 } 1114 } 1115 } 1116 1117 static inline void 1118 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1119 struct spdk_bdev_shared_resource *shared_resource) 1120 { 1121 assert(bdev_ch->io_outstanding > 0); 1122 assert(shared_resource->io_outstanding > 0); 1123 bdev_ch->io_outstanding--; 1124 shared_resource->io_outstanding--; 1125 } 1126 1127 static inline bool 1128 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1129 { 1130 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1131 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1132 1133 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1134 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1135 /* 1136 * Wait for some of the outstanding I/O to complete before we 1137 * retry any of the nomem_io. Normally we will wait for 1138 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1139 * depth channels we will instead wait for half to complete. 1140 */ 1141 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1142 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1143 return true; 1144 } 1145 1146 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1147 bdev_ch_retry_io(bdev_ch); 1148 } 1149 1150 return false; 1151 } 1152 1153 static void 1154 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1155 { 1156 struct spdk_bdev_io *bdev_io = ctx; 1157 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1158 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1159 1160 if (rc) { 1161 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1162 } 1163 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1164 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1165 */ 1166 bdev_io_put_buf(bdev_io); 1167 1168 /* Continue with IO completion flow */ 1169 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1170 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1171 return; 1172 } 1173 1174 bdev_io_complete(bdev_io); 1175 } 1176 1177 static inline void 1178 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1179 { 1180 int rc = 0; 1181 1182 /* do the same for metadata buffer */ 1183 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1184 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1185 1186 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1187 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1188 if (bdev_io_use_memory_domain(bdev_io)) { 1189 /* If memory domain is used then we need to call async push function */ 1190 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1191 bdev_io->internal.ext_opts->memory_domain_ctx, 1192 &bdev_io->internal.orig_md_iov, 1193 (uint32_t)bdev_io->internal.orig_iovcnt, 1194 &bdev_io->internal.bounce_md_iov, 1, 1195 bdev_io->internal.data_transfer_cpl, 1196 bdev_io); 1197 if (rc == 0) { 1198 /* Continue IO completion in async callback */ 1199 return; 1200 } 1201 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1202 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1203 } else { 1204 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1205 bdev_io->internal.orig_md_iov.iov_len); 1206 } 1207 } 1208 } 1209 1210 assert(bdev_io->internal.data_transfer_cpl); 1211 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1212 } 1213 1214 static void 1215 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1216 { 1217 struct spdk_bdev_io *bdev_io = ctx; 1218 1219 assert(bdev_io->internal.data_transfer_cpl); 1220 1221 if (rc) { 1222 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1223 return; 1224 } 1225 1226 /* set original buffer for this io */ 1227 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1228 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1229 /* disable bouncing buffer for this io */ 1230 bdev_io->internal.orig_iovcnt = 0; 1231 bdev_io->internal.orig_iovs = NULL; 1232 1233 _bdev_io_push_bounce_md_buffer(bdev_io); 1234 } 1235 1236 static inline void 1237 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1238 { 1239 int rc = 0; 1240 1241 bdev_io->internal.data_transfer_cpl = cpl_cb; 1242 1243 /* if this is read path, copy data from bounce buffer to original buffer */ 1244 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1245 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1246 if (bdev_io_use_memory_domain(bdev_io)) { 1247 /* If memory domain is used then we need to call async push function */ 1248 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1249 bdev_io->internal.ext_opts->memory_domain_ctx, 1250 bdev_io->internal.orig_iovs, 1251 (uint32_t)bdev_io->internal.orig_iovcnt, 1252 &bdev_io->internal.bounce_iov, 1, 1253 _bdev_io_push_bounce_data_buffer_done, 1254 bdev_io); 1255 if (rc == 0) { 1256 /* Continue IO completion in async callback */ 1257 return; 1258 } 1259 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1260 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1261 } else { 1262 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1263 bdev_io->internal.orig_iovcnt, 1264 bdev_io->internal.bounce_iov.iov_base, 1265 bdev_io->internal.bounce_iov.iov_len); 1266 } 1267 } 1268 1269 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1270 } 1271 1272 static void 1273 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1274 { 1275 struct spdk_bdev_io *bdev_io; 1276 1277 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1278 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1279 } 1280 1281 static void 1282 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1283 { 1284 struct spdk_bdev_mgmt_channel *mgmt_ch; 1285 uint64_t max_len; 1286 void *buf; 1287 1288 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1289 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1290 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1291 1292 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1293 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1294 bdev_io_get_buf_complete(bdev_io, false); 1295 return; 1296 } 1297 1298 bdev_io->internal.buf_len = len; 1299 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1300 bdev_io_get_iobuf_cb); 1301 if (buf != NULL) { 1302 _bdev_io_set_buf(bdev_io, buf, len); 1303 } 1304 } 1305 1306 void 1307 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1308 { 1309 struct spdk_bdev *bdev = bdev_io->bdev; 1310 uint64_t alignment; 1311 1312 assert(cb != NULL); 1313 bdev_io->internal.get_buf_cb = cb; 1314 1315 alignment = spdk_bdev_get_buf_align(bdev); 1316 1317 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1318 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1319 /* Buffer already present and aligned */ 1320 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1321 return; 1322 } 1323 1324 bdev_io_get_buf(bdev_io, len); 1325 } 1326 1327 static void 1328 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1329 bool success) 1330 { 1331 if (!success) { 1332 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1333 bdev_io_complete(bdev_io); 1334 } else { 1335 bdev_io_submit(bdev_io); 1336 } 1337 } 1338 1339 static void 1340 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1341 uint64_t len) 1342 { 1343 assert(cb != NULL); 1344 bdev_io->internal.get_buf_cb = cb; 1345 1346 bdev_io_get_buf(bdev_io, len); 1347 } 1348 1349 void 1350 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1351 { 1352 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1353 1354 assert(cb != NULL); 1355 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1356 bdev_io->internal.get_aux_buf_cb = cb; 1357 bdev_io_get_buf(bdev_io, len); 1358 } 1359 1360 static int 1361 bdev_module_get_max_ctx_size(void) 1362 { 1363 struct spdk_bdev_module *bdev_module; 1364 int max_bdev_module_size = 0; 1365 1366 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1367 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1368 max_bdev_module_size = bdev_module->get_ctx_size(); 1369 } 1370 } 1371 1372 return max_bdev_module_size; 1373 } 1374 1375 static void 1376 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1377 { 1378 int i; 1379 struct spdk_bdev_qos *qos = bdev->internal.qos; 1380 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1381 1382 if (!qos) { 1383 return; 1384 } 1385 1386 spdk_bdev_get_qos_rate_limits(bdev, limits); 1387 1388 spdk_json_write_object_begin(w); 1389 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1390 1391 spdk_json_write_named_object_begin(w, "params"); 1392 spdk_json_write_named_string(w, "name", bdev->name); 1393 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1394 if (limits[i] > 0) { 1395 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1396 } 1397 } 1398 spdk_json_write_object_end(w); 1399 1400 spdk_json_write_object_end(w); 1401 } 1402 1403 void 1404 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1405 { 1406 struct spdk_bdev_module *bdev_module; 1407 struct spdk_bdev *bdev; 1408 1409 assert(w != NULL); 1410 1411 spdk_json_write_array_begin(w); 1412 1413 spdk_json_write_object_begin(w); 1414 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1415 spdk_json_write_named_object_begin(w, "params"); 1416 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1417 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1418 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1419 spdk_json_write_object_end(w); 1420 spdk_json_write_object_end(w); 1421 1422 bdev_examine_allowlist_config_json(w); 1423 1424 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1425 if (bdev_module->config_json) { 1426 bdev_module->config_json(w); 1427 } 1428 } 1429 1430 spdk_spin_lock(&g_bdev_mgr.spinlock); 1431 1432 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1433 if (bdev->fn_table->write_config_json) { 1434 bdev->fn_table->write_config_json(bdev, w); 1435 } 1436 1437 bdev_qos_config_json(bdev, w); 1438 } 1439 1440 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1441 1442 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1443 spdk_json_write_object_begin(w); 1444 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1445 spdk_json_write_object_end(w); 1446 1447 spdk_json_write_array_end(w); 1448 } 1449 1450 static void 1451 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1452 { 1453 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1454 struct spdk_bdev_io *bdev_io; 1455 1456 spdk_iobuf_channel_fini(&ch->iobuf); 1457 1458 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1459 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1460 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1461 ch->per_thread_cache_count--; 1462 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1463 } 1464 1465 assert(ch->per_thread_cache_count == 0); 1466 } 1467 1468 static int 1469 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1470 { 1471 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1472 struct spdk_bdev_io *bdev_io; 1473 uint32_t i; 1474 int rc; 1475 1476 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1477 if (rc != 0) { 1478 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1479 return -1; 1480 } 1481 1482 STAILQ_INIT(&ch->per_thread_cache); 1483 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1484 1485 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1486 ch->per_thread_cache_count = 0; 1487 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1488 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1489 if (bdev_io == NULL) { 1490 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1491 assert(false); 1492 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1493 return -1; 1494 } 1495 ch->per_thread_cache_count++; 1496 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1497 } 1498 1499 TAILQ_INIT(&ch->shared_resources); 1500 TAILQ_INIT(&ch->io_wait_queue); 1501 1502 return 0; 1503 } 1504 1505 static void 1506 bdev_init_complete(int rc) 1507 { 1508 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1509 void *cb_arg = g_init_cb_arg; 1510 struct spdk_bdev_module *m; 1511 1512 g_bdev_mgr.init_complete = true; 1513 g_init_cb_fn = NULL; 1514 g_init_cb_arg = NULL; 1515 1516 /* 1517 * For modules that need to know when subsystem init is complete, 1518 * inform them now. 1519 */ 1520 if (rc == 0) { 1521 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1522 if (m->init_complete) { 1523 m->init_complete(); 1524 } 1525 } 1526 } 1527 1528 cb_fn(cb_arg, rc); 1529 } 1530 1531 static bool 1532 bdev_module_all_actions_completed(void) 1533 { 1534 struct spdk_bdev_module *m; 1535 1536 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1537 if (m->internal.action_in_progress > 0) { 1538 return false; 1539 } 1540 } 1541 return true; 1542 } 1543 1544 static void 1545 bdev_module_action_complete(void) 1546 { 1547 /* 1548 * Don't finish bdev subsystem initialization if 1549 * module pre-initialization is still in progress, or 1550 * the subsystem been already initialized. 1551 */ 1552 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1553 return; 1554 } 1555 1556 /* 1557 * Check all bdev modules for inits/examinations in progress. If any 1558 * exist, return immediately since we cannot finish bdev subsystem 1559 * initialization until all are completed. 1560 */ 1561 if (!bdev_module_all_actions_completed()) { 1562 return; 1563 } 1564 1565 /* 1566 * Modules already finished initialization - now that all 1567 * the bdev modules have finished their asynchronous I/O 1568 * processing, the entire bdev layer can be marked as complete. 1569 */ 1570 bdev_init_complete(0); 1571 } 1572 1573 static void 1574 bdev_module_action_done(struct spdk_bdev_module *module) 1575 { 1576 spdk_spin_lock(&module->internal.spinlock); 1577 assert(module->internal.action_in_progress > 0); 1578 module->internal.action_in_progress--; 1579 spdk_spin_unlock(&module->internal.spinlock); 1580 bdev_module_action_complete(); 1581 } 1582 1583 void 1584 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1585 { 1586 assert(module->async_init); 1587 bdev_module_action_done(module); 1588 } 1589 1590 void 1591 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1592 { 1593 bdev_module_action_done(module); 1594 } 1595 1596 /** The last initialized bdev module */ 1597 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1598 1599 static void 1600 bdev_init_failed(void *cb_arg) 1601 { 1602 struct spdk_bdev_module *module = cb_arg; 1603 1604 spdk_spin_lock(&module->internal.spinlock); 1605 assert(module->internal.action_in_progress > 0); 1606 module->internal.action_in_progress--; 1607 spdk_spin_unlock(&module->internal.spinlock); 1608 bdev_init_complete(-1); 1609 } 1610 1611 static int 1612 bdev_modules_init(void) 1613 { 1614 struct spdk_bdev_module *module; 1615 int rc = 0; 1616 1617 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1618 g_resume_bdev_module = module; 1619 if (module->async_init) { 1620 spdk_spin_lock(&module->internal.spinlock); 1621 module->internal.action_in_progress = 1; 1622 spdk_spin_unlock(&module->internal.spinlock); 1623 } 1624 rc = module->module_init(); 1625 if (rc != 0) { 1626 /* Bump action_in_progress to prevent other modules from completion of modules_init 1627 * Send message to defer application shutdown until resources are cleaned up */ 1628 spdk_spin_lock(&module->internal.spinlock); 1629 module->internal.action_in_progress = 1; 1630 spdk_spin_unlock(&module->internal.spinlock); 1631 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1632 return rc; 1633 } 1634 } 1635 1636 g_resume_bdev_module = NULL; 1637 return 0; 1638 } 1639 1640 void 1641 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1642 { 1643 int rc = 0; 1644 char mempool_name[32]; 1645 1646 assert(cb_fn != NULL); 1647 1648 g_init_cb_fn = cb_fn; 1649 g_init_cb_arg = cb_arg; 1650 1651 spdk_notify_type_register("bdev_register"); 1652 spdk_notify_type_register("bdev_unregister"); 1653 1654 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1655 1656 rc = spdk_iobuf_register_module("bdev"); 1657 if (rc != 0) { 1658 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1659 bdev_init_complete(-1); 1660 return; 1661 } 1662 1663 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1664 g_bdev_opts.bdev_io_pool_size, 1665 sizeof(struct spdk_bdev_io) + 1666 bdev_module_get_max_ctx_size(), 1667 0, 1668 SPDK_ENV_SOCKET_ID_ANY); 1669 1670 if (g_bdev_mgr.bdev_io_pool == NULL) { 1671 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1672 bdev_init_complete(-1); 1673 return; 1674 } 1675 1676 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1677 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1678 if (!g_bdev_mgr.zero_buffer) { 1679 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1680 bdev_init_complete(-1); 1681 return; 1682 } 1683 1684 #ifdef SPDK_CONFIG_VTUNE 1685 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1686 #endif 1687 1688 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1689 bdev_mgmt_channel_destroy, 1690 sizeof(struct spdk_bdev_mgmt_channel), 1691 "bdev_mgr"); 1692 1693 rc = bdev_modules_init(); 1694 g_bdev_mgr.module_init_complete = true; 1695 if (rc != 0) { 1696 SPDK_ERRLOG("bdev modules init failed\n"); 1697 return; 1698 } 1699 1700 bdev_module_action_complete(); 1701 } 1702 1703 static void 1704 bdev_mgr_unregister_cb(void *io_device) 1705 { 1706 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1707 1708 if (g_bdev_mgr.bdev_io_pool) { 1709 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1710 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1711 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1712 g_bdev_opts.bdev_io_pool_size); 1713 } 1714 1715 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1716 } 1717 1718 spdk_free(g_bdev_mgr.zero_buffer); 1719 1720 bdev_examine_allowlist_free(); 1721 1722 cb_fn(g_fini_cb_arg); 1723 g_fini_cb_fn = NULL; 1724 g_fini_cb_arg = NULL; 1725 g_bdev_mgr.init_complete = false; 1726 g_bdev_mgr.module_init_complete = false; 1727 } 1728 1729 static void 1730 bdev_module_fini_iter(void *arg) 1731 { 1732 struct spdk_bdev_module *bdev_module; 1733 1734 /* FIXME: Handling initialization failures is broken now, 1735 * so we won't even try cleaning up after successfully 1736 * initialized modules. if module_init_complete is false, 1737 * just call spdk_bdev_mgr_unregister_cb 1738 */ 1739 if (!g_bdev_mgr.module_init_complete) { 1740 bdev_mgr_unregister_cb(NULL); 1741 return; 1742 } 1743 1744 /* Start iterating from the last touched module */ 1745 if (!g_resume_bdev_module) { 1746 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1747 } else { 1748 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1749 internal.tailq); 1750 } 1751 1752 while (bdev_module) { 1753 if (bdev_module->async_fini) { 1754 /* Save our place so we can resume later. We must 1755 * save the variable here, before calling module_fini() 1756 * below, because in some cases the module may immediately 1757 * call spdk_bdev_module_fini_done() and re-enter 1758 * this function to continue iterating. */ 1759 g_resume_bdev_module = bdev_module; 1760 } 1761 1762 if (bdev_module->module_fini) { 1763 bdev_module->module_fini(); 1764 } 1765 1766 if (bdev_module->async_fini) { 1767 return; 1768 } 1769 1770 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1771 internal.tailq); 1772 } 1773 1774 g_resume_bdev_module = NULL; 1775 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1776 } 1777 1778 void 1779 spdk_bdev_module_fini_done(void) 1780 { 1781 if (spdk_get_thread() != g_fini_thread) { 1782 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1783 } else { 1784 bdev_module_fini_iter(NULL); 1785 } 1786 } 1787 1788 static void 1789 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1790 { 1791 struct spdk_bdev *bdev = cb_arg; 1792 1793 if (bdeverrno && bdev) { 1794 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1795 bdev->name); 1796 1797 /* 1798 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1799 * bdev; try to continue by manually removing this bdev from the list and continue 1800 * with the next bdev in the list. 1801 */ 1802 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1803 } 1804 1805 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1806 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1807 /* 1808 * Bdev module finish need to be deferred as we might be in the middle of some context 1809 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1810 * after returning. 1811 */ 1812 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1813 return; 1814 } 1815 1816 /* 1817 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1818 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1819 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1820 * base bdevs. 1821 * 1822 * Also, walk the list in the reverse order. 1823 */ 1824 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1825 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1826 spdk_spin_lock(&bdev->internal.spinlock); 1827 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1828 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1829 bdev->name, bdev->internal.claim.v1.module->name); 1830 spdk_spin_unlock(&bdev->internal.spinlock); 1831 continue; 1832 } 1833 spdk_spin_unlock(&bdev->internal.spinlock); 1834 1835 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1836 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1837 return; 1838 } 1839 1840 /* 1841 * If any bdev fails to unclaim underlying bdev properly, we may face the 1842 * case of bdev list consisting of claimed bdevs only (if claims are managed 1843 * correctly, this would mean there's a loop in the claims graph which is 1844 * clearly impossible). Warn and unregister last bdev on the list then. 1845 */ 1846 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1847 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1848 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1849 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1850 return; 1851 } 1852 } 1853 1854 static void 1855 bdev_module_fini_start_iter(void *arg) 1856 { 1857 struct spdk_bdev_module *bdev_module; 1858 1859 if (!g_resume_bdev_module) { 1860 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1861 } else { 1862 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1863 } 1864 1865 while (bdev_module) { 1866 if (bdev_module->async_fini_start) { 1867 /* Save our place so we can resume later. We must 1868 * save the variable here, before calling fini_start() 1869 * below, because in some cases the module may immediately 1870 * call spdk_bdev_module_fini_start_done() and re-enter 1871 * this function to continue iterating. */ 1872 g_resume_bdev_module = bdev_module; 1873 } 1874 1875 if (bdev_module->fini_start) { 1876 bdev_module->fini_start(); 1877 } 1878 1879 if (bdev_module->async_fini_start) { 1880 return; 1881 } 1882 1883 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1884 } 1885 1886 g_resume_bdev_module = NULL; 1887 1888 bdev_finish_unregister_bdevs_iter(NULL, 0); 1889 } 1890 1891 void 1892 spdk_bdev_module_fini_start_done(void) 1893 { 1894 if (spdk_get_thread() != g_fini_thread) { 1895 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1896 } else { 1897 bdev_module_fini_start_iter(NULL); 1898 } 1899 } 1900 1901 static void 1902 bdev_finish_wait_for_examine_done(void *cb_arg) 1903 { 1904 bdev_module_fini_start_iter(NULL); 1905 } 1906 1907 void 1908 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1909 { 1910 int rc; 1911 1912 assert(cb_fn != NULL); 1913 1914 g_fini_thread = spdk_get_thread(); 1915 1916 g_fini_cb_fn = cb_fn; 1917 g_fini_cb_arg = cb_arg; 1918 1919 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1920 if (rc != 0) { 1921 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1922 bdev_finish_wait_for_examine_done(NULL); 1923 } 1924 } 1925 1926 struct spdk_bdev_io * 1927 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1928 { 1929 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1930 struct spdk_bdev_io *bdev_io; 1931 1932 if (ch->per_thread_cache_count > 0) { 1933 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1934 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1935 ch->per_thread_cache_count--; 1936 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1937 /* 1938 * Don't try to look for bdev_ios in the global pool if there are 1939 * waiters on bdev_ios - we don't want this caller to jump the line. 1940 */ 1941 bdev_io = NULL; 1942 } else { 1943 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1944 } 1945 1946 return bdev_io; 1947 } 1948 1949 void 1950 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1951 { 1952 struct spdk_bdev_mgmt_channel *ch; 1953 1954 assert(bdev_io != NULL); 1955 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1956 1957 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1958 1959 if (bdev_io->internal.buf != NULL) { 1960 bdev_io_put_buf(bdev_io); 1961 } 1962 1963 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1964 ch->per_thread_cache_count++; 1965 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1966 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1967 struct spdk_bdev_io_wait_entry *entry; 1968 1969 entry = TAILQ_FIRST(&ch->io_wait_queue); 1970 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1971 entry->cb_fn(entry->cb_arg); 1972 } 1973 } else { 1974 /* We should never have a full cache with entries on the io wait queue. */ 1975 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1976 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1977 } 1978 } 1979 1980 static bool 1981 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1982 { 1983 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1984 1985 switch (limit) { 1986 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1987 return true; 1988 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1989 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1990 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1991 return false; 1992 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1993 default: 1994 return false; 1995 } 1996 } 1997 1998 static bool 1999 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2000 { 2001 switch (bdev_io->type) { 2002 case SPDK_BDEV_IO_TYPE_NVME_IO: 2003 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2004 case SPDK_BDEV_IO_TYPE_READ: 2005 case SPDK_BDEV_IO_TYPE_WRITE: 2006 return true; 2007 case SPDK_BDEV_IO_TYPE_ZCOPY: 2008 if (bdev_io->u.bdev.zcopy.start) { 2009 return true; 2010 } else { 2011 return false; 2012 } 2013 default: 2014 return false; 2015 } 2016 } 2017 2018 static bool 2019 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2020 { 2021 switch (bdev_io->type) { 2022 case SPDK_BDEV_IO_TYPE_NVME_IO: 2023 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2024 /* Bit 1 (0x2) set for read operation */ 2025 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2026 return true; 2027 } else { 2028 return false; 2029 } 2030 case SPDK_BDEV_IO_TYPE_READ: 2031 return true; 2032 case SPDK_BDEV_IO_TYPE_ZCOPY: 2033 /* Populate to read from disk */ 2034 if (bdev_io->u.bdev.zcopy.populate) { 2035 return true; 2036 } else { 2037 return false; 2038 } 2039 default: 2040 return false; 2041 } 2042 } 2043 2044 static uint64_t 2045 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2046 { 2047 struct spdk_bdev *bdev = bdev_io->bdev; 2048 2049 switch (bdev_io->type) { 2050 case SPDK_BDEV_IO_TYPE_NVME_IO: 2051 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2052 return bdev_io->u.nvme_passthru.nbytes; 2053 case SPDK_BDEV_IO_TYPE_READ: 2054 case SPDK_BDEV_IO_TYPE_WRITE: 2055 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2056 case SPDK_BDEV_IO_TYPE_ZCOPY: 2057 /* Track the data in the start phase only */ 2058 if (bdev_io->u.bdev.zcopy.start) { 2059 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2060 } else { 2061 return 0; 2062 } 2063 default: 2064 return 0; 2065 } 2066 } 2067 2068 static bool 2069 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2070 { 2071 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2072 return true; 2073 } else { 2074 return false; 2075 } 2076 } 2077 2078 static bool 2079 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2080 { 2081 if (bdev_is_read_io(io) == false) { 2082 return false; 2083 } 2084 2085 return bdev_qos_rw_queue_io(limit, io); 2086 } 2087 2088 static bool 2089 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2090 { 2091 if (bdev_is_read_io(io) == true) { 2092 return false; 2093 } 2094 2095 return bdev_qos_rw_queue_io(limit, io); 2096 } 2097 2098 static void 2099 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2100 { 2101 limit->remaining_this_timeslice--; 2102 } 2103 2104 static void 2105 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2106 { 2107 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2108 } 2109 2110 static void 2111 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2112 { 2113 if (bdev_is_read_io(io) == false) { 2114 return; 2115 } 2116 2117 return bdev_qos_rw_bps_update_quota(limit, io); 2118 } 2119 2120 static void 2121 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2122 { 2123 if (bdev_is_read_io(io) == true) { 2124 return; 2125 } 2126 2127 return bdev_qos_rw_bps_update_quota(limit, io); 2128 } 2129 2130 static void 2131 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2132 { 2133 int i; 2134 2135 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2136 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2137 qos->rate_limits[i].queue_io = NULL; 2138 qos->rate_limits[i].update_quota = NULL; 2139 continue; 2140 } 2141 2142 switch (i) { 2143 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2144 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2145 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2146 break; 2147 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2148 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2149 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2150 break; 2151 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2152 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2153 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2154 break; 2155 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2156 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2157 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2158 break; 2159 default: 2160 break; 2161 } 2162 } 2163 } 2164 2165 static void 2166 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2167 struct spdk_bdev_io *bdev_io, 2168 enum spdk_bdev_io_status status) 2169 { 2170 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2171 2172 bdev_io->internal.in_submit_request = true; 2173 bdev_ch->io_outstanding++; 2174 shared_resource->io_outstanding++; 2175 spdk_bdev_io_complete(bdev_io, status); 2176 bdev_io->internal.in_submit_request = false; 2177 } 2178 2179 static inline void 2180 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2181 { 2182 struct spdk_bdev *bdev = bdev_io->bdev; 2183 struct spdk_io_channel *ch = bdev_ch->channel; 2184 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2185 2186 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2187 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2188 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2189 2190 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2191 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2192 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2193 SPDK_BDEV_IO_STATUS_SUCCESS); 2194 return; 2195 } 2196 } 2197 2198 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2199 bdev_io->bdev->split_on_write_unit && 2200 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2201 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2202 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2203 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2204 return; 2205 } 2206 2207 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2208 bdev_ch->io_outstanding++; 2209 shared_resource->io_outstanding++; 2210 bdev_io->internal.in_submit_request = true; 2211 bdev->fn_table->submit_request(ch, bdev_io); 2212 bdev_io->internal.in_submit_request = false; 2213 } else { 2214 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2215 } 2216 } 2217 2218 static bool 2219 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2220 { 2221 int i; 2222 2223 if (bdev_qos_io_to_limit(bdev_io) == true) { 2224 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2225 if (!qos->rate_limits[i].queue_io) { 2226 continue; 2227 } 2228 2229 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2230 bdev_io) == true) { 2231 return true; 2232 } 2233 } 2234 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2235 if (!qos->rate_limits[i].update_quota) { 2236 continue; 2237 } 2238 2239 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2240 } 2241 } 2242 2243 return false; 2244 } 2245 2246 static inline void 2247 _bdev_io_do_submit(void *ctx) 2248 { 2249 struct spdk_bdev_io *bdev_io = ctx; 2250 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2251 2252 bdev_io_do_submit(ch, bdev_io); 2253 } 2254 2255 static int 2256 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2257 { 2258 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2259 int submitted_ios = 0; 2260 2261 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2262 if (!bdev_qos_queue_io(qos, bdev_io)) { 2263 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2264 2265 if (bdev_io->internal.io_submit_ch) { 2266 /* Send back the IO to the original thread for the actual processing. */ 2267 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2268 bdev_io->internal.io_submit_ch = NULL; 2269 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2270 _bdev_io_do_submit, bdev_io); 2271 } else { 2272 bdev_io_do_submit(ch, bdev_io); 2273 } 2274 2275 submitted_ios++; 2276 } 2277 } 2278 2279 return submitted_ios; 2280 } 2281 2282 static void 2283 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2284 { 2285 int rc; 2286 2287 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2288 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2289 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2290 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2291 &bdev_io->internal.waitq_entry); 2292 if (rc != 0) { 2293 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2294 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2295 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2296 } 2297 } 2298 2299 static bool 2300 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2301 { 2302 uint32_t io_boundary; 2303 struct spdk_bdev *bdev = bdev_io->bdev; 2304 uint32_t max_size = bdev->max_segment_size; 2305 int max_segs = bdev->max_num_segments; 2306 2307 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2308 io_boundary = bdev->write_unit_size; 2309 } else if (bdev->split_on_optimal_io_boundary) { 2310 io_boundary = bdev->optimal_io_boundary; 2311 } else { 2312 io_boundary = 0; 2313 } 2314 2315 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2316 return false; 2317 } 2318 2319 if (io_boundary) { 2320 uint64_t start_stripe, end_stripe; 2321 2322 start_stripe = bdev_io->u.bdev.offset_blocks; 2323 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2324 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2325 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2326 start_stripe >>= spdk_u32log2(io_boundary); 2327 end_stripe >>= spdk_u32log2(io_boundary); 2328 } else { 2329 start_stripe /= io_boundary; 2330 end_stripe /= io_boundary; 2331 } 2332 2333 if (start_stripe != end_stripe) { 2334 return true; 2335 } 2336 } 2337 2338 if (max_segs) { 2339 if (bdev_io->u.bdev.iovcnt > max_segs) { 2340 return true; 2341 } 2342 } 2343 2344 if (max_size) { 2345 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2346 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2347 return true; 2348 } 2349 } 2350 } 2351 2352 return false; 2353 } 2354 2355 static bool 2356 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2357 { 2358 uint32_t num_unmap_segments; 2359 2360 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2361 return false; 2362 } 2363 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2364 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2365 return true; 2366 } 2367 2368 return false; 2369 } 2370 2371 static bool 2372 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2373 { 2374 if (!bdev_io->bdev->max_write_zeroes) { 2375 return false; 2376 } 2377 2378 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2379 return true; 2380 } 2381 2382 return false; 2383 } 2384 2385 static bool 2386 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2387 { 2388 if (bdev_io->bdev->max_copy != 0 && 2389 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2390 return true; 2391 } 2392 2393 return false; 2394 } 2395 2396 static bool 2397 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2398 { 2399 switch (bdev_io->type) { 2400 case SPDK_BDEV_IO_TYPE_READ: 2401 case SPDK_BDEV_IO_TYPE_WRITE: 2402 return bdev_rw_should_split(bdev_io); 2403 case SPDK_BDEV_IO_TYPE_UNMAP: 2404 return bdev_unmap_should_split(bdev_io); 2405 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2406 return bdev_write_zeroes_should_split(bdev_io); 2407 case SPDK_BDEV_IO_TYPE_COPY: 2408 return bdev_copy_should_split(bdev_io); 2409 default: 2410 return false; 2411 } 2412 } 2413 2414 static uint32_t 2415 _to_next_boundary(uint64_t offset, uint32_t boundary) 2416 { 2417 return (boundary - (offset % boundary)); 2418 } 2419 2420 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2421 2422 static void _bdev_rw_split(void *_bdev_io); 2423 2424 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2425 2426 static void 2427 _bdev_unmap_split(void *_bdev_io) 2428 { 2429 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2430 } 2431 2432 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2433 2434 static void 2435 _bdev_write_zeroes_split(void *_bdev_io) 2436 { 2437 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2438 } 2439 2440 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2441 2442 static void 2443 _bdev_copy_split(void *_bdev_io) 2444 { 2445 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2446 } 2447 2448 static int 2449 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2450 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2451 { 2452 int rc; 2453 uint64_t current_offset, current_remaining, current_src_offset; 2454 spdk_bdev_io_wait_cb io_wait_fn; 2455 2456 current_offset = *offset; 2457 current_remaining = *remaining; 2458 2459 bdev_io->u.bdev.split_outstanding++; 2460 2461 io_wait_fn = _bdev_rw_split; 2462 switch (bdev_io->type) { 2463 case SPDK_BDEV_IO_TYPE_READ: 2464 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2465 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2466 iov, iovcnt, md_buf, current_offset, 2467 num_blocks, 2468 bdev_io_split_done, bdev_io, 2469 bdev_io->internal.ext_opts, true); 2470 break; 2471 case SPDK_BDEV_IO_TYPE_WRITE: 2472 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2473 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2474 iov, iovcnt, md_buf, current_offset, 2475 num_blocks, 2476 bdev_io_split_done, bdev_io, 2477 bdev_io->internal.ext_opts, true); 2478 break; 2479 case SPDK_BDEV_IO_TYPE_UNMAP: 2480 io_wait_fn = _bdev_unmap_split; 2481 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2482 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2483 current_offset, num_blocks, 2484 bdev_io_split_done, bdev_io); 2485 break; 2486 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2487 io_wait_fn = _bdev_write_zeroes_split; 2488 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2489 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2490 current_offset, num_blocks, 2491 bdev_io_split_done, bdev_io); 2492 break; 2493 case SPDK_BDEV_IO_TYPE_COPY: 2494 io_wait_fn = _bdev_copy_split; 2495 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2496 (current_offset - bdev_io->u.bdev.offset_blocks); 2497 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2498 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2499 current_offset, current_src_offset, num_blocks, 2500 bdev_io_split_done, bdev_io); 2501 break; 2502 default: 2503 assert(false); 2504 rc = -EINVAL; 2505 break; 2506 } 2507 2508 if (rc == 0) { 2509 current_offset += num_blocks; 2510 current_remaining -= num_blocks; 2511 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2512 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2513 *offset = current_offset; 2514 *remaining = current_remaining; 2515 } else { 2516 bdev_io->u.bdev.split_outstanding--; 2517 if (rc == -ENOMEM) { 2518 if (bdev_io->u.bdev.split_outstanding == 0) { 2519 /* No I/O is outstanding. Hence we should wait here. */ 2520 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2521 } 2522 } else { 2523 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2524 if (bdev_io->u.bdev.split_outstanding == 0) { 2525 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2526 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2527 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2528 } 2529 } 2530 } 2531 2532 return rc; 2533 } 2534 2535 static void 2536 _bdev_rw_split(void *_bdev_io) 2537 { 2538 struct iovec *parent_iov, *iov; 2539 struct spdk_bdev_io *bdev_io = _bdev_io; 2540 struct spdk_bdev *bdev = bdev_io->bdev; 2541 uint64_t parent_offset, current_offset, remaining; 2542 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2543 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2544 uint32_t iovcnt, iov_len, child_iovsize; 2545 uint32_t blocklen = bdev->blocklen; 2546 uint32_t io_boundary; 2547 uint32_t max_segment_size = bdev->max_segment_size; 2548 uint32_t max_child_iovcnt = bdev->max_num_segments; 2549 void *md_buf = NULL; 2550 int rc; 2551 2552 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2553 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2554 SPDK_BDEV_IO_NUM_CHILD_IOV; 2555 2556 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2557 io_boundary = bdev->write_unit_size; 2558 } else if (bdev->split_on_optimal_io_boundary) { 2559 io_boundary = bdev->optimal_io_boundary; 2560 } else { 2561 io_boundary = UINT32_MAX; 2562 } 2563 2564 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2565 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2566 parent_offset = bdev_io->u.bdev.offset_blocks; 2567 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2568 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2569 2570 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2571 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2572 if (parent_iov_offset < parent_iov->iov_len) { 2573 break; 2574 } 2575 parent_iov_offset -= parent_iov->iov_len; 2576 } 2577 2578 child_iovcnt = 0; 2579 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2580 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2581 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2582 to_next_boundary = spdk_min(remaining, to_next_boundary); 2583 to_next_boundary_bytes = to_next_boundary * blocklen; 2584 2585 iov = &bdev_io->child_iov[child_iovcnt]; 2586 iovcnt = 0; 2587 2588 if (bdev_io->u.bdev.md_buf) { 2589 md_buf = (char *)bdev_io->u.bdev.md_buf + 2590 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2591 } 2592 2593 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2594 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2595 iovcnt < child_iovsize) { 2596 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2597 iov_len = parent_iov->iov_len - parent_iov_offset; 2598 2599 iov_len = spdk_min(iov_len, max_segment_size); 2600 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2601 to_next_boundary_bytes -= iov_len; 2602 2603 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2604 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2605 2606 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2607 parent_iov_offset += iov_len; 2608 } else { 2609 parent_iovpos++; 2610 parent_iov_offset = 0; 2611 } 2612 child_iovcnt++; 2613 iovcnt++; 2614 } 2615 2616 if (to_next_boundary_bytes > 0) { 2617 /* We had to stop this child I/O early because we ran out of 2618 * child_iov space or were limited by max_num_segments. 2619 * Ensure the iovs to be aligned with block size and 2620 * then adjust to_next_boundary before starting the 2621 * child I/O. 2622 */ 2623 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2624 iovcnt == child_iovsize); 2625 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2626 if (to_last_block_bytes != 0) { 2627 uint32_t child_iovpos = child_iovcnt - 1; 2628 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2629 * so the loop will naturally end 2630 */ 2631 2632 to_last_block_bytes = blocklen - to_last_block_bytes; 2633 to_next_boundary_bytes += to_last_block_bytes; 2634 while (to_last_block_bytes > 0 && iovcnt > 0) { 2635 iov_len = spdk_min(to_last_block_bytes, 2636 bdev_io->child_iov[child_iovpos].iov_len); 2637 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2638 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2639 child_iovpos--; 2640 if (--iovcnt == 0) { 2641 /* If the child IO is less than a block size just return. 2642 * If the first child IO of any split round is less than 2643 * a block size, an error exit. 2644 */ 2645 if (bdev_io->u.bdev.split_outstanding == 0) { 2646 SPDK_ERRLOG("The first child io was less than a block size\n"); 2647 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2648 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2649 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2650 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2651 } 2652 2653 return; 2654 } 2655 } 2656 2657 to_last_block_bytes -= iov_len; 2658 2659 if (parent_iov_offset == 0) { 2660 parent_iovpos--; 2661 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2662 } 2663 parent_iov_offset -= iov_len; 2664 } 2665 2666 assert(to_last_block_bytes == 0); 2667 } 2668 to_next_boundary -= to_next_boundary_bytes / blocklen; 2669 } 2670 2671 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2672 ¤t_offset, &remaining); 2673 if (spdk_unlikely(rc)) { 2674 return; 2675 } 2676 } 2677 } 2678 2679 static void 2680 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2681 { 2682 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2683 uint32_t num_children_reqs = 0; 2684 int rc; 2685 2686 offset = bdev_io->u.bdev.split_current_offset_blocks; 2687 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2688 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2689 2690 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2691 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2692 2693 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2694 &offset, &remaining); 2695 if (spdk_likely(rc == 0)) { 2696 num_children_reqs++; 2697 } else { 2698 return; 2699 } 2700 } 2701 } 2702 2703 static void 2704 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2705 { 2706 uint64_t offset, write_zeroes_blocks, remaining; 2707 uint32_t num_children_reqs = 0; 2708 int rc; 2709 2710 offset = bdev_io->u.bdev.split_current_offset_blocks; 2711 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2712 2713 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2714 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2715 2716 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2717 &offset, &remaining); 2718 if (spdk_likely(rc == 0)) { 2719 num_children_reqs++; 2720 } else { 2721 return; 2722 } 2723 } 2724 } 2725 2726 static void 2727 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2728 { 2729 uint64_t offset, copy_blocks, remaining; 2730 uint32_t num_children_reqs = 0; 2731 int rc; 2732 2733 offset = bdev_io->u.bdev.split_current_offset_blocks; 2734 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2735 2736 assert(bdev_io->bdev->max_copy != 0); 2737 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2738 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2739 2740 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2741 &offset, &remaining); 2742 if (spdk_likely(rc == 0)) { 2743 num_children_reqs++; 2744 } else { 2745 return; 2746 } 2747 } 2748 } 2749 2750 static void 2751 parent_bdev_io_complete(void *ctx, int rc) 2752 { 2753 struct spdk_bdev_io *parent_io = ctx; 2754 2755 if (rc) { 2756 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2757 } 2758 2759 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2760 parent_io->internal.caller_ctx); 2761 } 2762 2763 static void 2764 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2765 { 2766 struct spdk_bdev_io *parent_io = cb_arg; 2767 2768 spdk_bdev_free_io(bdev_io); 2769 2770 if (!success) { 2771 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2772 /* If any child I/O failed, stop further splitting process. */ 2773 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2774 parent_io->u.bdev.split_remaining_num_blocks = 0; 2775 } 2776 parent_io->u.bdev.split_outstanding--; 2777 if (parent_io->u.bdev.split_outstanding != 0) { 2778 return; 2779 } 2780 2781 /* 2782 * Parent I/O finishes when all blocks are consumed. 2783 */ 2784 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2785 assert(parent_io->internal.cb != bdev_io_split_done); 2786 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2787 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2788 2789 if (parent_io->internal.orig_iovcnt != 0) { 2790 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2791 /* bdev IO will be completed in the callback */ 2792 } else { 2793 parent_bdev_io_complete(parent_io, 0); 2794 } 2795 return; 2796 } 2797 2798 /* 2799 * Continue with the splitting process. This function will complete the parent I/O if the 2800 * splitting is done. 2801 */ 2802 switch (parent_io->type) { 2803 case SPDK_BDEV_IO_TYPE_READ: 2804 case SPDK_BDEV_IO_TYPE_WRITE: 2805 _bdev_rw_split(parent_io); 2806 break; 2807 case SPDK_BDEV_IO_TYPE_UNMAP: 2808 bdev_unmap_split(parent_io); 2809 break; 2810 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2811 bdev_write_zeroes_split(parent_io); 2812 break; 2813 case SPDK_BDEV_IO_TYPE_COPY: 2814 bdev_copy_split(parent_io); 2815 break; 2816 default: 2817 assert(false); 2818 break; 2819 } 2820 } 2821 2822 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2823 bool success); 2824 2825 static void 2826 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2827 { 2828 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2829 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2830 bdev_io->u.bdev.split_outstanding = 0; 2831 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2832 2833 switch (bdev_io->type) { 2834 case SPDK_BDEV_IO_TYPE_READ: 2835 case SPDK_BDEV_IO_TYPE_WRITE: 2836 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2837 _bdev_rw_split(bdev_io); 2838 } else { 2839 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2840 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2841 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2842 } 2843 break; 2844 case SPDK_BDEV_IO_TYPE_UNMAP: 2845 bdev_unmap_split(bdev_io); 2846 break; 2847 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2848 bdev_write_zeroes_split(bdev_io); 2849 break; 2850 case SPDK_BDEV_IO_TYPE_COPY: 2851 bdev_copy_split(bdev_io); 2852 break; 2853 default: 2854 assert(false); 2855 break; 2856 } 2857 } 2858 2859 static void 2860 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2861 { 2862 if (!success) { 2863 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2864 return; 2865 } 2866 2867 _bdev_rw_split(bdev_io); 2868 } 2869 2870 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2871 * be inlined, at least on some compilers. 2872 */ 2873 static inline void 2874 _bdev_io_submit(void *ctx) 2875 { 2876 struct spdk_bdev_io *bdev_io = ctx; 2877 struct spdk_bdev *bdev = bdev_io->bdev; 2878 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2879 2880 if (spdk_likely(bdev_ch->flags == 0)) { 2881 bdev_io_do_submit(bdev_ch, bdev_io); 2882 return; 2883 } 2884 2885 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2886 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2887 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2888 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2889 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2890 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2891 } else { 2892 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2893 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2894 } 2895 } else { 2896 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2897 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2898 } 2899 } 2900 2901 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2902 2903 bool 2904 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2905 { 2906 if (range1->length == 0 || range2->length == 0) { 2907 return false; 2908 } 2909 2910 if (range1->offset + range1->length <= range2->offset) { 2911 return false; 2912 } 2913 2914 if (range2->offset + range2->length <= range1->offset) { 2915 return false; 2916 } 2917 2918 return true; 2919 } 2920 2921 static bool 2922 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2923 { 2924 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2925 struct lba_range r; 2926 2927 switch (bdev_io->type) { 2928 case SPDK_BDEV_IO_TYPE_NVME_IO: 2929 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2930 /* Don't try to decode the NVMe command - just assume worst-case and that 2931 * it overlaps a locked range. 2932 */ 2933 return true; 2934 case SPDK_BDEV_IO_TYPE_WRITE: 2935 case SPDK_BDEV_IO_TYPE_UNMAP: 2936 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2937 case SPDK_BDEV_IO_TYPE_ZCOPY: 2938 case SPDK_BDEV_IO_TYPE_COPY: 2939 r.offset = bdev_io->u.bdev.offset_blocks; 2940 r.length = bdev_io->u.bdev.num_blocks; 2941 if (!bdev_lba_range_overlapped(range, &r)) { 2942 /* This I/O doesn't overlap the specified LBA range. */ 2943 return false; 2944 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2945 /* This I/O overlaps, but the I/O is on the same channel that locked this 2946 * range, and the caller_ctx is the same as the locked_ctx. This means 2947 * that this I/O is associated with the lock, and is allowed to execute. 2948 */ 2949 return false; 2950 } else { 2951 return true; 2952 } 2953 default: 2954 return false; 2955 } 2956 } 2957 2958 void 2959 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2960 { 2961 struct spdk_bdev *bdev = bdev_io->bdev; 2962 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2963 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2964 2965 assert(thread != NULL); 2966 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2967 2968 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2969 struct lba_range *range; 2970 2971 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2972 if (bdev_io_range_is_locked(bdev_io, range)) { 2973 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2974 return; 2975 } 2976 } 2977 } 2978 2979 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2980 2981 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2982 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2983 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2984 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2985 spdk_bdev_get_name(bdev)); 2986 2987 if (bdev_io_should_split(bdev_io)) { 2988 bdev_io_split(NULL, bdev_io); 2989 return; 2990 } 2991 2992 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2993 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2994 _bdev_io_submit(bdev_io); 2995 } else { 2996 bdev_io->internal.io_submit_ch = ch; 2997 bdev_io->internal.ch = bdev->internal.qos->ch; 2998 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2999 } 3000 } else { 3001 _bdev_io_submit(bdev_io); 3002 } 3003 } 3004 3005 static inline void 3006 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 3007 { 3008 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 3009 3010 /* Zero part we don't copy */ 3011 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 3012 memcpy(opts_copy, opts, opts->size); 3013 opts_copy->size = sizeof(*opts_copy); 3014 opts_copy->metadata = bdev_io->u.bdev.md_buf; 3015 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 3016 bdev_io->u.bdev.ext_opts = opts_copy; 3017 } 3018 3019 static inline void 3020 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3021 { 3022 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3023 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3024 * For write operation we need to pull buffers from memory domain before submitting IO. 3025 * Once read operation completes, we need to use memory_domain push functionality to 3026 * update data in original memory domain IO buffer 3027 * This IO request will go through a regular IO flow, so clear memory domains pointers in 3028 * the copied ext_opts */ 3029 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 3030 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 3031 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3032 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3033 } 3034 3035 static inline void 3036 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 3037 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 3038 { 3039 if (opts) { 3040 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 3041 assert(opts->size <= sizeof(*opts)); 3042 /* 3043 * copy if size is smaller than opts struct to avoid having to check size 3044 * on every access to bdev_io->u.bdev.ext_opts 3045 */ 3046 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 3047 _bdev_io_copy_ext_opts(bdev_io, opts); 3048 if (use_pull_push) { 3049 _bdev_io_ext_use_bounce_buffer(bdev_io); 3050 return; 3051 } 3052 } 3053 } 3054 bdev_io_submit(bdev_io); 3055 } 3056 3057 static void 3058 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3059 { 3060 struct spdk_bdev *bdev = bdev_io->bdev; 3061 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3062 struct spdk_io_channel *ch = bdev_ch->channel; 3063 3064 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3065 3066 bdev_io->internal.in_submit_request = true; 3067 bdev->fn_table->submit_request(ch, bdev_io); 3068 bdev_io->internal.in_submit_request = false; 3069 } 3070 3071 void 3072 bdev_io_init(struct spdk_bdev_io *bdev_io, 3073 struct spdk_bdev *bdev, void *cb_arg, 3074 spdk_bdev_io_completion_cb cb) 3075 { 3076 bdev_io->bdev = bdev; 3077 bdev_io->internal.caller_ctx = cb_arg; 3078 bdev_io->internal.cb = cb; 3079 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3080 bdev_io->internal.in_submit_request = false; 3081 bdev_io->internal.buf = NULL; 3082 bdev_io->internal.io_submit_ch = NULL; 3083 bdev_io->internal.orig_iovs = NULL; 3084 bdev_io->internal.orig_iovcnt = 0; 3085 bdev_io->internal.orig_md_iov.iov_base = NULL; 3086 bdev_io->internal.error.nvme.cdw0 = 0; 3087 bdev_io->num_retries = 0; 3088 bdev_io->internal.get_buf_cb = NULL; 3089 bdev_io->internal.get_aux_buf_cb = NULL; 3090 bdev_io->internal.ext_opts = NULL; 3091 bdev_io->internal.data_transfer_cpl = NULL; 3092 } 3093 3094 static bool 3095 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3096 { 3097 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3098 } 3099 3100 bool 3101 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3102 { 3103 bool supported; 3104 3105 supported = bdev_io_type_supported(bdev, io_type); 3106 3107 if (!supported) { 3108 switch (io_type) { 3109 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3110 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3111 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3112 break; 3113 default: 3114 break; 3115 } 3116 } 3117 3118 return supported; 3119 } 3120 3121 uint64_t 3122 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3123 { 3124 return bdev_io->internal.submit_tsc; 3125 } 3126 3127 int 3128 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3129 { 3130 if (bdev->fn_table->dump_info_json) { 3131 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3132 } 3133 3134 return 0; 3135 } 3136 3137 static void 3138 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3139 { 3140 uint32_t max_per_timeslice = 0; 3141 int i; 3142 3143 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3144 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3145 qos->rate_limits[i].max_per_timeslice = 0; 3146 continue; 3147 } 3148 3149 max_per_timeslice = qos->rate_limits[i].limit * 3150 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3151 3152 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3153 qos->rate_limits[i].min_per_timeslice); 3154 3155 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3156 } 3157 3158 bdev_qos_set_ops(qos); 3159 } 3160 3161 static int 3162 bdev_channel_poll_qos(void *arg) 3163 { 3164 struct spdk_bdev_qos *qos = arg; 3165 uint64_t now = spdk_get_ticks(); 3166 int i; 3167 3168 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3169 /* We received our callback earlier than expected - return 3170 * immediately and wait to do accounting until at least one 3171 * timeslice has actually expired. This should never happen 3172 * with a well-behaved timer implementation. 3173 */ 3174 return SPDK_POLLER_IDLE; 3175 } 3176 3177 /* Reset for next round of rate limiting */ 3178 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3179 /* We may have allowed the IOs or bytes to slightly overrun in the last 3180 * timeslice. remaining_this_timeslice is signed, so if it's negative 3181 * here, we'll account for the overrun so that the next timeslice will 3182 * be appropriately reduced. 3183 */ 3184 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3185 qos->rate_limits[i].remaining_this_timeslice = 0; 3186 } 3187 } 3188 3189 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3190 qos->last_timeslice += qos->timeslice_size; 3191 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3192 qos->rate_limits[i].remaining_this_timeslice += 3193 qos->rate_limits[i].max_per_timeslice; 3194 } 3195 } 3196 3197 return bdev_qos_io_submit(qos->ch, qos); 3198 } 3199 3200 static void 3201 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3202 { 3203 struct spdk_bdev_shared_resource *shared_resource; 3204 struct lba_range *range; 3205 3206 bdev_free_io_stat(ch->stat); 3207 #ifdef SPDK_CONFIG_VTUNE 3208 bdev_free_io_stat(ch->prev_stat); 3209 #endif 3210 3211 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3212 range = TAILQ_FIRST(&ch->locked_ranges); 3213 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3214 free(range); 3215 } 3216 3217 spdk_put_io_channel(ch->channel); 3218 3219 shared_resource = ch->shared_resource; 3220 3221 assert(TAILQ_EMPTY(&ch->io_locked)); 3222 assert(TAILQ_EMPTY(&ch->io_submitted)); 3223 assert(ch->io_outstanding == 0); 3224 assert(shared_resource->ref > 0); 3225 shared_resource->ref--; 3226 if (shared_resource->ref == 0) { 3227 assert(shared_resource->io_outstanding == 0); 3228 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3229 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3230 free(shared_resource); 3231 } 3232 } 3233 3234 static void 3235 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3236 { 3237 struct spdk_bdev_qos *qos = bdev->internal.qos; 3238 int i; 3239 3240 assert(spdk_spin_held(&bdev->internal.spinlock)); 3241 3242 /* Rate limiting on this bdev enabled */ 3243 if (qos) { 3244 if (qos->ch == NULL) { 3245 struct spdk_io_channel *io_ch; 3246 3247 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3248 bdev->name, spdk_get_thread()); 3249 3250 /* No qos channel has been selected, so set one up */ 3251 3252 /* Take another reference to ch */ 3253 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3254 assert(io_ch != NULL); 3255 qos->ch = ch; 3256 3257 qos->thread = spdk_io_channel_get_thread(io_ch); 3258 3259 TAILQ_INIT(&qos->queued); 3260 3261 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3262 if (bdev_qos_is_iops_rate_limit(i) == true) { 3263 qos->rate_limits[i].min_per_timeslice = 3264 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3265 } else { 3266 qos->rate_limits[i].min_per_timeslice = 3267 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3268 } 3269 3270 if (qos->rate_limits[i].limit == 0) { 3271 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3272 } 3273 } 3274 bdev_qos_update_max_quota_per_timeslice(qos); 3275 qos->timeslice_size = 3276 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3277 qos->last_timeslice = spdk_get_ticks(); 3278 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3279 qos, 3280 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3281 } 3282 3283 ch->flags |= BDEV_CH_QOS_ENABLED; 3284 } 3285 } 3286 3287 struct poll_timeout_ctx { 3288 struct spdk_bdev_desc *desc; 3289 uint64_t timeout_in_sec; 3290 spdk_bdev_io_timeout_cb cb_fn; 3291 void *cb_arg; 3292 }; 3293 3294 static void 3295 bdev_desc_free(struct spdk_bdev_desc *desc) 3296 { 3297 spdk_spin_destroy(&desc->spinlock); 3298 free(desc->media_events_buffer); 3299 free(desc); 3300 } 3301 3302 static void 3303 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3304 { 3305 struct poll_timeout_ctx *ctx = _ctx; 3306 struct spdk_bdev_desc *desc = ctx->desc; 3307 3308 free(ctx); 3309 3310 spdk_spin_lock(&desc->spinlock); 3311 desc->refs--; 3312 if (desc->closed == true && desc->refs == 0) { 3313 spdk_spin_unlock(&desc->spinlock); 3314 bdev_desc_free(desc); 3315 return; 3316 } 3317 spdk_spin_unlock(&desc->spinlock); 3318 } 3319 3320 static void 3321 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3322 struct spdk_io_channel *io_ch, void *_ctx) 3323 { 3324 struct poll_timeout_ctx *ctx = _ctx; 3325 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3326 struct spdk_bdev_desc *desc = ctx->desc; 3327 struct spdk_bdev_io *bdev_io; 3328 uint64_t now; 3329 3330 spdk_spin_lock(&desc->spinlock); 3331 if (desc->closed == true) { 3332 spdk_spin_unlock(&desc->spinlock); 3333 spdk_bdev_for_each_channel_continue(i, -1); 3334 return; 3335 } 3336 spdk_spin_unlock(&desc->spinlock); 3337 3338 now = spdk_get_ticks(); 3339 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3340 /* Exclude any I/O that are generated via splitting. */ 3341 if (bdev_io->internal.cb == bdev_io_split_done) { 3342 continue; 3343 } 3344 3345 /* Once we find an I/O that has not timed out, we can immediately 3346 * exit the loop. 3347 */ 3348 if (now < (bdev_io->internal.submit_tsc + 3349 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3350 goto end; 3351 } 3352 3353 if (bdev_io->internal.desc == desc) { 3354 ctx->cb_fn(ctx->cb_arg, bdev_io); 3355 } 3356 } 3357 3358 end: 3359 spdk_bdev_for_each_channel_continue(i, 0); 3360 } 3361 3362 static int 3363 bdev_poll_timeout_io(void *arg) 3364 { 3365 struct spdk_bdev_desc *desc = arg; 3366 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3367 struct poll_timeout_ctx *ctx; 3368 3369 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3370 if (!ctx) { 3371 SPDK_ERRLOG("failed to allocate memory\n"); 3372 return SPDK_POLLER_BUSY; 3373 } 3374 ctx->desc = desc; 3375 ctx->cb_arg = desc->cb_arg; 3376 ctx->cb_fn = desc->cb_fn; 3377 ctx->timeout_in_sec = desc->timeout_in_sec; 3378 3379 /* Take a ref on the descriptor in case it gets closed while we are checking 3380 * all of the channels. 3381 */ 3382 spdk_spin_lock(&desc->spinlock); 3383 desc->refs++; 3384 spdk_spin_unlock(&desc->spinlock); 3385 3386 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3387 bdev_channel_poll_timeout_io_done); 3388 3389 return SPDK_POLLER_BUSY; 3390 } 3391 3392 int 3393 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3394 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3395 { 3396 assert(desc->thread == spdk_get_thread()); 3397 3398 spdk_poller_unregister(&desc->io_timeout_poller); 3399 3400 if (timeout_in_sec) { 3401 assert(cb_fn != NULL); 3402 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3403 desc, 3404 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3405 1000); 3406 if (desc->io_timeout_poller == NULL) { 3407 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3408 return -1; 3409 } 3410 } 3411 3412 desc->cb_fn = cb_fn; 3413 desc->cb_arg = cb_arg; 3414 desc->timeout_in_sec = timeout_in_sec; 3415 3416 return 0; 3417 } 3418 3419 static int 3420 bdev_channel_create(void *io_device, void *ctx_buf) 3421 { 3422 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3423 struct spdk_bdev_channel *ch = ctx_buf; 3424 struct spdk_io_channel *mgmt_io_ch; 3425 struct spdk_bdev_mgmt_channel *mgmt_ch; 3426 struct spdk_bdev_shared_resource *shared_resource; 3427 struct lba_range *range; 3428 3429 ch->bdev = bdev; 3430 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3431 if (!ch->channel) { 3432 return -1; 3433 } 3434 3435 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3436 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3437 3438 assert(ch->histogram == NULL); 3439 if (bdev->internal.histogram_enabled) { 3440 ch->histogram = spdk_histogram_data_alloc(); 3441 if (ch->histogram == NULL) { 3442 SPDK_ERRLOG("Could not allocate histogram\n"); 3443 } 3444 } 3445 3446 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3447 if (!mgmt_io_ch) { 3448 spdk_put_io_channel(ch->channel); 3449 return -1; 3450 } 3451 3452 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3453 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3454 if (shared_resource->shared_ch == ch->channel) { 3455 spdk_put_io_channel(mgmt_io_ch); 3456 shared_resource->ref++; 3457 break; 3458 } 3459 } 3460 3461 if (shared_resource == NULL) { 3462 shared_resource = calloc(1, sizeof(*shared_resource)); 3463 if (shared_resource == NULL) { 3464 spdk_put_io_channel(ch->channel); 3465 spdk_put_io_channel(mgmt_io_ch); 3466 return -1; 3467 } 3468 3469 shared_resource->mgmt_ch = mgmt_ch; 3470 shared_resource->io_outstanding = 0; 3471 TAILQ_INIT(&shared_resource->nomem_io); 3472 shared_resource->nomem_threshold = 0; 3473 shared_resource->shared_ch = ch->channel; 3474 shared_resource->ref = 1; 3475 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3476 } 3477 3478 ch->io_outstanding = 0; 3479 TAILQ_INIT(&ch->queued_resets); 3480 TAILQ_INIT(&ch->locked_ranges); 3481 ch->flags = 0; 3482 ch->shared_resource = shared_resource; 3483 3484 TAILQ_INIT(&ch->io_submitted); 3485 TAILQ_INIT(&ch->io_locked); 3486 3487 ch->stat = bdev_alloc_io_stat(false); 3488 if (ch->stat == NULL) { 3489 bdev_channel_destroy_resource(ch); 3490 return -1; 3491 } 3492 3493 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3494 3495 #ifdef SPDK_CONFIG_VTUNE 3496 { 3497 char *name; 3498 __itt_init_ittlib(NULL, 0); 3499 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3500 if (!name) { 3501 bdev_channel_destroy_resource(ch); 3502 return -1; 3503 } 3504 ch->handle = __itt_string_handle_create(name); 3505 free(name); 3506 ch->start_tsc = spdk_get_ticks(); 3507 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3508 ch->prev_stat = bdev_alloc_io_stat(false); 3509 if (ch->prev_stat == NULL) { 3510 bdev_channel_destroy_resource(ch); 3511 return -1; 3512 } 3513 } 3514 #endif 3515 3516 spdk_spin_lock(&bdev->internal.spinlock); 3517 bdev_enable_qos(bdev, ch); 3518 3519 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3520 struct lba_range *new_range; 3521 3522 new_range = calloc(1, sizeof(*new_range)); 3523 if (new_range == NULL) { 3524 spdk_spin_unlock(&bdev->internal.spinlock); 3525 bdev_channel_destroy_resource(ch); 3526 return -1; 3527 } 3528 new_range->length = range->length; 3529 new_range->offset = range->offset; 3530 new_range->locked_ctx = range->locked_ctx; 3531 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3532 } 3533 3534 spdk_spin_unlock(&bdev->internal.spinlock); 3535 3536 return 0; 3537 } 3538 3539 static int 3540 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3541 void *cb_ctx) 3542 { 3543 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3544 struct spdk_bdev_io *bdev_io; 3545 uint64_t buf_len; 3546 3547 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3548 if (bdev_io->internal.ch == bdev_ch) { 3549 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3550 spdk_iobuf_entry_abort(ch, entry, buf_len); 3551 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3552 } 3553 3554 return 0; 3555 } 3556 3557 /* 3558 * Abort I/O that are waiting on a data buffer. 3559 */ 3560 static void 3561 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3562 { 3563 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3564 bdev_abort_all_buf_io_cb, ch); 3565 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3566 bdev_abort_all_buf_io_cb, ch); 3567 } 3568 3569 /* 3570 * Abort I/O that are queued waiting for submission. These types of I/O are 3571 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3572 */ 3573 static void 3574 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3575 { 3576 struct spdk_bdev_io *bdev_io, *tmp; 3577 3578 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3579 if (bdev_io->internal.ch == ch) { 3580 TAILQ_REMOVE(queue, bdev_io, internal.link); 3581 /* 3582 * spdk_bdev_io_complete() assumes that the completed I/O had 3583 * been submitted to the bdev module. Since in this case it 3584 * hadn't, bump io_outstanding to account for the decrement 3585 * that spdk_bdev_io_complete() will do. 3586 */ 3587 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3588 ch->io_outstanding++; 3589 ch->shared_resource->io_outstanding++; 3590 } 3591 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3592 } 3593 } 3594 } 3595 3596 static bool 3597 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3598 { 3599 struct spdk_bdev_io *bdev_io; 3600 3601 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3602 if (bdev_io == bio_to_abort) { 3603 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3604 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3605 return true; 3606 } 3607 } 3608 3609 return false; 3610 } 3611 3612 static int 3613 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3614 { 3615 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3616 uint64_t buf_len; 3617 3618 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3619 if (bdev_io == bio_to_abort) { 3620 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3621 spdk_iobuf_entry_abort(ch, entry, buf_len); 3622 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3623 return 1; 3624 } 3625 3626 return 0; 3627 } 3628 3629 static bool 3630 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3631 { 3632 int rc; 3633 3634 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3635 bdev_abort_buf_io_cb, bio_to_abort); 3636 if (rc == 1) { 3637 return true; 3638 } 3639 3640 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3641 bdev_abort_buf_io_cb, bio_to_abort); 3642 return rc == 1; 3643 } 3644 3645 static void 3646 bdev_qos_channel_destroy(void *cb_arg) 3647 { 3648 struct spdk_bdev_qos *qos = cb_arg; 3649 3650 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3651 spdk_poller_unregister(&qos->poller); 3652 3653 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3654 3655 free(qos); 3656 } 3657 3658 static int 3659 bdev_qos_destroy(struct spdk_bdev *bdev) 3660 { 3661 int i; 3662 3663 /* 3664 * Cleanly shutting down the QoS poller is tricky, because 3665 * during the asynchronous operation the user could open 3666 * a new descriptor and create a new channel, spawning 3667 * a new QoS poller. 3668 * 3669 * The strategy is to create a new QoS structure here and swap it 3670 * in. The shutdown path then continues to refer to the old one 3671 * until it completes and then releases it. 3672 */ 3673 struct spdk_bdev_qos *new_qos, *old_qos; 3674 3675 old_qos = bdev->internal.qos; 3676 3677 new_qos = calloc(1, sizeof(*new_qos)); 3678 if (!new_qos) { 3679 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3680 return -ENOMEM; 3681 } 3682 3683 /* Copy the old QoS data into the newly allocated structure */ 3684 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3685 3686 /* Zero out the key parts of the QoS structure */ 3687 new_qos->ch = NULL; 3688 new_qos->thread = NULL; 3689 new_qos->poller = NULL; 3690 TAILQ_INIT(&new_qos->queued); 3691 /* 3692 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3693 * It will be used later for the new QoS structure. 3694 */ 3695 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3696 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3697 new_qos->rate_limits[i].min_per_timeslice = 0; 3698 new_qos->rate_limits[i].max_per_timeslice = 0; 3699 } 3700 3701 bdev->internal.qos = new_qos; 3702 3703 if (old_qos->thread == NULL) { 3704 free(old_qos); 3705 } else { 3706 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3707 } 3708 3709 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3710 * been destroyed yet. The destruction path will end up waiting for the final 3711 * channel to be put before it releases resources. */ 3712 3713 return 0; 3714 } 3715 3716 static void 3717 bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3718 { 3719 total->bytes_read += add->bytes_read; 3720 total->num_read_ops += add->num_read_ops; 3721 total->bytes_written += add->bytes_written; 3722 total->num_write_ops += add->num_write_ops; 3723 total->bytes_unmapped += add->bytes_unmapped; 3724 total->num_unmap_ops += add->num_unmap_ops; 3725 total->bytes_copied += add->bytes_copied; 3726 total->num_copy_ops += add->num_copy_ops; 3727 total->read_latency_ticks += add->read_latency_ticks; 3728 total->write_latency_ticks += add->write_latency_ticks; 3729 total->unmap_latency_ticks += add->unmap_latency_ticks; 3730 total->copy_latency_ticks += add->copy_latency_ticks; 3731 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3732 total->max_read_latency_ticks = add->max_read_latency_ticks; 3733 } 3734 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3735 total->min_read_latency_ticks = add->min_read_latency_ticks; 3736 } 3737 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3738 total->max_write_latency_ticks = add->max_write_latency_ticks; 3739 } 3740 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3741 total->min_write_latency_ticks = add->min_write_latency_ticks; 3742 } 3743 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3744 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3745 } 3746 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3747 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3748 } 3749 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3750 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3751 } 3752 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3753 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3754 } 3755 } 3756 3757 static void 3758 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3759 { 3760 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3761 3762 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3763 memcpy(to_stat->io_error, from_stat->io_error, 3764 sizeof(struct spdk_bdev_io_error_stat)); 3765 } 3766 } 3767 3768 static void 3769 bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum bdev_reset_stat_mode mode) 3770 { 3771 stat->max_read_latency_ticks = 0; 3772 stat->min_read_latency_ticks = UINT64_MAX; 3773 stat->max_write_latency_ticks = 0; 3774 stat->min_write_latency_ticks = UINT64_MAX; 3775 stat->max_unmap_latency_ticks = 0; 3776 stat->min_unmap_latency_ticks = UINT64_MAX; 3777 stat->max_copy_latency_ticks = 0; 3778 stat->min_copy_latency_ticks = UINT64_MAX; 3779 3780 if (mode != BDEV_RESET_STAT_ALL) { 3781 return; 3782 } 3783 3784 stat->bytes_read = 0; 3785 stat->num_read_ops = 0; 3786 stat->bytes_written = 0; 3787 stat->num_write_ops = 0; 3788 stat->bytes_unmapped = 0; 3789 stat->num_unmap_ops = 0; 3790 stat->bytes_copied = 0; 3791 stat->num_copy_ops = 0; 3792 stat->read_latency_ticks = 0; 3793 stat->write_latency_ticks = 0; 3794 stat->unmap_latency_ticks = 0; 3795 stat->copy_latency_ticks = 0; 3796 3797 if (stat->io_error != NULL) { 3798 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3799 } 3800 } 3801 3802 struct spdk_bdev_io_stat * 3803 bdev_alloc_io_stat(bool io_error_stat) 3804 { 3805 struct spdk_bdev_io_stat *stat; 3806 3807 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3808 if (stat == NULL) { 3809 return NULL; 3810 } 3811 3812 if (io_error_stat) { 3813 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3814 if (stat->io_error == NULL) { 3815 free(stat); 3816 return NULL; 3817 } 3818 } else { 3819 stat->io_error = NULL; 3820 } 3821 3822 bdev_reset_io_stat(stat, BDEV_RESET_STAT_ALL); 3823 3824 return stat; 3825 } 3826 3827 void 3828 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3829 { 3830 if (stat != NULL) { 3831 free(stat->io_error); 3832 free(stat); 3833 } 3834 } 3835 3836 void 3837 bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3838 { 3839 int i; 3840 3841 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3842 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3843 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3844 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3845 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3846 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3847 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3848 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3849 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3850 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3851 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3852 stat->min_read_latency_ticks != UINT64_MAX ? 3853 stat->min_read_latency_ticks : 0); 3854 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3855 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3856 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3857 stat->min_write_latency_ticks != UINT64_MAX ? 3858 stat->min_write_latency_ticks : 0); 3859 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3860 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3861 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3862 stat->min_unmap_latency_ticks != UINT64_MAX ? 3863 stat->min_unmap_latency_ticks : 0); 3864 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3865 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3866 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3867 stat->min_copy_latency_ticks != UINT64_MAX ? 3868 stat->min_copy_latency_ticks : 0); 3869 3870 if (stat->io_error != NULL) { 3871 spdk_json_write_named_object_begin(w, "io_error"); 3872 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3873 if (stat->io_error->error_status[i] != 0) { 3874 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3875 stat->io_error->error_status[i]); 3876 } 3877 } 3878 spdk_json_write_object_end(w); 3879 } 3880 } 3881 3882 static void 3883 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3884 { 3885 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3886 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3887 3888 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3889 bdev_abort_all_buf_io(mgmt_ch, ch); 3890 bdev_abort_all_buf_io(mgmt_ch, ch); 3891 } 3892 3893 static void 3894 bdev_channel_destroy(void *io_device, void *ctx_buf) 3895 { 3896 struct spdk_bdev_channel *ch = ctx_buf; 3897 3898 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3899 spdk_get_thread()); 3900 3901 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3902 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3903 3904 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3905 spdk_spin_lock(&ch->bdev->internal.spinlock); 3906 bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3907 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3908 3909 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3910 3911 bdev_channel_abort_queued_ios(ch); 3912 3913 if (ch->histogram) { 3914 spdk_histogram_data_free(ch->histogram); 3915 } 3916 3917 bdev_channel_destroy_resource(ch); 3918 } 3919 3920 /* 3921 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3922 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3923 */ 3924 static int 3925 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3926 { 3927 struct spdk_bdev_name *tmp; 3928 3929 bdev_name->name = strdup(name); 3930 if (bdev_name->name == NULL) { 3931 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3932 return -ENOMEM; 3933 } 3934 3935 bdev_name->bdev = bdev; 3936 3937 spdk_spin_lock(&g_bdev_mgr.spinlock); 3938 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3939 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3940 3941 if (tmp != NULL) { 3942 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3943 free(bdev_name->name); 3944 return -EEXIST; 3945 } 3946 3947 return 0; 3948 } 3949 3950 static void 3951 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3952 { 3953 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3954 free(bdev_name->name); 3955 } 3956 3957 static void 3958 bdev_name_del(struct spdk_bdev_name *bdev_name) 3959 { 3960 spdk_spin_lock(&g_bdev_mgr.spinlock); 3961 bdev_name_del_unsafe(bdev_name); 3962 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3963 } 3964 3965 int 3966 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3967 { 3968 struct spdk_bdev_alias *tmp; 3969 int ret; 3970 3971 if (alias == NULL) { 3972 SPDK_ERRLOG("Empty alias passed\n"); 3973 return -EINVAL; 3974 } 3975 3976 tmp = calloc(1, sizeof(*tmp)); 3977 if (tmp == NULL) { 3978 SPDK_ERRLOG("Unable to allocate alias\n"); 3979 return -ENOMEM; 3980 } 3981 3982 ret = bdev_name_add(&tmp->alias, bdev, alias); 3983 if (ret != 0) { 3984 free(tmp); 3985 return ret; 3986 } 3987 3988 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3989 3990 return 0; 3991 } 3992 3993 static int 3994 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3995 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3996 { 3997 struct spdk_bdev_alias *tmp; 3998 3999 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4000 if (strcmp(alias, tmp->alias.name) == 0) { 4001 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4002 alias_del_fn(&tmp->alias); 4003 free(tmp); 4004 return 0; 4005 } 4006 } 4007 4008 return -ENOENT; 4009 } 4010 4011 int 4012 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4013 { 4014 int rc; 4015 4016 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4017 if (rc == -ENOENT) { 4018 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4019 } 4020 4021 return rc; 4022 } 4023 4024 void 4025 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4026 { 4027 struct spdk_bdev_alias *p, *tmp; 4028 4029 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4030 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4031 bdev_name_del(&p->alias); 4032 free(p); 4033 } 4034 } 4035 4036 struct spdk_io_channel * 4037 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4038 { 4039 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4040 } 4041 4042 void * 4043 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4044 { 4045 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4046 void *ctx = NULL; 4047 4048 if (bdev->fn_table->get_module_ctx) { 4049 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4050 } 4051 4052 return ctx; 4053 } 4054 4055 const char * 4056 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4057 { 4058 return bdev->module->name; 4059 } 4060 4061 const char * 4062 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4063 { 4064 return bdev->name; 4065 } 4066 4067 const char * 4068 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4069 { 4070 return bdev->product_name; 4071 } 4072 4073 const struct spdk_bdev_aliases_list * 4074 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4075 { 4076 return &bdev->aliases; 4077 } 4078 4079 uint32_t 4080 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4081 { 4082 return bdev->blocklen; 4083 } 4084 4085 uint32_t 4086 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4087 { 4088 return bdev->write_unit_size; 4089 } 4090 4091 uint64_t 4092 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4093 { 4094 return bdev->blockcnt; 4095 } 4096 4097 const char * 4098 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4099 { 4100 return qos_rpc_type[type]; 4101 } 4102 4103 void 4104 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4105 { 4106 int i; 4107 4108 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4109 4110 spdk_spin_lock(&bdev->internal.spinlock); 4111 if (bdev->internal.qos) { 4112 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4113 if (bdev->internal.qos->rate_limits[i].limit != 4114 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4115 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4116 if (bdev_qos_is_iops_rate_limit(i) == false) { 4117 /* Change from Byte to Megabyte which is user visible. */ 4118 limits[i] = limits[i] / 1024 / 1024; 4119 } 4120 } 4121 } 4122 } 4123 spdk_spin_unlock(&bdev->internal.spinlock); 4124 } 4125 4126 size_t 4127 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4128 { 4129 return 1 << bdev->required_alignment; 4130 } 4131 4132 uint32_t 4133 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4134 { 4135 return bdev->optimal_io_boundary; 4136 } 4137 4138 bool 4139 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4140 { 4141 return bdev->write_cache; 4142 } 4143 4144 const struct spdk_uuid * 4145 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4146 { 4147 return &bdev->uuid; 4148 } 4149 4150 uint16_t 4151 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4152 { 4153 return bdev->acwu; 4154 } 4155 4156 uint32_t 4157 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4158 { 4159 return bdev->md_len; 4160 } 4161 4162 bool 4163 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4164 { 4165 return (bdev->md_len != 0) && bdev->md_interleave; 4166 } 4167 4168 bool 4169 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4170 { 4171 return (bdev->md_len != 0) && !bdev->md_interleave; 4172 } 4173 4174 bool 4175 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4176 { 4177 return bdev->zoned; 4178 } 4179 4180 uint32_t 4181 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4182 { 4183 if (spdk_bdev_is_md_interleaved(bdev)) { 4184 return bdev->blocklen - bdev->md_len; 4185 } else { 4186 return bdev->blocklen; 4187 } 4188 } 4189 4190 uint32_t 4191 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4192 { 4193 return bdev->phys_blocklen; 4194 } 4195 4196 static uint32_t 4197 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4198 { 4199 if (!spdk_bdev_is_md_interleaved(bdev)) { 4200 return bdev->blocklen + bdev->md_len; 4201 } else { 4202 return bdev->blocklen; 4203 } 4204 } 4205 4206 /* We have to use the typedef in the function declaration to appease astyle. */ 4207 typedef enum spdk_dif_type spdk_dif_type_t; 4208 4209 spdk_dif_type_t 4210 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4211 { 4212 if (bdev->md_len != 0) { 4213 return bdev->dif_type; 4214 } else { 4215 return SPDK_DIF_DISABLE; 4216 } 4217 } 4218 4219 bool 4220 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4221 { 4222 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4223 return bdev->dif_is_head_of_md; 4224 } else { 4225 return false; 4226 } 4227 } 4228 4229 bool 4230 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4231 enum spdk_dif_check_type check_type) 4232 { 4233 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4234 return false; 4235 } 4236 4237 switch (check_type) { 4238 case SPDK_DIF_CHECK_TYPE_REFTAG: 4239 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4240 case SPDK_DIF_CHECK_TYPE_APPTAG: 4241 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4242 case SPDK_DIF_CHECK_TYPE_GUARD: 4243 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4244 default: 4245 return false; 4246 } 4247 } 4248 4249 uint32_t 4250 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4251 { 4252 return bdev->max_copy; 4253 } 4254 4255 uint64_t 4256 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4257 { 4258 return bdev->internal.measured_queue_depth; 4259 } 4260 4261 uint64_t 4262 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4263 { 4264 return bdev->internal.period; 4265 } 4266 4267 uint64_t 4268 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4269 { 4270 return bdev->internal.weighted_io_time; 4271 } 4272 4273 uint64_t 4274 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4275 { 4276 return bdev->internal.io_time; 4277 } 4278 4279 static void bdev_update_qd_sampling_period(void *ctx); 4280 4281 static void 4282 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4283 { 4284 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4285 4286 if (bdev->internal.measured_queue_depth) { 4287 bdev->internal.io_time += bdev->internal.period; 4288 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4289 } 4290 4291 bdev->internal.qd_poll_in_progress = false; 4292 4293 bdev_update_qd_sampling_period(bdev); 4294 } 4295 4296 static void 4297 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4298 struct spdk_io_channel *io_ch, void *_ctx) 4299 { 4300 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4301 4302 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4303 spdk_bdev_for_each_channel_continue(i, 0); 4304 } 4305 4306 static int 4307 bdev_calculate_measured_queue_depth(void *ctx) 4308 { 4309 struct spdk_bdev *bdev = ctx; 4310 4311 bdev->internal.qd_poll_in_progress = true; 4312 bdev->internal.temporary_queue_depth = 0; 4313 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4314 return SPDK_POLLER_BUSY; 4315 } 4316 4317 static void 4318 bdev_update_qd_sampling_period(void *ctx) 4319 { 4320 struct spdk_bdev *bdev = ctx; 4321 4322 if (bdev->internal.period == bdev->internal.new_period) { 4323 return; 4324 } 4325 4326 if (bdev->internal.qd_poll_in_progress) { 4327 return; 4328 } 4329 4330 bdev->internal.period = bdev->internal.new_period; 4331 4332 spdk_poller_unregister(&bdev->internal.qd_poller); 4333 if (bdev->internal.period != 0) { 4334 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4335 bdev, bdev->internal.period); 4336 } else { 4337 spdk_bdev_close(bdev->internal.qd_desc); 4338 bdev->internal.qd_desc = NULL; 4339 } 4340 } 4341 4342 static void 4343 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4344 { 4345 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4346 } 4347 4348 void 4349 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4350 { 4351 int rc; 4352 4353 if (bdev->internal.new_period == period) { 4354 return; 4355 } 4356 4357 bdev->internal.new_period = period; 4358 4359 if (bdev->internal.qd_desc != NULL) { 4360 assert(bdev->internal.period != 0); 4361 4362 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4363 bdev_update_qd_sampling_period, bdev); 4364 return; 4365 } 4366 4367 assert(bdev->internal.period == 0); 4368 4369 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4370 NULL, &bdev->internal.qd_desc); 4371 if (rc != 0) { 4372 return; 4373 } 4374 4375 bdev->internal.period = period; 4376 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4377 bdev, period); 4378 } 4379 4380 struct bdev_get_current_qd_ctx { 4381 uint64_t current_qd; 4382 spdk_bdev_get_current_qd_cb cb_fn; 4383 void *cb_arg; 4384 }; 4385 4386 static void 4387 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4388 { 4389 struct bdev_get_current_qd_ctx *ctx = _ctx; 4390 4391 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4392 4393 free(ctx); 4394 } 4395 4396 static void 4397 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4398 struct spdk_io_channel *io_ch, void *_ctx) 4399 { 4400 struct bdev_get_current_qd_ctx *ctx = _ctx; 4401 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4402 4403 ctx->current_qd += bdev_ch->io_outstanding; 4404 4405 spdk_bdev_for_each_channel_continue(i, 0); 4406 } 4407 4408 void 4409 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4410 void *cb_arg) 4411 { 4412 struct bdev_get_current_qd_ctx *ctx; 4413 4414 assert(cb_fn != NULL); 4415 4416 ctx = calloc(1, sizeof(*ctx)); 4417 if (ctx == NULL) { 4418 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4419 return; 4420 } 4421 4422 ctx->cb_fn = cb_fn; 4423 ctx->cb_arg = cb_arg; 4424 4425 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4426 } 4427 4428 static void 4429 _resize_notify(void *arg) 4430 { 4431 struct spdk_bdev_desc *desc = arg; 4432 4433 spdk_spin_lock(&desc->spinlock); 4434 desc->refs--; 4435 if (!desc->closed) { 4436 spdk_spin_unlock(&desc->spinlock); 4437 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4438 desc->bdev, 4439 desc->callback.ctx); 4440 return; 4441 } else if (0 == desc->refs) { 4442 /* This descriptor was closed after this resize_notify message was sent. 4443 * spdk_bdev_close() could not free the descriptor since this message was 4444 * in flight, so we free it now using bdev_desc_free(). 4445 */ 4446 spdk_spin_unlock(&desc->spinlock); 4447 bdev_desc_free(desc); 4448 return; 4449 } 4450 spdk_spin_unlock(&desc->spinlock); 4451 } 4452 4453 int 4454 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4455 { 4456 struct spdk_bdev_desc *desc; 4457 int ret; 4458 4459 if (size == bdev->blockcnt) { 4460 return 0; 4461 } 4462 4463 spdk_spin_lock(&bdev->internal.spinlock); 4464 4465 /* bdev has open descriptors */ 4466 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4467 bdev->blockcnt > size) { 4468 ret = -EBUSY; 4469 } else { 4470 bdev->blockcnt = size; 4471 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4472 spdk_spin_lock(&desc->spinlock); 4473 if (!desc->closed) { 4474 desc->refs++; 4475 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4476 } 4477 spdk_spin_unlock(&desc->spinlock); 4478 } 4479 ret = 0; 4480 } 4481 4482 spdk_spin_unlock(&bdev->internal.spinlock); 4483 4484 return ret; 4485 } 4486 4487 /* 4488 * Convert I/O offset and length from bytes to blocks. 4489 * 4490 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4491 */ 4492 static uint64_t 4493 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4494 uint64_t num_bytes, uint64_t *num_blocks) 4495 { 4496 uint32_t block_size = bdev->blocklen; 4497 uint8_t shift_cnt; 4498 4499 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4500 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4501 shift_cnt = spdk_u32log2(block_size); 4502 *offset_blocks = offset_bytes >> shift_cnt; 4503 *num_blocks = num_bytes >> shift_cnt; 4504 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4505 (num_bytes - (*num_blocks << shift_cnt)); 4506 } else { 4507 *offset_blocks = offset_bytes / block_size; 4508 *num_blocks = num_bytes / block_size; 4509 return (offset_bytes % block_size) | (num_bytes % block_size); 4510 } 4511 } 4512 4513 static bool 4514 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4515 { 4516 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4517 * has been an overflow and hence the offset has been wrapped around */ 4518 if (offset_blocks + num_blocks < offset_blocks) { 4519 return false; 4520 } 4521 4522 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4523 if (offset_blocks + num_blocks > bdev->blockcnt) { 4524 return false; 4525 } 4526 4527 return true; 4528 } 4529 4530 static void 4531 bdev_seek_complete_cb(void *ctx) 4532 { 4533 struct spdk_bdev_io *bdev_io = ctx; 4534 4535 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4536 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4537 } 4538 4539 static int 4540 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4541 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4542 spdk_bdev_io_completion_cb cb, void *cb_arg) 4543 { 4544 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4545 struct spdk_bdev_io *bdev_io; 4546 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4547 4548 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4549 4550 /* Check if offset_blocks is valid looking at the validity of one block */ 4551 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4552 return -EINVAL; 4553 } 4554 4555 bdev_io = bdev_channel_get_io(channel); 4556 if (!bdev_io) { 4557 return -ENOMEM; 4558 } 4559 4560 bdev_io->internal.ch = channel; 4561 bdev_io->internal.desc = desc; 4562 bdev_io->type = io_type; 4563 bdev_io->u.bdev.offset_blocks = offset_blocks; 4564 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4565 4566 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4567 /* In case bdev doesn't support seek to next data/hole offset, 4568 * it is assumed that only data and no holes are present */ 4569 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4570 bdev_io->u.bdev.seek.offset = offset_blocks; 4571 } else { 4572 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4573 } 4574 4575 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4576 return 0; 4577 } 4578 4579 bdev_io_submit(bdev_io); 4580 return 0; 4581 } 4582 4583 int 4584 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4585 uint64_t offset_blocks, 4586 spdk_bdev_io_completion_cb cb, void *cb_arg) 4587 { 4588 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4589 } 4590 4591 int 4592 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4593 uint64_t offset_blocks, 4594 spdk_bdev_io_completion_cb cb, void *cb_arg) 4595 { 4596 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4597 } 4598 4599 uint64_t 4600 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4601 { 4602 return bdev_io->u.bdev.seek.offset; 4603 } 4604 4605 static int 4606 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4607 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4608 spdk_bdev_io_completion_cb cb, void *cb_arg) 4609 { 4610 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4611 struct spdk_bdev_io *bdev_io; 4612 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4613 4614 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4615 return -EINVAL; 4616 } 4617 4618 bdev_io = bdev_channel_get_io(channel); 4619 if (!bdev_io) { 4620 return -ENOMEM; 4621 } 4622 4623 bdev_io->internal.ch = channel; 4624 bdev_io->internal.desc = desc; 4625 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4626 bdev_io->u.bdev.iovs = &bdev_io->iov; 4627 bdev_io->u.bdev.iovs[0].iov_base = buf; 4628 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4629 bdev_io->u.bdev.iovcnt = 1; 4630 bdev_io->u.bdev.md_buf = md_buf; 4631 bdev_io->u.bdev.num_blocks = num_blocks; 4632 bdev_io->u.bdev.offset_blocks = offset_blocks; 4633 bdev_io->u.bdev.ext_opts = NULL; 4634 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4635 4636 bdev_io_submit(bdev_io); 4637 return 0; 4638 } 4639 4640 int 4641 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4642 void *buf, uint64_t offset, uint64_t nbytes, 4643 spdk_bdev_io_completion_cb cb, void *cb_arg) 4644 { 4645 uint64_t offset_blocks, num_blocks; 4646 4647 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4648 nbytes, &num_blocks) != 0) { 4649 return -EINVAL; 4650 } 4651 4652 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4653 } 4654 4655 int 4656 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4657 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4658 spdk_bdev_io_completion_cb cb, void *cb_arg) 4659 { 4660 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4661 } 4662 4663 int 4664 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4665 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4666 spdk_bdev_io_completion_cb cb, void *cb_arg) 4667 { 4668 struct iovec iov = { 4669 .iov_base = buf, 4670 }; 4671 4672 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4673 return -EINVAL; 4674 } 4675 4676 if (md_buf && !_is_buf_allocated(&iov)) { 4677 return -EINVAL; 4678 } 4679 4680 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4681 cb, cb_arg); 4682 } 4683 4684 int 4685 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4686 struct iovec *iov, int iovcnt, 4687 uint64_t offset, uint64_t nbytes, 4688 spdk_bdev_io_completion_cb cb, void *cb_arg) 4689 { 4690 uint64_t offset_blocks, num_blocks; 4691 4692 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4693 nbytes, &num_blocks) != 0) { 4694 return -EINVAL; 4695 } 4696 4697 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4698 } 4699 4700 static int 4701 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4702 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4703 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4704 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4705 { 4706 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4707 struct spdk_bdev_io *bdev_io; 4708 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4709 4710 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4711 return -EINVAL; 4712 } 4713 4714 bdev_io = bdev_channel_get_io(channel); 4715 if (!bdev_io) { 4716 return -ENOMEM; 4717 } 4718 4719 bdev_io->internal.ch = channel; 4720 bdev_io->internal.desc = desc; 4721 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4722 bdev_io->u.bdev.iovs = iov; 4723 bdev_io->u.bdev.iovcnt = iovcnt; 4724 bdev_io->u.bdev.md_buf = md_buf; 4725 bdev_io->u.bdev.num_blocks = num_blocks; 4726 bdev_io->u.bdev.offset_blocks = offset_blocks; 4727 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4728 bdev_io->internal.ext_opts = opts; 4729 bdev_io->u.bdev.ext_opts = opts; 4730 4731 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4732 4733 return 0; 4734 } 4735 4736 int 4737 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4738 struct iovec *iov, int iovcnt, 4739 uint64_t offset_blocks, uint64_t num_blocks, 4740 spdk_bdev_io_completion_cb cb, void *cb_arg) 4741 { 4742 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4743 num_blocks, cb, cb_arg, NULL, false); 4744 } 4745 4746 int 4747 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4748 struct iovec *iov, int iovcnt, void *md_buf, 4749 uint64_t offset_blocks, uint64_t num_blocks, 4750 spdk_bdev_io_completion_cb cb, void *cb_arg) 4751 { 4752 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4753 return -EINVAL; 4754 } 4755 4756 if (md_buf && !_is_buf_allocated(iov)) { 4757 return -EINVAL; 4758 } 4759 4760 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4761 num_blocks, cb, cb_arg, NULL, false); 4762 } 4763 4764 static inline bool 4765 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4766 { 4767 /* 4768 * We check if opts size is at least of size when we first introduced 4769 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4770 * are not checked internal. 4771 */ 4772 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4773 sizeof(opts->metadata) && 4774 opts->size <= sizeof(*opts) && 4775 /* When memory domain is used, the user must provide data buffers */ 4776 (!opts->memory_domain || (iov && iov[0].iov_base)); 4777 } 4778 4779 int 4780 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4781 struct iovec *iov, int iovcnt, 4782 uint64_t offset_blocks, uint64_t num_blocks, 4783 spdk_bdev_io_completion_cb cb, void *cb_arg, 4784 struct spdk_bdev_ext_io_opts *opts) 4785 { 4786 void *md = NULL; 4787 4788 if (opts) { 4789 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4790 return -EINVAL; 4791 } 4792 md = opts->metadata; 4793 } 4794 4795 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4796 return -EINVAL; 4797 } 4798 4799 if (md && !_is_buf_allocated(iov)) { 4800 return -EINVAL; 4801 } 4802 4803 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4804 num_blocks, cb, cb_arg, opts, false); 4805 } 4806 4807 static int 4808 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4809 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4810 spdk_bdev_io_completion_cb cb, void *cb_arg) 4811 { 4812 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4813 struct spdk_bdev_io *bdev_io; 4814 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4815 4816 if (!desc->write) { 4817 return -EBADF; 4818 } 4819 4820 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4821 return -EINVAL; 4822 } 4823 4824 bdev_io = bdev_channel_get_io(channel); 4825 if (!bdev_io) { 4826 return -ENOMEM; 4827 } 4828 4829 bdev_io->internal.ch = channel; 4830 bdev_io->internal.desc = desc; 4831 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4832 bdev_io->u.bdev.iovs = &bdev_io->iov; 4833 bdev_io->u.bdev.iovs[0].iov_base = buf; 4834 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4835 bdev_io->u.bdev.iovcnt = 1; 4836 bdev_io->u.bdev.md_buf = md_buf; 4837 bdev_io->u.bdev.num_blocks = num_blocks; 4838 bdev_io->u.bdev.offset_blocks = offset_blocks; 4839 bdev_io->u.bdev.ext_opts = NULL; 4840 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4841 4842 bdev_io_submit(bdev_io); 4843 return 0; 4844 } 4845 4846 int 4847 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4848 void *buf, uint64_t offset, uint64_t nbytes, 4849 spdk_bdev_io_completion_cb cb, void *cb_arg) 4850 { 4851 uint64_t offset_blocks, num_blocks; 4852 4853 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4854 nbytes, &num_blocks) != 0) { 4855 return -EINVAL; 4856 } 4857 4858 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4859 } 4860 4861 int 4862 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4863 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4864 spdk_bdev_io_completion_cb cb, void *cb_arg) 4865 { 4866 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4867 cb, cb_arg); 4868 } 4869 4870 int 4871 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4872 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4873 spdk_bdev_io_completion_cb cb, void *cb_arg) 4874 { 4875 struct iovec iov = { 4876 .iov_base = buf, 4877 }; 4878 4879 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4880 return -EINVAL; 4881 } 4882 4883 if (md_buf && !_is_buf_allocated(&iov)) { 4884 return -EINVAL; 4885 } 4886 4887 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4888 cb, cb_arg); 4889 } 4890 4891 static int 4892 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4893 struct iovec *iov, int iovcnt, void *md_buf, 4894 uint64_t offset_blocks, uint64_t num_blocks, 4895 spdk_bdev_io_completion_cb cb, void *cb_arg, 4896 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4897 { 4898 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4899 struct spdk_bdev_io *bdev_io; 4900 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4901 4902 if (!desc->write) { 4903 return -EBADF; 4904 } 4905 4906 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4907 return -EINVAL; 4908 } 4909 4910 bdev_io = bdev_channel_get_io(channel); 4911 if (!bdev_io) { 4912 return -ENOMEM; 4913 } 4914 4915 bdev_io->internal.ch = channel; 4916 bdev_io->internal.desc = desc; 4917 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4918 bdev_io->u.bdev.iovs = iov; 4919 bdev_io->u.bdev.iovcnt = iovcnt; 4920 bdev_io->u.bdev.md_buf = md_buf; 4921 bdev_io->u.bdev.num_blocks = num_blocks; 4922 bdev_io->u.bdev.offset_blocks = offset_blocks; 4923 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4924 bdev_io->internal.ext_opts = opts; 4925 bdev_io->u.bdev.ext_opts = opts; 4926 4927 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4928 4929 return 0; 4930 } 4931 4932 int 4933 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4934 struct iovec *iov, int iovcnt, 4935 uint64_t offset, uint64_t len, 4936 spdk_bdev_io_completion_cb cb, void *cb_arg) 4937 { 4938 uint64_t offset_blocks, num_blocks; 4939 4940 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4941 len, &num_blocks) != 0) { 4942 return -EINVAL; 4943 } 4944 4945 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4946 } 4947 4948 int 4949 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4950 struct iovec *iov, int iovcnt, 4951 uint64_t offset_blocks, uint64_t num_blocks, 4952 spdk_bdev_io_completion_cb cb, void *cb_arg) 4953 { 4954 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4955 num_blocks, cb, cb_arg, NULL, false); 4956 } 4957 4958 int 4959 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4960 struct iovec *iov, int iovcnt, void *md_buf, 4961 uint64_t offset_blocks, uint64_t num_blocks, 4962 spdk_bdev_io_completion_cb cb, void *cb_arg) 4963 { 4964 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4965 return -EINVAL; 4966 } 4967 4968 if (md_buf && !_is_buf_allocated(iov)) { 4969 return -EINVAL; 4970 } 4971 4972 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4973 num_blocks, cb, cb_arg, NULL, false); 4974 } 4975 4976 int 4977 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4978 struct iovec *iov, int iovcnt, 4979 uint64_t offset_blocks, uint64_t num_blocks, 4980 spdk_bdev_io_completion_cb cb, void *cb_arg, 4981 struct spdk_bdev_ext_io_opts *opts) 4982 { 4983 void *md = NULL; 4984 4985 if (opts) { 4986 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4987 return -EINVAL; 4988 } 4989 md = opts->metadata; 4990 } 4991 4992 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4993 return -EINVAL; 4994 } 4995 4996 if (md && !_is_buf_allocated(iov)) { 4997 return -EINVAL; 4998 } 4999 5000 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5001 num_blocks, cb, cb_arg, opts, false); 5002 } 5003 5004 static void 5005 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5006 { 5007 struct spdk_bdev_io *parent_io = cb_arg; 5008 struct spdk_bdev *bdev = parent_io->bdev; 5009 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5010 int i, rc = 0; 5011 5012 if (!success) { 5013 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5014 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5015 spdk_bdev_free_io(bdev_io); 5016 return; 5017 } 5018 5019 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5020 rc = memcmp(read_buf, 5021 parent_io->u.bdev.iovs[i].iov_base, 5022 parent_io->u.bdev.iovs[i].iov_len); 5023 if (rc) { 5024 break; 5025 } 5026 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5027 } 5028 5029 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5030 rc = memcmp(bdev_io->u.bdev.md_buf, 5031 parent_io->u.bdev.md_buf, 5032 spdk_bdev_get_md_size(bdev)); 5033 } 5034 5035 spdk_bdev_free_io(bdev_io); 5036 5037 if (rc == 0) { 5038 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5039 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5040 } else { 5041 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5042 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5043 } 5044 } 5045 5046 static void 5047 bdev_compare_do_read(void *_bdev_io) 5048 { 5049 struct spdk_bdev_io *bdev_io = _bdev_io; 5050 int rc; 5051 5052 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5053 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5054 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5055 bdev_compare_do_read_done, bdev_io); 5056 5057 if (rc == -ENOMEM) { 5058 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5059 } else if (rc != 0) { 5060 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5061 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5062 } 5063 } 5064 5065 static int 5066 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5067 struct iovec *iov, int iovcnt, void *md_buf, 5068 uint64_t offset_blocks, uint64_t num_blocks, 5069 spdk_bdev_io_completion_cb cb, void *cb_arg) 5070 { 5071 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5072 struct spdk_bdev_io *bdev_io; 5073 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5074 5075 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5076 return -EINVAL; 5077 } 5078 5079 bdev_io = bdev_channel_get_io(channel); 5080 if (!bdev_io) { 5081 return -ENOMEM; 5082 } 5083 5084 bdev_io->internal.ch = channel; 5085 bdev_io->internal.desc = desc; 5086 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5087 bdev_io->u.bdev.iovs = iov; 5088 bdev_io->u.bdev.iovcnt = iovcnt; 5089 bdev_io->u.bdev.md_buf = md_buf; 5090 bdev_io->u.bdev.num_blocks = num_blocks; 5091 bdev_io->u.bdev.offset_blocks = offset_blocks; 5092 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5093 bdev_io->u.bdev.ext_opts = NULL; 5094 5095 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5096 bdev_io_submit(bdev_io); 5097 return 0; 5098 } 5099 5100 bdev_compare_do_read(bdev_io); 5101 5102 return 0; 5103 } 5104 5105 int 5106 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5107 struct iovec *iov, int iovcnt, 5108 uint64_t offset_blocks, uint64_t num_blocks, 5109 spdk_bdev_io_completion_cb cb, void *cb_arg) 5110 { 5111 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5112 num_blocks, cb, cb_arg); 5113 } 5114 5115 int 5116 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5117 struct iovec *iov, int iovcnt, void *md_buf, 5118 uint64_t offset_blocks, uint64_t num_blocks, 5119 spdk_bdev_io_completion_cb cb, void *cb_arg) 5120 { 5121 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5122 return -EINVAL; 5123 } 5124 5125 if (md_buf && !_is_buf_allocated(iov)) { 5126 return -EINVAL; 5127 } 5128 5129 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5130 num_blocks, cb, cb_arg); 5131 } 5132 5133 static int 5134 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5135 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5136 spdk_bdev_io_completion_cb cb, void *cb_arg) 5137 { 5138 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5139 struct spdk_bdev_io *bdev_io; 5140 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5141 5142 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5143 return -EINVAL; 5144 } 5145 5146 bdev_io = bdev_channel_get_io(channel); 5147 if (!bdev_io) { 5148 return -ENOMEM; 5149 } 5150 5151 bdev_io->internal.ch = channel; 5152 bdev_io->internal.desc = desc; 5153 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5154 bdev_io->u.bdev.iovs = &bdev_io->iov; 5155 bdev_io->u.bdev.iovs[0].iov_base = buf; 5156 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5157 bdev_io->u.bdev.iovcnt = 1; 5158 bdev_io->u.bdev.md_buf = md_buf; 5159 bdev_io->u.bdev.num_blocks = num_blocks; 5160 bdev_io->u.bdev.offset_blocks = offset_blocks; 5161 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5162 bdev_io->u.bdev.ext_opts = NULL; 5163 5164 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5165 bdev_io_submit(bdev_io); 5166 return 0; 5167 } 5168 5169 bdev_compare_do_read(bdev_io); 5170 5171 return 0; 5172 } 5173 5174 int 5175 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5176 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5177 spdk_bdev_io_completion_cb cb, void *cb_arg) 5178 { 5179 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5180 cb, cb_arg); 5181 } 5182 5183 int 5184 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5185 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5186 spdk_bdev_io_completion_cb cb, void *cb_arg) 5187 { 5188 struct iovec iov = { 5189 .iov_base = buf, 5190 }; 5191 5192 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5193 return -EINVAL; 5194 } 5195 5196 if (md_buf && !_is_buf_allocated(&iov)) { 5197 return -EINVAL; 5198 } 5199 5200 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5201 cb, cb_arg); 5202 } 5203 5204 static void 5205 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5206 { 5207 struct spdk_bdev_io *bdev_io = ctx; 5208 5209 if (unlock_status) { 5210 SPDK_ERRLOG("LBA range unlock failed\n"); 5211 } 5212 5213 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5214 false, bdev_io->internal.caller_ctx); 5215 } 5216 5217 static void 5218 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5219 { 5220 bdev_io->internal.status = status; 5221 5222 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5223 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5224 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5225 } 5226 5227 static void 5228 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5229 { 5230 struct spdk_bdev_io *parent_io = cb_arg; 5231 5232 if (!success) { 5233 SPDK_ERRLOG("Compare and write operation failed\n"); 5234 } 5235 5236 spdk_bdev_free_io(bdev_io); 5237 5238 bdev_comparev_and_writev_blocks_unlock(parent_io, 5239 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5240 } 5241 5242 static void 5243 bdev_compare_and_write_do_write(void *_bdev_io) 5244 { 5245 struct spdk_bdev_io *bdev_io = _bdev_io; 5246 int rc; 5247 5248 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5249 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5250 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5251 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5252 bdev_compare_and_write_do_write_done, bdev_io); 5253 5254 5255 if (rc == -ENOMEM) { 5256 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5257 } else if (rc != 0) { 5258 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5259 } 5260 } 5261 5262 static void 5263 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5264 { 5265 struct spdk_bdev_io *parent_io = cb_arg; 5266 5267 spdk_bdev_free_io(bdev_io); 5268 5269 if (!success) { 5270 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5271 return; 5272 } 5273 5274 bdev_compare_and_write_do_write(parent_io); 5275 } 5276 5277 static void 5278 bdev_compare_and_write_do_compare(void *_bdev_io) 5279 { 5280 struct spdk_bdev_io *bdev_io = _bdev_io; 5281 int rc; 5282 5283 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5284 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5285 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5286 bdev_compare_and_write_do_compare_done, bdev_io); 5287 5288 if (rc == -ENOMEM) { 5289 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5290 } else if (rc != 0) { 5291 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5292 } 5293 } 5294 5295 static void 5296 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5297 { 5298 struct spdk_bdev_io *bdev_io = ctx; 5299 5300 if (status) { 5301 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5302 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5303 return; 5304 } 5305 5306 bdev_compare_and_write_do_compare(bdev_io); 5307 } 5308 5309 int 5310 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5311 struct iovec *compare_iov, int compare_iovcnt, 5312 struct iovec *write_iov, int write_iovcnt, 5313 uint64_t offset_blocks, uint64_t num_blocks, 5314 spdk_bdev_io_completion_cb cb, void *cb_arg) 5315 { 5316 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5317 struct spdk_bdev_io *bdev_io; 5318 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5319 5320 if (!desc->write) { 5321 return -EBADF; 5322 } 5323 5324 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5325 return -EINVAL; 5326 } 5327 5328 if (num_blocks > bdev->acwu) { 5329 return -EINVAL; 5330 } 5331 5332 bdev_io = bdev_channel_get_io(channel); 5333 if (!bdev_io) { 5334 return -ENOMEM; 5335 } 5336 5337 bdev_io->internal.ch = channel; 5338 bdev_io->internal.desc = desc; 5339 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5340 bdev_io->u.bdev.iovs = compare_iov; 5341 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5342 bdev_io->u.bdev.fused_iovs = write_iov; 5343 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5344 bdev_io->u.bdev.md_buf = NULL; 5345 bdev_io->u.bdev.num_blocks = num_blocks; 5346 bdev_io->u.bdev.offset_blocks = offset_blocks; 5347 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5348 bdev_io->u.bdev.ext_opts = NULL; 5349 5350 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5351 bdev_io_submit(bdev_io); 5352 return 0; 5353 } 5354 5355 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5356 bdev_comparev_and_writev_blocks_locked, bdev_io); 5357 } 5358 5359 int 5360 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5361 struct iovec *iov, int iovcnt, 5362 uint64_t offset_blocks, uint64_t num_blocks, 5363 bool populate, 5364 spdk_bdev_io_completion_cb cb, void *cb_arg) 5365 { 5366 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5367 struct spdk_bdev_io *bdev_io; 5368 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5369 5370 if (!desc->write) { 5371 return -EBADF; 5372 } 5373 5374 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5375 return -EINVAL; 5376 } 5377 5378 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5379 return -ENOTSUP; 5380 } 5381 5382 bdev_io = bdev_channel_get_io(channel); 5383 if (!bdev_io) { 5384 return -ENOMEM; 5385 } 5386 5387 bdev_io->internal.ch = channel; 5388 bdev_io->internal.desc = desc; 5389 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5390 bdev_io->u.bdev.num_blocks = num_blocks; 5391 bdev_io->u.bdev.offset_blocks = offset_blocks; 5392 bdev_io->u.bdev.iovs = iov; 5393 bdev_io->u.bdev.iovcnt = iovcnt; 5394 bdev_io->u.bdev.md_buf = NULL; 5395 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5396 bdev_io->u.bdev.zcopy.commit = 0; 5397 bdev_io->u.bdev.zcopy.start = 1; 5398 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5399 bdev_io->u.bdev.ext_opts = NULL; 5400 5401 bdev_io_submit(bdev_io); 5402 5403 return 0; 5404 } 5405 5406 int 5407 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5408 spdk_bdev_io_completion_cb cb, void *cb_arg) 5409 { 5410 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5411 return -EINVAL; 5412 } 5413 5414 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5415 bdev_io->u.bdev.zcopy.start = 0; 5416 bdev_io->internal.caller_ctx = cb_arg; 5417 bdev_io->internal.cb = cb; 5418 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5419 5420 bdev_io_submit(bdev_io); 5421 5422 return 0; 5423 } 5424 5425 int 5426 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5427 uint64_t offset, uint64_t len, 5428 spdk_bdev_io_completion_cb cb, void *cb_arg) 5429 { 5430 uint64_t offset_blocks, num_blocks; 5431 5432 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5433 len, &num_blocks) != 0) { 5434 return -EINVAL; 5435 } 5436 5437 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5438 } 5439 5440 int 5441 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5442 uint64_t offset_blocks, uint64_t num_blocks, 5443 spdk_bdev_io_completion_cb cb, void *cb_arg) 5444 { 5445 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5446 struct spdk_bdev_io *bdev_io; 5447 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5448 5449 if (!desc->write) { 5450 return -EBADF; 5451 } 5452 5453 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5454 return -EINVAL; 5455 } 5456 5457 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5458 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5459 return -ENOTSUP; 5460 } 5461 5462 bdev_io = bdev_channel_get_io(channel); 5463 5464 if (!bdev_io) { 5465 return -ENOMEM; 5466 } 5467 5468 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5469 bdev_io->internal.ch = channel; 5470 bdev_io->internal.desc = desc; 5471 bdev_io->u.bdev.offset_blocks = offset_blocks; 5472 bdev_io->u.bdev.num_blocks = num_blocks; 5473 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5474 bdev_io->u.bdev.ext_opts = NULL; 5475 5476 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5477 bdev_io_submit(bdev_io); 5478 return 0; 5479 } 5480 5481 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5482 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5483 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5484 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5485 bdev_write_zero_buffer_next(bdev_io); 5486 5487 return 0; 5488 } 5489 5490 int 5491 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5492 uint64_t offset, uint64_t nbytes, 5493 spdk_bdev_io_completion_cb cb, void *cb_arg) 5494 { 5495 uint64_t offset_blocks, num_blocks; 5496 5497 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5498 nbytes, &num_blocks) != 0) { 5499 return -EINVAL; 5500 } 5501 5502 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5503 } 5504 5505 int 5506 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5507 uint64_t offset_blocks, uint64_t num_blocks, 5508 spdk_bdev_io_completion_cb cb, void *cb_arg) 5509 { 5510 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5511 struct spdk_bdev_io *bdev_io; 5512 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5513 5514 if (!desc->write) { 5515 return -EBADF; 5516 } 5517 5518 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5519 return -EINVAL; 5520 } 5521 5522 if (num_blocks == 0) { 5523 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5524 return -EINVAL; 5525 } 5526 5527 bdev_io = bdev_channel_get_io(channel); 5528 if (!bdev_io) { 5529 return -ENOMEM; 5530 } 5531 5532 bdev_io->internal.ch = channel; 5533 bdev_io->internal.desc = desc; 5534 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5535 5536 bdev_io->u.bdev.iovs = &bdev_io->iov; 5537 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5538 bdev_io->u.bdev.iovs[0].iov_len = 0; 5539 bdev_io->u.bdev.iovcnt = 1; 5540 5541 bdev_io->u.bdev.offset_blocks = offset_blocks; 5542 bdev_io->u.bdev.num_blocks = num_blocks; 5543 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5544 bdev_io->u.bdev.ext_opts = NULL; 5545 5546 bdev_io_submit(bdev_io); 5547 return 0; 5548 } 5549 5550 int 5551 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5552 uint64_t offset, uint64_t length, 5553 spdk_bdev_io_completion_cb cb, void *cb_arg) 5554 { 5555 uint64_t offset_blocks, num_blocks; 5556 5557 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5558 length, &num_blocks) != 0) { 5559 return -EINVAL; 5560 } 5561 5562 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5563 } 5564 5565 int 5566 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5567 uint64_t offset_blocks, uint64_t num_blocks, 5568 spdk_bdev_io_completion_cb cb, void *cb_arg) 5569 { 5570 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5571 struct spdk_bdev_io *bdev_io; 5572 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5573 5574 if (!desc->write) { 5575 return -EBADF; 5576 } 5577 5578 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5579 return -EINVAL; 5580 } 5581 5582 bdev_io = bdev_channel_get_io(channel); 5583 if (!bdev_io) { 5584 return -ENOMEM; 5585 } 5586 5587 bdev_io->internal.ch = channel; 5588 bdev_io->internal.desc = desc; 5589 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5590 bdev_io->u.bdev.iovs = NULL; 5591 bdev_io->u.bdev.iovcnt = 0; 5592 bdev_io->u.bdev.offset_blocks = offset_blocks; 5593 bdev_io->u.bdev.num_blocks = num_blocks; 5594 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5595 5596 bdev_io_submit(bdev_io); 5597 return 0; 5598 } 5599 5600 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5601 5602 static void 5603 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5604 { 5605 struct spdk_bdev_channel *ch = _ctx; 5606 struct spdk_bdev_io *bdev_io; 5607 5608 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5609 5610 if (status == -EBUSY) { 5611 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5612 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5613 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5614 } else { 5615 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5616 * start the reset. */ 5617 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5618 bdev_io_submit_reset(bdev_io); 5619 } 5620 } else { 5621 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5622 SPDK_DEBUGLOG(bdev, 5623 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5624 ch->bdev->name); 5625 /* Mark the completion status as a SUCCESS and complete the reset. */ 5626 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5627 } 5628 } 5629 5630 static void 5631 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5632 struct spdk_io_channel *io_ch, void *_ctx) 5633 { 5634 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5635 int status = 0; 5636 5637 if (cur_ch->io_outstanding > 0) { 5638 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5639 * further iteration over the rest of the channels and pass non-zero status 5640 * to the callback function. */ 5641 status = -EBUSY; 5642 } 5643 spdk_bdev_for_each_channel_continue(i, status); 5644 } 5645 5646 static int 5647 bdev_reset_poll_for_outstanding_io(void *ctx) 5648 { 5649 struct spdk_bdev_channel *ch = ctx; 5650 struct spdk_bdev_io *bdev_io; 5651 5652 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5653 5654 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5655 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5656 bdev_reset_check_outstanding_io_done); 5657 5658 return SPDK_POLLER_BUSY; 5659 } 5660 5661 static void 5662 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5663 { 5664 struct spdk_bdev_channel *ch = _ctx; 5665 struct spdk_bdev_io *bdev_io; 5666 5667 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5668 5669 if (bdev->reset_io_drain_timeout == 0) { 5670 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5671 5672 bdev_io_submit_reset(bdev_io); 5673 return; 5674 } 5675 5676 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5677 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5678 5679 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5680 * submit the reset to the underlying module only if outstanding I/O 5681 * remain after reset_io_drain_timeout seconds have passed. */ 5682 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5683 bdev_reset_check_outstanding_io_done); 5684 } 5685 5686 static void 5687 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5688 struct spdk_io_channel *ch, void *_ctx) 5689 { 5690 struct spdk_bdev_channel *channel; 5691 struct spdk_bdev_mgmt_channel *mgmt_channel; 5692 struct spdk_bdev_shared_resource *shared_resource; 5693 bdev_io_tailq_t tmp_queued; 5694 5695 TAILQ_INIT(&tmp_queued); 5696 5697 channel = __io_ch_to_bdev_ch(ch); 5698 shared_resource = channel->shared_resource; 5699 mgmt_channel = shared_resource->mgmt_ch; 5700 5701 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5702 5703 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5704 /* The QoS object is always valid and readable while 5705 * the channel flag is set, so the lock here should not 5706 * be necessary. We're not in the fast path though, so 5707 * just take it anyway. */ 5708 spdk_spin_lock(&channel->bdev->internal.spinlock); 5709 if (channel->bdev->internal.qos->ch == channel) { 5710 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5711 } 5712 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5713 } 5714 5715 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5716 bdev_abort_all_buf_io(mgmt_channel, channel); 5717 bdev_abort_all_buf_io(mgmt_channel, channel); 5718 bdev_abort_all_queued_io(&tmp_queued, channel); 5719 5720 spdk_bdev_for_each_channel_continue(i, 0); 5721 } 5722 5723 static void 5724 bdev_start_reset(void *ctx) 5725 { 5726 struct spdk_bdev_channel *ch = ctx; 5727 5728 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5729 bdev_reset_freeze_channel_done); 5730 } 5731 5732 static void 5733 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5734 { 5735 struct spdk_bdev *bdev = ch->bdev; 5736 5737 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5738 5739 spdk_spin_lock(&bdev->internal.spinlock); 5740 if (bdev->internal.reset_in_progress == NULL) { 5741 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5742 /* 5743 * Take a channel reference for the target bdev for the life of this 5744 * reset. This guards against the channel getting destroyed while 5745 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5746 * progress. We will release the reference when this reset is 5747 * completed. 5748 */ 5749 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5750 bdev_start_reset(ch); 5751 } 5752 spdk_spin_unlock(&bdev->internal.spinlock); 5753 } 5754 5755 int 5756 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5757 spdk_bdev_io_completion_cb cb, void *cb_arg) 5758 { 5759 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5760 struct spdk_bdev_io *bdev_io; 5761 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5762 5763 bdev_io = bdev_channel_get_io(channel); 5764 if (!bdev_io) { 5765 return -ENOMEM; 5766 } 5767 5768 bdev_io->internal.ch = channel; 5769 bdev_io->internal.desc = desc; 5770 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5771 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5772 bdev_io->u.reset.ch_ref = NULL; 5773 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5774 5775 spdk_spin_lock(&bdev->internal.spinlock); 5776 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5777 spdk_spin_unlock(&bdev->internal.spinlock); 5778 5779 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5780 internal.ch_link); 5781 5782 bdev_channel_start_reset(channel); 5783 5784 return 0; 5785 } 5786 5787 void 5788 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5789 struct spdk_bdev_io_stat *stat) 5790 { 5791 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5792 5793 bdev_get_io_stat(stat, channel->stat); 5794 } 5795 5796 static void 5797 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5798 { 5799 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5800 5801 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5802 bdev_iostat_ctx->cb_arg, 0); 5803 free(bdev_iostat_ctx); 5804 } 5805 5806 static void 5807 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5808 struct spdk_io_channel *ch, void *_ctx) 5809 { 5810 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5811 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5812 5813 bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5814 spdk_bdev_for_each_channel_continue(i, 0); 5815 } 5816 5817 void 5818 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5819 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5820 { 5821 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5822 5823 assert(bdev != NULL); 5824 assert(stat != NULL); 5825 assert(cb != NULL); 5826 5827 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5828 if (bdev_iostat_ctx == NULL) { 5829 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5830 cb(bdev, stat, cb_arg, -ENOMEM); 5831 return; 5832 } 5833 5834 bdev_iostat_ctx->stat = stat; 5835 bdev_iostat_ctx->cb = cb; 5836 bdev_iostat_ctx->cb_arg = cb_arg; 5837 5838 /* Start with the statistics from previously deleted channels. */ 5839 spdk_spin_lock(&bdev->internal.spinlock); 5840 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5841 spdk_spin_unlock(&bdev->internal.spinlock); 5842 5843 /* Then iterate and add the statistics from each existing channel. */ 5844 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5845 bdev_get_device_stat_done); 5846 } 5847 5848 struct bdev_iostat_reset_ctx { 5849 enum bdev_reset_stat_mode mode; 5850 bdev_reset_device_stat_cb cb; 5851 void *cb_arg; 5852 }; 5853 5854 static void 5855 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5856 { 5857 struct bdev_iostat_reset_ctx *ctx = _ctx; 5858 5859 ctx->cb(bdev, ctx->cb_arg, 0); 5860 5861 free(ctx); 5862 } 5863 5864 static void 5865 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5866 struct spdk_io_channel *ch, void *_ctx) 5867 { 5868 struct bdev_iostat_reset_ctx *ctx = _ctx; 5869 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5870 5871 bdev_reset_io_stat(channel->stat, ctx->mode); 5872 5873 spdk_bdev_for_each_channel_continue(i, 0); 5874 } 5875 5876 void 5877 bdev_reset_device_stat(struct spdk_bdev *bdev, enum bdev_reset_stat_mode mode, 5878 bdev_reset_device_stat_cb cb, void *cb_arg) 5879 { 5880 struct bdev_iostat_reset_ctx *ctx; 5881 5882 assert(bdev != NULL); 5883 assert(cb != NULL); 5884 5885 ctx = calloc(1, sizeof(*ctx)); 5886 if (ctx == NULL) { 5887 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5888 cb(bdev, cb_arg, -ENOMEM); 5889 return; 5890 } 5891 5892 ctx->mode = mode; 5893 ctx->cb = cb; 5894 ctx->cb_arg = cb_arg; 5895 5896 spdk_spin_lock(&bdev->internal.spinlock); 5897 bdev_reset_io_stat(bdev->internal.stat, mode); 5898 spdk_spin_unlock(&bdev->internal.spinlock); 5899 5900 spdk_bdev_for_each_channel(bdev, 5901 bdev_reset_each_channel_stat, 5902 ctx, 5903 bdev_reset_device_stat_done); 5904 } 5905 5906 int 5907 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5908 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5909 spdk_bdev_io_completion_cb cb, void *cb_arg) 5910 { 5911 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5912 struct spdk_bdev_io *bdev_io; 5913 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5914 5915 if (!desc->write) { 5916 return -EBADF; 5917 } 5918 5919 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5920 return -ENOTSUP; 5921 } 5922 5923 bdev_io = bdev_channel_get_io(channel); 5924 if (!bdev_io) { 5925 return -ENOMEM; 5926 } 5927 5928 bdev_io->internal.ch = channel; 5929 bdev_io->internal.desc = desc; 5930 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5931 bdev_io->u.nvme_passthru.cmd = *cmd; 5932 bdev_io->u.nvme_passthru.buf = buf; 5933 bdev_io->u.nvme_passthru.nbytes = nbytes; 5934 bdev_io->u.nvme_passthru.md_buf = NULL; 5935 bdev_io->u.nvme_passthru.md_len = 0; 5936 5937 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5938 5939 bdev_io_submit(bdev_io); 5940 return 0; 5941 } 5942 5943 int 5944 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5945 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5946 spdk_bdev_io_completion_cb cb, void *cb_arg) 5947 { 5948 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5949 struct spdk_bdev_io *bdev_io; 5950 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5951 5952 if (!desc->write) { 5953 /* 5954 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5955 * to easily determine if the command is a read or write, but for now just 5956 * do not allow io_passthru with a read-only descriptor. 5957 */ 5958 return -EBADF; 5959 } 5960 5961 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5962 return -ENOTSUP; 5963 } 5964 5965 bdev_io = bdev_channel_get_io(channel); 5966 if (!bdev_io) { 5967 return -ENOMEM; 5968 } 5969 5970 bdev_io->internal.ch = channel; 5971 bdev_io->internal.desc = desc; 5972 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5973 bdev_io->u.nvme_passthru.cmd = *cmd; 5974 bdev_io->u.nvme_passthru.buf = buf; 5975 bdev_io->u.nvme_passthru.nbytes = nbytes; 5976 bdev_io->u.nvme_passthru.md_buf = NULL; 5977 bdev_io->u.nvme_passthru.md_len = 0; 5978 5979 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5980 5981 bdev_io_submit(bdev_io); 5982 return 0; 5983 } 5984 5985 int 5986 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5987 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5988 spdk_bdev_io_completion_cb cb, void *cb_arg) 5989 { 5990 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5991 struct spdk_bdev_io *bdev_io; 5992 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5993 5994 if (!desc->write) { 5995 /* 5996 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5997 * to easily determine if the command is a read or write, but for now just 5998 * do not allow io_passthru with a read-only descriptor. 5999 */ 6000 return -EBADF; 6001 } 6002 6003 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6004 return -ENOTSUP; 6005 } 6006 6007 bdev_io = bdev_channel_get_io(channel); 6008 if (!bdev_io) { 6009 return -ENOMEM; 6010 } 6011 6012 bdev_io->internal.ch = channel; 6013 bdev_io->internal.desc = desc; 6014 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6015 bdev_io->u.nvme_passthru.cmd = *cmd; 6016 bdev_io->u.nvme_passthru.buf = buf; 6017 bdev_io->u.nvme_passthru.nbytes = nbytes; 6018 bdev_io->u.nvme_passthru.md_buf = md_buf; 6019 bdev_io->u.nvme_passthru.md_len = md_len; 6020 6021 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6022 6023 bdev_io_submit(bdev_io); 6024 return 0; 6025 } 6026 6027 static void bdev_abort_retry(void *ctx); 6028 static void bdev_abort(struct spdk_bdev_io *parent_io); 6029 6030 static void 6031 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6032 { 6033 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6034 struct spdk_bdev_io *parent_io = cb_arg; 6035 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6036 6037 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6038 6039 spdk_bdev_free_io(bdev_io); 6040 6041 if (!success) { 6042 /* Check if the target I/O completed in the meantime. */ 6043 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6044 if (tmp_io == bio_to_abort) { 6045 break; 6046 } 6047 } 6048 6049 /* If the target I/O still exists, set the parent to failed. */ 6050 if (tmp_io != NULL) { 6051 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6052 } 6053 } 6054 6055 parent_io->u.bdev.split_outstanding--; 6056 if (parent_io->u.bdev.split_outstanding == 0) { 6057 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6058 bdev_abort_retry(parent_io); 6059 } else { 6060 bdev_io_complete(parent_io); 6061 } 6062 } 6063 } 6064 6065 static int 6066 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6067 struct spdk_bdev_io *bio_to_abort, 6068 spdk_bdev_io_completion_cb cb, void *cb_arg) 6069 { 6070 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6071 struct spdk_bdev_io *bdev_io; 6072 6073 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6074 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6075 /* TODO: Abort reset or abort request. */ 6076 return -ENOTSUP; 6077 } 6078 6079 bdev_io = bdev_channel_get_io(channel); 6080 if (bdev_io == NULL) { 6081 return -ENOMEM; 6082 } 6083 6084 bdev_io->internal.ch = channel; 6085 bdev_io->internal.desc = desc; 6086 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6087 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6088 6089 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6090 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6091 6092 /* Parent abort request is not submitted directly, but to manage its 6093 * execution add it to the submitted list here. 6094 */ 6095 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6096 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6097 6098 bdev_abort(bdev_io); 6099 6100 return 0; 6101 } 6102 6103 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6104 6105 /* Submit the abort request to the underlying bdev module. */ 6106 bdev_io_submit(bdev_io); 6107 6108 return 0; 6109 } 6110 6111 static uint32_t 6112 _bdev_abort(struct spdk_bdev_io *parent_io) 6113 { 6114 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6115 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6116 void *bio_cb_arg; 6117 struct spdk_bdev_io *bio_to_abort; 6118 uint32_t matched_ios; 6119 int rc; 6120 6121 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6122 6123 /* matched_ios is returned and will be kept by the caller. 6124 * 6125 * This function will be used for two cases, 1) the same cb_arg is used for 6126 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6127 * Incrementing split_outstanding directly here may confuse readers especially 6128 * for the 1st case. 6129 * 6130 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6131 * works as expected. 6132 */ 6133 matched_ios = 0; 6134 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6135 6136 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6137 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6138 continue; 6139 } 6140 6141 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6142 /* Any I/O which was submitted after this abort command should be excluded. */ 6143 continue; 6144 } 6145 6146 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6147 if (rc != 0) { 6148 if (rc == -ENOMEM) { 6149 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6150 } else { 6151 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6152 } 6153 break; 6154 } 6155 matched_ios++; 6156 } 6157 6158 return matched_ios; 6159 } 6160 6161 static void 6162 bdev_abort_retry(void *ctx) 6163 { 6164 struct spdk_bdev_io *parent_io = ctx; 6165 uint32_t matched_ios; 6166 6167 matched_ios = _bdev_abort(parent_io); 6168 6169 if (matched_ios == 0) { 6170 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6171 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6172 } else { 6173 /* For retry, the case that no target I/O was found is success 6174 * because it means target I/Os completed in the meantime. 6175 */ 6176 bdev_io_complete(parent_io); 6177 } 6178 return; 6179 } 6180 6181 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6182 parent_io->u.bdev.split_outstanding = matched_ios; 6183 } 6184 6185 static void 6186 bdev_abort(struct spdk_bdev_io *parent_io) 6187 { 6188 uint32_t matched_ios; 6189 6190 matched_ios = _bdev_abort(parent_io); 6191 6192 if (matched_ios == 0) { 6193 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6194 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6195 } else { 6196 /* The case the no target I/O was found is failure. */ 6197 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6198 bdev_io_complete(parent_io); 6199 } 6200 return; 6201 } 6202 6203 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6204 parent_io->u.bdev.split_outstanding = matched_ios; 6205 } 6206 6207 int 6208 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6209 void *bio_cb_arg, 6210 spdk_bdev_io_completion_cb cb, void *cb_arg) 6211 { 6212 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6213 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6214 struct spdk_bdev_io *bdev_io; 6215 6216 if (bio_cb_arg == NULL) { 6217 return -EINVAL; 6218 } 6219 6220 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6221 return -ENOTSUP; 6222 } 6223 6224 bdev_io = bdev_channel_get_io(channel); 6225 if (bdev_io == NULL) { 6226 return -ENOMEM; 6227 } 6228 6229 bdev_io->internal.ch = channel; 6230 bdev_io->internal.desc = desc; 6231 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6232 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6233 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6234 6235 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6236 6237 /* Parent abort request is not submitted directly, but to manage its execution, 6238 * add it to the submitted list here. 6239 */ 6240 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6241 6242 bdev_abort(bdev_io); 6243 6244 return 0; 6245 } 6246 6247 int 6248 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6249 struct spdk_bdev_io_wait_entry *entry) 6250 { 6251 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6252 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6253 6254 if (bdev != entry->bdev) { 6255 SPDK_ERRLOG("bdevs do not match\n"); 6256 return -EINVAL; 6257 } 6258 6259 if (mgmt_ch->per_thread_cache_count > 0) { 6260 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6261 return -EINVAL; 6262 } 6263 6264 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6265 return 0; 6266 } 6267 6268 static inline void 6269 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6270 { 6271 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6272 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6273 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6274 uint32_t blocklen = bdev_io->bdev->blocklen; 6275 6276 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6277 switch (bdev_io->type) { 6278 case SPDK_BDEV_IO_TYPE_READ: 6279 io_stat->bytes_read += num_blocks * blocklen; 6280 io_stat->num_read_ops++; 6281 io_stat->read_latency_ticks += tsc_diff; 6282 if (io_stat->max_read_latency_ticks < tsc_diff) { 6283 io_stat->max_read_latency_ticks = tsc_diff; 6284 } 6285 if (io_stat->min_read_latency_ticks > tsc_diff) { 6286 io_stat->min_read_latency_ticks = tsc_diff; 6287 } 6288 break; 6289 case SPDK_BDEV_IO_TYPE_WRITE: 6290 io_stat->bytes_written += num_blocks * blocklen; 6291 io_stat->num_write_ops++; 6292 io_stat->write_latency_ticks += tsc_diff; 6293 if (io_stat->max_write_latency_ticks < tsc_diff) { 6294 io_stat->max_write_latency_ticks = tsc_diff; 6295 } 6296 if (io_stat->min_write_latency_ticks > tsc_diff) { 6297 io_stat->min_write_latency_ticks = tsc_diff; 6298 } 6299 break; 6300 case SPDK_BDEV_IO_TYPE_UNMAP: 6301 io_stat->bytes_unmapped += num_blocks * blocklen; 6302 io_stat->num_unmap_ops++; 6303 io_stat->unmap_latency_ticks += tsc_diff; 6304 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6305 io_stat->max_unmap_latency_ticks = tsc_diff; 6306 } 6307 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6308 io_stat->min_unmap_latency_ticks = tsc_diff; 6309 } 6310 break; 6311 case SPDK_BDEV_IO_TYPE_ZCOPY: 6312 /* Track the data in the start phase only */ 6313 if (bdev_io->u.bdev.zcopy.start) { 6314 if (bdev_io->u.bdev.zcopy.populate) { 6315 io_stat->bytes_read += num_blocks * blocklen; 6316 io_stat->num_read_ops++; 6317 io_stat->read_latency_ticks += tsc_diff; 6318 if (io_stat->max_read_latency_ticks < tsc_diff) { 6319 io_stat->max_read_latency_ticks = tsc_diff; 6320 } 6321 if (io_stat->min_read_latency_ticks > tsc_diff) { 6322 io_stat->min_read_latency_ticks = tsc_diff; 6323 } 6324 } else { 6325 io_stat->bytes_written += num_blocks * blocklen; 6326 io_stat->num_write_ops++; 6327 io_stat->write_latency_ticks += tsc_diff; 6328 if (io_stat->max_write_latency_ticks < tsc_diff) { 6329 io_stat->max_write_latency_ticks = tsc_diff; 6330 } 6331 if (io_stat->min_write_latency_ticks > tsc_diff) { 6332 io_stat->min_write_latency_ticks = tsc_diff; 6333 } 6334 } 6335 } 6336 break; 6337 case SPDK_BDEV_IO_TYPE_COPY: 6338 io_stat->bytes_copied += num_blocks * blocklen; 6339 io_stat->num_copy_ops++; 6340 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6341 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6342 io_stat->max_copy_latency_ticks = tsc_diff; 6343 } 6344 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6345 io_stat->min_copy_latency_ticks = tsc_diff; 6346 } 6347 break; 6348 default: 6349 break; 6350 } 6351 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6352 io_stat = bdev_io->bdev->internal.stat; 6353 assert(io_stat->io_error != NULL); 6354 6355 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6356 io_stat->io_error->error_status[-io_status - 1]++; 6357 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6358 } 6359 6360 #ifdef SPDK_CONFIG_VTUNE 6361 uint64_t now_tsc = spdk_get_ticks(); 6362 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6363 uint64_t data[5]; 6364 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6365 6366 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6367 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6368 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6369 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6370 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6371 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6372 6373 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6374 __itt_metadata_u64, 5, data); 6375 6376 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6377 bdev_io->internal.ch->start_tsc = now_tsc; 6378 } 6379 #endif 6380 } 6381 6382 static inline void 6383 bdev_io_complete(void *ctx) 6384 { 6385 struct spdk_bdev_io *bdev_io = ctx; 6386 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6387 uint64_t tsc, tsc_diff; 6388 6389 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6390 /* 6391 * Defer completion to avoid potential infinite recursion if the 6392 * user's completion callback issues a new I/O. 6393 */ 6394 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6395 bdev_io_complete, bdev_io); 6396 return; 6397 } 6398 6399 tsc = spdk_get_ticks(); 6400 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6401 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6402 bdev_io->internal.caller_ctx); 6403 6404 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6405 6406 if (bdev_io->internal.ch->histogram) { 6407 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6408 } 6409 6410 bdev_io_update_io_stat(bdev_io, tsc_diff); 6411 6412 assert(bdev_io->internal.cb != NULL); 6413 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6414 6415 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6416 bdev_io->internal.caller_ctx); 6417 } 6418 6419 static void bdev_destroy_cb(void *io_device); 6420 6421 static void 6422 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6423 { 6424 struct spdk_bdev_io *bdev_io = _ctx; 6425 6426 if (bdev_io->u.reset.ch_ref != NULL) { 6427 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6428 bdev_io->u.reset.ch_ref = NULL; 6429 } 6430 6431 bdev_io_complete(bdev_io); 6432 6433 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6434 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6435 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6436 } 6437 } 6438 6439 static void 6440 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6441 struct spdk_io_channel *_ch, void *_ctx) 6442 { 6443 struct spdk_bdev_io *bdev_io = _ctx; 6444 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6445 struct spdk_bdev_io *queued_reset; 6446 6447 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6448 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6449 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6450 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6451 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6452 } 6453 6454 spdk_bdev_for_each_channel_continue(i, 0); 6455 } 6456 6457 void 6458 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6459 { 6460 struct spdk_bdev *bdev = bdev_io->bdev; 6461 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6462 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6463 6464 bdev_io->internal.status = status; 6465 6466 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6467 bool unlock_channels = false; 6468 6469 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6470 SPDK_ERRLOG("NOMEM returned for reset\n"); 6471 } 6472 spdk_spin_lock(&bdev->internal.spinlock); 6473 if (bdev_io == bdev->internal.reset_in_progress) { 6474 bdev->internal.reset_in_progress = NULL; 6475 unlock_channels = true; 6476 } 6477 spdk_spin_unlock(&bdev->internal.spinlock); 6478 6479 if (unlock_channels) { 6480 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6481 bdev_reset_complete); 6482 return; 6483 } 6484 } else { 6485 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6486 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6487 /* bdev IO will be completed in the callback */ 6488 return; 6489 } 6490 6491 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6492 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6493 return; 6494 } 6495 } 6496 6497 bdev_io_complete(bdev_io); 6498 } 6499 6500 void 6501 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6502 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6503 { 6504 if (sc == SPDK_SCSI_STATUS_GOOD) { 6505 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6506 } else { 6507 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6508 bdev_io->internal.error.scsi.sc = sc; 6509 bdev_io->internal.error.scsi.sk = sk; 6510 bdev_io->internal.error.scsi.asc = asc; 6511 bdev_io->internal.error.scsi.ascq = ascq; 6512 } 6513 6514 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6515 } 6516 6517 void 6518 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6519 int *sc, int *sk, int *asc, int *ascq) 6520 { 6521 assert(sc != NULL); 6522 assert(sk != NULL); 6523 assert(asc != NULL); 6524 assert(ascq != NULL); 6525 6526 switch (bdev_io->internal.status) { 6527 case SPDK_BDEV_IO_STATUS_SUCCESS: 6528 *sc = SPDK_SCSI_STATUS_GOOD; 6529 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6530 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6531 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6532 break; 6533 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6534 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6535 break; 6536 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6537 *sc = bdev_io->internal.error.scsi.sc; 6538 *sk = bdev_io->internal.error.scsi.sk; 6539 *asc = bdev_io->internal.error.scsi.asc; 6540 *ascq = bdev_io->internal.error.scsi.ascq; 6541 break; 6542 default: 6543 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6544 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6545 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6546 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6547 break; 6548 } 6549 } 6550 6551 void 6552 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6553 { 6554 if (aio_result == 0) { 6555 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6556 } else { 6557 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6558 } 6559 6560 bdev_io->internal.error.aio_result = aio_result; 6561 6562 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6563 } 6564 6565 void 6566 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6567 { 6568 assert(aio_result != NULL); 6569 6570 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6571 *aio_result = bdev_io->internal.error.aio_result; 6572 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6573 *aio_result = 0; 6574 } else { 6575 *aio_result = -EIO; 6576 } 6577 } 6578 6579 void 6580 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6581 { 6582 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6583 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6584 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6585 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6586 } else { 6587 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6588 } 6589 6590 bdev_io->internal.error.nvme.cdw0 = cdw0; 6591 bdev_io->internal.error.nvme.sct = sct; 6592 bdev_io->internal.error.nvme.sc = sc; 6593 6594 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6595 } 6596 6597 void 6598 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6599 { 6600 assert(sct != NULL); 6601 assert(sc != NULL); 6602 assert(cdw0 != NULL); 6603 6604 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6605 *sct = SPDK_NVME_SCT_GENERIC; 6606 *sc = SPDK_NVME_SC_SUCCESS; 6607 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6608 *cdw0 = 0; 6609 } else { 6610 *cdw0 = 1U; 6611 } 6612 return; 6613 } 6614 6615 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6616 *sct = bdev_io->internal.error.nvme.sct; 6617 *sc = bdev_io->internal.error.nvme.sc; 6618 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6619 *sct = SPDK_NVME_SCT_GENERIC; 6620 *sc = SPDK_NVME_SC_SUCCESS; 6621 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6622 *sct = SPDK_NVME_SCT_GENERIC; 6623 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6624 } else { 6625 *sct = SPDK_NVME_SCT_GENERIC; 6626 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6627 } 6628 6629 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6630 } 6631 6632 void 6633 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6634 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6635 { 6636 assert(first_sct != NULL); 6637 assert(first_sc != NULL); 6638 assert(second_sct != NULL); 6639 assert(second_sc != NULL); 6640 assert(cdw0 != NULL); 6641 6642 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6643 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6644 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6645 *first_sct = bdev_io->internal.error.nvme.sct; 6646 *first_sc = bdev_io->internal.error.nvme.sc; 6647 *second_sct = SPDK_NVME_SCT_GENERIC; 6648 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6649 } else { 6650 *first_sct = SPDK_NVME_SCT_GENERIC; 6651 *first_sc = SPDK_NVME_SC_SUCCESS; 6652 *second_sct = bdev_io->internal.error.nvme.sct; 6653 *second_sc = bdev_io->internal.error.nvme.sc; 6654 } 6655 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6656 *first_sct = SPDK_NVME_SCT_GENERIC; 6657 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6658 *second_sct = SPDK_NVME_SCT_GENERIC; 6659 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6660 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6661 *first_sct = SPDK_NVME_SCT_GENERIC; 6662 *first_sc = SPDK_NVME_SC_SUCCESS; 6663 *second_sct = SPDK_NVME_SCT_GENERIC; 6664 *second_sc = SPDK_NVME_SC_SUCCESS; 6665 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6666 *first_sct = SPDK_NVME_SCT_GENERIC; 6667 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6668 *second_sct = SPDK_NVME_SCT_GENERIC; 6669 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6670 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6671 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6672 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6673 *second_sct = SPDK_NVME_SCT_GENERIC; 6674 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6675 } else { 6676 *first_sct = SPDK_NVME_SCT_GENERIC; 6677 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6678 *second_sct = SPDK_NVME_SCT_GENERIC; 6679 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6680 } 6681 6682 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6683 } 6684 6685 struct spdk_thread * 6686 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6687 { 6688 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6689 } 6690 6691 struct spdk_io_channel * 6692 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6693 { 6694 return bdev_io->internal.ch->channel; 6695 } 6696 6697 static int 6698 bdev_register(struct spdk_bdev *bdev) 6699 { 6700 char *bdev_name; 6701 char uuid[SPDK_UUID_STRING_LEN]; 6702 int ret; 6703 6704 assert(bdev->module != NULL); 6705 6706 if (!bdev->name) { 6707 SPDK_ERRLOG("Bdev name is NULL\n"); 6708 return -EINVAL; 6709 } 6710 6711 if (!strlen(bdev->name)) { 6712 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6713 return -EINVAL; 6714 } 6715 6716 /* Users often register their own I/O devices using the bdev name. In 6717 * order to avoid conflicts, prepend bdev_. */ 6718 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6719 if (!bdev_name) { 6720 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6721 return -ENOMEM; 6722 } 6723 6724 bdev->internal.stat = bdev_alloc_io_stat(true); 6725 if (!bdev->internal.stat) { 6726 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6727 free(bdev_name); 6728 return -ENOMEM; 6729 } 6730 6731 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6732 bdev->internal.measured_queue_depth = UINT64_MAX; 6733 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6734 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6735 bdev->internal.qd_poller = NULL; 6736 bdev->internal.qos = NULL; 6737 6738 TAILQ_INIT(&bdev->internal.open_descs); 6739 TAILQ_INIT(&bdev->internal.locked_ranges); 6740 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6741 TAILQ_INIT(&bdev->aliases); 6742 6743 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6744 if (ret != 0) { 6745 bdev_free_io_stat(bdev->internal.stat); 6746 free(bdev_name); 6747 return ret; 6748 } 6749 6750 /* UUID has to be specified by the user or defined by bdev itself. 6751 * Otherwise this field must remain empty, to indicate that this 6752 * value cannot be depended upon. */ 6753 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6754 /* Add the UUID alias only if it's different than the name */ 6755 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6756 if (strcmp(bdev->name, uuid) != 0) { 6757 ret = spdk_bdev_alias_add(bdev, uuid); 6758 if (ret != 0) { 6759 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6760 bdev_name_del(&bdev->internal.bdev_name); 6761 bdev_free_io_stat(bdev->internal.stat); 6762 free(bdev_name); 6763 return ret; 6764 } 6765 } 6766 } 6767 6768 if (spdk_bdev_get_buf_align(bdev) > 1) { 6769 if (bdev->split_on_optimal_io_boundary) { 6770 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6771 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6772 } else { 6773 bdev->split_on_optimal_io_boundary = true; 6774 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6775 } 6776 } 6777 6778 /* If the user didn't specify a write unit size, set it to one. */ 6779 if (bdev->write_unit_size == 0) { 6780 bdev->write_unit_size = 1; 6781 } 6782 6783 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6784 if (bdev->acwu == 0) { 6785 bdev->acwu = bdev->write_unit_size; 6786 } 6787 6788 if (bdev->phys_blocklen == 0) { 6789 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6790 } 6791 6792 bdev->internal.reset_in_progress = NULL; 6793 bdev->internal.qd_poll_in_progress = false; 6794 bdev->internal.period = 0; 6795 bdev->internal.new_period = 0; 6796 6797 spdk_io_device_register(__bdev_to_io_dev(bdev), 6798 bdev_channel_create, bdev_channel_destroy, 6799 sizeof(struct spdk_bdev_channel), 6800 bdev_name); 6801 6802 free(bdev_name); 6803 6804 spdk_spin_init(&bdev->internal.spinlock); 6805 6806 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6807 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6808 6809 return 0; 6810 } 6811 6812 static void 6813 bdev_destroy_cb(void *io_device) 6814 { 6815 int rc; 6816 struct spdk_bdev *bdev; 6817 spdk_bdev_unregister_cb cb_fn; 6818 void *cb_arg; 6819 6820 bdev = __bdev_from_io_dev(io_device); 6821 cb_fn = bdev->internal.unregister_cb; 6822 cb_arg = bdev->internal.unregister_ctx; 6823 6824 spdk_spin_destroy(&bdev->internal.spinlock); 6825 free(bdev->internal.qos); 6826 bdev_free_io_stat(bdev->internal.stat); 6827 6828 rc = bdev->fn_table->destruct(bdev->ctxt); 6829 if (rc < 0) { 6830 SPDK_ERRLOG("destruct failed\n"); 6831 } 6832 if (rc <= 0 && cb_fn != NULL) { 6833 cb_fn(cb_arg, rc); 6834 } 6835 } 6836 6837 void 6838 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6839 { 6840 if (bdev->internal.unregister_cb != NULL) { 6841 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6842 } 6843 } 6844 6845 static void 6846 _remove_notify(void *arg) 6847 { 6848 struct spdk_bdev_desc *desc = arg; 6849 6850 spdk_spin_lock(&desc->spinlock); 6851 desc->refs--; 6852 6853 if (!desc->closed) { 6854 spdk_spin_unlock(&desc->spinlock); 6855 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6856 return; 6857 } else if (0 == desc->refs) { 6858 /* This descriptor was closed after this remove_notify message was sent. 6859 * spdk_bdev_close() could not free the descriptor since this message was 6860 * in flight, so we free it now using bdev_desc_free(). 6861 */ 6862 spdk_spin_unlock(&desc->spinlock); 6863 bdev_desc_free(desc); 6864 return; 6865 } 6866 spdk_spin_unlock(&desc->spinlock); 6867 } 6868 6869 /* returns: 0 - bdev removed and ready to be destructed. 6870 * -EBUSY - bdev can't be destructed yet. */ 6871 static int 6872 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6873 { 6874 struct spdk_bdev_desc *desc, *tmp; 6875 int rc = 0; 6876 char uuid[SPDK_UUID_STRING_LEN]; 6877 6878 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6879 assert(spdk_spin_held(&bdev->internal.spinlock)); 6880 6881 /* Notify each descriptor about hotremoval */ 6882 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6883 rc = -EBUSY; 6884 spdk_spin_lock(&desc->spinlock); 6885 /* 6886 * Defer invocation of the event_cb to a separate message that will 6887 * run later on its thread. This ensures this context unwinds and 6888 * we don't recursively unregister this bdev again if the event_cb 6889 * immediately closes its descriptor. 6890 */ 6891 desc->refs++; 6892 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6893 spdk_spin_unlock(&desc->spinlock); 6894 } 6895 6896 /* If there are no descriptors, proceed removing the bdev */ 6897 if (rc == 0) { 6898 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6899 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6900 6901 /* Delete the name and the UUID alias */ 6902 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6903 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6904 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6905 6906 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6907 6908 if (bdev->internal.reset_in_progress != NULL) { 6909 /* If reset is in progress, let the completion callback for reset 6910 * unregister the bdev. 6911 */ 6912 rc = -EBUSY; 6913 } 6914 } 6915 6916 return rc; 6917 } 6918 6919 static void 6920 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6921 struct spdk_io_channel *io_ch, void *_ctx) 6922 { 6923 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 6924 6925 bdev_channel_abort_queued_ios(bdev_ch); 6926 spdk_bdev_for_each_channel_continue(i, 0); 6927 } 6928 6929 static void 6930 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 6931 { 6932 int rc; 6933 6934 spdk_spin_lock(&g_bdev_mgr.spinlock); 6935 spdk_spin_lock(&bdev->internal.spinlock); 6936 /* 6937 * Set the status to REMOVING after completing to abort channels. Otherwise, 6938 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6939 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 6940 * may fail. 6941 */ 6942 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6943 rc = bdev_unregister_unsafe(bdev); 6944 spdk_spin_unlock(&bdev->internal.spinlock); 6945 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6946 6947 if (rc == 0) { 6948 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6949 } 6950 } 6951 6952 void 6953 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6954 { 6955 struct spdk_thread *thread; 6956 6957 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6958 6959 thread = spdk_get_thread(); 6960 if (!thread) { 6961 /* The user called this from a non-SPDK thread. */ 6962 if (cb_fn != NULL) { 6963 cb_fn(cb_arg, -ENOTSUP); 6964 } 6965 return; 6966 } 6967 6968 spdk_spin_lock(&g_bdev_mgr.spinlock); 6969 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6970 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6971 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6972 if (cb_fn) { 6973 cb_fn(cb_arg, -EBUSY); 6974 } 6975 return; 6976 } 6977 6978 spdk_spin_lock(&bdev->internal.spinlock); 6979 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6980 bdev->internal.unregister_cb = cb_fn; 6981 bdev->internal.unregister_ctx = cb_arg; 6982 spdk_spin_unlock(&bdev->internal.spinlock); 6983 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6984 6985 spdk_bdev_set_qd_sampling_period(bdev, 0); 6986 6987 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 6988 bdev_unregister); 6989 } 6990 6991 int 6992 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6993 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6994 { 6995 struct spdk_bdev_desc *desc; 6996 struct spdk_bdev *bdev; 6997 int rc; 6998 6999 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7000 if (rc != 0) { 7001 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7002 return rc; 7003 } 7004 7005 bdev = spdk_bdev_desc_get_bdev(desc); 7006 7007 if (bdev->module != module) { 7008 spdk_bdev_close(desc); 7009 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7010 bdev_name); 7011 return -ENODEV; 7012 } 7013 7014 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7015 7016 spdk_bdev_close(desc); 7017 7018 return 0; 7019 } 7020 7021 static int 7022 bdev_start_qos(struct spdk_bdev *bdev) 7023 { 7024 struct set_qos_limit_ctx *ctx; 7025 7026 /* Enable QoS */ 7027 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7028 ctx = calloc(1, sizeof(*ctx)); 7029 if (ctx == NULL) { 7030 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7031 return -ENOMEM; 7032 } 7033 ctx->bdev = bdev; 7034 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7035 } 7036 7037 return 0; 7038 } 7039 7040 static int 7041 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7042 { 7043 struct spdk_thread *thread; 7044 int rc = 0; 7045 7046 thread = spdk_get_thread(); 7047 if (!thread) { 7048 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7049 return -ENOTSUP; 7050 } 7051 7052 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7053 spdk_get_thread()); 7054 7055 desc->bdev = bdev; 7056 desc->thread = thread; 7057 desc->write = write; 7058 7059 spdk_spin_lock(&bdev->internal.spinlock); 7060 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7061 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7062 spdk_spin_unlock(&bdev->internal.spinlock); 7063 return -ENODEV; 7064 } 7065 7066 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7067 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 7068 bdev->name, bdev->internal.claim.v1.module->name); 7069 spdk_spin_unlock(&bdev->internal.spinlock); 7070 return -EPERM; 7071 } 7072 7073 rc = bdev_start_qos(bdev); 7074 if (rc != 0) { 7075 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7076 spdk_spin_unlock(&bdev->internal.spinlock); 7077 return rc; 7078 } 7079 7080 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7081 7082 spdk_spin_unlock(&bdev->internal.spinlock); 7083 7084 return 0; 7085 } 7086 7087 static int 7088 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7089 struct spdk_bdev_desc **_desc) 7090 { 7091 struct spdk_bdev_desc *desc; 7092 unsigned int event_id; 7093 7094 desc = calloc(1, sizeof(*desc)); 7095 if (desc == NULL) { 7096 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7097 return -ENOMEM; 7098 } 7099 7100 TAILQ_INIT(&desc->pending_media_events); 7101 TAILQ_INIT(&desc->free_media_events); 7102 7103 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7104 desc->callback.event_fn = event_cb; 7105 desc->callback.ctx = event_ctx; 7106 spdk_spin_init(&desc->spinlock); 7107 7108 if (bdev->media_events) { 7109 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7110 sizeof(*desc->media_events_buffer)); 7111 if (desc->media_events_buffer == NULL) { 7112 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7113 bdev_desc_free(desc); 7114 return -ENOMEM; 7115 } 7116 7117 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7118 TAILQ_INSERT_TAIL(&desc->free_media_events, 7119 &desc->media_events_buffer[event_id], tailq); 7120 } 7121 } 7122 7123 *_desc = desc; 7124 7125 return 0; 7126 } 7127 7128 int 7129 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7130 void *event_ctx, struct spdk_bdev_desc **_desc) 7131 { 7132 struct spdk_bdev_desc *desc; 7133 struct spdk_bdev *bdev; 7134 int rc; 7135 7136 if (event_cb == NULL) { 7137 SPDK_ERRLOG("Missing event callback function\n"); 7138 return -EINVAL; 7139 } 7140 7141 spdk_spin_lock(&g_bdev_mgr.spinlock); 7142 7143 bdev = bdev_get_by_name(bdev_name); 7144 7145 if (bdev == NULL) { 7146 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7147 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7148 return -ENODEV; 7149 } 7150 7151 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7152 if (rc != 0) { 7153 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7154 return rc; 7155 } 7156 7157 rc = bdev_open(bdev, write, desc); 7158 if (rc != 0) { 7159 bdev_desc_free(desc); 7160 desc = NULL; 7161 } 7162 7163 *_desc = desc; 7164 7165 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7166 7167 return rc; 7168 } 7169 7170 static void 7171 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7172 { 7173 int rc; 7174 7175 spdk_spin_lock(&bdev->internal.spinlock); 7176 spdk_spin_lock(&desc->spinlock); 7177 7178 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7179 7180 desc->closed = true; 7181 7182 if (0 == desc->refs) { 7183 spdk_spin_unlock(&desc->spinlock); 7184 bdev_desc_free(desc); 7185 } else { 7186 spdk_spin_unlock(&desc->spinlock); 7187 } 7188 7189 /* If no more descriptors, kill QoS channel */ 7190 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7191 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7192 bdev->name, spdk_get_thread()); 7193 7194 if (bdev_qos_destroy(bdev)) { 7195 /* There isn't anything we can do to recover here. Just let the 7196 * old QoS poller keep running. The QoS handling won't change 7197 * cores when the user allocates a new channel, but it won't break. */ 7198 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7199 } 7200 } 7201 7202 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7203 rc = bdev_unregister_unsafe(bdev); 7204 spdk_spin_unlock(&bdev->internal.spinlock); 7205 7206 if (rc == 0) { 7207 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7208 } 7209 } else { 7210 spdk_spin_unlock(&bdev->internal.spinlock); 7211 } 7212 } 7213 7214 void 7215 spdk_bdev_close(struct spdk_bdev_desc *desc) 7216 { 7217 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7218 7219 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7220 spdk_get_thread()); 7221 7222 assert(desc->thread == spdk_get_thread()); 7223 7224 spdk_poller_unregister(&desc->io_timeout_poller); 7225 7226 spdk_spin_lock(&g_bdev_mgr.spinlock); 7227 7228 bdev_close(bdev, desc); 7229 7230 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7231 } 7232 7233 static void 7234 bdev_register_finished(void *arg) 7235 { 7236 struct spdk_bdev_desc *desc = arg; 7237 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7238 7239 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7240 7241 spdk_spin_lock(&g_bdev_mgr.spinlock); 7242 7243 bdev_close(bdev, desc); 7244 7245 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7246 } 7247 7248 int 7249 spdk_bdev_register(struct spdk_bdev *bdev) 7250 { 7251 struct spdk_bdev_desc *desc; 7252 int rc; 7253 7254 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7255 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 7256 } 7257 7258 rc = bdev_register(bdev); 7259 if (rc != 0) { 7260 return rc; 7261 } 7262 7263 /* A descriptor is opened to prevent bdev deletion during examination */ 7264 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7265 if (rc != 0) { 7266 spdk_bdev_unregister(bdev, NULL, NULL); 7267 return rc; 7268 } 7269 7270 rc = bdev_open(bdev, false, desc); 7271 if (rc != 0) { 7272 bdev_desc_free(desc); 7273 spdk_bdev_unregister(bdev, NULL, NULL); 7274 return rc; 7275 } 7276 7277 /* Examine configuration before initializing I/O */ 7278 bdev_examine(bdev); 7279 7280 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7281 if (rc != 0) { 7282 bdev_close(bdev, desc); 7283 spdk_bdev_unregister(bdev, NULL, NULL); 7284 } 7285 7286 return rc; 7287 } 7288 7289 int 7290 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7291 struct spdk_bdev_module *module) 7292 { 7293 spdk_spin_lock(&bdev->internal.spinlock); 7294 7295 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7296 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 7297 bdev->internal.claim.v1.module->name); 7298 spdk_spin_unlock(&bdev->internal.spinlock); 7299 return -EPERM; 7300 } 7301 7302 if (desc && !desc->write) { 7303 desc->write = true; 7304 } 7305 7306 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7307 bdev->internal.claim.v1.module = module; 7308 7309 spdk_spin_unlock(&bdev->internal.spinlock); 7310 return 0; 7311 } 7312 7313 void 7314 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7315 { 7316 spdk_spin_lock(&bdev->internal.spinlock); 7317 7318 assert(bdev->internal.claim.v1.module != NULL); 7319 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7320 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7321 bdev->internal.claim.v1.module = NULL; 7322 7323 spdk_spin_unlock(&bdev->internal.spinlock); 7324 } 7325 7326 struct spdk_bdev * 7327 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7328 { 7329 assert(desc != NULL); 7330 return desc->bdev; 7331 } 7332 7333 int 7334 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7335 { 7336 struct spdk_bdev *bdev, *tmp; 7337 struct spdk_bdev_desc *desc; 7338 int rc = 0; 7339 7340 assert(fn != NULL); 7341 7342 spdk_spin_lock(&g_bdev_mgr.spinlock); 7343 bdev = spdk_bdev_first(); 7344 while (bdev != NULL) { 7345 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7346 if (rc != 0) { 7347 break; 7348 } 7349 rc = bdev_open(bdev, false, desc); 7350 if (rc != 0) { 7351 bdev_desc_free(desc); 7352 if (rc == -ENODEV) { 7353 /* Ignore the error and move to the next bdev. */ 7354 rc = 0; 7355 bdev = spdk_bdev_next(bdev); 7356 continue; 7357 } 7358 break; 7359 } 7360 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7361 7362 rc = fn(ctx, bdev); 7363 7364 spdk_spin_lock(&g_bdev_mgr.spinlock); 7365 tmp = spdk_bdev_next(bdev); 7366 bdev_close(bdev, desc); 7367 if (rc != 0) { 7368 break; 7369 } 7370 bdev = tmp; 7371 } 7372 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7373 7374 return rc; 7375 } 7376 7377 int 7378 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7379 { 7380 struct spdk_bdev *bdev, *tmp; 7381 struct spdk_bdev_desc *desc; 7382 int rc = 0; 7383 7384 assert(fn != NULL); 7385 7386 spdk_spin_lock(&g_bdev_mgr.spinlock); 7387 bdev = spdk_bdev_first_leaf(); 7388 while (bdev != NULL) { 7389 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7390 if (rc != 0) { 7391 break; 7392 } 7393 rc = bdev_open(bdev, false, desc); 7394 if (rc != 0) { 7395 bdev_desc_free(desc); 7396 if (rc == -ENODEV) { 7397 /* Ignore the error and move to the next bdev. */ 7398 rc = 0; 7399 bdev = spdk_bdev_next_leaf(bdev); 7400 continue; 7401 } 7402 break; 7403 } 7404 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7405 7406 rc = fn(ctx, bdev); 7407 7408 spdk_spin_lock(&g_bdev_mgr.spinlock); 7409 tmp = spdk_bdev_next_leaf(bdev); 7410 bdev_close(bdev, desc); 7411 if (rc != 0) { 7412 break; 7413 } 7414 bdev = tmp; 7415 } 7416 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7417 7418 return rc; 7419 } 7420 7421 void 7422 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7423 { 7424 struct iovec *iovs; 7425 int iovcnt; 7426 7427 if (bdev_io == NULL) { 7428 return; 7429 } 7430 7431 switch (bdev_io->type) { 7432 case SPDK_BDEV_IO_TYPE_READ: 7433 case SPDK_BDEV_IO_TYPE_WRITE: 7434 case SPDK_BDEV_IO_TYPE_ZCOPY: 7435 iovs = bdev_io->u.bdev.iovs; 7436 iovcnt = bdev_io->u.bdev.iovcnt; 7437 break; 7438 default: 7439 iovs = NULL; 7440 iovcnt = 0; 7441 break; 7442 } 7443 7444 if (iovp) { 7445 *iovp = iovs; 7446 } 7447 if (iovcntp) { 7448 *iovcntp = iovcnt; 7449 } 7450 } 7451 7452 void * 7453 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7454 { 7455 if (bdev_io == NULL) { 7456 return NULL; 7457 } 7458 7459 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7460 return NULL; 7461 } 7462 7463 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7464 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7465 return bdev_io->u.bdev.md_buf; 7466 } 7467 7468 return NULL; 7469 } 7470 7471 void * 7472 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7473 { 7474 if (bdev_io == NULL) { 7475 assert(false); 7476 return NULL; 7477 } 7478 7479 return bdev_io->internal.caller_ctx; 7480 } 7481 7482 void 7483 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7484 { 7485 7486 if (spdk_bdev_module_list_find(bdev_module->name)) { 7487 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7488 assert(false); 7489 } 7490 7491 spdk_spin_init(&bdev_module->internal.spinlock); 7492 7493 /* 7494 * Modules with examine callbacks must be initialized first, so they are 7495 * ready to handle examine callbacks from later modules that will 7496 * register physical bdevs. 7497 */ 7498 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7499 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7500 } else { 7501 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7502 } 7503 } 7504 7505 struct spdk_bdev_module * 7506 spdk_bdev_module_list_find(const char *name) 7507 { 7508 struct spdk_bdev_module *bdev_module; 7509 7510 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7511 if (strcmp(name, bdev_module->name) == 0) { 7512 break; 7513 } 7514 } 7515 7516 return bdev_module; 7517 } 7518 7519 static void 7520 bdev_write_zero_buffer_next(void *_bdev_io) 7521 { 7522 struct spdk_bdev_io *bdev_io = _bdev_io; 7523 uint64_t num_bytes, num_blocks; 7524 void *md_buf = NULL; 7525 int rc; 7526 7527 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7528 bdev_io->u.bdev.split_remaining_num_blocks, 7529 ZERO_BUFFER_SIZE); 7530 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7531 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7532 7533 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7534 md_buf = (char *)g_bdev_mgr.zero_buffer + 7535 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7536 } 7537 7538 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7539 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7540 g_bdev_mgr.zero_buffer, md_buf, 7541 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7542 bdev_write_zero_buffer_done, bdev_io); 7543 if (rc == 0) { 7544 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7545 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7546 } else if (rc == -ENOMEM) { 7547 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7548 } else { 7549 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7550 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7551 } 7552 } 7553 7554 static void 7555 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7556 { 7557 struct spdk_bdev_io *parent_io = cb_arg; 7558 7559 spdk_bdev_free_io(bdev_io); 7560 7561 if (!success) { 7562 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7563 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7564 return; 7565 } 7566 7567 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7568 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7569 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7570 return; 7571 } 7572 7573 bdev_write_zero_buffer_next(parent_io); 7574 } 7575 7576 static void 7577 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7578 { 7579 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7580 ctx->bdev->internal.qos_mod_in_progress = false; 7581 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7582 7583 if (ctx->cb_fn) { 7584 ctx->cb_fn(ctx->cb_arg, status); 7585 } 7586 free(ctx); 7587 } 7588 7589 static void 7590 bdev_disable_qos_done(void *cb_arg) 7591 { 7592 struct set_qos_limit_ctx *ctx = cb_arg; 7593 struct spdk_bdev *bdev = ctx->bdev; 7594 struct spdk_bdev_io *bdev_io; 7595 struct spdk_bdev_qos *qos; 7596 7597 spdk_spin_lock(&bdev->internal.spinlock); 7598 qos = bdev->internal.qos; 7599 bdev->internal.qos = NULL; 7600 spdk_spin_unlock(&bdev->internal.spinlock); 7601 7602 while (!TAILQ_EMPTY(&qos->queued)) { 7603 /* Send queued I/O back to their original thread for resubmission. */ 7604 bdev_io = TAILQ_FIRST(&qos->queued); 7605 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7606 7607 if (bdev_io->internal.io_submit_ch) { 7608 /* 7609 * Channel was changed when sending it to the QoS thread - change it back 7610 * before sending it back to the original thread. 7611 */ 7612 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7613 bdev_io->internal.io_submit_ch = NULL; 7614 } 7615 7616 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7617 _bdev_io_submit, bdev_io); 7618 } 7619 7620 if (qos->thread != NULL) { 7621 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7622 spdk_poller_unregister(&qos->poller); 7623 } 7624 7625 free(qos); 7626 7627 bdev_set_qos_limit_done(ctx, 0); 7628 } 7629 7630 static void 7631 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 7632 { 7633 struct set_qos_limit_ctx *ctx = _ctx; 7634 struct spdk_thread *thread; 7635 7636 spdk_spin_lock(&bdev->internal.spinlock); 7637 thread = bdev->internal.qos->thread; 7638 spdk_spin_unlock(&bdev->internal.spinlock); 7639 7640 if (thread != NULL) { 7641 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7642 } else { 7643 bdev_disable_qos_done(ctx); 7644 } 7645 } 7646 7647 static void 7648 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7649 struct spdk_io_channel *ch, void *_ctx) 7650 { 7651 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7652 7653 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7654 7655 spdk_bdev_for_each_channel_continue(i, 0); 7656 } 7657 7658 static void 7659 bdev_update_qos_rate_limit_msg(void *cb_arg) 7660 { 7661 struct set_qos_limit_ctx *ctx = cb_arg; 7662 struct spdk_bdev *bdev = ctx->bdev; 7663 7664 spdk_spin_lock(&bdev->internal.spinlock); 7665 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7666 spdk_spin_unlock(&bdev->internal.spinlock); 7667 7668 bdev_set_qos_limit_done(ctx, 0); 7669 } 7670 7671 static void 7672 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7673 struct spdk_io_channel *ch, void *_ctx) 7674 { 7675 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7676 7677 spdk_spin_lock(&bdev->internal.spinlock); 7678 bdev_enable_qos(bdev, bdev_ch); 7679 spdk_spin_unlock(&bdev->internal.spinlock); 7680 spdk_bdev_for_each_channel_continue(i, 0); 7681 } 7682 7683 static void 7684 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 7685 { 7686 struct set_qos_limit_ctx *ctx = _ctx; 7687 7688 bdev_set_qos_limit_done(ctx, status); 7689 } 7690 7691 static void 7692 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7693 { 7694 int i; 7695 7696 assert(bdev->internal.qos != NULL); 7697 7698 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7699 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7700 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7701 7702 if (limits[i] == 0) { 7703 bdev->internal.qos->rate_limits[i].limit = 7704 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7705 } 7706 } 7707 } 7708 } 7709 7710 void 7711 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7712 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7713 { 7714 struct set_qos_limit_ctx *ctx; 7715 uint32_t limit_set_complement; 7716 uint64_t min_limit_per_sec; 7717 int i; 7718 bool disable_rate_limit = true; 7719 7720 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7721 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7722 continue; 7723 } 7724 7725 if (limits[i] > 0) { 7726 disable_rate_limit = false; 7727 } 7728 7729 if (bdev_qos_is_iops_rate_limit(i) == true) { 7730 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7731 } else { 7732 /* Change from megabyte to byte rate limit */ 7733 limits[i] = limits[i] * 1024 * 1024; 7734 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7735 } 7736 7737 limit_set_complement = limits[i] % min_limit_per_sec; 7738 if (limit_set_complement) { 7739 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7740 limits[i], min_limit_per_sec); 7741 limits[i] += min_limit_per_sec - limit_set_complement; 7742 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7743 } 7744 } 7745 7746 ctx = calloc(1, sizeof(*ctx)); 7747 if (ctx == NULL) { 7748 cb_fn(cb_arg, -ENOMEM); 7749 return; 7750 } 7751 7752 ctx->cb_fn = cb_fn; 7753 ctx->cb_arg = cb_arg; 7754 ctx->bdev = bdev; 7755 7756 spdk_spin_lock(&bdev->internal.spinlock); 7757 if (bdev->internal.qos_mod_in_progress) { 7758 spdk_spin_unlock(&bdev->internal.spinlock); 7759 free(ctx); 7760 cb_fn(cb_arg, -EAGAIN); 7761 return; 7762 } 7763 bdev->internal.qos_mod_in_progress = true; 7764 7765 if (disable_rate_limit == true && bdev->internal.qos) { 7766 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7767 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7768 (bdev->internal.qos->rate_limits[i].limit > 0 && 7769 bdev->internal.qos->rate_limits[i].limit != 7770 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7771 disable_rate_limit = false; 7772 break; 7773 } 7774 } 7775 } 7776 7777 if (disable_rate_limit == false) { 7778 if (bdev->internal.qos == NULL) { 7779 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7780 if (!bdev->internal.qos) { 7781 spdk_spin_unlock(&bdev->internal.spinlock); 7782 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7783 bdev_set_qos_limit_done(ctx, -ENOMEM); 7784 return; 7785 } 7786 } 7787 7788 if (bdev->internal.qos->thread == NULL) { 7789 /* Enabling */ 7790 bdev_set_qos_rate_limits(bdev, limits); 7791 7792 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 7793 bdev_enable_qos_done); 7794 } else { 7795 /* Updating */ 7796 bdev_set_qos_rate_limits(bdev, limits); 7797 7798 spdk_thread_send_msg(bdev->internal.qos->thread, 7799 bdev_update_qos_rate_limit_msg, ctx); 7800 } 7801 } else { 7802 if (bdev->internal.qos != NULL) { 7803 bdev_set_qos_rate_limits(bdev, limits); 7804 7805 /* Disabling */ 7806 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 7807 bdev_disable_qos_msg_done); 7808 } else { 7809 spdk_spin_unlock(&bdev->internal.spinlock); 7810 bdev_set_qos_limit_done(ctx, 0); 7811 return; 7812 } 7813 } 7814 7815 spdk_spin_unlock(&bdev->internal.spinlock); 7816 } 7817 7818 struct spdk_bdev_histogram_ctx { 7819 spdk_bdev_histogram_status_cb cb_fn; 7820 void *cb_arg; 7821 struct spdk_bdev *bdev; 7822 int status; 7823 }; 7824 7825 static void 7826 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7827 { 7828 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7829 7830 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7831 ctx->bdev->internal.histogram_in_progress = false; 7832 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7833 ctx->cb_fn(ctx->cb_arg, ctx->status); 7834 free(ctx); 7835 } 7836 7837 static void 7838 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7839 struct spdk_io_channel *_ch, void *_ctx) 7840 { 7841 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7842 7843 if (ch->histogram != NULL) { 7844 spdk_histogram_data_free(ch->histogram); 7845 ch->histogram = NULL; 7846 } 7847 spdk_bdev_for_each_channel_continue(i, 0); 7848 } 7849 7850 static void 7851 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7852 { 7853 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7854 7855 if (status != 0) { 7856 ctx->status = status; 7857 ctx->bdev->internal.histogram_enabled = false; 7858 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 7859 bdev_histogram_disable_channel_cb); 7860 } else { 7861 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7862 ctx->bdev->internal.histogram_in_progress = false; 7863 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7864 ctx->cb_fn(ctx->cb_arg, ctx->status); 7865 free(ctx); 7866 } 7867 } 7868 7869 static void 7870 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7871 struct spdk_io_channel *_ch, void *_ctx) 7872 { 7873 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7874 int status = 0; 7875 7876 if (ch->histogram == NULL) { 7877 ch->histogram = spdk_histogram_data_alloc(); 7878 if (ch->histogram == NULL) { 7879 status = -ENOMEM; 7880 } 7881 } 7882 7883 spdk_bdev_for_each_channel_continue(i, status); 7884 } 7885 7886 void 7887 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7888 void *cb_arg, bool enable) 7889 { 7890 struct spdk_bdev_histogram_ctx *ctx; 7891 7892 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7893 if (ctx == NULL) { 7894 cb_fn(cb_arg, -ENOMEM); 7895 return; 7896 } 7897 7898 ctx->bdev = bdev; 7899 ctx->status = 0; 7900 ctx->cb_fn = cb_fn; 7901 ctx->cb_arg = cb_arg; 7902 7903 spdk_spin_lock(&bdev->internal.spinlock); 7904 if (bdev->internal.histogram_in_progress) { 7905 spdk_spin_unlock(&bdev->internal.spinlock); 7906 free(ctx); 7907 cb_fn(cb_arg, -EAGAIN); 7908 return; 7909 } 7910 7911 bdev->internal.histogram_in_progress = true; 7912 spdk_spin_unlock(&bdev->internal.spinlock); 7913 7914 bdev->internal.histogram_enabled = enable; 7915 7916 if (enable) { 7917 /* Allocate histogram for each channel */ 7918 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 7919 bdev_histogram_enable_channel_cb); 7920 } else { 7921 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 7922 bdev_histogram_disable_channel_cb); 7923 } 7924 } 7925 7926 struct spdk_bdev_histogram_data_ctx { 7927 spdk_bdev_histogram_data_cb cb_fn; 7928 void *cb_arg; 7929 struct spdk_bdev *bdev; 7930 /** merged histogram data from all channels */ 7931 struct spdk_histogram_data *histogram; 7932 }; 7933 7934 static void 7935 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7936 { 7937 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7938 7939 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7940 free(ctx); 7941 } 7942 7943 static void 7944 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7945 struct spdk_io_channel *_ch, void *_ctx) 7946 { 7947 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7948 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7949 int status = 0; 7950 7951 if (ch->histogram == NULL) { 7952 status = -EFAULT; 7953 } else { 7954 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7955 } 7956 7957 spdk_bdev_for_each_channel_continue(i, status); 7958 } 7959 7960 void 7961 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7962 spdk_bdev_histogram_data_cb cb_fn, 7963 void *cb_arg) 7964 { 7965 struct spdk_bdev_histogram_data_ctx *ctx; 7966 7967 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7968 if (ctx == NULL) { 7969 cb_fn(cb_arg, -ENOMEM, NULL); 7970 return; 7971 } 7972 7973 ctx->bdev = bdev; 7974 ctx->cb_fn = cb_fn; 7975 ctx->cb_arg = cb_arg; 7976 7977 ctx->histogram = histogram; 7978 7979 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 7980 bdev_histogram_get_channel_cb); 7981 } 7982 7983 void 7984 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 7985 void *cb_arg) 7986 { 7987 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7988 int status = 0; 7989 7990 assert(cb_fn != NULL); 7991 7992 if (bdev_ch->histogram == NULL) { 7993 status = -EFAULT; 7994 } 7995 cb_fn(cb_arg, status, bdev_ch->histogram); 7996 } 7997 7998 size_t 7999 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8000 size_t max_events) 8001 { 8002 struct media_event_entry *entry; 8003 size_t num_events = 0; 8004 8005 for (; num_events < max_events; ++num_events) { 8006 entry = TAILQ_FIRST(&desc->pending_media_events); 8007 if (entry == NULL) { 8008 break; 8009 } 8010 8011 events[num_events] = entry->event; 8012 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8013 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8014 } 8015 8016 return num_events; 8017 } 8018 8019 int 8020 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8021 size_t num_events) 8022 { 8023 struct spdk_bdev_desc *desc; 8024 struct media_event_entry *entry; 8025 size_t event_id; 8026 int rc = 0; 8027 8028 assert(bdev->media_events); 8029 8030 spdk_spin_lock(&bdev->internal.spinlock); 8031 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8032 if (desc->write) { 8033 break; 8034 } 8035 } 8036 8037 if (desc == NULL || desc->media_events_buffer == NULL) { 8038 rc = -ENODEV; 8039 goto out; 8040 } 8041 8042 for (event_id = 0; event_id < num_events; ++event_id) { 8043 entry = TAILQ_FIRST(&desc->free_media_events); 8044 if (entry == NULL) { 8045 break; 8046 } 8047 8048 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8049 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8050 entry->event = events[event_id]; 8051 } 8052 8053 rc = event_id; 8054 out: 8055 spdk_spin_unlock(&bdev->internal.spinlock); 8056 return rc; 8057 } 8058 8059 void 8060 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8061 { 8062 struct spdk_bdev_desc *desc; 8063 8064 spdk_spin_lock(&bdev->internal.spinlock); 8065 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8066 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8067 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 8068 desc->callback.ctx); 8069 } 8070 } 8071 spdk_spin_unlock(&bdev->internal.spinlock); 8072 } 8073 8074 struct locked_lba_range_ctx { 8075 struct lba_range range; 8076 struct spdk_bdev *bdev; 8077 struct lba_range *current_range; 8078 struct lba_range *owner_range; 8079 struct spdk_poller *poller; 8080 lock_range_cb cb_fn; 8081 void *cb_arg; 8082 }; 8083 8084 static void 8085 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8086 { 8087 struct locked_lba_range_ctx *ctx = _ctx; 8088 8089 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8090 free(ctx); 8091 } 8092 8093 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8094 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8095 8096 static void 8097 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8098 { 8099 struct locked_lba_range_ctx *ctx = _ctx; 8100 8101 if (status == -ENOMEM) { 8102 /* One of the channels could not allocate a range object. 8103 * So we have to go back and clean up any ranges that were 8104 * allocated successfully before we return error status to 8105 * the caller. We can reuse the unlock function to do that 8106 * clean up. 8107 */ 8108 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8109 bdev_lock_error_cleanup_cb); 8110 return; 8111 } 8112 8113 /* All channels have locked this range and no I/O overlapping the range 8114 * are outstanding! Set the owner_ch for the range object for the 8115 * locking channel, so that this channel will know that it is allowed 8116 * to write to this range. 8117 */ 8118 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8119 ctx->cb_fn(ctx->cb_arg, status); 8120 8121 /* Don't free the ctx here. Its range is in the bdev's global list of 8122 * locked ranges still, and will be removed and freed when this range 8123 * is later unlocked. 8124 */ 8125 } 8126 8127 static int 8128 bdev_lock_lba_range_check_io(void *_i) 8129 { 8130 struct spdk_bdev_channel_iter *i = _i; 8131 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8132 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8133 struct locked_lba_range_ctx *ctx = i->ctx; 8134 struct lba_range *range = ctx->current_range; 8135 struct spdk_bdev_io *bdev_io; 8136 8137 spdk_poller_unregister(&ctx->poller); 8138 8139 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8140 * range. But we need to wait until any outstanding IO overlapping with this range 8141 * are completed. 8142 */ 8143 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8144 if (bdev_io_range_is_locked(bdev_io, range)) { 8145 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8146 return SPDK_POLLER_BUSY; 8147 } 8148 } 8149 8150 spdk_bdev_for_each_channel_continue(i, 0); 8151 return SPDK_POLLER_BUSY; 8152 } 8153 8154 static void 8155 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8156 struct spdk_io_channel *_ch, void *_ctx) 8157 { 8158 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8159 struct locked_lba_range_ctx *ctx = _ctx; 8160 struct lba_range *range; 8161 8162 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8163 if (range->length == ctx->range.length && 8164 range->offset == ctx->range.offset && 8165 range->locked_ctx == ctx->range.locked_ctx) { 8166 /* This range already exists on this channel, so don't add 8167 * it again. This can happen when a new channel is created 8168 * while the for_each_channel operation is in progress. 8169 * Do not check for outstanding I/O in that case, since the 8170 * range was locked before any I/O could be submitted to the 8171 * new channel. 8172 */ 8173 spdk_bdev_for_each_channel_continue(i, 0); 8174 return; 8175 } 8176 } 8177 8178 range = calloc(1, sizeof(*range)); 8179 if (range == NULL) { 8180 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8181 return; 8182 } 8183 8184 range->length = ctx->range.length; 8185 range->offset = ctx->range.offset; 8186 range->locked_ctx = ctx->range.locked_ctx; 8187 ctx->current_range = range; 8188 if (ctx->range.owner_ch == ch) { 8189 /* This is the range object for the channel that will hold 8190 * the lock. Store it in the ctx object so that we can easily 8191 * set its owner_ch after the lock is finally acquired. 8192 */ 8193 ctx->owner_range = range; 8194 } 8195 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8196 bdev_lock_lba_range_check_io(i); 8197 } 8198 8199 static void 8200 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8201 { 8202 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8203 8204 /* We will add a copy of this range to each channel now. */ 8205 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8206 bdev_lock_lba_range_cb); 8207 } 8208 8209 static bool 8210 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8211 { 8212 struct lba_range *r; 8213 8214 TAILQ_FOREACH(r, tailq, tailq) { 8215 if (bdev_lba_range_overlapped(range, r)) { 8216 return true; 8217 } 8218 } 8219 return false; 8220 } 8221 8222 static int 8223 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8224 uint64_t offset, uint64_t length, 8225 lock_range_cb cb_fn, void *cb_arg) 8226 { 8227 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8228 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8229 struct locked_lba_range_ctx *ctx; 8230 8231 if (cb_arg == NULL) { 8232 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8233 return -EINVAL; 8234 } 8235 8236 ctx = calloc(1, sizeof(*ctx)); 8237 if (ctx == NULL) { 8238 return -ENOMEM; 8239 } 8240 8241 ctx->range.offset = offset; 8242 ctx->range.length = length; 8243 ctx->range.owner_ch = ch; 8244 ctx->range.locked_ctx = cb_arg; 8245 ctx->bdev = bdev; 8246 ctx->cb_fn = cb_fn; 8247 ctx->cb_arg = cb_arg; 8248 8249 spdk_spin_lock(&bdev->internal.spinlock); 8250 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8251 /* There is an active lock overlapping with this range. 8252 * Put it on the pending list until this range no 8253 * longer overlaps with another. 8254 */ 8255 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8256 } else { 8257 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8258 bdev_lock_lba_range_ctx(bdev, ctx); 8259 } 8260 spdk_spin_unlock(&bdev->internal.spinlock); 8261 return 0; 8262 } 8263 8264 static void 8265 bdev_lock_lba_range_ctx_msg(void *_ctx) 8266 { 8267 struct locked_lba_range_ctx *ctx = _ctx; 8268 8269 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8270 } 8271 8272 static void 8273 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8274 { 8275 struct locked_lba_range_ctx *ctx = _ctx; 8276 struct locked_lba_range_ctx *pending_ctx; 8277 struct lba_range *range, *tmp; 8278 8279 spdk_spin_lock(&bdev->internal.spinlock); 8280 /* Check if there are any pending locked ranges that overlap with this range 8281 * that was just unlocked. If there are, check that it doesn't overlap with any 8282 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8283 * the lock process. 8284 */ 8285 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8286 if (bdev_lba_range_overlapped(range, &ctx->range) && 8287 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8288 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8289 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8290 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8291 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8292 bdev_lock_lba_range_ctx_msg, pending_ctx); 8293 } 8294 } 8295 spdk_spin_unlock(&bdev->internal.spinlock); 8296 8297 ctx->cb_fn(ctx->cb_arg, status); 8298 free(ctx); 8299 } 8300 8301 static void 8302 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8303 struct spdk_io_channel *_ch, void *_ctx) 8304 { 8305 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8306 struct locked_lba_range_ctx *ctx = _ctx; 8307 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8308 struct spdk_bdev_io *bdev_io; 8309 struct lba_range *range; 8310 8311 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8312 if (ctx->range.offset == range->offset && 8313 ctx->range.length == range->length && 8314 ctx->range.locked_ctx == range->locked_ctx) { 8315 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8316 free(range); 8317 break; 8318 } 8319 } 8320 8321 /* Note: we should almost always be able to assert that the range specified 8322 * was found. But there are some very rare corner cases where a new channel 8323 * gets created simultaneously with a range unlock, where this function 8324 * would execute on that new channel and wouldn't have the range. 8325 * We also use this to clean up range allocations when a later allocation 8326 * fails in the locking path. 8327 * So we can't actually assert() here. 8328 */ 8329 8330 /* Swap the locked IO into a temporary list, and then try to submit them again. 8331 * We could hyper-optimize this to only resubmit locked I/O that overlap 8332 * with the range that was just unlocked, but this isn't a performance path so 8333 * we go for simplicity here. 8334 */ 8335 TAILQ_INIT(&io_locked); 8336 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8337 while (!TAILQ_EMPTY(&io_locked)) { 8338 bdev_io = TAILQ_FIRST(&io_locked); 8339 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8340 bdev_io_submit(bdev_io); 8341 } 8342 8343 spdk_bdev_for_each_channel_continue(i, 0); 8344 } 8345 8346 static int 8347 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8348 uint64_t offset, uint64_t length, 8349 lock_range_cb cb_fn, void *cb_arg) 8350 { 8351 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8352 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8353 struct locked_lba_range_ctx *ctx; 8354 struct lba_range *range; 8355 bool range_found = false; 8356 8357 /* Let's make sure the specified channel actually has a lock on 8358 * the specified range. Note that the range must match exactly. 8359 */ 8360 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8361 if (range->offset == offset && range->length == length && 8362 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8363 range_found = true; 8364 break; 8365 } 8366 } 8367 8368 if (!range_found) { 8369 return -EINVAL; 8370 } 8371 8372 spdk_spin_lock(&bdev->internal.spinlock); 8373 /* We confirmed that this channel has locked the specified range. To 8374 * start the unlock the process, we find the range in the bdev's locked_ranges 8375 * and remove it. This ensures new channels don't inherit the locked range. 8376 * Then we will send a message to each channel (including the one specified 8377 * here) to remove the range from its per-channel list. 8378 */ 8379 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8380 if (range->offset == offset && range->length == length && 8381 range->locked_ctx == cb_arg) { 8382 break; 8383 } 8384 } 8385 if (range == NULL) { 8386 assert(false); 8387 spdk_spin_unlock(&bdev->internal.spinlock); 8388 return -EINVAL; 8389 } 8390 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8391 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8392 spdk_spin_unlock(&bdev->internal.spinlock); 8393 8394 ctx->cb_fn = cb_fn; 8395 ctx->cb_arg = cb_arg; 8396 8397 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8398 bdev_unlock_lba_range_cb); 8399 return 0; 8400 } 8401 8402 int 8403 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8404 int array_size) 8405 { 8406 if (!bdev) { 8407 return -EINVAL; 8408 } 8409 8410 if (bdev->fn_table->get_memory_domains) { 8411 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8412 } 8413 8414 return 0; 8415 } 8416 8417 struct spdk_bdev_for_each_io_ctx { 8418 void *ctx; 8419 spdk_bdev_io_fn fn; 8420 spdk_bdev_for_each_io_cb cb; 8421 }; 8422 8423 static void 8424 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8425 struct spdk_io_channel *io_ch, void *_ctx) 8426 { 8427 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8428 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8429 struct spdk_bdev_io *bdev_io; 8430 int rc = 0; 8431 8432 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8433 rc = ctx->fn(ctx->ctx, bdev_io); 8434 if (rc != 0) { 8435 break; 8436 } 8437 } 8438 8439 spdk_bdev_for_each_channel_continue(i, rc); 8440 } 8441 8442 static void 8443 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8444 { 8445 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8446 8447 ctx->cb(ctx->ctx, status); 8448 8449 free(ctx); 8450 } 8451 8452 void 8453 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8454 spdk_bdev_for_each_io_cb cb) 8455 { 8456 struct spdk_bdev_for_each_io_ctx *ctx; 8457 8458 assert(fn != NULL && cb != NULL); 8459 8460 ctx = calloc(1, sizeof(*ctx)); 8461 if (ctx == NULL) { 8462 SPDK_ERRLOG("Failed to allocate context.\n"); 8463 cb(_ctx, -ENOMEM); 8464 return; 8465 } 8466 8467 ctx->ctx = _ctx; 8468 ctx->fn = fn; 8469 ctx->cb = cb; 8470 8471 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8472 bdev_for_each_io_done); 8473 } 8474 8475 void 8476 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8477 { 8478 spdk_for_each_channel_continue(iter->i, status); 8479 } 8480 8481 static struct spdk_bdev * 8482 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8483 { 8484 void *io_device = spdk_io_channel_iter_get_io_device(i); 8485 8486 return __bdev_from_io_dev(io_device); 8487 } 8488 8489 static void 8490 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8491 { 8492 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8493 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8494 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8495 8496 iter->i = i; 8497 iter->fn(iter, bdev, ch, iter->ctx); 8498 } 8499 8500 static void 8501 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 8502 { 8503 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8504 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8505 8506 iter->i = i; 8507 iter->cpl(bdev, iter->ctx, status); 8508 8509 free(iter); 8510 } 8511 8512 void 8513 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 8514 void *ctx, spdk_bdev_for_each_channel_done cpl) 8515 { 8516 struct spdk_bdev_channel_iter *iter; 8517 8518 assert(bdev != NULL && fn != NULL && ctx != NULL); 8519 8520 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 8521 if (iter == NULL) { 8522 SPDK_ERRLOG("Unable to allocate iterator\n"); 8523 assert(false); 8524 return; 8525 } 8526 8527 iter->fn = fn; 8528 iter->cpl = cpl; 8529 iter->ctx = ctx; 8530 8531 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 8532 iter, bdev_each_channel_cpl); 8533 } 8534 8535 int 8536 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 8537 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 8538 spdk_bdev_io_completion_cb cb, void *cb_arg) 8539 { 8540 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8541 struct spdk_bdev_io *bdev_io; 8542 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 8543 8544 if (!desc->write) { 8545 return -EBADF; 8546 } 8547 8548 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 8549 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 8550 return -ENOTSUP; 8551 } 8552 8553 if (num_blocks == 0) { 8554 SPDK_ERRLOG("Can't copy 0 blocks\n"); 8555 return -EINVAL; 8556 } 8557 8558 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 8559 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 8560 SPDK_DEBUGLOG(bdev, 8561 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 8562 dst_offset_blocks, src_offset_blocks, num_blocks); 8563 return -EINVAL; 8564 } 8565 8566 bdev_io = bdev_channel_get_io(channel); 8567 if (!bdev_io) { 8568 return -ENOMEM; 8569 } 8570 8571 bdev_io->internal.ch = channel; 8572 bdev_io->internal.desc = desc; 8573 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 8574 8575 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 8576 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 8577 bdev_io->u.bdev.num_blocks = num_blocks; 8578 bdev_io->u.bdev.ext_opts = NULL; 8579 bdev_io_init(bdev_io, bdev, cb_arg, cb); 8580 8581 bdev_io_submit(bdev_io); 8582 return 0; 8583 } 8584 8585 SPDK_LOG_REGISTER_COMPONENT(bdev) 8586 8587 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8588 { 8589 struct spdk_trace_tpoint_opts opts[] = { 8590 { 8591 "BDEV_IO_START", TRACE_BDEV_IO_START, 8592 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8593 { 8594 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8595 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8596 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8597 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8598 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8599 } 8600 }, 8601 { 8602 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8603 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8604 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8605 }, 8606 { 8607 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8608 OWNER_BDEV, OBJECT_NONE, 1, 8609 { 8610 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8611 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8612 } 8613 }, 8614 { 8615 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8616 OWNER_BDEV, OBJECT_NONE, 0, 8617 { 8618 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8619 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8620 } 8621 }, 8622 }; 8623 8624 8625 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8626 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8627 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8628 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8629 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8630 } 8631