1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 65 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 66 }; 67 68 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 69 70 RB_HEAD(bdev_name_tree, spdk_bdev_name); 71 72 static int 73 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 74 { 75 return strcmp(name1->name, name2->name); 76 } 77 78 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 79 80 struct spdk_bdev_mgr { 81 struct spdk_mempool *bdev_io_pool; 82 83 void *zero_buffer; 84 85 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 86 87 struct spdk_bdev_list bdevs; 88 struct bdev_name_tree bdev_names; 89 90 bool init_complete; 91 bool module_init_complete; 92 93 struct spdk_spinlock spinlock; 94 95 #ifdef SPDK_CONFIG_VTUNE 96 __itt_domain *domain; 97 #endif 98 }; 99 100 static struct spdk_bdev_mgr g_bdev_mgr = { 101 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 102 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 103 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 104 .init_complete = false, 105 .module_init_complete = false, 106 }; 107 108 static void 109 __attribute__((constructor)) 110 _bdev_init(void) 111 { 112 spdk_spin_init(&g_bdev_mgr.spinlock); 113 } 114 115 typedef void (*lock_range_cb)(void *ctx, int status); 116 117 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 118 119 struct lba_range { 120 uint64_t offset; 121 uint64_t length; 122 void *locked_ctx; 123 struct spdk_bdev_channel *owner_ch; 124 TAILQ_ENTRY(lba_range) tailq; 125 }; 126 127 static struct spdk_bdev_opts g_bdev_opts = { 128 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 129 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 130 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 131 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 132 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 133 }; 134 135 static spdk_bdev_init_cb g_init_cb_fn = NULL; 136 static void *g_init_cb_arg = NULL; 137 138 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 139 static void *g_fini_cb_arg = NULL; 140 static struct spdk_thread *g_fini_thread = NULL; 141 142 struct spdk_bdev_qos_limit { 143 /** IOs or bytes allowed per second (i.e., 1s). */ 144 uint64_t limit; 145 146 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 147 * For remaining bytes, allowed to run negative if an I/O is submitted when 148 * some bytes are remaining, but the I/O is bigger than that amount. The 149 * excess will be deducted from the next timeslice. 150 */ 151 int64_t remaining_this_timeslice; 152 153 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 154 uint32_t min_per_timeslice; 155 156 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 157 uint32_t max_per_timeslice; 158 159 /** Function to check whether to queue the IO. */ 160 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 161 162 /** Function to update for the submitted IO. */ 163 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 164 }; 165 166 struct spdk_bdev_qos { 167 /** Types of structure of rate limits. */ 168 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 169 170 /** The channel that all I/O are funneled through. */ 171 struct spdk_bdev_channel *ch; 172 173 /** The thread on which the poller is running. */ 174 struct spdk_thread *thread; 175 176 /** Queue of I/O waiting to be issued. */ 177 bdev_io_tailq_t queued; 178 179 /** Size of a timeslice in tsc ticks. */ 180 uint64_t timeslice_size; 181 182 /** Timestamp of start of last timeslice. */ 183 uint64_t last_timeslice; 184 185 /** Poller that processes queued I/O commands each time slice. */ 186 struct spdk_poller *poller; 187 }; 188 189 struct spdk_bdev_mgmt_channel { 190 /* 191 * Each thread keeps a cache of bdev_io - this allows 192 * bdev threads which are *not* DPDK threads to still 193 * benefit from a per-thread bdev_io cache. Without 194 * this, non-DPDK threads fetching from the mempool 195 * incur a cmpxchg on get and put. 196 */ 197 bdev_io_stailq_t per_thread_cache; 198 uint32_t per_thread_cache_count; 199 uint32_t bdev_io_cache_size; 200 201 struct spdk_iobuf_channel iobuf; 202 203 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 204 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 205 }; 206 207 /* 208 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 209 * will queue here their IO that awaits retry. It makes it possible to retry sending 210 * IO to one bdev after IO from other bdev completes. 211 */ 212 struct spdk_bdev_shared_resource { 213 /* The bdev management channel */ 214 struct spdk_bdev_mgmt_channel *mgmt_ch; 215 216 /* 217 * Count of I/O submitted to bdev module and waiting for completion. 218 * Incremented before submit_request() is called on an spdk_bdev_io. 219 */ 220 uint64_t io_outstanding; 221 222 /* 223 * Queue of IO awaiting retry because of a previous NOMEM status returned 224 * on this channel. 225 */ 226 bdev_io_tailq_t nomem_io; 227 228 /* 229 * Threshold which io_outstanding must drop to before retrying nomem_io. 230 */ 231 uint64_t nomem_threshold; 232 233 /* I/O channel allocated by a bdev module */ 234 struct spdk_io_channel *shared_ch; 235 236 /* Refcount of bdev channels using this resource */ 237 uint32_t ref; 238 239 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 240 }; 241 242 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 243 #define BDEV_CH_QOS_ENABLED (1 << 1) 244 245 struct spdk_bdev_channel { 246 struct spdk_bdev *bdev; 247 248 /* The channel for the underlying device */ 249 struct spdk_io_channel *channel; 250 251 /* Per io_device per thread data */ 252 struct spdk_bdev_shared_resource *shared_resource; 253 254 struct spdk_bdev_io_stat *stat; 255 256 /* 257 * Count of I/O submitted to the underlying dev module through this channel 258 * and waiting for completion. 259 */ 260 uint64_t io_outstanding; 261 262 /* 263 * List of all submitted I/Os including I/O that are generated via splitting. 264 */ 265 bdev_io_tailq_t io_submitted; 266 267 /* 268 * List of spdk_bdev_io that are currently queued because they write to a locked 269 * LBA range. 270 */ 271 bdev_io_tailq_t io_locked; 272 273 uint32_t flags; 274 275 struct spdk_histogram_data *histogram; 276 277 #ifdef SPDK_CONFIG_VTUNE 278 uint64_t start_tsc; 279 uint64_t interval_tsc; 280 __itt_string_handle *handle; 281 struct spdk_bdev_io_stat *prev_stat; 282 #endif 283 284 bdev_io_tailq_t queued_resets; 285 286 lba_range_tailq_t locked_ranges; 287 }; 288 289 struct media_event_entry { 290 struct spdk_bdev_media_event event; 291 TAILQ_ENTRY(media_event_entry) tailq; 292 }; 293 294 #define MEDIA_EVENT_POOL_SIZE 64 295 296 struct spdk_bdev_desc { 297 struct spdk_bdev *bdev; 298 struct spdk_thread *thread; 299 struct { 300 spdk_bdev_event_cb_t event_fn; 301 void *ctx; 302 } callback; 303 bool closed; 304 bool write; 305 bool memory_domains_supported; 306 struct spdk_spinlock spinlock; 307 uint32_t refs; 308 TAILQ_HEAD(, media_event_entry) pending_media_events; 309 TAILQ_HEAD(, media_event_entry) free_media_events; 310 struct media_event_entry *media_events_buffer; 311 TAILQ_ENTRY(spdk_bdev_desc) link; 312 313 uint64_t timeout_in_sec; 314 spdk_bdev_io_timeout_cb cb_fn; 315 void *cb_arg; 316 struct spdk_poller *io_timeout_poller; 317 }; 318 319 struct spdk_bdev_iostat_ctx { 320 struct spdk_bdev_io_stat *stat; 321 spdk_bdev_get_device_stat_cb cb; 322 void *cb_arg; 323 }; 324 325 struct set_qos_limit_ctx { 326 void (*cb_fn)(void *cb_arg, int status); 327 void *cb_arg; 328 struct spdk_bdev *bdev; 329 }; 330 331 struct spdk_bdev_channel_iter { 332 spdk_bdev_for_each_channel_msg fn; 333 spdk_bdev_for_each_channel_done cpl; 334 struct spdk_io_channel_iter *i; 335 void *ctx; 336 }; 337 338 struct spdk_bdev_io_error_stat { 339 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 340 }; 341 342 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 343 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 344 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 345 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 346 347 static inline void bdev_io_complete(void *ctx); 348 349 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 350 static void bdev_write_zero_buffer_next(void *_bdev_io); 351 352 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 353 struct spdk_io_channel *ch, void *_ctx); 354 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 355 356 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 357 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 358 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 359 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 360 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 361 struct iovec *iov, int iovcnt, void *md_buf, 362 uint64_t offset_blocks, uint64_t num_blocks, 363 spdk_bdev_io_completion_cb cb, void *cb_arg, 364 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 365 366 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 367 uint64_t offset, uint64_t length, 368 lock_range_cb cb_fn, void *cb_arg); 369 370 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 371 uint64_t offset, uint64_t length, 372 lock_range_cb cb_fn, void *cb_arg); 373 374 static inline void bdev_io_complete(void *ctx); 375 376 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 377 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 378 379 void 380 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 381 { 382 if (!opts) { 383 SPDK_ERRLOG("opts should not be NULL\n"); 384 return; 385 } 386 387 if (!opts_size) { 388 SPDK_ERRLOG("opts_size should not be zero value\n"); 389 return; 390 } 391 392 opts->opts_size = opts_size; 393 394 #define SET_FIELD(field) \ 395 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 396 opts->field = g_bdev_opts.field; \ 397 } \ 398 399 SET_FIELD(bdev_io_pool_size); 400 SET_FIELD(bdev_io_cache_size); 401 SET_FIELD(bdev_auto_examine); 402 SET_FIELD(small_buf_pool_size); 403 SET_FIELD(large_buf_pool_size); 404 405 /* Do not remove this statement, you should always update this statement when you adding a new field, 406 * and do not forget to add the SET_FIELD statement for your added field. */ 407 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 408 409 #undef SET_FIELD 410 } 411 412 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 413 "v23.05", 0); 414 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 415 "v23.05", 0); 416 int 417 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 418 { 419 struct spdk_iobuf_opts iobuf_opts; 420 uint32_t min_pool_size; 421 int rc; 422 423 if (!opts) { 424 SPDK_ERRLOG("opts cannot be NULL\n"); 425 return -1; 426 } 427 428 if (!opts->opts_size) { 429 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 430 return -1; 431 } 432 433 /* 434 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 435 * initialization. A second mgmt_ch will be created on the same thread when the application starts 436 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 437 */ 438 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 439 if (opts->bdev_io_pool_size < min_pool_size) { 440 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 441 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 442 spdk_thread_get_count()); 443 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 444 return -1; 445 } 446 447 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 448 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 449 } 450 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 451 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 452 } 453 454 #define SET_FIELD(field) \ 455 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 456 g_bdev_opts.field = opts->field; \ 457 } \ 458 459 SET_FIELD(bdev_io_pool_size); 460 SET_FIELD(bdev_io_cache_size); 461 SET_FIELD(bdev_auto_examine); 462 SET_FIELD(small_buf_pool_size); 463 SET_FIELD(large_buf_pool_size); 464 465 spdk_iobuf_get_opts(&iobuf_opts); 466 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 467 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 468 469 rc = spdk_iobuf_set_opts(&iobuf_opts); 470 if (rc != 0) { 471 SPDK_ERRLOG("Failed to set iobuf opts\n"); 472 return -1; 473 } 474 475 g_bdev_opts.opts_size = opts->opts_size; 476 477 #undef SET_FIELD 478 479 return 0; 480 } 481 482 static struct spdk_bdev * 483 bdev_get_by_name(const char *bdev_name) 484 { 485 struct spdk_bdev_name find; 486 struct spdk_bdev_name *res; 487 488 find.name = (char *)bdev_name; 489 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 490 if (res != NULL) { 491 return res->bdev; 492 } 493 494 return NULL; 495 } 496 497 struct spdk_bdev * 498 spdk_bdev_get_by_name(const char *bdev_name) 499 { 500 struct spdk_bdev *bdev; 501 502 spdk_spin_lock(&g_bdev_mgr.spinlock); 503 bdev = bdev_get_by_name(bdev_name); 504 spdk_spin_unlock(&g_bdev_mgr.spinlock); 505 506 return bdev; 507 } 508 509 struct bdev_io_status_string { 510 enum spdk_bdev_io_status status; 511 const char *str; 512 }; 513 514 static const struct bdev_io_status_string bdev_io_status_strings[] = { 515 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 516 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 517 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 518 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 519 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 520 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 521 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 522 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 523 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 524 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 525 }; 526 527 static const char * 528 bdev_io_status_get_string(enum spdk_bdev_io_status status) 529 { 530 uint32_t i; 531 532 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 533 if (bdev_io_status_strings[i].status == status) { 534 return bdev_io_status_strings[i].str; 535 } 536 } 537 538 return "reserved"; 539 } 540 541 struct spdk_bdev_wait_for_examine_ctx { 542 struct spdk_poller *poller; 543 spdk_bdev_wait_for_examine_cb cb_fn; 544 void *cb_arg; 545 }; 546 547 static bool bdev_module_all_actions_completed(void); 548 549 static int 550 bdev_wait_for_examine_cb(void *arg) 551 { 552 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 553 554 if (!bdev_module_all_actions_completed()) { 555 return SPDK_POLLER_IDLE; 556 } 557 558 spdk_poller_unregister(&ctx->poller); 559 ctx->cb_fn(ctx->cb_arg); 560 free(ctx); 561 562 return SPDK_POLLER_BUSY; 563 } 564 565 int 566 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 567 { 568 struct spdk_bdev_wait_for_examine_ctx *ctx; 569 570 ctx = calloc(1, sizeof(*ctx)); 571 if (ctx == NULL) { 572 return -ENOMEM; 573 } 574 ctx->cb_fn = cb_fn; 575 ctx->cb_arg = cb_arg; 576 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 577 578 return 0; 579 } 580 581 struct spdk_bdev_examine_item { 582 char *name; 583 TAILQ_ENTRY(spdk_bdev_examine_item) link; 584 }; 585 586 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 587 588 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 589 g_bdev_examine_allowlist); 590 591 static inline bool 592 bdev_examine_allowlist_check(const char *name) 593 { 594 struct spdk_bdev_examine_item *item; 595 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 596 if (strcmp(name, item->name) == 0) { 597 return true; 598 } 599 } 600 return false; 601 } 602 603 static inline void 604 bdev_examine_allowlist_free(void) 605 { 606 struct spdk_bdev_examine_item *item; 607 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 608 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 609 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 610 free(item->name); 611 free(item); 612 } 613 } 614 615 static inline bool 616 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 617 { 618 struct spdk_bdev_alias *tmp; 619 if (bdev_examine_allowlist_check(bdev->name)) { 620 return true; 621 } 622 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 623 if (bdev_examine_allowlist_check(tmp->alias.name)) { 624 return true; 625 } 626 } 627 return false; 628 } 629 630 static inline bool 631 bdev_ok_to_examine(struct spdk_bdev *bdev) 632 { 633 if (g_bdev_opts.bdev_auto_examine) { 634 return true; 635 } else { 636 return bdev_in_examine_allowlist(bdev); 637 } 638 } 639 640 static void 641 bdev_examine(struct spdk_bdev *bdev) 642 { 643 struct spdk_bdev_module *module; 644 uint32_t action; 645 646 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 647 if (module->examine_config && bdev_ok_to_examine(bdev)) { 648 action = module->internal.action_in_progress; 649 module->internal.action_in_progress++; 650 module->examine_config(bdev); 651 if (action != module->internal.action_in_progress) { 652 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 653 module->name); 654 } 655 } 656 } 657 658 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 659 if (bdev->internal.claim_module->examine_disk) { 660 bdev->internal.claim_module->internal.action_in_progress++; 661 bdev->internal.claim_module->examine_disk(bdev); 662 } 663 return; 664 } 665 666 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 667 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 668 module->internal.action_in_progress++; 669 module->examine_disk(bdev); 670 } 671 } 672 } 673 674 int 675 spdk_bdev_examine(const char *name) 676 { 677 struct spdk_bdev *bdev; 678 struct spdk_bdev_examine_item *item; 679 680 if (g_bdev_opts.bdev_auto_examine) { 681 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 682 return -EINVAL; 683 } 684 685 if (bdev_examine_allowlist_check(name)) { 686 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 687 return -EEXIST; 688 } 689 690 item = calloc(1, sizeof(*item)); 691 if (!item) { 692 return -ENOMEM; 693 } 694 item->name = strdup(name); 695 if (!item->name) { 696 free(item); 697 return -ENOMEM; 698 } 699 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 700 701 bdev = spdk_bdev_get_by_name(name); 702 if (bdev) { 703 bdev_examine(bdev); 704 } 705 return 0; 706 } 707 708 static inline void 709 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 710 { 711 struct spdk_bdev_examine_item *item; 712 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 713 spdk_json_write_object_begin(w); 714 spdk_json_write_named_string(w, "method", "bdev_examine"); 715 spdk_json_write_named_object_begin(w, "params"); 716 spdk_json_write_named_string(w, "name", item->name); 717 spdk_json_write_object_end(w); 718 spdk_json_write_object_end(w); 719 } 720 } 721 722 struct spdk_bdev * 723 spdk_bdev_first(void) 724 { 725 struct spdk_bdev *bdev; 726 727 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 728 if (bdev) { 729 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 730 } 731 732 return bdev; 733 } 734 735 struct spdk_bdev * 736 spdk_bdev_next(struct spdk_bdev *prev) 737 { 738 struct spdk_bdev *bdev; 739 740 bdev = TAILQ_NEXT(prev, internal.link); 741 if (bdev) { 742 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 743 } 744 745 return bdev; 746 } 747 748 static struct spdk_bdev * 749 _bdev_next_leaf(struct spdk_bdev *bdev) 750 { 751 while (bdev != NULL) { 752 if (bdev->internal.claim_module == NULL) { 753 return bdev; 754 } else { 755 bdev = TAILQ_NEXT(bdev, internal.link); 756 } 757 } 758 759 return bdev; 760 } 761 762 struct spdk_bdev * 763 spdk_bdev_first_leaf(void) 764 { 765 struct spdk_bdev *bdev; 766 767 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 768 769 if (bdev) { 770 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 771 } 772 773 return bdev; 774 } 775 776 struct spdk_bdev * 777 spdk_bdev_next_leaf(struct spdk_bdev *prev) 778 { 779 struct spdk_bdev *bdev; 780 781 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 782 783 if (bdev) { 784 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 785 } 786 787 return bdev; 788 } 789 790 static inline bool 791 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 792 { 793 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 794 } 795 796 void 797 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 798 { 799 struct iovec *iovs; 800 801 if (bdev_io->u.bdev.iovs == NULL) { 802 bdev_io->u.bdev.iovs = &bdev_io->iov; 803 bdev_io->u.bdev.iovcnt = 1; 804 } 805 806 iovs = bdev_io->u.bdev.iovs; 807 808 assert(iovs != NULL); 809 assert(bdev_io->u.bdev.iovcnt >= 1); 810 811 iovs[0].iov_base = buf; 812 iovs[0].iov_len = len; 813 } 814 815 void 816 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 817 { 818 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 819 bdev_io->u.bdev.md_buf = md_buf; 820 } 821 822 static bool 823 _is_buf_allocated(const struct iovec *iovs) 824 { 825 if (iovs == NULL) { 826 return false; 827 } 828 829 return iovs[0].iov_base != NULL; 830 } 831 832 static bool 833 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 834 { 835 int i; 836 uintptr_t iov_base; 837 838 if (spdk_likely(alignment == 1)) { 839 return true; 840 } 841 842 for (i = 0; i < iovcnt; i++) { 843 iov_base = (uintptr_t)iovs[i].iov_base; 844 if ((iov_base & (alignment - 1)) != 0) { 845 return false; 846 } 847 } 848 849 return true; 850 } 851 852 static void 853 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 854 { 855 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 856 void *buf; 857 858 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 859 buf = bdev_io->internal.buf; 860 bdev_io->internal.buf = NULL; 861 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 862 bdev_io->internal.get_aux_buf_cb = NULL; 863 } else { 864 assert(bdev_io->internal.get_buf_cb != NULL); 865 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 866 bdev_io->internal.get_buf_cb = NULL; 867 } 868 } 869 870 static void 871 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 872 { 873 struct spdk_bdev_io *bdev_io = ctx; 874 875 if (rc) { 876 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 877 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 878 } 879 bdev_io_get_buf_complete(bdev_io, !rc); 880 } 881 882 static void 883 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 884 { 885 int rc = 0; 886 887 /* save original md_buf */ 888 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 889 bdev_io->internal.orig_md_iov.iov_len = len; 890 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 891 bdev_io->internal.bounce_md_iov.iov_len = len; 892 /* set bounce md_buf */ 893 bdev_io->u.bdev.md_buf = md_buf; 894 895 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 896 if (bdev_io_use_memory_domain(bdev_io)) { 897 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 898 bdev_io->internal.ext_opts->memory_domain_ctx, 899 &bdev_io->internal.orig_md_iov, 1, 900 &bdev_io->internal.bounce_md_iov, 1, 901 bdev_io->internal.data_transfer_cpl, 902 bdev_io); 903 if (rc == 0) { 904 /* Continue to submit IO in completion callback */ 905 return; 906 } 907 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 908 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 909 } else { 910 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 911 } 912 } 913 914 assert(bdev_io->internal.data_transfer_cpl); 915 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 916 } 917 918 static void 919 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 920 { 921 struct spdk_bdev *bdev = bdev_io->bdev; 922 uint64_t md_len; 923 void *buf; 924 925 if (spdk_bdev_is_md_separate(bdev)) { 926 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 927 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 928 929 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 930 931 if (bdev_io->u.bdev.md_buf != NULL) { 932 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 933 return; 934 } else { 935 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 936 } 937 } 938 939 bdev_io_get_buf_complete(bdev_io, true); 940 } 941 942 static void 943 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 944 { 945 struct spdk_bdev_io *bdev_io = ctx; 946 947 if (rc) { 948 SPDK_ERRLOG("Failed to get data buffer\n"); 949 assert(bdev_io->internal.data_transfer_cpl); 950 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 951 return; 952 } 953 954 _bdev_io_set_md_buf(bdev_io); 955 } 956 957 static void 958 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 959 bdev_copy_bounce_buffer_cpl cpl_cb) 960 { 961 int rc = 0; 962 963 bdev_io->internal.data_transfer_cpl = cpl_cb; 964 /* save original iovec */ 965 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 966 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 967 /* set bounce iov */ 968 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 969 bdev_io->u.bdev.iovcnt = 1; 970 /* set bounce buffer for this operation */ 971 bdev_io->u.bdev.iovs[0].iov_base = buf; 972 bdev_io->u.bdev.iovs[0].iov_len = len; 973 /* if this is write path, copy data from original buffer to bounce buffer */ 974 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 975 if (bdev_io_use_memory_domain(bdev_io)) { 976 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 977 bdev_io->internal.ext_opts->memory_domain_ctx, 978 bdev_io->internal.orig_iovs, 979 (uint32_t) bdev_io->internal.orig_iovcnt, 980 bdev_io->u.bdev.iovs, 1, 981 _bdev_io_pull_bounce_data_buf_done, 982 bdev_io); 983 if (rc == 0) { 984 /* Continue to submit IO in completion callback */ 985 return; 986 } 987 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 988 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 989 } else { 990 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 991 } 992 } 993 994 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 995 } 996 997 static void 998 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 999 { 1000 struct spdk_bdev *bdev = bdev_io->bdev; 1001 bool buf_allocated; 1002 uint64_t alignment; 1003 void *aligned_buf; 1004 1005 bdev_io->internal.buf = buf; 1006 1007 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1008 bdev_io_get_buf_complete(bdev_io, true); 1009 return; 1010 } 1011 1012 alignment = spdk_bdev_get_buf_align(bdev); 1013 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1014 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1015 1016 if (buf_allocated) { 1017 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1018 /* Continue in completion callback */ 1019 return; 1020 } else { 1021 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1022 } 1023 1024 _bdev_io_set_md_buf(bdev_io); 1025 } 1026 1027 static inline uint64_t 1028 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1029 { 1030 struct spdk_bdev *bdev = bdev_io->bdev; 1031 uint64_t md_len, alignment; 1032 1033 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1034 alignment = spdk_bdev_get_buf_align(bdev); 1035 1036 return len + alignment + md_len; 1037 } 1038 1039 static void 1040 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1041 { 1042 struct spdk_bdev_mgmt_channel *ch; 1043 1044 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1045 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1046 } 1047 1048 static void 1049 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1050 { 1051 assert(bdev_io->internal.buf != NULL); 1052 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1053 bdev_io->internal.buf = NULL; 1054 } 1055 1056 void 1057 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1058 { 1059 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1060 1061 assert(buf != NULL); 1062 _bdev_io_put_buf(bdev_io, buf, len); 1063 } 1064 1065 static void 1066 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1067 { 1068 struct spdk_bdev *bdev = bdev_ch->bdev; 1069 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1070 struct spdk_bdev_io *bdev_io; 1071 1072 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1073 /* 1074 * Allow some more I/O to complete before retrying the nomem_io queue. 1075 * Some drivers (such as nvme) cannot immediately take a new I/O in 1076 * the context of a completion, because the resources for the I/O are 1077 * not released until control returns to the bdev poller. Also, we 1078 * may require several small I/O to complete before a larger I/O 1079 * (that requires splitting) can be submitted. 1080 */ 1081 return; 1082 } 1083 1084 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1085 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1086 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1087 bdev_io->internal.ch->io_outstanding++; 1088 shared_resource->io_outstanding++; 1089 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1090 bdev_io->internal.error.nvme.cdw0 = 0; 1091 bdev_io->num_retries++; 1092 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1093 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1094 break; 1095 } 1096 } 1097 } 1098 1099 static inline void 1100 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1101 struct spdk_bdev_shared_resource *shared_resource) 1102 { 1103 assert(bdev_ch->io_outstanding > 0); 1104 assert(shared_resource->io_outstanding > 0); 1105 bdev_ch->io_outstanding--; 1106 shared_resource->io_outstanding--; 1107 } 1108 1109 static inline bool 1110 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1111 { 1112 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1113 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1114 1115 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1116 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1117 /* 1118 * Wait for some of the outstanding I/O to complete before we 1119 * retry any of the nomem_io. Normally we will wait for 1120 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1121 * depth channels we will instead wait for half to complete. 1122 */ 1123 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1124 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1125 return true; 1126 } 1127 1128 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1129 bdev_ch_retry_io(bdev_ch); 1130 } 1131 1132 return false; 1133 } 1134 1135 static void 1136 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1137 { 1138 struct spdk_bdev_io *bdev_io = ctx; 1139 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1140 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1141 1142 if (rc) { 1143 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1144 } 1145 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1146 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1147 */ 1148 bdev_io_put_buf(bdev_io); 1149 1150 /* Continue with IO completion flow */ 1151 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1152 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1153 return; 1154 } 1155 1156 bdev_io_complete(bdev_io); 1157 } 1158 1159 static inline void 1160 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1161 { 1162 int rc = 0; 1163 1164 /* do the same for metadata buffer */ 1165 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1166 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1167 1168 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1169 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1170 if (bdev_io_use_memory_domain(bdev_io)) { 1171 /* If memory domain is used then we need to call async push function */ 1172 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1173 bdev_io->internal.ext_opts->memory_domain_ctx, 1174 &bdev_io->internal.orig_md_iov, 1175 (uint32_t)bdev_io->internal.orig_iovcnt, 1176 &bdev_io->internal.bounce_md_iov, 1, 1177 bdev_io->internal.data_transfer_cpl, 1178 bdev_io); 1179 if (rc == 0) { 1180 /* Continue IO completion in async callback */ 1181 return; 1182 } 1183 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1184 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1185 } else { 1186 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1187 bdev_io->internal.orig_md_iov.iov_len); 1188 } 1189 } 1190 } 1191 1192 assert(bdev_io->internal.data_transfer_cpl); 1193 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1194 } 1195 1196 static void 1197 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1198 { 1199 struct spdk_bdev_io *bdev_io = ctx; 1200 1201 assert(bdev_io->internal.data_transfer_cpl); 1202 1203 if (rc) { 1204 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1205 return; 1206 } 1207 1208 /* set original buffer for this io */ 1209 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1210 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1211 /* disable bouncing buffer for this io */ 1212 bdev_io->internal.orig_iovcnt = 0; 1213 bdev_io->internal.orig_iovs = NULL; 1214 1215 _bdev_io_push_bounce_md_buffer(bdev_io); 1216 } 1217 1218 static inline void 1219 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1220 { 1221 int rc = 0; 1222 1223 bdev_io->internal.data_transfer_cpl = cpl_cb; 1224 1225 /* if this is read path, copy data from bounce buffer to original buffer */ 1226 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1227 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1228 if (bdev_io_use_memory_domain(bdev_io)) { 1229 /* If memory domain is used then we need to call async push function */ 1230 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1231 bdev_io->internal.ext_opts->memory_domain_ctx, 1232 bdev_io->internal.orig_iovs, 1233 (uint32_t)bdev_io->internal.orig_iovcnt, 1234 &bdev_io->internal.bounce_iov, 1, 1235 _bdev_io_push_bounce_data_buffer_done, 1236 bdev_io); 1237 if (rc == 0) { 1238 /* Continue IO completion in async callback */ 1239 return; 1240 } 1241 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1242 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1243 } else { 1244 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1245 bdev_io->internal.orig_iovcnt, 1246 bdev_io->internal.bounce_iov.iov_base, 1247 bdev_io->internal.bounce_iov.iov_len); 1248 } 1249 } 1250 1251 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1252 } 1253 1254 static void 1255 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1256 { 1257 struct spdk_bdev_io *bdev_io; 1258 1259 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1260 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1261 } 1262 1263 static void 1264 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1265 { 1266 struct spdk_bdev_mgmt_channel *mgmt_ch; 1267 uint64_t max_len; 1268 void *buf; 1269 1270 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1271 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1272 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1273 1274 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1275 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1276 bdev_io_get_buf_complete(bdev_io, false); 1277 return; 1278 } 1279 1280 bdev_io->internal.buf_len = len; 1281 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1282 bdev_io_get_iobuf_cb); 1283 if (buf != NULL) { 1284 _bdev_io_set_buf(bdev_io, buf, len); 1285 } 1286 } 1287 1288 void 1289 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1290 { 1291 struct spdk_bdev *bdev = bdev_io->bdev; 1292 uint64_t alignment; 1293 1294 assert(cb != NULL); 1295 bdev_io->internal.get_buf_cb = cb; 1296 1297 alignment = spdk_bdev_get_buf_align(bdev); 1298 1299 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1300 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1301 /* Buffer already present and aligned */ 1302 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1303 return; 1304 } 1305 1306 bdev_io_get_buf(bdev_io, len); 1307 } 1308 1309 static void 1310 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1311 bool success) 1312 { 1313 if (!success) { 1314 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1315 bdev_io_complete(bdev_io); 1316 } else { 1317 bdev_io_submit(bdev_io); 1318 } 1319 } 1320 1321 static void 1322 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1323 uint64_t len) 1324 { 1325 assert(cb != NULL); 1326 bdev_io->internal.get_buf_cb = cb; 1327 1328 bdev_io_get_buf(bdev_io, len); 1329 } 1330 1331 void 1332 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1333 { 1334 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1335 1336 assert(cb != NULL); 1337 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1338 bdev_io->internal.get_aux_buf_cb = cb; 1339 bdev_io_get_buf(bdev_io, len); 1340 } 1341 1342 static int 1343 bdev_module_get_max_ctx_size(void) 1344 { 1345 struct spdk_bdev_module *bdev_module; 1346 int max_bdev_module_size = 0; 1347 1348 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1349 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1350 max_bdev_module_size = bdev_module->get_ctx_size(); 1351 } 1352 } 1353 1354 return max_bdev_module_size; 1355 } 1356 1357 static void 1358 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1359 { 1360 int i; 1361 struct spdk_bdev_qos *qos = bdev->internal.qos; 1362 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1363 1364 if (!qos) { 1365 return; 1366 } 1367 1368 spdk_bdev_get_qos_rate_limits(bdev, limits); 1369 1370 spdk_json_write_object_begin(w); 1371 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1372 1373 spdk_json_write_named_object_begin(w, "params"); 1374 spdk_json_write_named_string(w, "name", bdev->name); 1375 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1376 if (limits[i] > 0) { 1377 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1378 } 1379 } 1380 spdk_json_write_object_end(w); 1381 1382 spdk_json_write_object_end(w); 1383 } 1384 1385 void 1386 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1387 { 1388 struct spdk_bdev_module *bdev_module; 1389 struct spdk_bdev *bdev; 1390 1391 assert(w != NULL); 1392 1393 spdk_json_write_array_begin(w); 1394 1395 spdk_json_write_object_begin(w); 1396 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1397 spdk_json_write_named_object_begin(w, "params"); 1398 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1399 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1400 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1401 spdk_json_write_object_end(w); 1402 spdk_json_write_object_end(w); 1403 1404 bdev_examine_allowlist_config_json(w); 1405 1406 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1407 if (bdev_module->config_json) { 1408 bdev_module->config_json(w); 1409 } 1410 } 1411 1412 spdk_spin_lock(&g_bdev_mgr.spinlock); 1413 1414 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1415 if (bdev->fn_table->write_config_json) { 1416 bdev->fn_table->write_config_json(bdev, w); 1417 } 1418 1419 bdev_qos_config_json(bdev, w); 1420 } 1421 1422 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1423 1424 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1425 spdk_json_write_object_begin(w); 1426 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1427 spdk_json_write_object_end(w); 1428 1429 spdk_json_write_array_end(w); 1430 } 1431 1432 static void 1433 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1434 { 1435 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1436 struct spdk_bdev_io *bdev_io; 1437 1438 spdk_iobuf_channel_fini(&ch->iobuf); 1439 1440 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1441 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1442 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1443 ch->per_thread_cache_count--; 1444 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1445 } 1446 1447 assert(ch->per_thread_cache_count == 0); 1448 } 1449 1450 static int 1451 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1452 { 1453 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1454 struct spdk_bdev_io *bdev_io; 1455 uint32_t i; 1456 int rc; 1457 1458 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1459 if (rc != 0) { 1460 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1461 return -1; 1462 } 1463 1464 STAILQ_INIT(&ch->per_thread_cache); 1465 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1466 1467 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1468 ch->per_thread_cache_count = 0; 1469 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1470 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1471 if (bdev_io == NULL) { 1472 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1473 assert(false); 1474 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1475 return -1; 1476 } 1477 ch->per_thread_cache_count++; 1478 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1479 } 1480 1481 TAILQ_INIT(&ch->shared_resources); 1482 TAILQ_INIT(&ch->io_wait_queue); 1483 1484 return 0; 1485 } 1486 1487 static void 1488 bdev_init_complete(int rc) 1489 { 1490 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1491 void *cb_arg = g_init_cb_arg; 1492 struct spdk_bdev_module *m; 1493 1494 g_bdev_mgr.init_complete = true; 1495 g_init_cb_fn = NULL; 1496 g_init_cb_arg = NULL; 1497 1498 /* 1499 * For modules that need to know when subsystem init is complete, 1500 * inform them now. 1501 */ 1502 if (rc == 0) { 1503 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1504 if (m->init_complete) { 1505 m->init_complete(); 1506 } 1507 } 1508 } 1509 1510 cb_fn(cb_arg, rc); 1511 } 1512 1513 static bool 1514 bdev_module_all_actions_completed(void) 1515 { 1516 struct spdk_bdev_module *m; 1517 1518 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1519 if (m->internal.action_in_progress > 0) { 1520 return false; 1521 } 1522 } 1523 return true; 1524 } 1525 1526 static void 1527 bdev_module_action_complete(void) 1528 { 1529 /* 1530 * Don't finish bdev subsystem initialization if 1531 * module pre-initialization is still in progress, or 1532 * the subsystem been already initialized. 1533 */ 1534 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1535 return; 1536 } 1537 1538 /* 1539 * Check all bdev modules for inits/examinations in progress. If any 1540 * exist, return immediately since we cannot finish bdev subsystem 1541 * initialization until all are completed. 1542 */ 1543 if (!bdev_module_all_actions_completed()) { 1544 return; 1545 } 1546 1547 /* 1548 * Modules already finished initialization - now that all 1549 * the bdev modules have finished their asynchronous I/O 1550 * processing, the entire bdev layer can be marked as complete. 1551 */ 1552 bdev_init_complete(0); 1553 } 1554 1555 static void 1556 bdev_module_action_done(struct spdk_bdev_module *module) 1557 { 1558 assert(module->internal.action_in_progress > 0); 1559 module->internal.action_in_progress--; 1560 bdev_module_action_complete(); 1561 } 1562 1563 void 1564 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1565 { 1566 bdev_module_action_done(module); 1567 } 1568 1569 void 1570 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1571 { 1572 bdev_module_action_done(module); 1573 } 1574 1575 /** The last initialized bdev module */ 1576 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1577 1578 static void 1579 bdev_init_failed(void *cb_arg) 1580 { 1581 struct spdk_bdev_module *module = cb_arg; 1582 1583 module->internal.action_in_progress--; 1584 bdev_init_complete(-1); 1585 } 1586 1587 static int 1588 bdev_modules_init(void) 1589 { 1590 struct spdk_bdev_module *module; 1591 int rc = 0; 1592 1593 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1594 g_resume_bdev_module = module; 1595 if (module->async_init) { 1596 module->internal.action_in_progress = 1; 1597 } 1598 rc = module->module_init(); 1599 if (rc != 0) { 1600 /* Bump action_in_progress to prevent other modules from completion of modules_init 1601 * Send message to defer application shutdown until resources are cleaned up */ 1602 module->internal.action_in_progress = 1; 1603 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1604 return rc; 1605 } 1606 } 1607 1608 g_resume_bdev_module = NULL; 1609 return 0; 1610 } 1611 1612 void 1613 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1614 { 1615 int rc = 0; 1616 char mempool_name[32]; 1617 1618 assert(cb_fn != NULL); 1619 1620 g_init_cb_fn = cb_fn; 1621 g_init_cb_arg = cb_arg; 1622 1623 spdk_notify_type_register("bdev_register"); 1624 spdk_notify_type_register("bdev_unregister"); 1625 1626 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1627 1628 rc = spdk_iobuf_register_module("bdev"); 1629 if (rc != 0) { 1630 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1631 bdev_init_complete(-1); 1632 return; 1633 } 1634 1635 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1636 g_bdev_opts.bdev_io_pool_size, 1637 sizeof(struct spdk_bdev_io) + 1638 bdev_module_get_max_ctx_size(), 1639 0, 1640 SPDK_ENV_SOCKET_ID_ANY); 1641 1642 if (g_bdev_mgr.bdev_io_pool == NULL) { 1643 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1644 bdev_init_complete(-1); 1645 return; 1646 } 1647 1648 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1649 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1650 if (!g_bdev_mgr.zero_buffer) { 1651 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1652 bdev_init_complete(-1); 1653 return; 1654 } 1655 1656 #ifdef SPDK_CONFIG_VTUNE 1657 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1658 #endif 1659 1660 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1661 bdev_mgmt_channel_destroy, 1662 sizeof(struct spdk_bdev_mgmt_channel), 1663 "bdev_mgr"); 1664 1665 rc = bdev_modules_init(); 1666 g_bdev_mgr.module_init_complete = true; 1667 if (rc != 0) { 1668 SPDK_ERRLOG("bdev modules init failed\n"); 1669 return; 1670 } 1671 1672 bdev_module_action_complete(); 1673 } 1674 1675 static void 1676 bdev_mgr_unregister_cb(void *io_device) 1677 { 1678 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1679 1680 if (g_bdev_mgr.bdev_io_pool) { 1681 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1682 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1683 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1684 g_bdev_opts.bdev_io_pool_size); 1685 } 1686 1687 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1688 } 1689 1690 spdk_free(g_bdev_mgr.zero_buffer); 1691 1692 bdev_examine_allowlist_free(); 1693 1694 cb_fn(g_fini_cb_arg); 1695 g_fini_cb_fn = NULL; 1696 g_fini_cb_arg = NULL; 1697 g_bdev_mgr.init_complete = false; 1698 g_bdev_mgr.module_init_complete = false; 1699 } 1700 1701 static void 1702 bdev_module_fini_iter(void *arg) 1703 { 1704 struct spdk_bdev_module *bdev_module; 1705 1706 /* FIXME: Handling initialization failures is broken now, 1707 * so we won't even try cleaning up after successfully 1708 * initialized modules. if module_init_complete is false, 1709 * just call spdk_bdev_mgr_unregister_cb 1710 */ 1711 if (!g_bdev_mgr.module_init_complete) { 1712 bdev_mgr_unregister_cb(NULL); 1713 return; 1714 } 1715 1716 /* Start iterating from the last touched module */ 1717 if (!g_resume_bdev_module) { 1718 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1719 } else { 1720 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1721 internal.tailq); 1722 } 1723 1724 while (bdev_module) { 1725 if (bdev_module->async_fini) { 1726 /* Save our place so we can resume later. We must 1727 * save the variable here, before calling module_fini() 1728 * below, because in some cases the module may immediately 1729 * call spdk_bdev_module_fini_done() and re-enter 1730 * this function to continue iterating. */ 1731 g_resume_bdev_module = bdev_module; 1732 } 1733 1734 if (bdev_module->module_fini) { 1735 bdev_module->module_fini(); 1736 } 1737 1738 if (bdev_module->async_fini) { 1739 return; 1740 } 1741 1742 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1743 internal.tailq); 1744 } 1745 1746 g_resume_bdev_module = NULL; 1747 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1748 } 1749 1750 void 1751 spdk_bdev_module_fini_done(void) 1752 { 1753 if (spdk_get_thread() != g_fini_thread) { 1754 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1755 } else { 1756 bdev_module_fini_iter(NULL); 1757 } 1758 } 1759 1760 static void 1761 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1762 { 1763 struct spdk_bdev *bdev = cb_arg; 1764 1765 if (bdeverrno && bdev) { 1766 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1767 bdev->name); 1768 1769 /* 1770 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1771 * bdev; try to continue by manually removing this bdev from the list and continue 1772 * with the next bdev in the list. 1773 */ 1774 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1775 } 1776 1777 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1778 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1779 /* 1780 * Bdev module finish need to be deferred as we might be in the middle of some context 1781 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1782 * after returning. 1783 */ 1784 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1785 return; 1786 } 1787 1788 /* 1789 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1790 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1791 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1792 * base bdevs. 1793 * 1794 * Also, walk the list in the reverse order. 1795 */ 1796 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1797 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1798 if (bdev->internal.claim_module != NULL) { 1799 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1800 bdev->name, bdev->internal.claim_module->name); 1801 continue; 1802 } 1803 1804 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1805 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1806 return; 1807 } 1808 1809 /* 1810 * If any bdev fails to unclaim underlying bdev properly, we may face the 1811 * case of bdev list consisting of claimed bdevs only (if claims are managed 1812 * correctly, this would mean there's a loop in the claims graph which is 1813 * clearly impossible). Warn and unregister last bdev on the list then. 1814 */ 1815 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1816 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1817 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1818 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1819 return; 1820 } 1821 } 1822 1823 static void 1824 bdev_module_fini_start_iter(void *arg) 1825 { 1826 struct spdk_bdev_module *bdev_module; 1827 1828 if (!g_resume_bdev_module) { 1829 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1830 } else { 1831 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1832 } 1833 1834 while (bdev_module) { 1835 if (bdev_module->async_fini_start) { 1836 /* Save our place so we can resume later. We must 1837 * save the variable here, before calling fini_start() 1838 * below, because in some cases the module may immediately 1839 * call spdk_bdev_module_fini_start_done() and re-enter 1840 * this function to continue iterating. */ 1841 g_resume_bdev_module = bdev_module; 1842 } 1843 1844 if (bdev_module->fini_start) { 1845 bdev_module->fini_start(); 1846 } 1847 1848 if (bdev_module->async_fini_start) { 1849 return; 1850 } 1851 1852 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1853 } 1854 1855 g_resume_bdev_module = NULL; 1856 1857 bdev_finish_unregister_bdevs_iter(NULL, 0); 1858 } 1859 1860 void 1861 spdk_bdev_module_fini_start_done(void) 1862 { 1863 if (spdk_get_thread() != g_fini_thread) { 1864 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1865 } else { 1866 bdev_module_fini_start_iter(NULL); 1867 } 1868 } 1869 1870 static void 1871 bdev_finish_wait_for_examine_done(void *cb_arg) 1872 { 1873 bdev_module_fini_start_iter(NULL); 1874 } 1875 1876 void 1877 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1878 { 1879 int rc; 1880 1881 assert(cb_fn != NULL); 1882 1883 g_fini_thread = spdk_get_thread(); 1884 1885 g_fini_cb_fn = cb_fn; 1886 g_fini_cb_arg = cb_arg; 1887 1888 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1889 if (rc != 0) { 1890 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1891 bdev_finish_wait_for_examine_done(NULL); 1892 } 1893 } 1894 1895 struct spdk_bdev_io * 1896 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1897 { 1898 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1899 struct spdk_bdev_io *bdev_io; 1900 1901 if (ch->per_thread_cache_count > 0) { 1902 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1903 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1904 ch->per_thread_cache_count--; 1905 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1906 /* 1907 * Don't try to look for bdev_ios in the global pool if there are 1908 * waiters on bdev_ios - we don't want this caller to jump the line. 1909 */ 1910 bdev_io = NULL; 1911 } else { 1912 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1913 } 1914 1915 return bdev_io; 1916 } 1917 1918 void 1919 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1920 { 1921 struct spdk_bdev_mgmt_channel *ch; 1922 1923 assert(bdev_io != NULL); 1924 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1925 1926 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1927 1928 if (bdev_io->internal.buf != NULL) { 1929 bdev_io_put_buf(bdev_io); 1930 } 1931 1932 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1933 ch->per_thread_cache_count++; 1934 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1935 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1936 struct spdk_bdev_io_wait_entry *entry; 1937 1938 entry = TAILQ_FIRST(&ch->io_wait_queue); 1939 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1940 entry->cb_fn(entry->cb_arg); 1941 } 1942 } else { 1943 /* We should never have a full cache with entries on the io wait queue. */ 1944 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1945 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1946 } 1947 } 1948 1949 static bool 1950 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1951 { 1952 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1953 1954 switch (limit) { 1955 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1956 return true; 1957 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1958 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1959 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1960 return false; 1961 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1962 default: 1963 return false; 1964 } 1965 } 1966 1967 static bool 1968 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1969 { 1970 switch (bdev_io->type) { 1971 case SPDK_BDEV_IO_TYPE_NVME_IO: 1972 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1973 case SPDK_BDEV_IO_TYPE_READ: 1974 case SPDK_BDEV_IO_TYPE_WRITE: 1975 return true; 1976 case SPDK_BDEV_IO_TYPE_ZCOPY: 1977 if (bdev_io->u.bdev.zcopy.start) { 1978 return true; 1979 } else { 1980 return false; 1981 } 1982 default: 1983 return false; 1984 } 1985 } 1986 1987 static bool 1988 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1989 { 1990 switch (bdev_io->type) { 1991 case SPDK_BDEV_IO_TYPE_NVME_IO: 1992 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1993 /* Bit 1 (0x2) set for read operation */ 1994 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1995 return true; 1996 } else { 1997 return false; 1998 } 1999 case SPDK_BDEV_IO_TYPE_READ: 2000 return true; 2001 case SPDK_BDEV_IO_TYPE_ZCOPY: 2002 /* Populate to read from disk */ 2003 if (bdev_io->u.bdev.zcopy.populate) { 2004 return true; 2005 } else { 2006 return false; 2007 } 2008 default: 2009 return false; 2010 } 2011 } 2012 2013 static uint64_t 2014 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2015 { 2016 struct spdk_bdev *bdev = bdev_io->bdev; 2017 2018 switch (bdev_io->type) { 2019 case SPDK_BDEV_IO_TYPE_NVME_IO: 2020 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2021 return bdev_io->u.nvme_passthru.nbytes; 2022 case SPDK_BDEV_IO_TYPE_READ: 2023 case SPDK_BDEV_IO_TYPE_WRITE: 2024 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2025 case SPDK_BDEV_IO_TYPE_ZCOPY: 2026 /* Track the data in the start phase only */ 2027 if (bdev_io->u.bdev.zcopy.start) { 2028 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2029 } else { 2030 return 0; 2031 } 2032 default: 2033 return 0; 2034 } 2035 } 2036 2037 static bool 2038 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2039 { 2040 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2041 return true; 2042 } else { 2043 return false; 2044 } 2045 } 2046 2047 static bool 2048 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2049 { 2050 if (bdev_is_read_io(io) == false) { 2051 return false; 2052 } 2053 2054 return bdev_qos_rw_queue_io(limit, io); 2055 } 2056 2057 static bool 2058 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2059 { 2060 if (bdev_is_read_io(io) == true) { 2061 return false; 2062 } 2063 2064 return bdev_qos_rw_queue_io(limit, io); 2065 } 2066 2067 static void 2068 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2069 { 2070 limit->remaining_this_timeslice--; 2071 } 2072 2073 static void 2074 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2075 { 2076 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2077 } 2078 2079 static void 2080 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2081 { 2082 if (bdev_is_read_io(io) == false) { 2083 return; 2084 } 2085 2086 return bdev_qos_rw_bps_update_quota(limit, io); 2087 } 2088 2089 static void 2090 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2091 { 2092 if (bdev_is_read_io(io) == true) { 2093 return; 2094 } 2095 2096 return bdev_qos_rw_bps_update_quota(limit, io); 2097 } 2098 2099 static void 2100 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2101 { 2102 int i; 2103 2104 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2105 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2106 qos->rate_limits[i].queue_io = NULL; 2107 qos->rate_limits[i].update_quota = NULL; 2108 continue; 2109 } 2110 2111 switch (i) { 2112 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2113 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2114 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2115 break; 2116 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2117 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2118 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2119 break; 2120 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2121 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2122 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2123 break; 2124 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2125 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2126 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2127 break; 2128 default: 2129 break; 2130 } 2131 } 2132 } 2133 2134 static void 2135 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2136 struct spdk_bdev_io *bdev_io, 2137 enum spdk_bdev_io_status status) 2138 { 2139 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2140 2141 bdev_io->internal.in_submit_request = true; 2142 bdev_ch->io_outstanding++; 2143 shared_resource->io_outstanding++; 2144 spdk_bdev_io_complete(bdev_io, status); 2145 bdev_io->internal.in_submit_request = false; 2146 } 2147 2148 static inline void 2149 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2150 { 2151 struct spdk_bdev *bdev = bdev_io->bdev; 2152 struct spdk_io_channel *ch = bdev_ch->channel; 2153 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2154 2155 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2156 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2157 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2158 2159 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2160 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2161 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2162 SPDK_BDEV_IO_STATUS_SUCCESS); 2163 return; 2164 } 2165 } 2166 2167 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2168 bdev_io->bdev->split_on_write_unit && 2169 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2170 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2171 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2172 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2173 return; 2174 } 2175 2176 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2177 bdev_ch->io_outstanding++; 2178 shared_resource->io_outstanding++; 2179 bdev_io->internal.in_submit_request = true; 2180 bdev->fn_table->submit_request(ch, bdev_io); 2181 bdev_io->internal.in_submit_request = false; 2182 } else { 2183 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2184 } 2185 } 2186 2187 static bool 2188 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2189 { 2190 int i; 2191 2192 if (bdev_qos_io_to_limit(bdev_io) == true) { 2193 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2194 if (!qos->rate_limits[i].queue_io) { 2195 continue; 2196 } 2197 2198 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2199 bdev_io) == true) { 2200 return true; 2201 } 2202 } 2203 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2204 if (!qos->rate_limits[i].update_quota) { 2205 continue; 2206 } 2207 2208 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2209 } 2210 } 2211 2212 return false; 2213 } 2214 2215 static inline void 2216 _bdev_io_do_submit(void *ctx) 2217 { 2218 struct spdk_bdev_io *bdev_io = ctx; 2219 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2220 2221 bdev_io_do_submit(ch, bdev_io); 2222 } 2223 2224 static int 2225 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2226 { 2227 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2228 int submitted_ios = 0; 2229 2230 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2231 if (!bdev_qos_queue_io(qos, bdev_io)) { 2232 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2233 2234 if (bdev_io->internal.io_submit_ch) { 2235 /* Send back the IO to the original thread for the actual processing. */ 2236 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2237 bdev_io->internal.io_submit_ch = NULL; 2238 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2239 _bdev_io_do_submit, bdev_io); 2240 } else { 2241 bdev_io_do_submit(ch, bdev_io); 2242 } 2243 2244 submitted_ios++; 2245 } 2246 } 2247 2248 return submitted_ios; 2249 } 2250 2251 static void 2252 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2253 { 2254 int rc; 2255 2256 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2257 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2258 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2259 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2260 &bdev_io->internal.waitq_entry); 2261 if (rc != 0) { 2262 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2263 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2264 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2265 } 2266 } 2267 2268 static bool 2269 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2270 { 2271 uint32_t io_boundary; 2272 struct spdk_bdev *bdev = bdev_io->bdev; 2273 uint32_t max_size = bdev->max_segment_size; 2274 int max_segs = bdev->max_num_segments; 2275 2276 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2277 io_boundary = bdev->write_unit_size; 2278 } else if (bdev->split_on_optimal_io_boundary) { 2279 io_boundary = bdev->optimal_io_boundary; 2280 } else { 2281 io_boundary = 0; 2282 } 2283 2284 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2285 return false; 2286 } 2287 2288 if (io_boundary) { 2289 uint64_t start_stripe, end_stripe; 2290 2291 start_stripe = bdev_io->u.bdev.offset_blocks; 2292 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2293 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2294 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2295 start_stripe >>= spdk_u32log2(io_boundary); 2296 end_stripe >>= spdk_u32log2(io_boundary); 2297 } else { 2298 start_stripe /= io_boundary; 2299 end_stripe /= io_boundary; 2300 } 2301 2302 if (start_stripe != end_stripe) { 2303 return true; 2304 } 2305 } 2306 2307 if (max_segs) { 2308 if (bdev_io->u.bdev.iovcnt > max_segs) { 2309 return true; 2310 } 2311 } 2312 2313 if (max_size) { 2314 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2315 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2316 return true; 2317 } 2318 } 2319 } 2320 2321 return false; 2322 } 2323 2324 static bool 2325 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2326 { 2327 uint32_t num_unmap_segments; 2328 2329 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2330 return false; 2331 } 2332 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2333 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2334 return true; 2335 } 2336 2337 return false; 2338 } 2339 2340 static bool 2341 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2342 { 2343 if (!bdev_io->bdev->max_write_zeroes) { 2344 return false; 2345 } 2346 2347 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2348 return true; 2349 } 2350 2351 return false; 2352 } 2353 2354 static bool 2355 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2356 { 2357 if (bdev_io->bdev->max_copy != 0 && 2358 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2359 return true; 2360 } 2361 2362 return false; 2363 } 2364 2365 static bool 2366 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2367 { 2368 switch (bdev_io->type) { 2369 case SPDK_BDEV_IO_TYPE_READ: 2370 case SPDK_BDEV_IO_TYPE_WRITE: 2371 return bdev_rw_should_split(bdev_io); 2372 case SPDK_BDEV_IO_TYPE_UNMAP: 2373 return bdev_unmap_should_split(bdev_io); 2374 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2375 return bdev_write_zeroes_should_split(bdev_io); 2376 case SPDK_BDEV_IO_TYPE_COPY: 2377 return bdev_copy_should_split(bdev_io); 2378 default: 2379 return false; 2380 } 2381 } 2382 2383 static uint32_t 2384 _to_next_boundary(uint64_t offset, uint32_t boundary) 2385 { 2386 return (boundary - (offset % boundary)); 2387 } 2388 2389 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2390 2391 static void _bdev_rw_split(void *_bdev_io); 2392 2393 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2394 2395 static void 2396 _bdev_unmap_split(void *_bdev_io) 2397 { 2398 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2399 } 2400 2401 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2402 2403 static void 2404 _bdev_write_zeroes_split(void *_bdev_io) 2405 { 2406 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2407 } 2408 2409 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2410 2411 static void 2412 _bdev_copy_split(void *_bdev_io) 2413 { 2414 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2415 } 2416 2417 static int 2418 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2419 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2420 { 2421 int rc; 2422 uint64_t current_offset, current_remaining, current_src_offset; 2423 spdk_bdev_io_wait_cb io_wait_fn; 2424 2425 current_offset = *offset; 2426 current_remaining = *remaining; 2427 2428 bdev_io->u.bdev.split_outstanding++; 2429 2430 io_wait_fn = _bdev_rw_split; 2431 switch (bdev_io->type) { 2432 case SPDK_BDEV_IO_TYPE_READ: 2433 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2434 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2435 iov, iovcnt, md_buf, current_offset, 2436 num_blocks, 2437 bdev_io_split_done, bdev_io, 2438 bdev_io->internal.ext_opts, true); 2439 break; 2440 case SPDK_BDEV_IO_TYPE_WRITE: 2441 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2442 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2443 iov, iovcnt, md_buf, current_offset, 2444 num_blocks, 2445 bdev_io_split_done, bdev_io, 2446 bdev_io->internal.ext_opts, true); 2447 break; 2448 case SPDK_BDEV_IO_TYPE_UNMAP: 2449 io_wait_fn = _bdev_unmap_split; 2450 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2451 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2452 current_offset, num_blocks, 2453 bdev_io_split_done, bdev_io); 2454 break; 2455 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2456 io_wait_fn = _bdev_write_zeroes_split; 2457 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2458 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2459 current_offset, num_blocks, 2460 bdev_io_split_done, bdev_io); 2461 break; 2462 case SPDK_BDEV_IO_TYPE_COPY: 2463 io_wait_fn = _bdev_copy_split; 2464 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2465 (current_offset - bdev_io->u.bdev.offset_blocks); 2466 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2467 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2468 current_offset, current_src_offset, num_blocks, 2469 bdev_io_split_done, bdev_io); 2470 break; 2471 default: 2472 assert(false); 2473 rc = -EINVAL; 2474 break; 2475 } 2476 2477 if (rc == 0) { 2478 current_offset += num_blocks; 2479 current_remaining -= num_blocks; 2480 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2481 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2482 *offset = current_offset; 2483 *remaining = current_remaining; 2484 } else { 2485 bdev_io->u.bdev.split_outstanding--; 2486 if (rc == -ENOMEM) { 2487 if (bdev_io->u.bdev.split_outstanding == 0) { 2488 /* No I/O is outstanding. Hence we should wait here. */ 2489 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2490 } 2491 } else { 2492 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2493 if (bdev_io->u.bdev.split_outstanding == 0) { 2494 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2495 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2496 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2497 } 2498 } 2499 } 2500 2501 return rc; 2502 } 2503 2504 static void 2505 _bdev_rw_split(void *_bdev_io) 2506 { 2507 struct iovec *parent_iov, *iov; 2508 struct spdk_bdev_io *bdev_io = _bdev_io; 2509 struct spdk_bdev *bdev = bdev_io->bdev; 2510 uint64_t parent_offset, current_offset, remaining; 2511 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2512 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2513 uint32_t iovcnt, iov_len, child_iovsize; 2514 uint32_t blocklen = bdev->blocklen; 2515 uint32_t io_boundary; 2516 uint32_t max_segment_size = bdev->max_segment_size; 2517 uint32_t max_child_iovcnt = bdev->max_num_segments; 2518 void *md_buf = NULL; 2519 int rc; 2520 2521 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2522 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2523 SPDK_BDEV_IO_NUM_CHILD_IOV; 2524 2525 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2526 io_boundary = bdev->write_unit_size; 2527 } else if (bdev->split_on_optimal_io_boundary) { 2528 io_boundary = bdev->optimal_io_boundary; 2529 } else { 2530 io_boundary = UINT32_MAX; 2531 } 2532 2533 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2534 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2535 parent_offset = bdev_io->u.bdev.offset_blocks; 2536 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2537 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2538 2539 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2540 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2541 if (parent_iov_offset < parent_iov->iov_len) { 2542 break; 2543 } 2544 parent_iov_offset -= parent_iov->iov_len; 2545 } 2546 2547 child_iovcnt = 0; 2548 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2549 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2550 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2551 to_next_boundary = spdk_min(remaining, to_next_boundary); 2552 to_next_boundary_bytes = to_next_boundary * blocklen; 2553 2554 iov = &bdev_io->child_iov[child_iovcnt]; 2555 iovcnt = 0; 2556 2557 if (bdev_io->u.bdev.md_buf) { 2558 md_buf = (char *)bdev_io->u.bdev.md_buf + 2559 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2560 } 2561 2562 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2563 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2564 iovcnt < child_iovsize) { 2565 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2566 iov_len = parent_iov->iov_len - parent_iov_offset; 2567 2568 iov_len = spdk_min(iov_len, max_segment_size); 2569 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2570 to_next_boundary_bytes -= iov_len; 2571 2572 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2573 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2574 2575 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2576 parent_iov_offset += iov_len; 2577 } else { 2578 parent_iovpos++; 2579 parent_iov_offset = 0; 2580 } 2581 child_iovcnt++; 2582 iovcnt++; 2583 } 2584 2585 if (to_next_boundary_bytes > 0) { 2586 /* We had to stop this child I/O early because we ran out of 2587 * child_iov space or were limited by max_num_segments. 2588 * Ensure the iovs to be aligned with block size and 2589 * then adjust to_next_boundary before starting the 2590 * child I/O. 2591 */ 2592 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2593 iovcnt == child_iovsize); 2594 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2595 if (to_last_block_bytes != 0) { 2596 uint32_t child_iovpos = child_iovcnt - 1; 2597 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2598 * so the loop will naturally end 2599 */ 2600 2601 to_last_block_bytes = blocklen - to_last_block_bytes; 2602 to_next_boundary_bytes += to_last_block_bytes; 2603 while (to_last_block_bytes > 0 && iovcnt > 0) { 2604 iov_len = spdk_min(to_last_block_bytes, 2605 bdev_io->child_iov[child_iovpos].iov_len); 2606 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2607 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2608 child_iovpos--; 2609 if (--iovcnt == 0) { 2610 /* If the child IO is less than a block size just return. 2611 * If the first child IO of any split round is less than 2612 * a block size, an error exit. 2613 */ 2614 if (bdev_io->u.bdev.split_outstanding == 0) { 2615 SPDK_ERRLOG("The first child io was less than a block size\n"); 2616 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2617 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2618 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2619 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2620 } 2621 2622 return; 2623 } 2624 } 2625 2626 to_last_block_bytes -= iov_len; 2627 2628 if (parent_iov_offset == 0) { 2629 parent_iovpos--; 2630 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2631 } 2632 parent_iov_offset -= iov_len; 2633 } 2634 2635 assert(to_last_block_bytes == 0); 2636 } 2637 to_next_boundary -= to_next_boundary_bytes / blocklen; 2638 } 2639 2640 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2641 ¤t_offset, &remaining); 2642 if (spdk_unlikely(rc)) { 2643 return; 2644 } 2645 } 2646 } 2647 2648 static void 2649 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2650 { 2651 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2652 uint32_t num_children_reqs = 0; 2653 int rc; 2654 2655 offset = bdev_io->u.bdev.split_current_offset_blocks; 2656 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2657 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2658 2659 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2660 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2661 2662 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2663 &offset, &remaining); 2664 if (spdk_likely(rc == 0)) { 2665 num_children_reqs++; 2666 } else { 2667 return; 2668 } 2669 } 2670 } 2671 2672 static void 2673 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2674 { 2675 uint64_t offset, write_zeroes_blocks, remaining; 2676 uint32_t num_children_reqs = 0; 2677 int rc; 2678 2679 offset = bdev_io->u.bdev.split_current_offset_blocks; 2680 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2681 2682 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2683 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2684 2685 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2686 &offset, &remaining); 2687 if (spdk_likely(rc == 0)) { 2688 num_children_reqs++; 2689 } else { 2690 return; 2691 } 2692 } 2693 } 2694 2695 static void 2696 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2697 { 2698 uint64_t offset, copy_blocks, remaining; 2699 uint32_t num_children_reqs = 0; 2700 int rc; 2701 2702 offset = bdev_io->u.bdev.split_current_offset_blocks; 2703 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2704 2705 assert(bdev_io->bdev->max_copy != 0); 2706 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2707 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2708 2709 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2710 &offset, &remaining); 2711 if (spdk_likely(rc == 0)) { 2712 num_children_reqs++; 2713 } else { 2714 return; 2715 } 2716 } 2717 } 2718 2719 static void 2720 parent_bdev_io_complete(void *ctx, int rc) 2721 { 2722 struct spdk_bdev_io *parent_io = ctx; 2723 2724 if (rc) { 2725 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2726 } 2727 2728 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2729 parent_io->internal.caller_ctx); 2730 } 2731 2732 static void 2733 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2734 { 2735 struct spdk_bdev_io *parent_io = cb_arg; 2736 2737 spdk_bdev_free_io(bdev_io); 2738 2739 if (!success) { 2740 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2741 /* If any child I/O failed, stop further splitting process. */ 2742 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2743 parent_io->u.bdev.split_remaining_num_blocks = 0; 2744 } 2745 parent_io->u.bdev.split_outstanding--; 2746 if (parent_io->u.bdev.split_outstanding != 0) { 2747 return; 2748 } 2749 2750 /* 2751 * Parent I/O finishes when all blocks are consumed. 2752 */ 2753 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2754 assert(parent_io->internal.cb != bdev_io_split_done); 2755 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2756 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2757 2758 if (parent_io->internal.orig_iovcnt != 0) { 2759 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2760 /* bdev IO will be completed in the callback */ 2761 } else { 2762 parent_bdev_io_complete(parent_io, 0); 2763 } 2764 return; 2765 } 2766 2767 /* 2768 * Continue with the splitting process. This function will complete the parent I/O if the 2769 * splitting is done. 2770 */ 2771 switch (parent_io->type) { 2772 case SPDK_BDEV_IO_TYPE_READ: 2773 case SPDK_BDEV_IO_TYPE_WRITE: 2774 _bdev_rw_split(parent_io); 2775 break; 2776 case SPDK_BDEV_IO_TYPE_UNMAP: 2777 bdev_unmap_split(parent_io); 2778 break; 2779 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2780 bdev_write_zeroes_split(parent_io); 2781 break; 2782 case SPDK_BDEV_IO_TYPE_COPY: 2783 bdev_copy_split(parent_io); 2784 break; 2785 default: 2786 assert(false); 2787 break; 2788 } 2789 } 2790 2791 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2792 bool success); 2793 2794 static void 2795 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2796 { 2797 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2798 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2799 bdev_io->u.bdev.split_outstanding = 0; 2800 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2801 2802 switch (bdev_io->type) { 2803 case SPDK_BDEV_IO_TYPE_READ: 2804 case SPDK_BDEV_IO_TYPE_WRITE: 2805 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2806 _bdev_rw_split(bdev_io); 2807 } else { 2808 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2809 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2810 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2811 } 2812 break; 2813 case SPDK_BDEV_IO_TYPE_UNMAP: 2814 bdev_unmap_split(bdev_io); 2815 break; 2816 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2817 bdev_write_zeroes_split(bdev_io); 2818 break; 2819 case SPDK_BDEV_IO_TYPE_COPY: 2820 bdev_copy_split(bdev_io); 2821 break; 2822 default: 2823 assert(false); 2824 break; 2825 } 2826 } 2827 2828 static void 2829 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2830 { 2831 if (!success) { 2832 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2833 return; 2834 } 2835 2836 _bdev_rw_split(bdev_io); 2837 } 2838 2839 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2840 * be inlined, at least on some compilers. 2841 */ 2842 static inline void 2843 _bdev_io_submit(void *ctx) 2844 { 2845 struct spdk_bdev_io *bdev_io = ctx; 2846 struct spdk_bdev *bdev = bdev_io->bdev; 2847 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2848 2849 if (spdk_likely(bdev_ch->flags == 0)) { 2850 bdev_io_do_submit(bdev_ch, bdev_io); 2851 return; 2852 } 2853 2854 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2855 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2856 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2857 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2858 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2859 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2860 } else { 2861 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2862 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2863 } 2864 } else { 2865 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2866 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2867 } 2868 } 2869 2870 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2871 2872 bool 2873 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2874 { 2875 if (range1->length == 0 || range2->length == 0) { 2876 return false; 2877 } 2878 2879 if (range1->offset + range1->length <= range2->offset) { 2880 return false; 2881 } 2882 2883 if (range2->offset + range2->length <= range1->offset) { 2884 return false; 2885 } 2886 2887 return true; 2888 } 2889 2890 static bool 2891 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2892 { 2893 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2894 struct lba_range r; 2895 2896 switch (bdev_io->type) { 2897 case SPDK_BDEV_IO_TYPE_NVME_IO: 2898 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2899 /* Don't try to decode the NVMe command - just assume worst-case and that 2900 * it overlaps a locked range. 2901 */ 2902 return true; 2903 case SPDK_BDEV_IO_TYPE_WRITE: 2904 case SPDK_BDEV_IO_TYPE_UNMAP: 2905 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2906 case SPDK_BDEV_IO_TYPE_ZCOPY: 2907 case SPDK_BDEV_IO_TYPE_COPY: 2908 r.offset = bdev_io->u.bdev.offset_blocks; 2909 r.length = bdev_io->u.bdev.num_blocks; 2910 if (!bdev_lba_range_overlapped(range, &r)) { 2911 /* This I/O doesn't overlap the specified LBA range. */ 2912 return false; 2913 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2914 /* This I/O overlaps, but the I/O is on the same channel that locked this 2915 * range, and the caller_ctx is the same as the locked_ctx. This means 2916 * that this I/O is associated with the lock, and is allowed to execute. 2917 */ 2918 return false; 2919 } else { 2920 return true; 2921 } 2922 default: 2923 return false; 2924 } 2925 } 2926 2927 void 2928 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2929 { 2930 struct spdk_bdev *bdev = bdev_io->bdev; 2931 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2932 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2933 2934 assert(thread != NULL); 2935 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2936 2937 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2938 struct lba_range *range; 2939 2940 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2941 if (bdev_io_range_is_locked(bdev_io, range)) { 2942 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2943 return; 2944 } 2945 } 2946 } 2947 2948 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2949 2950 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2951 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2952 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2953 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2954 spdk_bdev_get_name(bdev)); 2955 2956 if (bdev_io_should_split(bdev_io)) { 2957 bdev_io_split(NULL, bdev_io); 2958 return; 2959 } 2960 2961 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2962 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2963 _bdev_io_submit(bdev_io); 2964 } else { 2965 bdev_io->internal.io_submit_ch = ch; 2966 bdev_io->internal.ch = bdev->internal.qos->ch; 2967 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2968 } 2969 } else { 2970 _bdev_io_submit(bdev_io); 2971 } 2972 } 2973 2974 static inline void 2975 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2976 { 2977 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2978 2979 /* Zero part we don't copy */ 2980 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2981 memcpy(opts_copy, opts, opts->size); 2982 opts_copy->size = sizeof(*opts_copy); 2983 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2984 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2985 bdev_io->u.bdev.ext_opts = opts_copy; 2986 } 2987 2988 static inline void 2989 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2990 { 2991 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2992 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2993 * For write operation we need to pull buffers from memory domain before submitting IO. 2994 * Once read operation completes, we need to use memory_domain push functionality to 2995 * update data in original memory domain IO buffer 2996 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2997 * the copied ext_opts */ 2998 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2999 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 3000 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3001 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3002 } 3003 3004 static inline void 3005 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 3006 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 3007 { 3008 if (opts) { 3009 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 3010 assert(opts->size <= sizeof(*opts)); 3011 /* 3012 * copy if size is smaller than opts struct to avoid having to check size 3013 * on every access to bdev_io->u.bdev.ext_opts 3014 */ 3015 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 3016 _bdev_io_copy_ext_opts(bdev_io, opts); 3017 if (use_pull_push) { 3018 _bdev_io_ext_use_bounce_buffer(bdev_io); 3019 return; 3020 } 3021 } 3022 } 3023 bdev_io_submit(bdev_io); 3024 } 3025 3026 static void 3027 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3028 { 3029 struct spdk_bdev *bdev = bdev_io->bdev; 3030 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3031 struct spdk_io_channel *ch = bdev_ch->channel; 3032 3033 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3034 3035 bdev_io->internal.in_submit_request = true; 3036 bdev->fn_table->submit_request(ch, bdev_io); 3037 bdev_io->internal.in_submit_request = false; 3038 } 3039 3040 void 3041 bdev_io_init(struct spdk_bdev_io *bdev_io, 3042 struct spdk_bdev *bdev, void *cb_arg, 3043 spdk_bdev_io_completion_cb cb) 3044 { 3045 bdev_io->bdev = bdev; 3046 bdev_io->internal.caller_ctx = cb_arg; 3047 bdev_io->internal.cb = cb; 3048 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3049 bdev_io->internal.in_submit_request = false; 3050 bdev_io->internal.buf = NULL; 3051 bdev_io->internal.io_submit_ch = NULL; 3052 bdev_io->internal.orig_iovs = NULL; 3053 bdev_io->internal.orig_iovcnt = 0; 3054 bdev_io->internal.orig_md_iov.iov_base = NULL; 3055 bdev_io->internal.error.nvme.cdw0 = 0; 3056 bdev_io->num_retries = 0; 3057 bdev_io->internal.get_buf_cb = NULL; 3058 bdev_io->internal.get_aux_buf_cb = NULL; 3059 bdev_io->internal.ext_opts = NULL; 3060 bdev_io->internal.data_transfer_cpl = NULL; 3061 } 3062 3063 static bool 3064 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3065 { 3066 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3067 } 3068 3069 bool 3070 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3071 { 3072 bool supported; 3073 3074 supported = bdev_io_type_supported(bdev, io_type); 3075 3076 if (!supported) { 3077 switch (io_type) { 3078 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3079 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3080 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3081 break; 3082 default: 3083 break; 3084 } 3085 } 3086 3087 return supported; 3088 } 3089 3090 uint64_t 3091 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3092 { 3093 return bdev_io->internal.submit_tsc; 3094 } 3095 3096 int 3097 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3098 { 3099 if (bdev->fn_table->dump_info_json) { 3100 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3101 } 3102 3103 return 0; 3104 } 3105 3106 static void 3107 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3108 { 3109 uint32_t max_per_timeslice = 0; 3110 int i; 3111 3112 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3113 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3114 qos->rate_limits[i].max_per_timeslice = 0; 3115 continue; 3116 } 3117 3118 max_per_timeslice = qos->rate_limits[i].limit * 3119 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3120 3121 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3122 qos->rate_limits[i].min_per_timeslice); 3123 3124 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3125 } 3126 3127 bdev_qos_set_ops(qos); 3128 } 3129 3130 static int 3131 bdev_channel_poll_qos(void *arg) 3132 { 3133 struct spdk_bdev_qos *qos = arg; 3134 uint64_t now = spdk_get_ticks(); 3135 int i; 3136 3137 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3138 /* We received our callback earlier than expected - return 3139 * immediately and wait to do accounting until at least one 3140 * timeslice has actually expired. This should never happen 3141 * with a well-behaved timer implementation. 3142 */ 3143 return SPDK_POLLER_IDLE; 3144 } 3145 3146 /* Reset for next round of rate limiting */ 3147 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3148 /* We may have allowed the IOs or bytes to slightly overrun in the last 3149 * timeslice. remaining_this_timeslice is signed, so if it's negative 3150 * here, we'll account for the overrun so that the next timeslice will 3151 * be appropriately reduced. 3152 */ 3153 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3154 qos->rate_limits[i].remaining_this_timeslice = 0; 3155 } 3156 } 3157 3158 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3159 qos->last_timeslice += qos->timeslice_size; 3160 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3161 qos->rate_limits[i].remaining_this_timeslice += 3162 qos->rate_limits[i].max_per_timeslice; 3163 } 3164 } 3165 3166 return bdev_qos_io_submit(qos->ch, qos); 3167 } 3168 3169 static void 3170 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3171 { 3172 struct spdk_bdev_shared_resource *shared_resource; 3173 struct lba_range *range; 3174 3175 bdev_free_io_stat(ch->stat); 3176 #ifdef SPDK_CONFIG_VTUNE 3177 bdev_free_io_stat(ch->prev_stat); 3178 #endif 3179 3180 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3181 range = TAILQ_FIRST(&ch->locked_ranges); 3182 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3183 free(range); 3184 } 3185 3186 spdk_put_io_channel(ch->channel); 3187 3188 shared_resource = ch->shared_resource; 3189 3190 assert(TAILQ_EMPTY(&ch->io_locked)); 3191 assert(TAILQ_EMPTY(&ch->io_submitted)); 3192 assert(ch->io_outstanding == 0); 3193 assert(shared_resource->ref > 0); 3194 shared_resource->ref--; 3195 if (shared_resource->ref == 0) { 3196 assert(shared_resource->io_outstanding == 0); 3197 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3198 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3199 free(shared_resource); 3200 } 3201 } 3202 3203 static void 3204 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3205 { 3206 struct spdk_bdev_qos *qos = bdev->internal.qos; 3207 int i; 3208 3209 assert(spdk_spin_held(&bdev->internal.spinlock)); 3210 3211 /* Rate limiting on this bdev enabled */ 3212 if (qos) { 3213 if (qos->ch == NULL) { 3214 struct spdk_io_channel *io_ch; 3215 3216 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3217 bdev->name, spdk_get_thread()); 3218 3219 /* No qos channel has been selected, so set one up */ 3220 3221 /* Take another reference to ch */ 3222 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3223 assert(io_ch != NULL); 3224 qos->ch = ch; 3225 3226 qos->thread = spdk_io_channel_get_thread(io_ch); 3227 3228 TAILQ_INIT(&qos->queued); 3229 3230 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3231 if (bdev_qos_is_iops_rate_limit(i) == true) { 3232 qos->rate_limits[i].min_per_timeslice = 3233 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3234 } else { 3235 qos->rate_limits[i].min_per_timeslice = 3236 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3237 } 3238 3239 if (qos->rate_limits[i].limit == 0) { 3240 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3241 } 3242 } 3243 bdev_qos_update_max_quota_per_timeslice(qos); 3244 qos->timeslice_size = 3245 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3246 qos->last_timeslice = spdk_get_ticks(); 3247 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3248 qos, 3249 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3250 } 3251 3252 ch->flags |= BDEV_CH_QOS_ENABLED; 3253 } 3254 } 3255 3256 struct poll_timeout_ctx { 3257 struct spdk_bdev_desc *desc; 3258 uint64_t timeout_in_sec; 3259 spdk_bdev_io_timeout_cb cb_fn; 3260 void *cb_arg; 3261 }; 3262 3263 static void 3264 bdev_desc_free(struct spdk_bdev_desc *desc) 3265 { 3266 spdk_spin_destroy(&desc->spinlock); 3267 free(desc->media_events_buffer); 3268 free(desc); 3269 } 3270 3271 static void 3272 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3273 { 3274 struct poll_timeout_ctx *ctx = _ctx; 3275 struct spdk_bdev_desc *desc = ctx->desc; 3276 3277 free(ctx); 3278 3279 spdk_spin_lock(&desc->spinlock); 3280 desc->refs--; 3281 if (desc->closed == true && desc->refs == 0) { 3282 spdk_spin_unlock(&desc->spinlock); 3283 bdev_desc_free(desc); 3284 return; 3285 } 3286 spdk_spin_unlock(&desc->spinlock); 3287 } 3288 3289 static void 3290 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3291 struct spdk_io_channel *io_ch, void *_ctx) 3292 { 3293 struct poll_timeout_ctx *ctx = _ctx; 3294 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3295 struct spdk_bdev_desc *desc = ctx->desc; 3296 struct spdk_bdev_io *bdev_io; 3297 uint64_t now; 3298 3299 spdk_spin_lock(&desc->spinlock); 3300 if (desc->closed == true) { 3301 spdk_spin_unlock(&desc->spinlock); 3302 spdk_bdev_for_each_channel_continue(i, -1); 3303 return; 3304 } 3305 spdk_spin_unlock(&desc->spinlock); 3306 3307 now = spdk_get_ticks(); 3308 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3309 /* Exclude any I/O that are generated via splitting. */ 3310 if (bdev_io->internal.cb == bdev_io_split_done) { 3311 continue; 3312 } 3313 3314 /* Once we find an I/O that has not timed out, we can immediately 3315 * exit the loop. 3316 */ 3317 if (now < (bdev_io->internal.submit_tsc + 3318 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3319 goto end; 3320 } 3321 3322 if (bdev_io->internal.desc == desc) { 3323 ctx->cb_fn(ctx->cb_arg, bdev_io); 3324 } 3325 } 3326 3327 end: 3328 spdk_bdev_for_each_channel_continue(i, 0); 3329 } 3330 3331 static int 3332 bdev_poll_timeout_io(void *arg) 3333 { 3334 struct spdk_bdev_desc *desc = arg; 3335 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3336 struct poll_timeout_ctx *ctx; 3337 3338 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3339 if (!ctx) { 3340 SPDK_ERRLOG("failed to allocate memory\n"); 3341 return SPDK_POLLER_BUSY; 3342 } 3343 ctx->desc = desc; 3344 ctx->cb_arg = desc->cb_arg; 3345 ctx->cb_fn = desc->cb_fn; 3346 ctx->timeout_in_sec = desc->timeout_in_sec; 3347 3348 /* Take a ref on the descriptor in case it gets closed while we are checking 3349 * all of the channels. 3350 */ 3351 spdk_spin_lock(&desc->spinlock); 3352 desc->refs++; 3353 spdk_spin_unlock(&desc->spinlock); 3354 3355 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3356 bdev_channel_poll_timeout_io_done); 3357 3358 return SPDK_POLLER_BUSY; 3359 } 3360 3361 int 3362 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3363 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3364 { 3365 assert(desc->thread == spdk_get_thread()); 3366 3367 spdk_poller_unregister(&desc->io_timeout_poller); 3368 3369 if (timeout_in_sec) { 3370 assert(cb_fn != NULL); 3371 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3372 desc, 3373 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3374 1000); 3375 if (desc->io_timeout_poller == NULL) { 3376 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3377 return -1; 3378 } 3379 } 3380 3381 desc->cb_fn = cb_fn; 3382 desc->cb_arg = cb_arg; 3383 desc->timeout_in_sec = timeout_in_sec; 3384 3385 return 0; 3386 } 3387 3388 static int 3389 bdev_channel_create(void *io_device, void *ctx_buf) 3390 { 3391 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3392 struct spdk_bdev_channel *ch = ctx_buf; 3393 struct spdk_io_channel *mgmt_io_ch; 3394 struct spdk_bdev_mgmt_channel *mgmt_ch; 3395 struct spdk_bdev_shared_resource *shared_resource; 3396 struct lba_range *range; 3397 3398 ch->bdev = bdev; 3399 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3400 if (!ch->channel) { 3401 return -1; 3402 } 3403 3404 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3405 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3406 3407 assert(ch->histogram == NULL); 3408 if (bdev->internal.histogram_enabled) { 3409 ch->histogram = spdk_histogram_data_alloc(); 3410 if (ch->histogram == NULL) { 3411 SPDK_ERRLOG("Could not allocate histogram\n"); 3412 } 3413 } 3414 3415 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3416 if (!mgmt_io_ch) { 3417 spdk_put_io_channel(ch->channel); 3418 return -1; 3419 } 3420 3421 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3422 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3423 if (shared_resource->shared_ch == ch->channel) { 3424 spdk_put_io_channel(mgmt_io_ch); 3425 shared_resource->ref++; 3426 break; 3427 } 3428 } 3429 3430 if (shared_resource == NULL) { 3431 shared_resource = calloc(1, sizeof(*shared_resource)); 3432 if (shared_resource == NULL) { 3433 spdk_put_io_channel(ch->channel); 3434 spdk_put_io_channel(mgmt_io_ch); 3435 return -1; 3436 } 3437 3438 shared_resource->mgmt_ch = mgmt_ch; 3439 shared_resource->io_outstanding = 0; 3440 TAILQ_INIT(&shared_resource->nomem_io); 3441 shared_resource->nomem_threshold = 0; 3442 shared_resource->shared_ch = ch->channel; 3443 shared_resource->ref = 1; 3444 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3445 } 3446 3447 ch->io_outstanding = 0; 3448 TAILQ_INIT(&ch->queued_resets); 3449 TAILQ_INIT(&ch->locked_ranges); 3450 ch->flags = 0; 3451 ch->shared_resource = shared_resource; 3452 3453 TAILQ_INIT(&ch->io_submitted); 3454 TAILQ_INIT(&ch->io_locked); 3455 3456 ch->stat = bdev_alloc_io_stat(false); 3457 if (ch->stat == NULL) { 3458 bdev_channel_destroy_resource(ch); 3459 return -1; 3460 } 3461 3462 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3463 3464 #ifdef SPDK_CONFIG_VTUNE 3465 { 3466 char *name; 3467 __itt_init_ittlib(NULL, 0); 3468 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3469 if (!name) { 3470 bdev_channel_destroy_resource(ch); 3471 return -1; 3472 } 3473 ch->handle = __itt_string_handle_create(name); 3474 free(name); 3475 ch->start_tsc = spdk_get_ticks(); 3476 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3477 ch->prev_stat = bdev_alloc_io_stat(false); 3478 if (ch->prev_stat == NULL) { 3479 bdev_channel_destroy_resource(ch); 3480 return -1; 3481 } 3482 } 3483 #endif 3484 3485 spdk_spin_lock(&bdev->internal.spinlock); 3486 bdev_enable_qos(bdev, ch); 3487 3488 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3489 struct lba_range *new_range; 3490 3491 new_range = calloc(1, sizeof(*new_range)); 3492 if (new_range == NULL) { 3493 spdk_spin_unlock(&bdev->internal.spinlock); 3494 bdev_channel_destroy_resource(ch); 3495 return -1; 3496 } 3497 new_range->length = range->length; 3498 new_range->offset = range->offset; 3499 new_range->locked_ctx = range->locked_ctx; 3500 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3501 } 3502 3503 spdk_spin_unlock(&bdev->internal.spinlock); 3504 3505 return 0; 3506 } 3507 3508 static int 3509 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3510 void *cb_ctx) 3511 { 3512 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3513 struct spdk_bdev_io *bdev_io; 3514 uint64_t buf_len; 3515 3516 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3517 if (bdev_io->internal.ch == bdev_ch) { 3518 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3519 spdk_iobuf_entry_abort(ch, entry, buf_len); 3520 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3521 } 3522 3523 return 0; 3524 } 3525 3526 /* 3527 * Abort I/O that are waiting on a data buffer. 3528 */ 3529 static void 3530 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3531 { 3532 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3533 bdev_abort_all_buf_io_cb, ch); 3534 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3535 bdev_abort_all_buf_io_cb, ch); 3536 } 3537 3538 /* 3539 * Abort I/O that are queued waiting for submission. These types of I/O are 3540 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3541 */ 3542 static void 3543 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3544 { 3545 struct spdk_bdev_io *bdev_io, *tmp; 3546 3547 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3548 if (bdev_io->internal.ch == ch) { 3549 TAILQ_REMOVE(queue, bdev_io, internal.link); 3550 /* 3551 * spdk_bdev_io_complete() assumes that the completed I/O had 3552 * been submitted to the bdev module. Since in this case it 3553 * hadn't, bump io_outstanding to account for the decrement 3554 * that spdk_bdev_io_complete() will do. 3555 */ 3556 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3557 ch->io_outstanding++; 3558 ch->shared_resource->io_outstanding++; 3559 } 3560 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3561 } 3562 } 3563 } 3564 3565 static bool 3566 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3567 { 3568 struct spdk_bdev_io *bdev_io; 3569 3570 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3571 if (bdev_io == bio_to_abort) { 3572 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3573 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3574 return true; 3575 } 3576 } 3577 3578 return false; 3579 } 3580 3581 static int 3582 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3583 { 3584 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3585 uint64_t buf_len; 3586 3587 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3588 if (bdev_io == bio_to_abort) { 3589 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3590 spdk_iobuf_entry_abort(ch, entry, buf_len); 3591 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3592 return 1; 3593 } 3594 3595 return 0; 3596 } 3597 3598 static bool 3599 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3600 { 3601 int rc; 3602 3603 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3604 bdev_abort_buf_io_cb, bio_to_abort); 3605 if (rc == 1) { 3606 return true; 3607 } 3608 3609 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3610 bdev_abort_buf_io_cb, bio_to_abort); 3611 return rc == 1; 3612 } 3613 3614 static void 3615 bdev_qos_channel_destroy(void *cb_arg) 3616 { 3617 struct spdk_bdev_qos *qos = cb_arg; 3618 3619 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3620 spdk_poller_unregister(&qos->poller); 3621 3622 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3623 3624 free(qos); 3625 } 3626 3627 static int 3628 bdev_qos_destroy(struct spdk_bdev *bdev) 3629 { 3630 int i; 3631 3632 /* 3633 * Cleanly shutting down the QoS poller is tricky, because 3634 * during the asynchronous operation the user could open 3635 * a new descriptor and create a new channel, spawning 3636 * a new QoS poller. 3637 * 3638 * The strategy is to create a new QoS structure here and swap it 3639 * in. The shutdown path then continues to refer to the old one 3640 * until it completes and then releases it. 3641 */ 3642 struct spdk_bdev_qos *new_qos, *old_qos; 3643 3644 old_qos = bdev->internal.qos; 3645 3646 new_qos = calloc(1, sizeof(*new_qos)); 3647 if (!new_qos) { 3648 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3649 return -ENOMEM; 3650 } 3651 3652 /* Copy the old QoS data into the newly allocated structure */ 3653 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3654 3655 /* Zero out the key parts of the QoS structure */ 3656 new_qos->ch = NULL; 3657 new_qos->thread = NULL; 3658 new_qos->poller = NULL; 3659 TAILQ_INIT(&new_qos->queued); 3660 /* 3661 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3662 * It will be used later for the new QoS structure. 3663 */ 3664 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3665 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3666 new_qos->rate_limits[i].min_per_timeslice = 0; 3667 new_qos->rate_limits[i].max_per_timeslice = 0; 3668 } 3669 3670 bdev->internal.qos = new_qos; 3671 3672 if (old_qos->thread == NULL) { 3673 free(old_qos); 3674 } else { 3675 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3676 } 3677 3678 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3679 * been destroyed yet. The destruction path will end up waiting for the final 3680 * channel to be put before it releases resources. */ 3681 3682 return 0; 3683 } 3684 3685 static void 3686 bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3687 { 3688 total->bytes_read += add->bytes_read; 3689 total->num_read_ops += add->num_read_ops; 3690 total->bytes_written += add->bytes_written; 3691 total->num_write_ops += add->num_write_ops; 3692 total->bytes_unmapped += add->bytes_unmapped; 3693 total->num_unmap_ops += add->num_unmap_ops; 3694 total->bytes_copied += add->bytes_copied; 3695 total->num_copy_ops += add->num_copy_ops; 3696 total->read_latency_ticks += add->read_latency_ticks; 3697 total->write_latency_ticks += add->write_latency_ticks; 3698 total->unmap_latency_ticks += add->unmap_latency_ticks; 3699 total->copy_latency_ticks += add->copy_latency_ticks; 3700 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3701 total->max_read_latency_ticks = add->max_read_latency_ticks; 3702 } 3703 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3704 total->min_read_latency_ticks = add->min_read_latency_ticks; 3705 } 3706 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3707 total->max_write_latency_ticks = add->max_write_latency_ticks; 3708 } 3709 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3710 total->min_write_latency_ticks = add->min_write_latency_ticks; 3711 } 3712 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3713 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3714 } 3715 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3716 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3717 } 3718 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3719 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3720 } 3721 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3722 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3723 } 3724 } 3725 3726 static void 3727 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3728 { 3729 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3730 3731 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3732 memcpy(to_stat->io_error, from_stat->io_error, 3733 sizeof(struct spdk_bdev_io_error_stat)); 3734 } 3735 } 3736 3737 static void 3738 bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum bdev_reset_stat_mode mode) 3739 { 3740 stat->max_read_latency_ticks = 0; 3741 stat->min_read_latency_ticks = UINT64_MAX; 3742 stat->max_write_latency_ticks = 0; 3743 stat->min_write_latency_ticks = UINT64_MAX; 3744 stat->max_unmap_latency_ticks = 0; 3745 stat->min_unmap_latency_ticks = UINT64_MAX; 3746 stat->max_copy_latency_ticks = 0; 3747 stat->min_copy_latency_ticks = UINT64_MAX; 3748 3749 if (mode != BDEV_RESET_STAT_ALL) { 3750 return; 3751 } 3752 3753 stat->bytes_read = 0; 3754 stat->num_read_ops = 0; 3755 stat->bytes_written = 0; 3756 stat->num_write_ops = 0; 3757 stat->bytes_unmapped = 0; 3758 stat->num_unmap_ops = 0; 3759 stat->read_latency_ticks = 0; 3760 stat->write_latency_ticks = 0; 3761 stat->unmap_latency_ticks = 0; 3762 3763 if (stat->io_error != NULL) { 3764 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3765 } 3766 } 3767 3768 struct spdk_bdev_io_stat * 3769 bdev_alloc_io_stat(bool io_error_stat) 3770 { 3771 struct spdk_bdev_io_stat *stat; 3772 3773 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3774 if (stat == NULL) { 3775 return NULL; 3776 } 3777 3778 if (io_error_stat) { 3779 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3780 if (stat->io_error == NULL) { 3781 free(stat); 3782 return NULL; 3783 } 3784 } else { 3785 stat->io_error = NULL; 3786 } 3787 3788 bdev_reset_io_stat(stat, BDEV_RESET_STAT_ALL); 3789 3790 return stat; 3791 } 3792 3793 void 3794 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3795 { 3796 free(stat->io_error); 3797 free(stat); 3798 } 3799 3800 void 3801 bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3802 { 3803 int i; 3804 3805 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3806 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3807 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3808 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3809 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3810 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3811 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3812 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3813 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3814 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3815 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3816 stat->min_read_latency_ticks != UINT64_MAX ? 3817 stat->min_read_latency_ticks : 0); 3818 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3819 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3820 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3821 stat->min_write_latency_ticks != UINT64_MAX ? 3822 stat->min_write_latency_ticks : 0); 3823 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3824 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3825 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3826 stat->min_unmap_latency_ticks != UINT64_MAX ? 3827 stat->min_unmap_latency_ticks : 0); 3828 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3829 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3830 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3831 stat->min_copy_latency_ticks != UINT64_MAX ? 3832 stat->min_copy_latency_ticks : 0); 3833 3834 if (stat->io_error != NULL) { 3835 spdk_json_write_named_object_begin(w, "io_error"); 3836 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3837 if (stat->io_error->error_status[i] != 0) { 3838 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3839 stat->io_error->error_status[i]); 3840 } 3841 } 3842 spdk_json_write_object_end(w); 3843 } 3844 } 3845 3846 static void 3847 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3848 { 3849 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3850 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3851 3852 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3853 bdev_abort_all_buf_io(mgmt_ch, ch); 3854 bdev_abort_all_buf_io(mgmt_ch, ch); 3855 } 3856 3857 static void 3858 bdev_channel_destroy(void *io_device, void *ctx_buf) 3859 { 3860 struct spdk_bdev_channel *ch = ctx_buf; 3861 3862 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3863 spdk_get_thread()); 3864 3865 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3866 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3867 3868 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3869 spdk_spin_lock(&ch->bdev->internal.spinlock); 3870 bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3871 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3872 3873 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3874 3875 bdev_channel_abort_queued_ios(ch); 3876 3877 if (ch->histogram) { 3878 spdk_histogram_data_free(ch->histogram); 3879 } 3880 3881 bdev_channel_destroy_resource(ch); 3882 } 3883 3884 /* 3885 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3886 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3887 */ 3888 static int 3889 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3890 { 3891 struct spdk_bdev_name *tmp; 3892 3893 bdev_name->name = strdup(name); 3894 if (bdev_name->name == NULL) { 3895 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3896 return -ENOMEM; 3897 } 3898 3899 bdev_name->bdev = bdev; 3900 3901 spdk_spin_lock(&g_bdev_mgr.spinlock); 3902 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3903 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3904 3905 if (tmp != NULL) { 3906 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3907 free(bdev_name->name); 3908 return -EEXIST; 3909 } 3910 3911 return 0; 3912 } 3913 3914 static void 3915 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3916 { 3917 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3918 free(bdev_name->name); 3919 } 3920 3921 static void 3922 bdev_name_del(struct spdk_bdev_name *bdev_name) 3923 { 3924 spdk_spin_lock(&g_bdev_mgr.spinlock); 3925 bdev_name_del_unsafe(bdev_name); 3926 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3927 } 3928 3929 int 3930 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3931 { 3932 struct spdk_bdev_alias *tmp; 3933 int ret; 3934 3935 if (alias == NULL) { 3936 SPDK_ERRLOG("Empty alias passed\n"); 3937 return -EINVAL; 3938 } 3939 3940 tmp = calloc(1, sizeof(*tmp)); 3941 if (tmp == NULL) { 3942 SPDK_ERRLOG("Unable to allocate alias\n"); 3943 return -ENOMEM; 3944 } 3945 3946 ret = bdev_name_add(&tmp->alias, bdev, alias); 3947 if (ret != 0) { 3948 free(tmp); 3949 return ret; 3950 } 3951 3952 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3953 3954 return 0; 3955 } 3956 3957 static int 3958 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3959 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3960 { 3961 struct spdk_bdev_alias *tmp; 3962 3963 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3964 if (strcmp(alias, tmp->alias.name) == 0) { 3965 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3966 alias_del_fn(&tmp->alias); 3967 free(tmp); 3968 return 0; 3969 } 3970 } 3971 3972 return -ENOENT; 3973 } 3974 3975 int 3976 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3977 { 3978 int rc; 3979 3980 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3981 if (rc == -ENOENT) { 3982 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3983 } 3984 3985 return rc; 3986 } 3987 3988 void 3989 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3990 { 3991 struct spdk_bdev_alias *p, *tmp; 3992 3993 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3994 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3995 bdev_name_del(&p->alias); 3996 free(p); 3997 } 3998 } 3999 4000 struct spdk_io_channel * 4001 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4002 { 4003 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4004 } 4005 4006 void * 4007 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4008 { 4009 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4010 void *ctx = NULL; 4011 4012 if (bdev->fn_table->get_module_ctx) { 4013 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4014 } 4015 4016 return ctx; 4017 } 4018 4019 const char * 4020 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4021 { 4022 return bdev->module->name; 4023 } 4024 4025 const char * 4026 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4027 { 4028 return bdev->name; 4029 } 4030 4031 const char * 4032 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4033 { 4034 return bdev->product_name; 4035 } 4036 4037 const struct spdk_bdev_aliases_list * 4038 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4039 { 4040 return &bdev->aliases; 4041 } 4042 4043 uint32_t 4044 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4045 { 4046 return bdev->blocklen; 4047 } 4048 4049 uint32_t 4050 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4051 { 4052 return bdev->write_unit_size; 4053 } 4054 4055 uint64_t 4056 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4057 { 4058 return bdev->blockcnt; 4059 } 4060 4061 const char * 4062 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4063 { 4064 return qos_rpc_type[type]; 4065 } 4066 4067 void 4068 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4069 { 4070 int i; 4071 4072 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4073 4074 spdk_spin_lock(&bdev->internal.spinlock); 4075 if (bdev->internal.qos) { 4076 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4077 if (bdev->internal.qos->rate_limits[i].limit != 4078 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4079 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4080 if (bdev_qos_is_iops_rate_limit(i) == false) { 4081 /* Change from Byte to Megabyte which is user visible. */ 4082 limits[i] = limits[i] / 1024 / 1024; 4083 } 4084 } 4085 } 4086 } 4087 spdk_spin_unlock(&bdev->internal.spinlock); 4088 } 4089 4090 size_t 4091 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4092 { 4093 return 1 << bdev->required_alignment; 4094 } 4095 4096 uint32_t 4097 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4098 { 4099 return bdev->optimal_io_boundary; 4100 } 4101 4102 bool 4103 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4104 { 4105 return bdev->write_cache; 4106 } 4107 4108 const struct spdk_uuid * 4109 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4110 { 4111 return &bdev->uuid; 4112 } 4113 4114 uint16_t 4115 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4116 { 4117 return bdev->acwu; 4118 } 4119 4120 uint32_t 4121 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4122 { 4123 return bdev->md_len; 4124 } 4125 4126 bool 4127 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4128 { 4129 return (bdev->md_len != 0) && bdev->md_interleave; 4130 } 4131 4132 bool 4133 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4134 { 4135 return (bdev->md_len != 0) && !bdev->md_interleave; 4136 } 4137 4138 bool 4139 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4140 { 4141 return bdev->zoned; 4142 } 4143 4144 uint32_t 4145 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4146 { 4147 if (spdk_bdev_is_md_interleaved(bdev)) { 4148 return bdev->blocklen - bdev->md_len; 4149 } else { 4150 return bdev->blocklen; 4151 } 4152 } 4153 4154 uint32_t 4155 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4156 { 4157 return bdev->phys_blocklen; 4158 } 4159 4160 static uint32_t 4161 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4162 { 4163 if (!spdk_bdev_is_md_interleaved(bdev)) { 4164 return bdev->blocklen + bdev->md_len; 4165 } else { 4166 return bdev->blocklen; 4167 } 4168 } 4169 4170 /* We have to use the typedef in the function declaration to appease astyle. */ 4171 typedef enum spdk_dif_type spdk_dif_type_t; 4172 4173 spdk_dif_type_t 4174 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4175 { 4176 if (bdev->md_len != 0) { 4177 return bdev->dif_type; 4178 } else { 4179 return SPDK_DIF_DISABLE; 4180 } 4181 } 4182 4183 bool 4184 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4185 { 4186 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4187 return bdev->dif_is_head_of_md; 4188 } else { 4189 return false; 4190 } 4191 } 4192 4193 bool 4194 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4195 enum spdk_dif_check_type check_type) 4196 { 4197 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4198 return false; 4199 } 4200 4201 switch (check_type) { 4202 case SPDK_DIF_CHECK_TYPE_REFTAG: 4203 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4204 case SPDK_DIF_CHECK_TYPE_APPTAG: 4205 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4206 case SPDK_DIF_CHECK_TYPE_GUARD: 4207 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4208 default: 4209 return false; 4210 } 4211 } 4212 4213 uint32_t 4214 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4215 { 4216 return bdev->max_copy; 4217 } 4218 4219 uint64_t 4220 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4221 { 4222 return bdev->internal.measured_queue_depth; 4223 } 4224 4225 uint64_t 4226 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4227 { 4228 return bdev->internal.period; 4229 } 4230 4231 uint64_t 4232 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4233 { 4234 return bdev->internal.weighted_io_time; 4235 } 4236 4237 uint64_t 4238 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4239 { 4240 return bdev->internal.io_time; 4241 } 4242 4243 static void bdev_update_qd_sampling_period(void *ctx); 4244 4245 static void 4246 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4247 { 4248 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4249 4250 if (bdev->internal.measured_queue_depth) { 4251 bdev->internal.io_time += bdev->internal.period; 4252 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4253 } 4254 4255 bdev->internal.qd_poll_in_progress = false; 4256 4257 bdev_update_qd_sampling_period(bdev); 4258 } 4259 4260 static void 4261 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4262 struct spdk_io_channel *io_ch, void *_ctx) 4263 { 4264 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4265 4266 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4267 spdk_bdev_for_each_channel_continue(i, 0); 4268 } 4269 4270 static int 4271 bdev_calculate_measured_queue_depth(void *ctx) 4272 { 4273 struct spdk_bdev *bdev = ctx; 4274 4275 bdev->internal.qd_poll_in_progress = true; 4276 bdev->internal.temporary_queue_depth = 0; 4277 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4278 return SPDK_POLLER_BUSY; 4279 } 4280 4281 static void 4282 bdev_update_qd_sampling_period(void *ctx) 4283 { 4284 struct spdk_bdev *bdev = ctx; 4285 4286 if (bdev->internal.period == bdev->internal.new_period) { 4287 return; 4288 } 4289 4290 if (bdev->internal.qd_poll_in_progress) { 4291 return; 4292 } 4293 4294 bdev->internal.period = bdev->internal.new_period; 4295 4296 spdk_poller_unregister(&bdev->internal.qd_poller); 4297 if (bdev->internal.period != 0) { 4298 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4299 bdev, bdev->internal.period); 4300 } else { 4301 spdk_bdev_close(bdev->internal.qd_desc); 4302 bdev->internal.qd_desc = NULL; 4303 } 4304 } 4305 4306 static void 4307 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4308 { 4309 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4310 } 4311 4312 void 4313 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4314 { 4315 int rc; 4316 4317 if (bdev->internal.new_period == period) { 4318 return; 4319 } 4320 4321 bdev->internal.new_period = period; 4322 4323 if (bdev->internal.qd_desc != NULL) { 4324 assert(bdev->internal.period != 0); 4325 4326 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4327 bdev_update_qd_sampling_period, bdev); 4328 return; 4329 } 4330 4331 assert(bdev->internal.period == 0); 4332 4333 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4334 NULL, &bdev->internal.qd_desc); 4335 if (rc != 0) { 4336 return; 4337 } 4338 4339 bdev->internal.period = period; 4340 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4341 bdev, period); 4342 } 4343 4344 struct bdev_get_current_qd_ctx { 4345 uint64_t current_qd; 4346 spdk_bdev_get_current_qd_cb cb_fn; 4347 void *cb_arg; 4348 }; 4349 4350 static void 4351 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4352 { 4353 struct bdev_get_current_qd_ctx *ctx = _ctx; 4354 4355 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4356 4357 free(ctx); 4358 } 4359 4360 static void 4361 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4362 struct spdk_io_channel *io_ch, void *_ctx) 4363 { 4364 struct bdev_get_current_qd_ctx *ctx = _ctx; 4365 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4366 4367 ctx->current_qd += bdev_ch->io_outstanding; 4368 4369 spdk_bdev_for_each_channel_continue(i, 0); 4370 } 4371 4372 void 4373 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4374 void *cb_arg) 4375 { 4376 struct bdev_get_current_qd_ctx *ctx; 4377 4378 assert(cb_fn != NULL); 4379 4380 ctx = calloc(1, sizeof(*ctx)); 4381 if (ctx == NULL) { 4382 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4383 return; 4384 } 4385 4386 ctx->cb_fn = cb_fn; 4387 ctx->cb_arg = cb_arg; 4388 4389 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4390 } 4391 4392 static void 4393 _resize_notify(void *arg) 4394 { 4395 struct spdk_bdev_desc *desc = arg; 4396 4397 spdk_spin_lock(&desc->spinlock); 4398 desc->refs--; 4399 if (!desc->closed) { 4400 spdk_spin_unlock(&desc->spinlock); 4401 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4402 desc->bdev, 4403 desc->callback.ctx); 4404 return; 4405 } else if (0 == desc->refs) { 4406 /* This descriptor was closed after this resize_notify message was sent. 4407 * spdk_bdev_close() could not free the descriptor since this message was 4408 * in flight, so we free it now using bdev_desc_free(). 4409 */ 4410 spdk_spin_unlock(&desc->spinlock); 4411 bdev_desc_free(desc); 4412 return; 4413 } 4414 spdk_spin_unlock(&desc->spinlock); 4415 } 4416 4417 int 4418 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4419 { 4420 struct spdk_bdev_desc *desc; 4421 int ret; 4422 4423 if (size == bdev->blockcnt) { 4424 return 0; 4425 } 4426 4427 spdk_spin_lock(&bdev->internal.spinlock); 4428 4429 /* bdev has open descriptors */ 4430 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4431 bdev->blockcnt > size) { 4432 ret = -EBUSY; 4433 } else { 4434 bdev->blockcnt = size; 4435 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4436 spdk_spin_lock(&desc->spinlock); 4437 if (!desc->closed) { 4438 desc->refs++; 4439 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4440 } 4441 spdk_spin_unlock(&desc->spinlock); 4442 } 4443 ret = 0; 4444 } 4445 4446 spdk_spin_unlock(&bdev->internal.spinlock); 4447 4448 return ret; 4449 } 4450 4451 /* 4452 * Convert I/O offset and length from bytes to blocks. 4453 * 4454 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4455 */ 4456 static uint64_t 4457 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4458 uint64_t num_bytes, uint64_t *num_blocks) 4459 { 4460 uint32_t block_size = bdev->blocklen; 4461 uint8_t shift_cnt; 4462 4463 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4464 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4465 shift_cnt = spdk_u32log2(block_size); 4466 *offset_blocks = offset_bytes >> shift_cnt; 4467 *num_blocks = num_bytes >> shift_cnt; 4468 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4469 (num_bytes - (*num_blocks << shift_cnt)); 4470 } else { 4471 *offset_blocks = offset_bytes / block_size; 4472 *num_blocks = num_bytes / block_size; 4473 return (offset_bytes % block_size) | (num_bytes % block_size); 4474 } 4475 } 4476 4477 static bool 4478 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4479 { 4480 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4481 * has been an overflow and hence the offset has been wrapped around */ 4482 if (offset_blocks + num_blocks < offset_blocks) { 4483 return false; 4484 } 4485 4486 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4487 if (offset_blocks + num_blocks > bdev->blockcnt) { 4488 return false; 4489 } 4490 4491 return true; 4492 } 4493 4494 static void 4495 bdev_seek_complete_cb(void *ctx) 4496 { 4497 struct spdk_bdev_io *bdev_io = ctx; 4498 4499 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4500 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4501 } 4502 4503 static int 4504 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4505 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4506 spdk_bdev_io_completion_cb cb, void *cb_arg) 4507 { 4508 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4509 struct spdk_bdev_io *bdev_io; 4510 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4511 4512 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4513 4514 /* Check if offset_blocks is valid looking at the validity of one block */ 4515 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4516 return -EINVAL; 4517 } 4518 4519 bdev_io = bdev_channel_get_io(channel); 4520 if (!bdev_io) { 4521 return -ENOMEM; 4522 } 4523 4524 bdev_io->internal.ch = channel; 4525 bdev_io->internal.desc = desc; 4526 bdev_io->type = io_type; 4527 bdev_io->u.bdev.offset_blocks = offset_blocks; 4528 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4529 4530 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4531 /* In case bdev doesn't support seek to next data/hole offset, 4532 * it is assumed that only data and no holes are present */ 4533 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4534 bdev_io->u.bdev.seek.offset = offset_blocks; 4535 } else { 4536 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4537 } 4538 4539 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4540 return 0; 4541 } 4542 4543 bdev_io_submit(bdev_io); 4544 return 0; 4545 } 4546 4547 int 4548 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4549 uint64_t offset_blocks, 4550 spdk_bdev_io_completion_cb cb, void *cb_arg) 4551 { 4552 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4553 } 4554 4555 int 4556 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4557 uint64_t offset_blocks, 4558 spdk_bdev_io_completion_cb cb, void *cb_arg) 4559 { 4560 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4561 } 4562 4563 uint64_t 4564 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4565 { 4566 return bdev_io->u.bdev.seek.offset; 4567 } 4568 4569 static int 4570 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4571 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4572 spdk_bdev_io_completion_cb cb, void *cb_arg) 4573 { 4574 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4575 struct spdk_bdev_io *bdev_io; 4576 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4577 4578 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4579 return -EINVAL; 4580 } 4581 4582 bdev_io = bdev_channel_get_io(channel); 4583 if (!bdev_io) { 4584 return -ENOMEM; 4585 } 4586 4587 bdev_io->internal.ch = channel; 4588 bdev_io->internal.desc = desc; 4589 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4590 bdev_io->u.bdev.iovs = &bdev_io->iov; 4591 bdev_io->u.bdev.iovs[0].iov_base = buf; 4592 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4593 bdev_io->u.bdev.iovcnt = 1; 4594 bdev_io->u.bdev.md_buf = md_buf; 4595 bdev_io->u.bdev.num_blocks = num_blocks; 4596 bdev_io->u.bdev.offset_blocks = offset_blocks; 4597 bdev_io->u.bdev.ext_opts = NULL; 4598 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4599 4600 bdev_io_submit(bdev_io); 4601 return 0; 4602 } 4603 4604 int 4605 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4606 void *buf, uint64_t offset, uint64_t nbytes, 4607 spdk_bdev_io_completion_cb cb, void *cb_arg) 4608 { 4609 uint64_t offset_blocks, num_blocks; 4610 4611 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4612 nbytes, &num_blocks) != 0) { 4613 return -EINVAL; 4614 } 4615 4616 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4617 } 4618 4619 int 4620 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4621 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4622 spdk_bdev_io_completion_cb cb, void *cb_arg) 4623 { 4624 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4625 } 4626 4627 int 4628 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4629 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4630 spdk_bdev_io_completion_cb cb, void *cb_arg) 4631 { 4632 struct iovec iov = { 4633 .iov_base = buf, 4634 }; 4635 4636 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4637 return -EINVAL; 4638 } 4639 4640 if (md_buf && !_is_buf_allocated(&iov)) { 4641 return -EINVAL; 4642 } 4643 4644 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4645 cb, cb_arg); 4646 } 4647 4648 int 4649 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4650 struct iovec *iov, int iovcnt, 4651 uint64_t offset, uint64_t nbytes, 4652 spdk_bdev_io_completion_cb cb, void *cb_arg) 4653 { 4654 uint64_t offset_blocks, num_blocks; 4655 4656 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4657 nbytes, &num_blocks) != 0) { 4658 return -EINVAL; 4659 } 4660 4661 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4662 } 4663 4664 static int 4665 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4666 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4667 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4668 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4669 { 4670 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4671 struct spdk_bdev_io *bdev_io; 4672 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4673 4674 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4675 return -EINVAL; 4676 } 4677 4678 bdev_io = bdev_channel_get_io(channel); 4679 if (!bdev_io) { 4680 return -ENOMEM; 4681 } 4682 4683 bdev_io->internal.ch = channel; 4684 bdev_io->internal.desc = desc; 4685 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4686 bdev_io->u.bdev.iovs = iov; 4687 bdev_io->u.bdev.iovcnt = iovcnt; 4688 bdev_io->u.bdev.md_buf = md_buf; 4689 bdev_io->u.bdev.num_blocks = num_blocks; 4690 bdev_io->u.bdev.offset_blocks = offset_blocks; 4691 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4692 bdev_io->internal.ext_opts = opts; 4693 bdev_io->u.bdev.ext_opts = opts; 4694 4695 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4696 4697 return 0; 4698 } 4699 4700 int 4701 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4702 struct iovec *iov, int iovcnt, 4703 uint64_t offset_blocks, uint64_t num_blocks, 4704 spdk_bdev_io_completion_cb cb, void *cb_arg) 4705 { 4706 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4707 num_blocks, cb, cb_arg, NULL, false); 4708 } 4709 4710 int 4711 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4712 struct iovec *iov, int iovcnt, void *md_buf, 4713 uint64_t offset_blocks, uint64_t num_blocks, 4714 spdk_bdev_io_completion_cb cb, void *cb_arg) 4715 { 4716 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4717 return -EINVAL; 4718 } 4719 4720 if (md_buf && !_is_buf_allocated(iov)) { 4721 return -EINVAL; 4722 } 4723 4724 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4725 num_blocks, cb, cb_arg, NULL, false); 4726 } 4727 4728 static inline bool 4729 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4730 { 4731 /* 4732 * We check if opts size is at least of size when we first introduced 4733 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4734 * are not checked internal. 4735 */ 4736 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4737 sizeof(opts->metadata) && 4738 opts->size <= sizeof(*opts) && 4739 /* When memory domain is used, the user must provide data buffers */ 4740 (!opts->memory_domain || (iov && iov[0].iov_base)); 4741 } 4742 4743 int 4744 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4745 struct iovec *iov, int iovcnt, 4746 uint64_t offset_blocks, uint64_t num_blocks, 4747 spdk_bdev_io_completion_cb cb, void *cb_arg, 4748 struct spdk_bdev_ext_io_opts *opts) 4749 { 4750 void *md = NULL; 4751 4752 if (opts) { 4753 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4754 return -EINVAL; 4755 } 4756 md = opts->metadata; 4757 } 4758 4759 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4760 return -EINVAL; 4761 } 4762 4763 if (md && !_is_buf_allocated(iov)) { 4764 return -EINVAL; 4765 } 4766 4767 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4768 num_blocks, cb, cb_arg, opts, false); 4769 } 4770 4771 static int 4772 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4773 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4774 spdk_bdev_io_completion_cb cb, void *cb_arg) 4775 { 4776 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4777 struct spdk_bdev_io *bdev_io; 4778 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4779 4780 if (!desc->write) { 4781 return -EBADF; 4782 } 4783 4784 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4785 return -EINVAL; 4786 } 4787 4788 bdev_io = bdev_channel_get_io(channel); 4789 if (!bdev_io) { 4790 return -ENOMEM; 4791 } 4792 4793 bdev_io->internal.ch = channel; 4794 bdev_io->internal.desc = desc; 4795 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4796 bdev_io->u.bdev.iovs = &bdev_io->iov; 4797 bdev_io->u.bdev.iovs[0].iov_base = buf; 4798 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4799 bdev_io->u.bdev.iovcnt = 1; 4800 bdev_io->u.bdev.md_buf = md_buf; 4801 bdev_io->u.bdev.num_blocks = num_blocks; 4802 bdev_io->u.bdev.offset_blocks = offset_blocks; 4803 bdev_io->u.bdev.ext_opts = NULL; 4804 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4805 4806 bdev_io_submit(bdev_io); 4807 return 0; 4808 } 4809 4810 int 4811 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4812 void *buf, uint64_t offset, uint64_t nbytes, 4813 spdk_bdev_io_completion_cb cb, void *cb_arg) 4814 { 4815 uint64_t offset_blocks, num_blocks; 4816 4817 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4818 nbytes, &num_blocks) != 0) { 4819 return -EINVAL; 4820 } 4821 4822 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4823 } 4824 4825 int 4826 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4827 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4828 spdk_bdev_io_completion_cb cb, void *cb_arg) 4829 { 4830 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4831 cb, cb_arg); 4832 } 4833 4834 int 4835 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4836 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4837 spdk_bdev_io_completion_cb cb, void *cb_arg) 4838 { 4839 struct iovec iov = { 4840 .iov_base = buf, 4841 }; 4842 4843 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4844 return -EINVAL; 4845 } 4846 4847 if (md_buf && !_is_buf_allocated(&iov)) { 4848 return -EINVAL; 4849 } 4850 4851 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4852 cb, cb_arg); 4853 } 4854 4855 static int 4856 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4857 struct iovec *iov, int iovcnt, void *md_buf, 4858 uint64_t offset_blocks, uint64_t num_blocks, 4859 spdk_bdev_io_completion_cb cb, void *cb_arg, 4860 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4861 { 4862 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4863 struct spdk_bdev_io *bdev_io; 4864 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4865 4866 if (!desc->write) { 4867 return -EBADF; 4868 } 4869 4870 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4871 return -EINVAL; 4872 } 4873 4874 bdev_io = bdev_channel_get_io(channel); 4875 if (!bdev_io) { 4876 return -ENOMEM; 4877 } 4878 4879 bdev_io->internal.ch = channel; 4880 bdev_io->internal.desc = desc; 4881 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4882 bdev_io->u.bdev.iovs = iov; 4883 bdev_io->u.bdev.iovcnt = iovcnt; 4884 bdev_io->u.bdev.md_buf = md_buf; 4885 bdev_io->u.bdev.num_blocks = num_blocks; 4886 bdev_io->u.bdev.offset_blocks = offset_blocks; 4887 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4888 bdev_io->internal.ext_opts = opts; 4889 bdev_io->u.bdev.ext_opts = opts; 4890 4891 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4892 4893 return 0; 4894 } 4895 4896 int 4897 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4898 struct iovec *iov, int iovcnt, 4899 uint64_t offset, uint64_t len, 4900 spdk_bdev_io_completion_cb cb, void *cb_arg) 4901 { 4902 uint64_t offset_blocks, num_blocks; 4903 4904 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4905 len, &num_blocks) != 0) { 4906 return -EINVAL; 4907 } 4908 4909 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4910 } 4911 4912 int 4913 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4914 struct iovec *iov, int iovcnt, 4915 uint64_t offset_blocks, uint64_t num_blocks, 4916 spdk_bdev_io_completion_cb cb, void *cb_arg) 4917 { 4918 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4919 num_blocks, cb, cb_arg, NULL, false); 4920 } 4921 4922 int 4923 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4924 struct iovec *iov, int iovcnt, void *md_buf, 4925 uint64_t offset_blocks, uint64_t num_blocks, 4926 spdk_bdev_io_completion_cb cb, void *cb_arg) 4927 { 4928 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4929 return -EINVAL; 4930 } 4931 4932 if (md_buf && !_is_buf_allocated(iov)) { 4933 return -EINVAL; 4934 } 4935 4936 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4937 num_blocks, cb, cb_arg, NULL, false); 4938 } 4939 4940 int 4941 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4942 struct iovec *iov, int iovcnt, 4943 uint64_t offset_blocks, uint64_t num_blocks, 4944 spdk_bdev_io_completion_cb cb, void *cb_arg, 4945 struct spdk_bdev_ext_io_opts *opts) 4946 { 4947 void *md = NULL; 4948 4949 if (opts) { 4950 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4951 return -EINVAL; 4952 } 4953 md = opts->metadata; 4954 } 4955 4956 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4957 return -EINVAL; 4958 } 4959 4960 if (md && !_is_buf_allocated(iov)) { 4961 return -EINVAL; 4962 } 4963 4964 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4965 num_blocks, cb, cb_arg, opts, false); 4966 } 4967 4968 static void 4969 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4970 { 4971 struct spdk_bdev_io *parent_io = cb_arg; 4972 struct spdk_bdev *bdev = parent_io->bdev; 4973 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4974 int i, rc = 0; 4975 4976 if (!success) { 4977 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4978 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4979 spdk_bdev_free_io(bdev_io); 4980 return; 4981 } 4982 4983 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4984 rc = memcmp(read_buf, 4985 parent_io->u.bdev.iovs[i].iov_base, 4986 parent_io->u.bdev.iovs[i].iov_len); 4987 if (rc) { 4988 break; 4989 } 4990 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4991 } 4992 4993 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 4994 rc = memcmp(bdev_io->u.bdev.md_buf, 4995 parent_io->u.bdev.md_buf, 4996 spdk_bdev_get_md_size(bdev)); 4997 } 4998 4999 spdk_bdev_free_io(bdev_io); 5000 5001 if (rc == 0) { 5002 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5003 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5004 } else { 5005 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5006 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5007 } 5008 } 5009 5010 static void 5011 bdev_compare_do_read(void *_bdev_io) 5012 { 5013 struct spdk_bdev_io *bdev_io = _bdev_io; 5014 int rc; 5015 5016 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5017 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5018 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5019 bdev_compare_do_read_done, bdev_io); 5020 5021 if (rc == -ENOMEM) { 5022 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5023 } else if (rc != 0) { 5024 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5025 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5026 } 5027 } 5028 5029 static int 5030 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5031 struct iovec *iov, int iovcnt, void *md_buf, 5032 uint64_t offset_blocks, uint64_t num_blocks, 5033 spdk_bdev_io_completion_cb cb, void *cb_arg) 5034 { 5035 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5036 struct spdk_bdev_io *bdev_io; 5037 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5038 5039 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5040 return -EINVAL; 5041 } 5042 5043 bdev_io = bdev_channel_get_io(channel); 5044 if (!bdev_io) { 5045 return -ENOMEM; 5046 } 5047 5048 bdev_io->internal.ch = channel; 5049 bdev_io->internal.desc = desc; 5050 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5051 bdev_io->u.bdev.iovs = iov; 5052 bdev_io->u.bdev.iovcnt = iovcnt; 5053 bdev_io->u.bdev.md_buf = md_buf; 5054 bdev_io->u.bdev.num_blocks = num_blocks; 5055 bdev_io->u.bdev.offset_blocks = offset_blocks; 5056 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5057 bdev_io->u.bdev.ext_opts = NULL; 5058 5059 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5060 bdev_io_submit(bdev_io); 5061 return 0; 5062 } 5063 5064 bdev_compare_do_read(bdev_io); 5065 5066 return 0; 5067 } 5068 5069 int 5070 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5071 struct iovec *iov, int iovcnt, 5072 uint64_t offset_blocks, uint64_t num_blocks, 5073 spdk_bdev_io_completion_cb cb, void *cb_arg) 5074 { 5075 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5076 num_blocks, cb, cb_arg); 5077 } 5078 5079 int 5080 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5081 struct iovec *iov, int iovcnt, void *md_buf, 5082 uint64_t offset_blocks, uint64_t num_blocks, 5083 spdk_bdev_io_completion_cb cb, void *cb_arg) 5084 { 5085 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5086 return -EINVAL; 5087 } 5088 5089 if (md_buf && !_is_buf_allocated(iov)) { 5090 return -EINVAL; 5091 } 5092 5093 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5094 num_blocks, cb, cb_arg); 5095 } 5096 5097 static int 5098 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5099 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5100 spdk_bdev_io_completion_cb cb, void *cb_arg) 5101 { 5102 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5103 struct spdk_bdev_io *bdev_io; 5104 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5105 5106 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5107 return -EINVAL; 5108 } 5109 5110 bdev_io = bdev_channel_get_io(channel); 5111 if (!bdev_io) { 5112 return -ENOMEM; 5113 } 5114 5115 bdev_io->internal.ch = channel; 5116 bdev_io->internal.desc = desc; 5117 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5118 bdev_io->u.bdev.iovs = &bdev_io->iov; 5119 bdev_io->u.bdev.iovs[0].iov_base = buf; 5120 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5121 bdev_io->u.bdev.iovcnt = 1; 5122 bdev_io->u.bdev.md_buf = md_buf; 5123 bdev_io->u.bdev.num_blocks = num_blocks; 5124 bdev_io->u.bdev.offset_blocks = offset_blocks; 5125 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5126 bdev_io->u.bdev.ext_opts = NULL; 5127 5128 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5129 bdev_io_submit(bdev_io); 5130 return 0; 5131 } 5132 5133 bdev_compare_do_read(bdev_io); 5134 5135 return 0; 5136 } 5137 5138 int 5139 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5140 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5141 spdk_bdev_io_completion_cb cb, void *cb_arg) 5142 { 5143 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5144 cb, cb_arg); 5145 } 5146 5147 int 5148 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5149 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5150 spdk_bdev_io_completion_cb cb, void *cb_arg) 5151 { 5152 struct iovec iov = { 5153 .iov_base = buf, 5154 }; 5155 5156 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5157 return -EINVAL; 5158 } 5159 5160 if (md_buf && !_is_buf_allocated(&iov)) { 5161 return -EINVAL; 5162 } 5163 5164 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5165 cb, cb_arg); 5166 } 5167 5168 static void 5169 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5170 { 5171 struct spdk_bdev_io *bdev_io = ctx; 5172 5173 if (unlock_status) { 5174 SPDK_ERRLOG("LBA range unlock failed\n"); 5175 } 5176 5177 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5178 false, bdev_io->internal.caller_ctx); 5179 } 5180 5181 static void 5182 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5183 { 5184 bdev_io->internal.status = status; 5185 5186 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5187 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5188 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5189 } 5190 5191 static void 5192 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5193 { 5194 struct spdk_bdev_io *parent_io = cb_arg; 5195 5196 if (!success) { 5197 SPDK_ERRLOG("Compare and write operation failed\n"); 5198 } 5199 5200 spdk_bdev_free_io(bdev_io); 5201 5202 bdev_comparev_and_writev_blocks_unlock(parent_io, 5203 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5204 } 5205 5206 static void 5207 bdev_compare_and_write_do_write(void *_bdev_io) 5208 { 5209 struct spdk_bdev_io *bdev_io = _bdev_io; 5210 int rc; 5211 5212 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5213 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5214 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5215 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5216 bdev_compare_and_write_do_write_done, bdev_io); 5217 5218 5219 if (rc == -ENOMEM) { 5220 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5221 } else if (rc != 0) { 5222 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5223 } 5224 } 5225 5226 static void 5227 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5228 { 5229 struct spdk_bdev_io *parent_io = cb_arg; 5230 5231 spdk_bdev_free_io(bdev_io); 5232 5233 if (!success) { 5234 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5235 return; 5236 } 5237 5238 bdev_compare_and_write_do_write(parent_io); 5239 } 5240 5241 static void 5242 bdev_compare_and_write_do_compare(void *_bdev_io) 5243 { 5244 struct spdk_bdev_io *bdev_io = _bdev_io; 5245 int rc; 5246 5247 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5248 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5249 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5250 bdev_compare_and_write_do_compare_done, bdev_io); 5251 5252 if (rc == -ENOMEM) { 5253 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5254 } else if (rc != 0) { 5255 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5256 } 5257 } 5258 5259 static void 5260 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5261 { 5262 struct spdk_bdev_io *bdev_io = ctx; 5263 5264 if (status) { 5265 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5266 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5267 return; 5268 } 5269 5270 bdev_compare_and_write_do_compare(bdev_io); 5271 } 5272 5273 int 5274 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5275 struct iovec *compare_iov, int compare_iovcnt, 5276 struct iovec *write_iov, int write_iovcnt, 5277 uint64_t offset_blocks, uint64_t num_blocks, 5278 spdk_bdev_io_completion_cb cb, void *cb_arg) 5279 { 5280 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5281 struct spdk_bdev_io *bdev_io; 5282 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5283 5284 if (!desc->write) { 5285 return -EBADF; 5286 } 5287 5288 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5289 return -EINVAL; 5290 } 5291 5292 if (num_blocks > bdev->acwu) { 5293 return -EINVAL; 5294 } 5295 5296 bdev_io = bdev_channel_get_io(channel); 5297 if (!bdev_io) { 5298 return -ENOMEM; 5299 } 5300 5301 bdev_io->internal.ch = channel; 5302 bdev_io->internal.desc = desc; 5303 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5304 bdev_io->u.bdev.iovs = compare_iov; 5305 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5306 bdev_io->u.bdev.fused_iovs = write_iov; 5307 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5308 bdev_io->u.bdev.md_buf = NULL; 5309 bdev_io->u.bdev.num_blocks = num_blocks; 5310 bdev_io->u.bdev.offset_blocks = offset_blocks; 5311 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5312 bdev_io->u.bdev.ext_opts = NULL; 5313 5314 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5315 bdev_io_submit(bdev_io); 5316 return 0; 5317 } 5318 5319 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5320 bdev_comparev_and_writev_blocks_locked, bdev_io); 5321 } 5322 5323 int 5324 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5325 struct iovec *iov, int iovcnt, 5326 uint64_t offset_blocks, uint64_t num_blocks, 5327 bool populate, 5328 spdk_bdev_io_completion_cb cb, void *cb_arg) 5329 { 5330 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5331 struct spdk_bdev_io *bdev_io; 5332 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5333 5334 if (!desc->write) { 5335 return -EBADF; 5336 } 5337 5338 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5339 return -EINVAL; 5340 } 5341 5342 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5343 return -ENOTSUP; 5344 } 5345 5346 bdev_io = bdev_channel_get_io(channel); 5347 if (!bdev_io) { 5348 return -ENOMEM; 5349 } 5350 5351 bdev_io->internal.ch = channel; 5352 bdev_io->internal.desc = desc; 5353 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5354 bdev_io->u.bdev.num_blocks = num_blocks; 5355 bdev_io->u.bdev.offset_blocks = offset_blocks; 5356 bdev_io->u.bdev.iovs = iov; 5357 bdev_io->u.bdev.iovcnt = iovcnt; 5358 bdev_io->u.bdev.md_buf = NULL; 5359 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5360 bdev_io->u.bdev.zcopy.commit = 0; 5361 bdev_io->u.bdev.zcopy.start = 1; 5362 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5363 bdev_io->u.bdev.ext_opts = NULL; 5364 5365 bdev_io_submit(bdev_io); 5366 5367 return 0; 5368 } 5369 5370 int 5371 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5372 spdk_bdev_io_completion_cb cb, void *cb_arg) 5373 { 5374 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5375 return -EINVAL; 5376 } 5377 5378 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5379 bdev_io->u.bdev.zcopy.start = 0; 5380 bdev_io->internal.caller_ctx = cb_arg; 5381 bdev_io->internal.cb = cb; 5382 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5383 5384 bdev_io_submit(bdev_io); 5385 5386 return 0; 5387 } 5388 5389 int 5390 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5391 uint64_t offset, uint64_t len, 5392 spdk_bdev_io_completion_cb cb, void *cb_arg) 5393 { 5394 uint64_t offset_blocks, num_blocks; 5395 5396 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5397 len, &num_blocks) != 0) { 5398 return -EINVAL; 5399 } 5400 5401 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5402 } 5403 5404 int 5405 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5406 uint64_t offset_blocks, uint64_t num_blocks, 5407 spdk_bdev_io_completion_cb cb, void *cb_arg) 5408 { 5409 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5410 struct spdk_bdev_io *bdev_io; 5411 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5412 5413 if (!desc->write) { 5414 return -EBADF; 5415 } 5416 5417 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5418 return -EINVAL; 5419 } 5420 5421 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5422 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5423 return -ENOTSUP; 5424 } 5425 5426 bdev_io = bdev_channel_get_io(channel); 5427 5428 if (!bdev_io) { 5429 return -ENOMEM; 5430 } 5431 5432 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5433 bdev_io->internal.ch = channel; 5434 bdev_io->internal.desc = desc; 5435 bdev_io->u.bdev.offset_blocks = offset_blocks; 5436 bdev_io->u.bdev.num_blocks = num_blocks; 5437 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5438 bdev_io->u.bdev.ext_opts = NULL; 5439 5440 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5441 bdev_io_submit(bdev_io); 5442 return 0; 5443 } 5444 5445 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5446 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5447 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5448 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5449 bdev_write_zero_buffer_next(bdev_io); 5450 5451 return 0; 5452 } 5453 5454 int 5455 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5456 uint64_t offset, uint64_t nbytes, 5457 spdk_bdev_io_completion_cb cb, void *cb_arg) 5458 { 5459 uint64_t offset_blocks, num_blocks; 5460 5461 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5462 nbytes, &num_blocks) != 0) { 5463 return -EINVAL; 5464 } 5465 5466 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5467 } 5468 5469 int 5470 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5471 uint64_t offset_blocks, uint64_t num_blocks, 5472 spdk_bdev_io_completion_cb cb, void *cb_arg) 5473 { 5474 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5475 struct spdk_bdev_io *bdev_io; 5476 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5477 5478 if (!desc->write) { 5479 return -EBADF; 5480 } 5481 5482 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5483 return -EINVAL; 5484 } 5485 5486 if (num_blocks == 0) { 5487 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5488 return -EINVAL; 5489 } 5490 5491 bdev_io = bdev_channel_get_io(channel); 5492 if (!bdev_io) { 5493 return -ENOMEM; 5494 } 5495 5496 bdev_io->internal.ch = channel; 5497 bdev_io->internal.desc = desc; 5498 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5499 5500 bdev_io->u.bdev.iovs = &bdev_io->iov; 5501 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5502 bdev_io->u.bdev.iovs[0].iov_len = 0; 5503 bdev_io->u.bdev.iovcnt = 1; 5504 5505 bdev_io->u.bdev.offset_blocks = offset_blocks; 5506 bdev_io->u.bdev.num_blocks = num_blocks; 5507 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5508 bdev_io->u.bdev.ext_opts = NULL; 5509 5510 bdev_io_submit(bdev_io); 5511 return 0; 5512 } 5513 5514 int 5515 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5516 uint64_t offset, uint64_t length, 5517 spdk_bdev_io_completion_cb cb, void *cb_arg) 5518 { 5519 uint64_t offset_blocks, num_blocks; 5520 5521 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5522 length, &num_blocks) != 0) { 5523 return -EINVAL; 5524 } 5525 5526 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5527 } 5528 5529 int 5530 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5531 uint64_t offset_blocks, uint64_t num_blocks, 5532 spdk_bdev_io_completion_cb cb, void *cb_arg) 5533 { 5534 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5535 struct spdk_bdev_io *bdev_io; 5536 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5537 5538 if (!desc->write) { 5539 return -EBADF; 5540 } 5541 5542 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5543 return -EINVAL; 5544 } 5545 5546 bdev_io = bdev_channel_get_io(channel); 5547 if (!bdev_io) { 5548 return -ENOMEM; 5549 } 5550 5551 bdev_io->internal.ch = channel; 5552 bdev_io->internal.desc = desc; 5553 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5554 bdev_io->u.bdev.iovs = NULL; 5555 bdev_io->u.bdev.iovcnt = 0; 5556 bdev_io->u.bdev.offset_blocks = offset_blocks; 5557 bdev_io->u.bdev.num_blocks = num_blocks; 5558 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5559 5560 bdev_io_submit(bdev_io); 5561 return 0; 5562 } 5563 5564 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5565 5566 static void 5567 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5568 { 5569 struct spdk_bdev_channel *ch = _ctx; 5570 struct spdk_bdev_io *bdev_io; 5571 5572 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5573 5574 if (status == -EBUSY) { 5575 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5576 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5577 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5578 } else { 5579 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5580 * start the reset. */ 5581 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5582 bdev_io_submit_reset(bdev_io); 5583 } 5584 } else { 5585 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5586 SPDK_DEBUGLOG(bdev, 5587 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5588 ch->bdev->name); 5589 /* Mark the completion status as a SUCCESS and complete the reset. */ 5590 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5591 } 5592 } 5593 5594 static void 5595 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5596 struct spdk_io_channel *io_ch, void *_ctx) 5597 { 5598 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5599 int status = 0; 5600 5601 if (cur_ch->io_outstanding > 0) { 5602 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5603 * further iteration over the rest of the channels and pass non-zero status 5604 * to the callback function. */ 5605 status = -EBUSY; 5606 } 5607 spdk_bdev_for_each_channel_continue(i, status); 5608 } 5609 5610 static int 5611 bdev_reset_poll_for_outstanding_io(void *ctx) 5612 { 5613 struct spdk_bdev_channel *ch = ctx; 5614 struct spdk_bdev_io *bdev_io; 5615 5616 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5617 5618 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5619 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5620 bdev_reset_check_outstanding_io_done); 5621 5622 return SPDK_POLLER_BUSY; 5623 } 5624 5625 static void 5626 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5627 { 5628 struct spdk_bdev_channel *ch = _ctx; 5629 struct spdk_bdev_io *bdev_io; 5630 5631 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5632 5633 if (bdev->reset_io_drain_timeout == 0) { 5634 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5635 5636 bdev_io_submit_reset(bdev_io); 5637 return; 5638 } 5639 5640 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5641 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5642 5643 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5644 * submit the reset to the underlying module only if outstanding I/O 5645 * remain after reset_io_drain_timeout seconds have passed. */ 5646 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5647 bdev_reset_check_outstanding_io_done); 5648 } 5649 5650 static void 5651 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5652 struct spdk_io_channel *ch, void *_ctx) 5653 { 5654 struct spdk_bdev_channel *channel; 5655 struct spdk_bdev_mgmt_channel *mgmt_channel; 5656 struct spdk_bdev_shared_resource *shared_resource; 5657 bdev_io_tailq_t tmp_queued; 5658 5659 TAILQ_INIT(&tmp_queued); 5660 5661 channel = __io_ch_to_bdev_ch(ch); 5662 shared_resource = channel->shared_resource; 5663 mgmt_channel = shared_resource->mgmt_ch; 5664 5665 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5666 5667 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5668 /* The QoS object is always valid and readable while 5669 * the channel flag is set, so the lock here should not 5670 * be necessary. We're not in the fast path though, so 5671 * just take it anyway. */ 5672 spdk_spin_lock(&channel->bdev->internal.spinlock); 5673 if (channel->bdev->internal.qos->ch == channel) { 5674 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5675 } 5676 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5677 } 5678 5679 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5680 bdev_abort_all_buf_io(mgmt_channel, channel); 5681 bdev_abort_all_buf_io(mgmt_channel, channel); 5682 bdev_abort_all_queued_io(&tmp_queued, channel); 5683 5684 spdk_bdev_for_each_channel_continue(i, 0); 5685 } 5686 5687 static void 5688 bdev_start_reset(void *ctx) 5689 { 5690 struct spdk_bdev_channel *ch = ctx; 5691 5692 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5693 bdev_reset_freeze_channel_done); 5694 } 5695 5696 static void 5697 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5698 { 5699 struct spdk_bdev *bdev = ch->bdev; 5700 5701 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5702 5703 spdk_spin_lock(&bdev->internal.spinlock); 5704 if (bdev->internal.reset_in_progress == NULL) { 5705 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5706 /* 5707 * Take a channel reference for the target bdev for the life of this 5708 * reset. This guards against the channel getting destroyed while 5709 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5710 * progress. We will release the reference when this reset is 5711 * completed. 5712 */ 5713 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5714 bdev_start_reset(ch); 5715 } 5716 spdk_spin_unlock(&bdev->internal.spinlock); 5717 } 5718 5719 int 5720 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5721 spdk_bdev_io_completion_cb cb, void *cb_arg) 5722 { 5723 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5724 struct spdk_bdev_io *bdev_io; 5725 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5726 5727 bdev_io = bdev_channel_get_io(channel); 5728 if (!bdev_io) { 5729 return -ENOMEM; 5730 } 5731 5732 bdev_io->internal.ch = channel; 5733 bdev_io->internal.desc = desc; 5734 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5735 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5736 bdev_io->u.reset.ch_ref = NULL; 5737 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5738 5739 spdk_spin_lock(&bdev->internal.spinlock); 5740 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5741 spdk_spin_unlock(&bdev->internal.spinlock); 5742 5743 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5744 internal.ch_link); 5745 5746 bdev_channel_start_reset(channel); 5747 5748 return 0; 5749 } 5750 5751 void 5752 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5753 struct spdk_bdev_io_stat *stat) 5754 { 5755 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5756 5757 bdev_get_io_stat(stat, channel->stat); 5758 } 5759 5760 static void 5761 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5762 { 5763 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5764 5765 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5766 bdev_iostat_ctx->cb_arg, 0); 5767 free(bdev_iostat_ctx); 5768 } 5769 5770 static void 5771 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5772 struct spdk_io_channel *ch, void *_ctx) 5773 { 5774 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5775 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5776 5777 bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5778 spdk_bdev_for_each_channel_continue(i, 0); 5779 } 5780 5781 void 5782 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5783 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5784 { 5785 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5786 5787 assert(bdev != NULL); 5788 assert(stat != NULL); 5789 assert(cb != NULL); 5790 5791 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5792 if (bdev_iostat_ctx == NULL) { 5793 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5794 cb(bdev, stat, cb_arg, -ENOMEM); 5795 return; 5796 } 5797 5798 bdev_iostat_ctx->stat = stat; 5799 bdev_iostat_ctx->cb = cb; 5800 bdev_iostat_ctx->cb_arg = cb_arg; 5801 5802 /* Start with the statistics from previously deleted channels. */ 5803 spdk_spin_lock(&bdev->internal.spinlock); 5804 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5805 spdk_spin_unlock(&bdev->internal.spinlock); 5806 5807 /* Then iterate and add the statistics from each existing channel. */ 5808 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5809 bdev_get_device_stat_done); 5810 } 5811 5812 struct bdev_iostat_reset_ctx { 5813 enum bdev_reset_stat_mode mode; 5814 bdev_reset_device_stat_cb cb; 5815 void *cb_arg; 5816 }; 5817 5818 static void 5819 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5820 { 5821 struct bdev_iostat_reset_ctx *ctx = _ctx; 5822 5823 ctx->cb(bdev, ctx->cb_arg, 0); 5824 5825 free(ctx); 5826 } 5827 5828 static void 5829 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5830 struct spdk_io_channel *ch, void *_ctx) 5831 { 5832 struct bdev_iostat_reset_ctx *ctx = _ctx; 5833 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5834 5835 bdev_reset_io_stat(channel->stat, ctx->mode); 5836 5837 spdk_bdev_for_each_channel_continue(i, 0); 5838 } 5839 5840 void 5841 bdev_reset_device_stat(struct spdk_bdev *bdev, enum bdev_reset_stat_mode mode, 5842 bdev_reset_device_stat_cb cb, void *cb_arg) 5843 { 5844 struct bdev_iostat_reset_ctx *ctx; 5845 5846 assert(bdev != NULL); 5847 assert(cb != NULL); 5848 5849 ctx = calloc(1, sizeof(*ctx)); 5850 if (ctx == NULL) { 5851 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5852 cb(bdev, cb_arg, -ENOMEM); 5853 return; 5854 } 5855 5856 ctx->mode = mode; 5857 ctx->cb = cb; 5858 ctx->cb_arg = cb_arg; 5859 5860 spdk_spin_lock(&bdev->internal.spinlock); 5861 bdev_reset_io_stat(bdev->internal.stat, mode); 5862 spdk_spin_unlock(&bdev->internal.spinlock); 5863 5864 spdk_bdev_for_each_channel(bdev, 5865 bdev_reset_each_channel_stat, 5866 ctx, 5867 bdev_reset_device_stat_done); 5868 } 5869 5870 int 5871 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5872 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5873 spdk_bdev_io_completion_cb cb, void *cb_arg) 5874 { 5875 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5876 struct spdk_bdev_io *bdev_io; 5877 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5878 5879 if (!desc->write) { 5880 return -EBADF; 5881 } 5882 5883 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5884 return -ENOTSUP; 5885 } 5886 5887 bdev_io = bdev_channel_get_io(channel); 5888 if (!bdev_io) { 5889 return -ENOMEM; 5890 } 5891 5892 bdev_io->internal.ch = channel; 5893 bdev_io->internal.desc = desc; 5894 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5895 bdev_io->u.nvme_passthru.cmd = *cmd; 5896 bdev_io->u.nvme_passthru.buf = buf; 5897 bdev_io->u.nvme_passthru.nbytes = nbytes; 5898 bdev_io->u.nvme_passthru.md_buf = NULL; 5899 bdev_io->u.nvme_passthru.md_len = 0; 5900 5901 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5902 5903 bdev_io_submit(bdev_io); 5904 return 0; 5905 } 5906 5907 int 5908 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5909 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5910 spdk_bdev_io_completion_cb cb, void *cb_arg) 5911 { 5912 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5913 struct spdk_bdev_io *bdev_io; 5914 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5915 5916 if (!desc->write) { 5917 /* 5918 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5919 * to easily determine if the command is a read or write, but for now just 5920 * do not allow io_passthru with a read-only descriptor. 5921 */ 5922 return -EBADF; 5923 } 5924 5925 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5926 return -ENOTSUP; 5927 } 5928 5929 bdev_io = bdev_channel_get_io(channel); 5930 if (!bdev_io) { 5931 return -ENOMEM; 5932 } 5933 5934 bdev_io->internal.ch = channel; 5935 bdev_io->internal.desc = desc; 5936 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5937 bdev_io->u.nvme_passthru.cmd = *cmd; 5938 bdev_io->u.nvme_passthru.buf = buf; 5939 bdev_io->u.nvme_passthru.nbytes = nbytes; 5940 bdev_io->u.nvme_passthru.md_buf = NULL; 5941 bdev_io->u.nvme_passthru.md_len = 0; 5942 5943 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5944 5945 bdev_io_submit(bdev_io); 5946 return 0; 5947 } 5948 5949 int 5950 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5951 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5952 spdk_bdev_io_completion_cb cb, void *cb_arg) 5953 { 5954 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5955 struct spdk_bdev_io *bdev_io; 5956 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5957 5958 if (!desc->write) { 5959 /* 5960 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5961 * to easily determine if the command is a read or write, but for now just 5962 * do not allow io_passthru with a read-only descriptor. 5963 */ 5964 return -EBADF; 5965 } 5966 5967 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5968 return -ENOTSUP; 5969 } 5970 5971 bdev_io = bdev_channel_get_io(channel); 5972 if (!bdev_io) { 5973 return -ENOMEM; 5974 } 5975 5976 bdev_io->internal.ch = channel; 5977 bdev_io->internal.desc = desc; 5978 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5979 bdev_io->u.nvme_passthru.cmd = *cmd; 5980 bdev_io->u.nvme_passthru.buf = buf; 5981 bdev_io->u.nvme_passthru.nbytes = nbytes; 5982 bdev_io->u.nvme_passthru.md_buf = md_buf; 5983 bdev_io->u.nvme_passthru.md_len = md_len; 5984 5985 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5986 5987 bdev_io_submit(bdev_io); 5988 return 0; 5989 } 5990 5991 static void bdev_abort_retry(void *ctx); 5992 static void bdev_abort(struct spdk_bdev_io *parent_io); 5993 5994 static void 5995 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5996 { 5997 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5998 struct spdk_bdev_io *parent_io = cb_arg; 5999 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6000 6001 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6002 6003 spdk_bdev_free_io(bdev_io); 6004 6005 if (!success) { 6006 /* Check if the target I/O completed in the meantime. */ 6007 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6008 if (tmp_io == bio_to_abort) { 6009 break; 6010 } 6011 } 6012 6013 /* If the target I/O still exists, set the parent to failed. */ 6014 if (tmp_io != NULL) { 6015 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6016 } 6017 } 6018 6019 parent_io->u.bdev.split_outstanding--; 6020 if (parent_io->u.bdev.split_outstanding == 0) { 6021 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6022 bdev_abort_retry(parent_io); 6023 } else { 6024 bdev_io_complete(parent_io); 6025 } 6026 } 6027 } 6028 6029 static int 6030 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6031 struct spdk_bdev_io *bio_to_abort, 6032 spdk_bdev_io_completion_cb cb, void *cb_arg) 6033 { 6034 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6035 struct spdk_bdev_io *bdev_io; 6036 6037 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6038 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6039 /* TODO: Abort reset or abort request. */ 6040 return -ENOTSUP; 6041 } 6042 6043 bdev_io = bdev_channel_get_io(channel); 6044 if (bdev_io == NULL) { 6045 return -ENOMEM; 6046 } 6047 6048 bdev_io->internal.ch = channel; 6049 bdev_io->internal.desc = desc; 6050 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6051 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6052 6053 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6054 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6055 6056 /* Parent abort request is not submitted directly, but to manage its 6057 * execution add it to the submitted list here. 6058 */ 6059 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6060 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6061 6062 bdev_abort(bdev_io); 6063 6064 return 0; 6065 } 6066 6067 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6068 6069 /* Submit the abort request to the underlying bdev module. */ 6070 bdev_io_submit(bdev_io); 6071 6072 return 0; 6073 } 6074 6075 static uint32_t 6076 _bdev_abort(struct spdk_bdev_io *parent_io) 6077 { 6078 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6079 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6080 void *bio_cb_arg; 6081 struct spdk_bdev_io *bio_to_abort; 6082 uint32_t matched_ios; 6083 int rc; 6084 6085 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6086 6087 /* matched_ios is returned and will be kept by the caller. 6088 * 6089 * This function will be used for two cases, 1) the same cb_arg is used for 6090 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6091 * Incrementing split_outstanding directly here may confuse readers especially 6092 * for the 1st case. 6093 * 6094 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6095 * works as expected. 6096 */ 6097 matched_ios = 0; 6098 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6099 6100 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6101 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6102 continue; 6103 } 6104 6105 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6106 /* Any I/O which was submitted after this abort command should be excluded. */ 6107 continue; 6108 } 6109 6110 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6111 if (rc != 0) { 6112 if (rc == -ENOMEM) { 6113 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6114 } else { 6115 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6116 } 6117 break; 6118 } 6119 matched_ios++; 6120 } 6121 6122 return matched_ios; 6123 } 6124 6125 static void 6126 bdev_abort_retry(void *ctx) 6127 { 6128 struct spdk_bdev_io *parent_io = ctx; 6129 uint32_t matched_ios; 6130 6131 matched_ios = _bdev_abort(parent_io); 6132 6133 if (matched_ios == 0) { 6134 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6135 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6136 } else { 6137 /* For retry, the case that no target I/O was found is success 6138 * because it means target I/Os completed in the meantime. 6139 */ 6140 bdev_io_complete(parent_io); 6141 } 6142 return; 6143 } 6144 6145 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6146 parent_io->u.bdev.split_outstanding = matched_ios; 6147 } 6148 6149 static void 6150 bdev_abort(struct spdk_bdev_io *parent_io) 6151 { 6152 uint32_t matched_ios; 6153 6154 matched_ios = _bdev_abort(parent_io); 6155 6156 if (matched_ios == 0) { 6157 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6158 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6159 } else { 6160 /* The case the no target I/O was found is failure. */ 6161 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6162 bdev_io_complete(parent_io); 6163 } 6164 return; 6165 } 6166 6167 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6168 parent_io->u.bdev.split_outstanding = matched_ios; 6169 } 6170 6171 int 6172 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6173 void *bio_cb_arg, 6174 spdk_bdev_io_completion_cb cb, void *cb_arg) 6175 { 6176 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6177 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6178 struct spdk_bdev_io *bdev_io; 6179 6180 if (bio_cb_arg == NULL) { 6181 return -EINVAL; 6182 } 6183 6184 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6185 return -ENOTSUP; 6186 } 6187 6188 bdev_io = bdev_channel_get_io(channel); 6189 if (bdev_io == NULL) { 6190 return -ENOMEM; 6191 } 6192 6193 bdev_io->internal.ch = channel; 6194 bdev_io->internal.desc = desc; 6195 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6196 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6197 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6198 6199 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6200 6201 /* Parent abort request is not submitted directly, but to manage its execution, 6202 * add it to the submitted list here. 6203 */ 6204 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6205 6206 bdev_abort(bdev_io); 6207 6208 return 0; 6209 } 6210 6211 int 6212 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6213 struct spdk_bdev_io_wait_entry *entry) 6214 { 6215 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6216 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6217 6218 if (bdev != entry->bdev) { 6219 SPDK_ERRLOG("bdevs do not match\n"); 6220 return -EINVAL; 6221 } 6222 6223 if (mgmt_ch->per_thread_cache_count > 0) { 6224 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6225 return -EINVAL; 6226 } 6227 6228 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6229 return 0; 6230 } 6231 6232 static inline void 6233 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6234 { 6235 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6236 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6237 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6238 uint32_t blocklen = bdev_io->bdev->blocklen; 6239 6240 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6241 switch (bdev_io->type) { 6242 case SPDK_BDEV_IO_TYPE_READ: 6243 io_stat->bytes_read += num_blocks * blocklen; 6244 io_stat->num_read_ops++; 6245 io_stat->read_latency_ticks += tsc_diff; 6246 if (io_stat->max_read_latency_ticks < tsc_diff) { 6247 io_stat->max_read_latency_ticks = tsc_diff; 6248 } 6249 if (io_stat->min_read_latency_ticks > tsc_diff) { 6250 io_stat->min_read_latency_ticks = tsc_diff; 6251 } 6252 break; 6253 case SPDK_BDEV_IO_TYPE_WRITE: 6254 io_stat->bytes_written += num_blocks * blocklen; 6255 io_stat->num_write_ops++; 6256 io_stat->write_latency_ticks += tsc_diff; 6257 if (io_stat->max_write_latency_ticks < tsc_diff) { 6258 io_stat->max_write_latency_ticks = tsc_diff; 6259 } 6260 if (io_stat->min_write_latency_ticks > tsc_diff) { 6261 io_stat->min_write_latency_ticks = tsc_diff; 6262 } 6263 break; 6264 case SPDK_BDEV_IO_TYPE_UNMAP: 6265 io_stat->bytes_unmapped += num_blocks * blocklen; 6266 io_stat->num_unmap_ops++; 6267 io_stat->unmap_latency_ticks += tsc_diff; 6268 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6269 io_stat->max_unmap_latency_ticks = tsc_diff; 6270 } 6271 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6272 io_stat->min_unmap_latency_ticks = tsc_diff; 6273 } 6274 break; 6275 case SPDK_BDEV_IO_TYPE_ZCOPY: 6276 /* Track the data in the start phase only */ 6277 if (bdev_io->u.bdev.zcopy.start) { 6278 if (bdev_io->u.bdev.zcopy.populate) { 6279 io_stat->bytes_read += num_blocks * blocklen; 6280 io_stat->num_read_ops++; 6281 io_stat->read_latency_ticks += tsc_diff; 6282 if (io_stat->max_read_latency_ticks < tsc_diff) { 6283 io_stat->max_read_latency_ticks = tsc_diff; 6284 } 6285 if (io_stat->min_read_latency_ticks > tsc_diff) { 6286 io_stat->min_read_latency_ticks = tsc_diff; 6287 } 6288 } else { 6289 io_stat->bytes_written += num_blocks * blocklen; 6290 io_stat->num_write_ops++; 6291 io_stat->write_latency_ticks += tsc_diff; 6292 if (io_stat->max_write_latency_ticks < tsc_diff) { 6293 io_stat->max_write_latency_ticks = tsc_diff; 6294 } 6295 if (io_stat->min_write_latency_ticks > tsc_diff) { 6296 io_stat->min_write_latency_ticks = tsc_diff; 6297 } 6298 } 6299 } 6300 break; 6301 case SPDK_BDEV_IO_TYPE_COPY: 6302 io_stat->bytes_copied += num_blocks * blocklen; 6303 io_stat->num_copy_ops++; 6304 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6305 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6306 io_stat->max_copy_latency_ticks = tsc_diff; 6307 } 6308 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6309 io_stat->min_copy_latency_ticks = tsc_diff; 6310 } 6311 break; 6312 default: 6313 break; 6314 } 6315 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6316 io_stat = bdev_io->bdev->internal.stat; 6317 assert(io_stat->io_error != NULL); 6318 6319 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6320 io_stat->io_error->error_status[-io_status - 1]++; 6321 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6322 } 6323 6324 #ifdef SPDK_CONFIG_VTUNE 6325 uint64_t now_tsc = spdk_get_ticks(); 6326 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6327 uint64_t data[5]; 6328 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6329 6330 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6331 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6332 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6333 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6334 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6335 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6336 6337 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6338 __itt_metadata_u64, 5, data); 6339 6340 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6341 bdev_io->internal.ch->start_tsc = now_tsc; 6342 } 6343 #endif 6344 } 6345 6346 static inline void 6347 bdev_io_complete(void *ctx) 6348 { 6349 struct spdk_bdev_io *bdev_io = ctx; 6350 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6351 uint64_t tsc, tsc_diff; 6352 6353 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6354 /* 6355 * Defer completion to avoid potential infinite recursion if the 6356 * user's completion callback issues a new I/O. 6357 */ 6358 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6359 bdev_io_complete, bdev_io); 6360 return; 6361 } 6362 6363 tsc = spdk_get_ticks(); 6364 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6365 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6366 bdev_io->internal.caller_ctx); 6367 6368 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6369 6370 if (bdev_io->internal.ch->histogram) { 6371 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6372 } 6373 6374 bdev_io_update_io_stat(bdev_io, tsc_diff); 6375 6376 assert(bdev_io->internal.cb != NULL); 6377 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6378 6379 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6380 bdev_io->internal.caller_ctx); 6381 } 6382 6383 static void bdev_destroy_cb(void *io_device); 6384 6385 static void 6386 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6387 { 6388 struct spdk_bdev_io *bdev_io = _ctx; 6389 6390 if (bdev_io->u.reset.ch_ref != NULL) { 6391 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6392 bdev_io->u.reset.ch_ref = NULL; 6393 } 6394 6395 bdev_io_complete(bdev_io); 6396 6397 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6398 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6399 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6400 } 6401 } 6402 6403 static void 6404 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6405 struct spdk_io_channel *_ch, void *_ctx) 6406 { 6407 struct spdk_bdev_io *bdev_io = _ctx; 6408 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6409 struct spdk_bdev_io *queued_reset; 6410 6411 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6412 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6413 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6414 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6415 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6416 } 6417 6418 spdk_bdev_for_each_channel_continue(i, 0); 6419 } 6420 6421 void 6422 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6423 { 6424 struct spdk_bdev *bdev = bdev_io->bdev; 6425 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6426 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6427 6428 bdev_io->internal.status = status; 6429 6430 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6431 bool unlock_channels = false; 6432 6433 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6434 SPDK_ERRLOG("NOMEM returned for reset\n"); 6435 } 6436 spdk_spin_lock(&bdev->internal.spinlock); 6437 if (bdev_io == bdev->internal.reset_in_progress) { 6438 bdev->internal.reset_in_progress = NULL; 6439 unlock_channels = true; 6440 } 6441 spdk_spin_unlock(&bdev->internal.spinlock); 6442 6443 if (unlock_channels) { 6444 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6445 bdev_reset_complete); 6446 return; 6447 } 6448 } else { 6449 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6450 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6451 /* bdev IO will be completed in the callback */ 6452 return; 6453 } 6454 6455 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6456 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6457 return; 6458 } 6459 } 6460 6461 bdev_io_complete(bdev_io); 6462 } 6463 6464 void 6465 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6466 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6467 { 6468 if (sc == SPDK_SCSI_STATUS_GOOD) { 6469 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6470 } else { 6471 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6472 bdev_io->internal.error.scsi.sc = sc; 6473 bdev_io->internal.error.scsi.sk = sk; 6474 bdev_io->internal.error.scsi.asc = asc; 6475 bdev_io->internal.error.scsi.ascq = ascq; 6476 } 6477 6478 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6479 } 6480 6481 void 6482 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6483 int *sc, int *sk, int *asc, int *ascq) 6484 { 6485 assert(sc != NULL); 6486 assert(sk != NULL); 6487 assert(asc != NULL); 6488 assert(ascq != NULL); 6489 6490 switch (bdev_io->internal.status) { 6491 case SPDK_BDEV_IO_STATUS_SUCCESS: 6492 *sc = SPDK_SCSI_STATUS_GOOD; 6493 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6494 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6495 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6496 break; 6497 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6498 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6499 break; 6500 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6501 *sc = bdev_io->internal.error.scsi.sc; 6502 *sk = bdev_io->internal.error.scsi.sk; 6503 *asc = bdev_io->internal.error.scsi.asc; 6504 *ascq = bdev_io->internal.error.scsi.ascq; 6505 break; 6506 default: 6507 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6508 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6509 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6510 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6511 break; 6512 } 6513 } 6514 6515 void 6516 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6517 { 6518 if (aio_result == 0) { 6519 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6520 } else { 6521 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6522 } 6523 6524 bdev_io->internal.error.aio_result = aio_result; 6525 6526 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6527 } 6528 6529 void 6530 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6531 { 6532 assert(aio_result != NULL); 6533 6534 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6535 *aio_result = bdev_io->internal.error.aio_result; 6536 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6537 *aio_result = 0; 6538 } else { 6539 *aio_result = -EIO; 6540 } 6541 } 6542 6543 void 6544 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6545 { 6546 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6547 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6548 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6549 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6550 } else { 6551 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6552 } 6553 6554 bdev_io->internal.error.nvme.cdw0 = cdw0; 6555 bdev_io->internal.error.nvme.sct = sct; 6556 bdev_io->internal.error.nvme.sc = sc; 6557 6558 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6559 } 6560 6561 void 6562 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6563 { 6564 assert(sct != NULL); 6565 assert(sc != NULL); 6566 assert(cdw0 != NULL); 6567 6568 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6569 *sct = SPDK_NVME_SCT_GENERIC; 6570 *sc = SPDK_NVME_SC_SUCCESS; 6571 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6572 *cdw0 = 0; 6573 } else { 6574 *cdw0 = 1U; 6575 } 6576 return; 6577 } 6578 6579 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6580 *sct = bdev_io->internal.error.nvme.sct; 6581 *sc = bdev_io->internal.error.nvme.sc; 6582 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6583 *sct = SPDK_NVME_SCT_GENERIC; 6584 *sc = SPDK_NVME_SC_SUCCESS; 6585 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6586 *sct = SPDK_NVME_SCT_GENERIC; 6587 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6588 } else { 6589 *sct = SPDK_NVME_SCT_GENERIC; 6590 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6591 } 6592 6593 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6594 } 6595 6596 void 6597 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6598 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6599 { 6600 assert(first_sct != NULL); 6601 assert(first_sc != NULL); 6602 assert(second_sct != NULL); 6603 assert(second_sc != NULL); 6604 assert(cdw0 != NULL); 6605 6606 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6607 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6608 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6609 *first_sct = bdev_io->internal.error.nvme.sct; 6610 *first_sc = bdev_io->internal.error.nvme.sc; 6611 *second_sct = SPDK_NVME_SCT_GENERIC; 6612 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6613 } else { 6614 *first_sct = SPDK_NVME_SCT_GENERIC; 6615 *first_sc = SPDK_NVME_SC_SUCCESS; 6616 *second_sct = bdev_io->internal.error.nvme.sct; 6617 *second_sc = bdev_io->internal.error.nvme.sc; 6618 } 6619 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6620 *first_sct = SPDK_NVME_SCT_GENERIC; 6621 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6622 *second_sct = SPDK_NVME_SCT_GENERIC; 6623 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6624 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6625 *first_sct = SPDK_NVME_SCT_GENERIC; 6626 *first_sc = SPDK_NVME_SC_SUCCESS; 6627 *second_sct = SPDK_NVME_SCT_GENERIC; 6628 *second_sc = SPDK_NVME_SC_SUCCESS; 6629 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6630 *first_sct = SPDK_NVME_SCT_GENERIC; 6631 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6632 *second_sct = SPDK_NVME_SCT_GENERIC; 6633 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6634 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6635 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6636 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6637 *second_sct = SPDK_NVME_SCT_GENERIC; 6638 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6639 } else { 6640 *first_sct = SPDK_NVME_SCT_GENERIC; 6641 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6642 *second_sct = SPDK_NVME_SCT_GENERIC; 6643 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6644 } 6645 6646 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6647 } 6648 6649 struct spdk_thread * 6650 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6651 { 6652 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6653 } 6654 6655 struct spdk_io_channel * 6656 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6657 { 6658 return bdev_io->internal.ch->channel; 6659 } 6660 6661 static int 6662 bdev_register(struct spdk_bdev *bdev) 6663 { 6664 char *bdev_name; 6665 char uuid[SPDK_UUID_STRING_LEN]; 6666 int ret; 6667 6668 assert(bdev->module != NULL); 6669 6670 if (!bdev->name) { 6671 SPDK_ERRLOG("Bdev name is NULL\n"); 6672 return -EINVAL; 6673 } 6674 6675 if (!strlen(bdev->name)) { 6676 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6677 return -EINVAL; 6678 } 6679 6680 /* Users often register their own I/O devices using the bdev name. In 6681 * order to avoid conflicts, prepend bdev_. */ 6682 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6683 if (!bdev_name) { 6684 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6685 return -ENOMEM; 6686 } 6687 6688 bdev->internal.stat = bdev_alloc_io_stat(true); 6689 if (!bdev->internal.stat) { 6690 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6691 free(bdev_name); 6692 return -ENOMEM; 6693 } 6694 6695 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6696 bdev->internal.measured_queue_depth = UINT64_MAX; 6697 bdev->internal.claim_module = NULL; 6698 bdev->internal.qd_poller = NULL; 6699 bdev->internal.qos = NULL; 6700 6701 TAILQ_INIT(&bdev->internal.open_descs); 6702 TAILQ_INIT(&bdev->internal.locked_ranges); 6703 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6704 TAILQ_INIT(&bdev->aliases); 6705 6706 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6707 if (ret != 0) { 6708 bdev_free_io_stat(bdev->internal.stat); 6709 free(bdev_name); 6710 return ret; 6711 } 6712 6713 /* UUID has to be specified by the user or defined by bdev itself. 6714 * Otherwise this field must remain empty, to indicate that this 6715 * value cannot be depended upon. */ 6716 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6717 /* Add the UUID alias only if it's different than the name */ 6718 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6719 if (strcmp(bdev->name, uuid) != 0) { 6720 ret = spdk_bdev_alias_add(bdev, uuid); 6721 if (ret != 0) { 6722 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6723 bdev_name_del(&bdev->internal.bdev_name); 6724 bdev_free_io_stat(bdev->internal.stat); 6725 free(bdev_name); 6726 return ret; 6727 } 6728 } 6729 } 6730 6731 if (spdk_bdev_get_buf_align(bdev) > 1) { 6732 if (bdev->split_on_optimal_io_boundary) { 6733 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6734 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6735 } else { 6736 bdev->split_on_optimal_io_boundary = true; 6737 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6738 } 6739 } 6740 6741 /* If the user didn't specify a write unit size, set it to one. */ 6742 if (bdev->write_unit_size == 0) { 6743 bdev->write_unit_size = 1; 6744 } 6745 6746 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6747 if (bdev->acwu == 0) { 6748 bdev->acwu = bdev->write_unit_size; 6749 } 6750 6751 if (bdev->phys_blocklen == 0) { 6752 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6753 } 6754 6755 bdev->internal.reset_in_progress = NULL; 6756 bdev->internal.qd_poll_in_progress = false; 6757 bdev->internal.period = 0; 6758 bdev->internal.new_period = 0; 6759 6760 spdk_io_device_register(__bdev_to_io_dev(bdev), 6761 bdev_channel_create, bdev_channel_destroy, 6762 sizeof(struct spdk_bdev_channel), 6763 bdev_name); 6764 6765 free(bdev_name); 6766 6767 spdk_spin_init(&bdev->internal.spinlock); 6768 6769 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6770 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6771 6772 return 0; 6773 } 6774 6775 static void 6776 bdev_destroy_cb(void *io_device) 6777 { 6778 int rc; 6779 struct spdk_bdev *bdev; 6780 spdk_bdev_unregister_cb cb_fn; 6781 void *cb_arg; 6782 6783 bdev = __bdev_from_io_dev(io_device); 6784 cb_fn = bdev->internal.unregister_cb; 6785 cb_arg = bdev->internal.unregister_ctx; 6786 6787 spdk_spin_destroy(&bdev->internal.spinlock); 6788 free(bdev->internal.qos); 6789 bdev_free_io_stat(bdev->internal.stat); 6790 6791 rc = bdev->fn_table->destruct(bdev->ctxt); 6792 if (rc < 0) { 6793 SPDK_ERRLOG("destruct failed\n"); 6794 } 6795 if (rc <= 0 && cb_fn != NULL) { 6796 cb_fn(cb_arg, rc); 6797 } 6798 } 6799 6800 void 6801 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6802 { 6803 if (bdev->internal.unregister_cb != NULL) { 6804 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6805 } 6806 } 6807 6808 static void 6809 _remove_notify(void *arg) 6810 { 6811 struct spdk_bdev_desc *desc = arg; 6812 6813 spdk_spin_lock(&desc->spinlock); 6814 desc->refs--; 6815 6816 if (!desc->closed) { 6817 spdk_spin_unlock(&desc->spinlock); 6818 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6819 return; 6820 } else if (0 == desc->refs) { 6821 /* This descriptor was closed after this remove_notify message was sent. 6822 * spdk_bdev_close() could not free the descriptor since this message was 6823 * in flight, so we free it now using bdev_desc_free(). 6824 */ 6825 spdk_spin_unlock(&desc->spinlock); 6826 bdev_desc_free(desc); 6827 return; 6828 } 6829 spdk_spin_unlock(&desc->spinlock); 6830 } 6831 6832 /* returns: 0 - bdev removed and ready to be destructed. 6833 * -EBUSY - bdev can't be destructed yet. */ 6834 static int 6835 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6836 { 6837 struct spdk_bdev_desc *desc, *tmp; 6838 int rc = 0; 6839 char uuid[SPDK_UUID_STRING_LEN]; 6840 6841 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6842 assert(spdk_spin_held(&bdev->internal.spinlock)); 6843 6844 /* Notify each descriptor about hotremoval */ 6845 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6846 rc = -EBUSY; 6847 spdk_spin_lock(&desc->spinlock); 6848 /* 6849 * Defer invocation of the event_cb to a separate message that will 6850 * run later on its thread. This ensures this context unwinds and 6851 * we don't recursively unregister this bdev again if the event_cb 6852 * immediately closes its descriptor. 6853 */ 6854 desc->refs++; 6855 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6856 spdk_spin_unlock(&desc->spinlock); 6857 } 6858 6859 /* If there are no descriptors, proceed removing the bdev */ 6860 if (rc == 0) { 6861 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6862 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6863 6864 /* Delete the name and the UUID alias */ 6865 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6866 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6867 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6868 6869 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6870 6871 if (bdev->internal.reset_in_progress != NULL) { 6872 /* If reset is in progress, let the completion callback for reset 6873 * unregister the bdev. 6874 */ 6875 rc = -EBUSY; 6876 } 6877 } 6878 6879 return rc; 6880 } 6881 6882 static void 6883 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6884 struct spdk_io_channel *io_ch, void *_ctx) 6885 { 6886 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 6887 6888 bdev_channel_abort_queued_ios(bdev_ch); 6889 spdk_bdev_for_each_channel_continue(i, 0); 6890 } 6891 6892 static void 6893 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 6894 { 6895 int rc; 6896 6897 spdk_spin_lock(&g_bdev_mgr.spinlock); 6898 spdk_spin_lock(&bdev->internal.spinlock); 6899 /* 6900 * Set the status to REMOVING after completing to abort channels. Otherwise, 6901 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6902 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 6903 * may fail. 6904 */ 6905 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6906 rc = bdev_unregister_unsafe(bdev); 6907 spdk_spin_unlock(&bdev->internal.spinlock); 6908 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6909 6910 if (rc == 0) { 6911 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6912 } 6913 } 6914 6915 void 6916 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6917 { 6918 struct spdk_thread *thread; 6919 6920 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6921 6922 thread = spdk_get_thread(); 6923 if (!thread) { 6924 /* The user called this from a non-SPDK thread. */ 6925 if (cb_fn != NULL) { 6926 cb_fn(cb_arg, -ENOTSUP); 6927 } 6928 return; 6929 } 6930 6931 spdk_spin_lock(&g_bdev_mgr.spinlock); 6932 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6933 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6934 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6935 if (cb_fn) { 6936 cb_fn(cb_arg, -EBUSY); 6937 } 6938 return; 6939 } 6940 6941 spdk_spin_lock(&bdev->internal.spinlock); 6942 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6943 bdev->internal.unregister_cb = cb_fn; 6944 bdev->internal.unregister_ctx = cb_arg; 6945 spdk_spin_unlock(&bdev->internal.spinlock); 6946 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6947 6948 spdk_bdev_set_qd_sampling_period(bdev, 0); 6949 6950 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 6951 bdev_unregister); 6952 } 6953 6954 int 6955 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6956 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6957 { 6958 struct spdk_bdev_desc *desc; 6959 struct spdk_bdev *bdev; 6960 int rc; 6961 6962 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6963 if (rc != 0) { 6964 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6965 return rc; 6966 } 6967 6968 bdev = spdk_bdev_desc_get_bdev(desc); 6969 6970 if (bdev->module != module) { 6971 spdk_bdev_close(desc); 6972 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6973 bdev_name); 6974 return -ENODEV; 6975 } 6976 6977 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6978 6979 spdk_bdev_close(desc); 6980 6981 return 0; 6982 } 6983 6984 static int 6985 bdev_start_qos(struct spdk_bdev *bdev) 6986 { 6987 struct set_qos_limit_ctx *ctx; 6988 6989 /* Enable QoS */ 6990 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6991 ctx = calloc(1, sizeof(*ctx)); 6992 if (ctx == NULL) { 6993 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6994 return -ENOMEM; 6995 } 6996 ctx->bdev = bdev; 6997 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 6998 } 6999 7000 return 0; 7001 } 7002 7003 static int 7004 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7005 { 7006 struct spdk_thread *thread; 7007 int rc = 0; 7008 7009 thread = spdk_get_thread(); 7010 if (!thread) { 7011 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7012 return -ENOTSUP; 7013 } 7014 7015 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7016 spdk_get_thread()); 7017 7018 desc->bdev = bdev; 7019 desc->thread = thread; 7020 desc->write = write; 7021 7022 spdk_spin_lock(&bdev->internal.spinlock); 7023 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7024 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7025 spdk_spin_unlock(&bdev->internal.spinlock); 7026 return -ENODEV; 7027 } 7028 7029 if (write && bdev->internal.claim_module) { 7030 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 7031 bdev->name, bdev->internal.claim_module->name); 7032 spdk_spin_unlock(&bdev->internal.spinlock); 7033 return -EPERM; 7034 } 7035 7036 rc = bdev_start_qos(bdev); 7037 if (rc != 0) { 7038 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7039 spdk_spin_unlock(&bdev->internal.spinlock); 7040 return rc; 7041 } 7042 7043 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7044 7045 spdk_spin_unlock(&bdev->internal.spinlock); 7046 7047 return 0; 7048 } 7049 7050 static int 7051 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7052 struct spdk_bdev_desc **_desc) 7053 { 7054 struct spdk_bdev_desc *desc; 7055 unsigned int event_id; 7056 7057 desc = calloc(1, sizeof(*desc)); 7058 if (desc == NULL) { 7059 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7060 return -ENOMEM; 7061 } 7062 7063 TAILQ_INIT(&desc->pending_media_events); 7064 TAILQ_INIT(&desc->free_media_events); 7065 7066 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7067 desc->callback.event_fn = event_cb; 7068 desc->callback.ctx = event_ctx; 7069 spdk_spin_init(&desc->spinlock); 7070 7071 if (bdev->media_events) { 7072 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7073 sizeof(*desc->media_events_buffer)); 7074 if (desc->media_events_buffer == NULL) { 7075 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7076 bdev_desc_free(desc); 7077 return -ENOMEM; 7078 } 7079 7080 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7081 TAILQ_INSERT_TAIL(&desc->free_media_events, 7082 &desc->media_events_buffer[event_id], tailq); 7083 } 7084 } 7085 7086 *_desc = desc; 7087 7088 return 0; 7089 } 7090 7091 int 7092 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7093 void *event_ctx, struct spdk_bdev_desc **_desc) 7094 { 7095 struct spdk_bdev_desc *desc; 7096 struct spdk_bdev *bdev; 7097 int rc; 7098 7099 if (event_cb == NULL) { 7100 SPDK_ERRLOG("Missing event callback function\n"); 7101 return -EINVAL; 7102 } 7103 7104 spdk_spin_lock(&g_bdev_mgr.spinlock); 7105 7106 bdev = bdev_get_by_name(bdev_name); 7107 7108 if (bdev == NULL) { 7109 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7110 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7111 return -ENODEV; 7112 } 7113 7114 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7115 if (rc != 0) { 7116 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7117 return rc; 7118 } 7119 7120 rc = bdev_open(bdev, write, desc); 7121 if (rc != 0) { 7122 bdev_desc_free(desc); 7123 desc = NULL; 7124 } 7125 7126 *_desc = desc; 7127 7128 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7129 7130 return rc; 7131 } 7132 7133 static void 7134 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7135 { 7136 int rc; 7137 7138 spdk_spin_lock(&bdev->internal.spinlock); 7139 spdk_spin_lock(&desc->spinlock); 7140 7141 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7142 7143 desc->closed = true; 7144 7145 if (0 == desc->refs) { 7146 spdk_spin_unlock(&desc->spinlock); 7147 bdev_desc_free(desc); 7148 } else { 7149 spdk_spin_unlock(&desc->spinlock); 7150 } 7151 7152 /* If no more descriptors, kill QoS channel */ 7153 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7154 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7155 bdev->name, spdk_get_thread()); 7156 7157 if (bdev_qos_destroy(bdev)) { 7158 /* There isn't anything we can do to recover here. Just let the 7159 * old QoS poller keep running. The QoS handling won't change 7160 * cores when the user allocates a new channel, but it won't break. */ 7161 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7162 } 7163 } 7164 7165 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7166 rc = bdev_unregister_unsafe(bdev); 7167 spdk_spin_unlock(&bdev->internal.spinlock); 7168 7169 if (rc == 0) { 7170 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7171 } 7172 } else { 7173 spdk_spin_unlock(&bdev->internal.spinlock); 7174 } 7175 } 7176 7177 void 7178 spdk_bdev_close(struct spdk_bdev_desc *desc) 7179 { 7180 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7181 7182 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7183 spdk_get_thread()); 7184 7185 assert(desc->thread == spdk_get_thread()); 7186 7187 spdk_poller_unregister(&desc->io_timeout_poller); 7188 7189 spdk_spin_lock(&g_bdev_mgr.spinlock); 7190 7191 bdev_close(bdev, desc); 7192 7193 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7194 } 7195 7196 static void 7197 bdev_register_finished(void *arg) 7198 { 7199 struct spdk_bdev_desc *desc = arg; 7200 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7201 7202 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7203 7204 spdk_spin_lock(&g_bdev_mgr.spinlock); 7205 7206 bdev_close(bdev, desc); 7207 7208 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7209 } 7210 7211 int 7212 spdk_bdev_register(struct spdk_bdev *bdev) 7213 { 7214 struct spdk_bdev_desc *desc; 7215 int rc; 7216 7217 rc = bdev_register(bdev); 7218 if (rc != 0) { 7219 return rc; 7220 } 7221 7222 /* A descriptor is opened to prevent bdev deletion during examination */ 7223 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7224 if (rc != 0) { 7225 spdk_bdev_unregister(bdev, NULL, NULL); 7226 return rc; 7227 } 7228 7229 rc = bdev_open(bdev, false, desc); 7230 if (rc != 0) { 7231 bdev_desc_free(desc); 7232 spdk_bdev_unregister(bdev, NULL, NULL); 7233 return rc; 7234 } 7235 7236 /* Examine configuration before initializing I/O */ 7237 bdev_examine(bdev); 7238 7239 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7240 if (rc != 0) { 7241 bdev_close(bdev, desc); 7242 spdk_bdev_unregister(bdev, NULL, NULL); 7243 } 7244 7245 return rc; 7246 } 7247 7248 int 7249 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7250 struct spdk_bdev_module *module) 7251 { 7252 if (bdev->internal.claim_module != NULL) { 7253 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 7254 bdev->internal.claim_module->name); 7255 return -EPERM; 7256 } 7257 7258 if (desc && !desc->write) { 7259 desc->write = true; 7260 } 7261 7262 bdev->internal.claim_module = module; 7263 return 0; 7264 } 7265 7266 void 7267 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7268 { 7269 assert(bdev->internal.claim_module != NULL); 7270 bdev->internal.claim_module = NULL; 7271 } 7272 7273 struct spdk_bdev * 7274 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7275 { 7276 assert(desc != NULL); 7277 return desc->bdev; 7278 } 7279 7280 int 7281 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7282 { 7283 struct spdk_bdev *bdev, *tmp; 7284 struct spdk_bdev_desc *desc; 7285 int rc = 0; 7286 7287 assert(fn != NULL); 7288 7289 spdk_spin_lock(&g_bdev_mgr.spinlock); 7290 bdev = spdk_bdev_first(); 7291 while (bdev != NULL) { 7292 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7293 if (rc != 0) { 7294 break; 7295 } 7296 rc = bdev_open(bdev, false, desc); 7297 if (rc != 0) { 7298 bdev_desc_free(desc); 7299 if (rc == -ENODEV) { 7300 /* Ignore the error and move to the next bdev. */ 7301 rc = 0; 7302 bdev = spdk_bdev_next(bdev); 7303 continue; 7304 } 7305 break; 7306 } 7307 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7308 7309 rc = fn(ctx, bdev); 7310 7311 spdk_spin_lock(&g_bdev_mgr.spinlock); 7312 tmp = spdk_bdev_next(bdev); 7313 bdev_close(bdev, desc); 7314 if (rc != 0) { 7315 break; 7316 } 7317 bdev = tmp; 7318 } 7319 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7320 7321 return rc; 7322 } 7323 7324 int 7325 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7326 { 7327 struct spdk_bdev *bdev, *tmp; 7328 struct spdk_bdev_desc *desc; 7329 int rc = 0; 7330 7331 assert(fn != NULL); 7332 7333 spdk_spin_lock(&g_bdev_mgr.spinlock); 7334 bdev = spdk_bdev_first_leaf(); 7335 while (bdev != NULL) { 7336 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7337 if (rc != 0) { 7338 break; 7339 } 7340 rc = bdev_open(bdev, false, desc); 7341 if (rc != 0) { 7342 bdev_desc_free(desc); 7343 if (rc == -ENODEV) { 7344 /* Ignore the error and move to the next bdev. */ 7345 rc = 0; 7346 bdev = spdk_bdev_next_leaf(bdev); 7347 continue; 7348 } 7349 break; 7350 } 7351 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7352 7353 rc = fn(ctx, bdev); 7354 7355 spdk_spin_lock(&g_bdev_mgr.spinlock); 7356 tmp = spdk_bdev_next_leaf(bdev); 7357 bdev_close(bdev, desc); 7358 if (rc != 0) { 7359 break; 7360 } 7361 bdev = tmp; 7362 } 7363 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7364 7365 return rc; 7366 } 7367 7368 void 7369 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7370 { 7371 struct iovec *iovs; 7372 int iovcnt; 7373 7374 if (bdev_io == NULL) { 7375 return; 7376 } 7377 7378 switch (bdev_io->type) { 7379 case SPDK_BDEV_IO_TYPE_READ: 7380 case SPDK_BDEV_IO_TYPE_WRITE: 7381 case SPDK_BDEV_IO_TYPE_ZCOPY: 7382 iovs = bdev_io->u.bdev.iovs; 7383 iovcnt = bdev_io->u.bdev.iovcnt; 7384 break; 7385 default: 7386 iovs = NULL; 7387 iovcnt = 0; 7388 break; 7389 } 7390 7391 if (iovp) { 7392 *iovp = iovs; 7393 } 7394 if (iovcntp) { 7395 *iovcntp = iovcnt; 7396 } 7397 } 7398 7399 void * 7400 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7401 { 7402 if (bdev_io == NULL) { 7403 return NULL; 7404 } 7405 7406 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7407 return NULL; 7408 } 7409 7410 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7411 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7412 return bdev_io->u.bdev.md_buf; 7413 } 7414 7415 return NULL; 7416 } 7417 7418 void * 7419 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7420 { 7421 if (bdev_io == NULL) { 7422 assert(false); 7423 return NULL; 7424 } 7425 7426 return bdev_io->internal.caller_ctx; 7427 } 7428 7429 void 7430 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7431 { 7432 7433 if (spdk_bdev_module_list_find(bdev_module->name)) { 7434 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7435 assert(false); 7436 } 7437 7438 /* 7439 * Modules with examine callbacks must be initialized first, so they are 7440 * ready to handle examine callbacks from later modules that will 7441 * register physical bdevs. 7442 */ 7443 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7444 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7445 } else { 7446 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7447 } 7448 } 7449 7450 struct spdk_bdev_module * 7451 spdk_bdev_module_list_find(const char *name) 7452 { 7453 struct spdk_bdev_module *bdev_module; 7454 7455 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7456 if (strcmp(name, bdev_module->name) == 0) { 7457 break; 7458 } 7459 } 7460 7461 return bdev_module; 7462 } 7463 7464 static void 7465 bdev_write_zero_buffer_next(void *_bdev_io) 7466 { 7467 struct spdk_bdev_io *bdev_io = _bdev_io; 7468 uint64_t num_bytes, num_blocks; 7469 void *md_buf = NULL; 7470 int rc; 7471 7472 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7473 bdev_io->u.bdev.split_remaining_num_blocks, 7474 ZERO_BUFFER_SIZE); 7475 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7476 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7477 7478 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7479 md_buf = (char *)g_bdev_mgr.zero_buffer + 7480 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7481 } 7482 7483 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7484 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7485 g_bdev_mgr.zero_buffer, md_buf, 7486 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7487 bdev_write_zero_buffer_done, bdev_io); 7488 if (rc == 0) { 7489 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7490 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7491 } else if (rc == -ENOMEM) { 7492 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7493 } else { 7494 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7495 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7496 } 7497 } 7498 7499 static void 7500 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7501 { 7502 struct spdk_bdev_io *parent_io = cb_arg; 7503 7504 spdk_bdev_free_io(bdev_io); 7505 7506 if (!success) { 7507 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7508 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7509 return; 7510 } 7511 7512 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7513 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7514 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7515 return; 7516 } 7517 7518 bdev_write_zero_buffer_next(parent_io); 7519 } 7520 7521 static void 7522 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7523 { 7524 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7525 ctx->bdev->internal.qos_mod_in_progress = false; 7526 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7527 7528 if (ctx->cb_fn) { 7529 ctx->cb_fn(ctx->cb_arg, status); 7530 } 7531 free(ctx); 7532 } 7533 7534 static void 7535 bdev_disable_qos_done(void *cb_arg) 7536 { 7537 struct set_qos_limit_ctx *ctx = cb_arg; 7538 struct spdk_bdev *bdev = ctx->bdev; 7539 struct spdk_bdev_io *bdev_io; 7540 struct spdk_bdev_qos *qos; 7541 7542 spdk_spin_lock(&bdev->internal.spinlock); 7543 qos = bdev->internal.qos; 7544 bdev->internal.qos = NULL; 7545 spdk_spin_unlock(&bdev->internal.spinlock); 7546 7547 while (!TAILQ_EMPTY(&qos->queued)) { 7548 /* Send queued I/O back to their original thread for resubmission. */ 7549 bdev_io = TAILQ_FIRST(&qos->queued); 7550 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7551 7552 if (bdev_io->internal.io_submit_ch) { 7553 /* 7554 * Channel was changed when sending it to the QoS thread - change it back 7555 * before sending it back to the original thread. 7556 */ 7557 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7558 bdev_io->internal.io_submit_ch = NULL; 7559 } 7560 7561 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7562 _bdev_io_submit, bdev_io); 7563 } 7564 7565 if (qos->thread != NULL) { 7566 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7567 spdk_poller_unregister(&qos->poller); 7568 } 7569 7570 free(qos); 7571 7572 bdev_set_qos_limit_done(ctx, 0); 7573 } 7574 7575 static void 7576 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 7577 { 7578 struct set_qos_limit_ctx *ctx = _ctx; 7579 struct spdk_thread *thread; 7580 7581 spdk_spin_lock(&bdev->internal.spinlock); 7582 thread = bdev->internal.qos->thread; 7583 spdk_spin_unlock(&bdev->internal.spinlock); 7584 7585 if (thread != NULL) { 7586 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7587 } else { 7588 bdev_disable_qos_done(ctx); 7589 } 7590 } 7591 7592 static void 7593 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7594 struct spdk_io_channel *ch, void *_ctx) 7595 { 7596 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7597 7598 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7599 7600 spdk_bdev_for_each_channel_continue(i, 0); 7601 } 7602 7603 static void 7604 bdev_update_qos_rate_limit_msg(void *cb_arg) 7605 { 7606 struct set_qos_limit_ctx *ctx = cb_arg; 7607 struct spdk_bdev *bdev = ctx->bdev; 7608 7609 spdk_spin_lock(&bdev->internal.spinlock); 7610 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7611 spdk_spin_unlock(&bdev->internal.spinlock); 7612 7613 bdev_set_qos_limit_done(ctx, 0); 7614 } 7615 7616 static void 7617 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7618 struct spdk_io_channel *ch, void *_ctx) 7619 { 7620 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7621 7622 spdk_spin_lock(&bdev->internal.spinlock); 7623 bdev_enable_qos(bdev, bdev_ch); 7624 spdk_spin_unlock(&bdev->internal.spinlock); 7625 spdk_bdev_for_each_channel_continue(i, 0); 7626 } 7627 7628 static void 7629 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 7630 { 7631 struct set_qos_limit_ctx *ctx = _ctx; 7632 7633 bdev_set_qos_limit_done(ctx, status); 7634 } 7635 7636 static void 7637 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7638 { 7639 int i; 7640 7641 assert(bdev->internal.qos != NULL); 7642 7643 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7644 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7645 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7646 7647 if (limits[i] == 0) { 7648 bdev->internal.qos->rate_limits[i].limit = 7649 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7650 } 7651 } 7652 } 7653 } 7654 7655 void 7656 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7657 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7658 { 7659 struct set_qos_limit_ctx *ctx; 7660 uint32_t limit_set_complement; 7661 uint64_t min_limit_per_sec; 7662 int i; 7663 bool disable_rate_limit = true; 7664 7665 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7666 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7667 continue; 7668 } 7669 7670 if (limits[i] > 0) { 7671 disable_rate_limit = false; 7672 } 7673 7674 if (bdev_qos_is_iops_rate_limit(i) == true) { 7675 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7676 } else { 7677 /* Change from megabyte to byte rate limit */ 7678 limits[i] = limits[i] * 1024 * 1024; 7679 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7680 } 7681 7682 limit_set_complement = limits[i] % min_limit_per_sec; 7683 if (limit_set_complement) { 7684 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7685 limits[i], min_limit_per_sec); 7686 limits[i] += min_limit_per_sec - limit_set_complement; 7687 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7688 } 7689 } 7690 7691 ctx = calloc(1, sizeof(*ctx)); 7692 if (ctx == NULL) { 7693 cb_fn(cb_arg, -ENOMEM); 7694 return; 7695 } 7696 7697 ctx->cb_fn = cb_fn; 7698 ctx->cb_arg = cb_arg; 7699 ctx->bdev = bdev; 7700 7701 spdk_spin_lock(&bdev->internal.spinlock); 7702 if (bdev->internal.qos_mod_in_progress) { 7703 spdk_spin_unlock(&bdev->internal.spinlock); 7704 free(ctx); 7705 cb_fn(cb_arg, -EAGAIN); 7706 return; 7707 } 7708 bdev->internal.qos_mod_in_progress = true; 7709 7710 if (disable_rate_limit == true && bdev->internal.qos) { 7711 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7712 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7713 (bdev->internal.qos->rate_limits[i].limit > 0 && 7714 bdev->internal.qos->rate_limits[i].limit != 7715 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7716 disable_rate_limit = false; 7717 break; 7718 } 7719 } 7720 } 7721 7722 if (disable_rate_limit == false) { 7723 if (bdev->internal.qos == NULL) { 7724 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7725 if (!bdev->internal.qos) { 7726 spdk_spin_unlock(&bdev->internal.spinlock); 7727 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7728 bdev_set_qos_limit_done(ctx, -ENOMEM); 7729 return; 7730 } 7731 } 7732 7733 if (bdev->internal.qos->thread == NULL) { 7734 /* Enabling */ 7735 bdev_set_qos_rate_limits(bdev, limits); 7736 7737 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 7738 bdev_enable_qos_done); 7739 } else { 7740 /* Updating */ 7741 bdev_set_qos_rate_limits(bdev, limits); 7742 7743 spdk_thread_send_msg(bdev->internal.qos->thread, 7744 bdev_update_qos_rate_limit_msg, ctx); 7745 } 7746 } else { 7747 if (bdev->internal.qos != NULL) { 7748 bdev_set_qos_rate_limits(bdev, limits); 7749 7750 /* Disabling */ 7751 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 7752 bdev_disable_qos_msg_done); 7753 } else { 7754 spdk_spin_unlock(&bdev->internal.spinlock); 7755 bdev_set_qos_limit_done(ctx, 0); 7756 return; 7757 } 7758 } 7759 7760 spdk_spin_unlock(&bdev->internal.spinlock); 7761 } 7762 7763 struct spdk_bdev_histogram_ctx { 7764 spdk_bdev_histogram_status_cb cb_fn; 7765 void *cb_arg; 7766 struct spdk_bdev *bdev; 7767 int status; 7768 }; 7769 7770 static void 7771 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7772 { 7773 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7774 7775 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7776 ctx->bdev->internal.histogram_in_progress = false; 7777 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7778 ctx->cb_fn(ctx->cb_arg, ctx->status); 7779 free(ctx); 7780 } 7781 7782 static void 7783 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7784 struct spdk_io_channel *_ch, void *_ctx) 7785 { 7786 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7787 7788 if (ch->histogram != NULL) { 7789 spdk_histogram_data_free(ch->histogram); 7790 ch->histogram = NULL; 7791 } 7792 spdk_bdev_for_each_channel_continue(i, 0); 7793 } 7794 7795 static void 7796 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7797 { 7798 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7799 7800 if (status != 0) { 7801 ctx->status = status; 7802 ctx->bdev->internal.histogram_enabled = false; 7803 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 7804 bdev_histogram_disable_channel_cb); 7805 } else { 7806 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7807 ctx->bdev->internal.histogram_in_progress = false; 7808 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7809 ctx->cb_fn(ctx->cb_arg, ctx->status); 7810 free(ctx); 7811 } 7812 } 7813 7814 static void 7815 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7816 struct spdk_io_channel *_ch, void *_ctx) 7817 { 7818 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7819 int status = 0; 7820 7821 if (ch->histogram == NULL) { 7822 ch->histogram = spdk_histogram_data_alloc(); 7823 if (ch->histogram == NULL) { 7824 status = -ENOMEM; 7825 } 7826 } 7827 7828 spdk_bdev_for_each_channel_continue(i, status); 7829 } 7830 7831 void 7832 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7833 void *cb_arg, bool enable) 7834 { 7835 struct spdk_bdev_histogram_ctx *ctx; 7836 7837 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7838 if (ctx == NULL) { 7839 cb_fn(cb_arg, -ENOMEM); 7840 return; 7841 } 7842 7843 ctx->bdev = bdev; 7844 ctx->status = 0; 7845 ctx->cb_fn = cb_fn; 7846 ctx->cb_arg = cb_arg; 7847 7848 spdk_spin_lock(&bdev->internal.spinlock); 7849 if (bdev->internal.histogram_in_progress) { 7850 spdk_spin_unlock(&bdev->internal.spinlock); 7851 free(ctx); 7852 cb_fn(cb_arg, -EAGAIN); 7853 return; 7854 } 7855 7856 bdev->internal.histogram_in_progress = true; 7857 spdk_spin_unlock(&bdev->internal.spinlock); 7858 7859 bdev->internal.histogram_enabled = enable; 7860 7861 if (enable) { 7862 /* Allocate histogram for each channel */ 7863 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 7864 bdev_histogram_enable_channel_cb); 7865 } else { 7866 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 7867 bdev_histogram_disable_channel_cb); 7868 } 7869 } 7870 7871 struct spdk_bdev_histogram_data_ctx { 7872 spdk_bdev_histogram_data_cb cb_fn; 7873 void *cb_arg; 7874 struct spdk_bdev *bdev; 7875 /** merged histogram data from all channels */ 7876 struct spdk_histogram_data *histogram; 7877 }; 7878 7879 static void 7880 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7881 { 7882 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7883 7884 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7885 free(ctx); 7886 } 7887 7888 static void 7889 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7890 struct spdk_io_channel *_ch, void *_ctx) 7891 { 7892 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7893 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7894 int status = 0; 7895 7896 if (ch->histogram == NULL) { 7897 status = -EFAULT; 7898 } else { 7899 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7900 } 7901 7902 spdk_bdev_for_each_channel_continue(i, status); 7903 } 7904 7905 void 7906 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7907 spdk_bdev_histogram_data_cb cb_fn, 7908 void *cb_arg) 7909 { 7910 struct spdk_bdev_histogram_data_ctx *ctx; 7911 7912 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7913 if (ctx == NULL) { 7914 cb_fn(cb_arg, -ENOMEM, NULL); 7915 return; 7916 } 7917 7918 ctx->bdev = bdev; 7919 ctx->cb_fn = cb_fn; 7920 ctx->cb_arg = cb_arg; 7921 7922 ctx->histogram = histogram; 7923 7924 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 7925 bdev_histogram_get_channel_cb); 7926 } 7927 7928 void 7929 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 7930 void *cb_arg) 7931 { 7932 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7933 int status = 0; 7934 7935 assert(cb_fn != NULL); 7936 7937 if (bdev_ch->histogram == NULL) { 7938 status = -EFAULT; 7939 } 7940 cb_fn(cb_arg, status, bdev_ch->histogram); 7941 } 7942 7943 size_t 7944 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7945 size_t max_events) 7946 { 7947 struct media_event_entry *entry; 7948 size_t num_events = 0; 7949 7950 for (; num_events < max_events; ++num_events) { 7951 entry = TAILQ_FIRST(&desc->pending_media_events); 7952 if (entry == NULL) { 7953 break; 7954 } 7955 7956 events[num_events] = entry->event; 7957 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7958 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7959 } 7960 7961 return num_events; 7962 } 7963 7964 int 7965 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7966 size_t num_events) 7967 { 7968 struct spdk_bdev_desc *desc; 7969 struct media_event_entry *entry; 7970 size_t event_id; 7971 int rc = 0; 7972 7973 assert(bdev->media_events); 7974 7975 spdk_spin_lock(&bdev->internal.spinlock); 7976 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7977 if (desc->write) { 7978 break; 7979 } 7980 } 7981 7982 if (desc == NULL || desc->media_events_buffer == NULL) { 7983 rc = -ENODEV; 7984 goto out; 7985 } 7986 7987 for (event_id = 0; event_id < num_events; ++event_id) { 7988 entry = TAILQ_FIRST(&desc->free_media_events); 7989 if (entry == NULL) { 7990 break; 7991 } 7992 7993 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7994 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7995 entry->event = events[event_id]; 7996 } 7997 7998 rc = event_id; 7999 out: 8000 spdk_spin_unlock(&bdev->internal.spinlock); 8001 return rc; 8002 } 8003 8004 void 8005 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8006 { 8007 struct spdk_bdev_desc *desc; 8008 8009 spdk_spin_lock(&bdev->internal.spinlock); 8010 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8011 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8012 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 8013 desc->callback.ctx); 8014 } 8015 } 8016 spdk_spin_unlock(&bdev->internal.spinlock); 8017 } 8018 8019 struct locked_lba_range_ctx { 8020 struct lba_range range; 8021 struct spdk_bdev *bdev; 8022 struct lba_range *current_range; 8023 struct lba_range *owner_range; 8024 struct spdk_poller *poller; 8025 lock_range_cb cb_fn; 8026 void *cb_arg; 8027 }; 8028 8029 static void 8030 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8031 { 8032 struct locked_lba_range_ctx *ctx = _ctx; 8033 8034 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8035 free(ctx); 8036 } 8037 8038 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8039 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8040 8041 static void 8042 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8043 { 8044 struct locked_lba_range_ctx *ctx = _ctx; 8045 8046 if (status == -ENOMEM) { 8047 /* One of the channels could not allocate a range object. 8048 * So we have to go back and clean up any ranges that were 8049 * allocated successfully before we return error status to 8050 * the caller. We can reuse the unlock function to do that 8051 * clean up. 8052 */ 8053 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8054 bdev_lock_error_cleanup_cb); 8055 return; 8056 } 8057 8058 /* All channels have locked this range and no I/O overlapping the range 8059 * are outstanding! Set the owner_ch for the range object for the 8060 * locking channel, so that this channel will know that it is allowed 8061 * to write to this range. 8062 */ 8063 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8064 ctx->cb_fn(ctx->cb_arg, status); 8065 8066 /* Don't free the ctx here. Its range is in the bdev's global list of 8067 * locked ranges still, and will be removed and freed when this range 8068 * is later unlocked. 8069 */ 8070 } 8071 8072 static int 8073 bdev_lock_lba_range_check_io(void *_i) 8074 { 8075 struct spdk_bdev_channel_iter *i = _i; 8076 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8077 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8078 struct locked_lba_range_ctx *ctx = i->ctx; 8079 struct lba_range *range = ctx->current_range; 8080 struct spdk_bdev_io *bdev_io; 8081 8082 spdk_poller_unregister(&ctx->poller); 8083 8084 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8085 * range. But we need to wait until any outstanding IO overlapping with this range 8086 * are completed. 8087 */ 8088 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8089 if (bdev_io_range_is_locked(bdev_io, range)) { 8090 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8091 return SPDK_POLLER_BUSY; 8092 } 8093 } 8094 8095 spdk_bdev_for_each_channel_continue(i, 0); 8096 return SPDK_POLLER_BUSY; 8097 } 8098 8099 static void 8100 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8101 struct spdk_io_channel *_ch, void *_ctx) 8102 { 8103 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8104 struct locked_lba_range_ctx *ctx = _ctx; 8105 struct lba_range *range; 8106 8107 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8108 if (range->length == ctx->range.length && 8109 range->offset == ctx->range.offset && 8110 range->locked_ctx == ctx->range.locked_ctx) { 8111 /* This range already exists on this channel, so don't add 8112 * it again. This can happen when a new channel is created 8113 * while the for_each_channel operation is in progress. 8114 * Do not check for outstanding I/O in that case, since the 8115 * range was locked before any I/O could be submitted to the 8116 * new channel. 8117 */ 8118 spdk_bdev_for_each_channel_continue(i, 0); 8119 return; 8120 } 8121 } 8122 8123 range = calloc(1, sizeof(*range)); 8124 if (range == NULL) { 8125 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8126 return; 8127 } 8128 8129 range->length = ctx->range.length; 8130 range->offset = ctx->range.offset; 8131 range->locked_ctx = ctx->range.locked_ctx; 8132 ctx->current_range = range; 8133 if (ctx->range.owner_ch == ch) { 8134 /* This is the range object for the channel that will hold 8135 * the lock. Store it in the ctx object so that we can easily 8136 * set its owner_ch after the lock is finally acquired. 8137 */ 8138 ctx->owner_range = range; 8139 } 8140 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8141 bdev_lock_lba_range_check_io(i); 8142 } 8143 8144 static void 8145 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8146 { 8147 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8148 8149 /* We will add a copy of this range to each channel now. */ 8150 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8151 bdev_lock_lba_range_cb); 8152 } 8153 8154 static bool 8155 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8156 { 8157 struct lba_range *r; 8158 8159 TAILQ_FOREACH(r, tailq, tailq) { 8160 if (bdev_lba_range_overlapped(range, r)) { 8161 return true; 8162 } 8163 } 8164 return false; 8165 } 8166 8167 static int 8168 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8169 uint64_t offset, uint64_t length, 8170 lock_range_cb cb_fn, void *cb_arg) 8171 { 8172 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8173 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8174 struct locked_lba_range_ctx *ctx; 8175 8176 if (cb_arg == NULL) { 8177 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8178 return -EINVAL; 8179 } 8180 8181 ctx = calloc(1, sizeof(*ctx)); 8182 if (ctx == NULL) { 8183 return -ENOMEM; 8184 } 8185 8186 ctx->range.offset = offset; 8187 ctx->range.length = length; 8188 ctx->range.owner_ch = ch; 8189 ctx->range.locked_ctx = cb_arg; 8190 ctx->bdev = bdev; 8191 ctx->cb_fn = cb_fn; 8192 ctx->cb_arg = cb_arg; 8193 8194 spdk_spin_lock(&bdev->internal.spinlock); 8195 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8196 /* There is an active lock overlapping with this range. 8197 * Put it on the pending list until this range no 8198 * longer overlaps with another. 8199 */ 8200 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8201 } else { 8202 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8203 bdev_lock_lba_range_ctx(bdev, ctx); 8204 } 8205 spdk_spin_unlock(&bdev->internal.spinlock); 8206 return 0; 8207 } 8208 8209 static void 8210 bdev_lock_lba_range_ctx_msg(void *_ctx) 8211 { 8212 struct locked_lba_range_ctx *ctx = _ctx; 8213 8214 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8215 } 8216 8217 static void 8218 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8219 { 8220 struct locked_lba_range_ctx *ctx = _ctx; 8221 struct locked_lba_range_ctx *pending_ctx; 8222 struct lba_range *range, *tmp; 8223 8224 spdk_spin_lock(&bdev->internal.spinlock); 8225 /* Check if there are any pending locked ranges that overlap with this range 8226 * that was just unlocked. If there are, check that it doesn't overlap with any 8227 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8228 * the lock process. 8229 */ 8230 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8231 if (bdev_lba_range_overlapped(range, &ctx->range) && 8232 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8233 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8234 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8235 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8236 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8237 bdev_lock_lba_range_ctx_msg, pending_ctx); 8238 } 8239 } 8240 spdk_spin_unlock(&bdev->internal.spinlock); 8241 8242 ctx->cb_fn(ctx->cb_arg, status); 8243 free(ctx); 8244 } 8245 8246 static void 8247 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8248 struct spdk_io_channel *_ch, void *_ctx) 8249 { 8250 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8251 struct locked_lba_range_ctx *ctx = _ctx; 8252 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8253 struct spdk_bdev_io *bdev_io; 8254 struct lba_range *range; 8255 8256 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8257 if (ctx->range.offset == range->offset && 8258 ctx->range.length == range->length && 8259 ctx->range.locked_ctx == range->locked_ctx) { 8260 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8261 free(range); 8262 break; 8263 } 8264 } 8265 8266 /* Note: we should almost always be able to assert that the range specified 8267 * was found. But there are some very rare corner cases where a new channel 8268 * gets created simultaneously with a range unlock, where this function 8269 * would execute on that new channel and wouldn't have the range. 8270 * We also use this to clean up range allocations when a later allocation 8271 * fails in the locking path. 8272 * So we can't actually assert() here. 8273 */ 8274 8275 /* Swap the locked IO into a temporary list, and then try to submit them again. 8276 * We could hyper-optimize this to only resubmit locked I/O that overlap 8277 * with the range that was just unlocked, but this isn't a performance path so 8278 * we go for simplicity here. 8279 */ 8280 TAILQ_INIT(&io_locked); 8281 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8282 while (!TAILQ_EMPTY(&io_locked)) { 8283 bdev_io = TAILQ_FIRST(&io_locked); 8284 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8285 bdev_io_submit(bdev_io); 8286 } 8287 8288 spdk_bdev_for_each_channel_continue(i, 0); 8289 } 8290 8291 static int 8292 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8293 uint64_t offset, uint64_t length, 8294 lock_range_cb cb_fn, void *cb_arg) 8295 { 8296 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8297 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8298 struct locked_lba_range_ctx *ctx; 8299 struct lba_range *range; 8300 bool range_found = false; 8301 8302 /* Let's make sure the specified channel actually has a lock on 8303 * the specified range. Note that the range must match exactly. 8304 */ 8305 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8306 if (range->offset == offset && range->length == length && 8307 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8308 range_found = true; 8309 break; 8310 } 8311 } 8312 8313 if (!range_found) { 8314 return -EINVAL; 8315 } 8316 8317 spdk_spin_lock(&bdev->internal.spinlock); 8318 /* We confirmed that this channel has locked the specified range. To 8319 * start the unlock the process, we find the range in the bdev's locked_ranges 8320 * and remove it. This ensures new channels don't inherit the locked range. 8321 * Then we will send a message to each channel (including the one specified 8322 * here) to remove the range from its per-channel list. 8323 */ 8324 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8325 if (range->offset == offset && range->length == length && 8326 range->locked_ctx == cb_arg) { 8327 break; 8328 } 8329 } 8330 if (range == NULL) { 8331 assert(false); 8332 spdk_spin_unlock(&bdev->internal.spinlock); 8333 return -EINVAL; 8334 } 8335 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8336 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8337 spdk_spin_unlock(&bdev->internal.spinlock); 8338 8339 ctx->cb_fn = cb_fn; 8340 ctx->cb_arg = cb_arg; 8341 8342 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8343 bdev_unlock_lba_range_cb); 8344 return 0; 8345 } 8346 8347 int 8348 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8349 int array_size) 8350 { 8351 if (!bdev) { 8352 return -EINVAL; 8353 } 8354 8355 if (bdev->fn_table->get_memory_domains) { 8356 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8357 } 8358 8359 return 0; 8360 } 8361 8362 struct spdk_bdev_for_each_io_ctx { 8363 void *ctx; 8364 spdk_bdev_io_fn fn; 8365 spdk_bdev_for_each_io_cb cb; 8366 }; 8367 8368 static void 8369 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8370 struct spdk_io_channel *io_ch, void *_ctx) 8371 { 8372 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8373 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8374 struct spdk_bdev_io *bdev_io; 8375 int rc = 0; 8376 8377 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8378 rc = ctx->fn(ctx->ctx, bdev_io); 8379 if (rc != 0) { 8380 break; 8381 } 8382 } 8383 8384 spdk_bdev_for_each_channel_continue(i, rc); 8385 } 8386 8387 static void 8388 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8389 { 8390 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8391 8392 ctx->cb(ctx->ctx, status); 8393 8394 free(ctx); 8395 } 8396 8397 void 8398 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8399 spdk_bdev_for_each_io_cb cb) 8400 { 8401 struct spdk_bdev_for_each_io_ctx *ctx; 8402 8403 assert(fn != NULL && cb != NULL); 8404 8405 ctx = calloc(1, sizeof(*ctx)); 8406 if (ctx == NULL) { 8407 SPDK_ERRLOG("Failed to allocate context.\n"); 8408 cb(_ctx, -ENOMEM); 8409 return; 8410 } 8411 8412 ctx->ctx = _ctx; 8413 ctx->fn = fn; 8414 ctx->cb = cb; 8415 8416 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8417 bdev_for_each_io_done); 8418 } 8419 8420 void 8421 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8422 { 8423 spdk_for_each_channel_continue(iter->i, status); 8424 } 8425 8426 static struct spdk_bdev * 8427 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8428 { 8429 void *io_device = spdk_io_channel_iter_get_io_device(i); 8430 8431 return __bdev_from_io_dev(io_device); 8432 } 8433 8434 static void 8435 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8436 { 8437 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8438 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8439 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8440 8441 iter->i = i; 8442 iter->fn(iter, bdev, ch, iter->ctx); 8443 } 8444 8445 static void 8446 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 8447 { 8448 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8449 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8450 8451 iter->i = i; 8452 iter->cpl(bdev, iter->ctx, status); 8453 8454 free(iter); 8455 } 8456 8457 void 8458 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 8459 void *ctx, spdk_bdev_for_each_channel_done cpl) 8460 { 8461 struct spdk_bdev_channel_iter *iter; 8462 8463 assert(bdev != NULL && fn != NULL && ctx != NULL); 8464 8465 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 8466 if (iter == NULL) { 8467 SPDK_ERRLOG("Unable to allocate iterator\n"); 8468 assert(false); 8469 return; 8470 } 8471 8472 iter->fn = fn; 8473 iter->cpl = cpl; 8474 iter->ctx = ctx; 8475 8476 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 8477 iter, bdev_each_channel_cpl); 8478 } 8479 8480 int 8481 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 8482 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 8483 spdk_bdev_io_completion_cb cb, void *cb_arg) 8484 { 8485 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8486 struct spdk_bdev_io *bdev_io; 8487 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 8488 8489 if (!desc->write) { 8490 return -EBADF; 8491 } 8492 8493 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 8494 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 8495 return -ENOTSUP; 8496 } 8497 8498 if (num_blocks == 0) { 8499 SPDK_ERRLOG("Can't copy 0 blocks\n"); 8500 return -EINVAL; 8501 } 8502 8503 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 8504 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 8505 SPDK_DEBUGLOG(bdev, 8506 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 8507 dst_offset_blocks, src_offset_blocks, num_blocks); 8508 return -EINVAL; 8509 } 8510 8511 bdev_io = bdev_channel_get_io(channel); 8512 if (!bdev_io) { 8513 return -ENOMEM; 8514 } 8515 8516 bdev_io->internal.ch = channel; 8517 bdev_io->internal.desc = desc; 8518 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 8519 8520 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 8521 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 8522 bdev_io->u.bdev.num_blocks = num_blocks; 8523 bdev_io->u.bdev.ext_opts = NULL; 8524 bdev_io_init(bdev_io, bdev, cb_arg, cb); 8525 8526 bdev_io_submit(bdev_io); 8527 return 0; 8528 } 8529 8530 SPDK_LOG_REGISTER_COMPONENT(bdev) 8531 8532 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8533 { 8534 struct spdk_trace_tpoint_opts opts[] = { 8535 { 8536 "BDEV_IO_START", TRACE_BDEV_IO_START, 8537 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8538 { 8539 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8540 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8541 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8542 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8543 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8544 } 8545 }, 8546 { 8547 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8548 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8549 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8550 }, 8551 { 8552 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8553 OWNER_BDEV, OBJECT_NONE, 1, 8554 { 8555 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8556 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8557 } 8558 }, 8559 { 8560 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8561 OWNER_BDEV, OBJECT_NONE, 0, 8562 { 8563 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8564 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8565 } 8566 }, 8567 }; 8568 8569 8570 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8571 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8572 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8573 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8574 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8575 } 8576