1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 SPDK_LOG_DEPRECATION_REGISTER(bdev_register_examine_thread, 65 "bdev register and examine on non-app thread", "SPDK 23.05", 0); 66 67 SPDK_LOG_DEPRECATION_REGISTER(vtune_support, "Intel(R) VTune integration", "SPDK 23.05", 0); 68 69 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 70 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 71 }; 72 73 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 74 75 RB_HEAD(bdev_name_tree, spdk_bdev_name); 76 77 static int 78 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 79 { 80 return strcmp(name1->name, name2->name); 81 } 82 83 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 84 85 struct spdk_bdev_mgr { 86 struct spdk_mempool *bdev_io_pool; 87 88 void *zero_buffer; 89 90 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 91 92 struct spdk_bdev_list bdevs; 93 struct bdev_name_tree bdev_names; 94 95 bool init_complete; 96 bool module_init_complete; 97 98 struct spdk_spinlock spinlock; 99 100 #ifdef SPDK_CONFIG_VTUNE 101 __itt_domain *domain; 102 #endif 103 }; 104 105 static struct spdk_bdev_mgr g_bdev_mgr = { 106 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 107 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 108 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 109 .init_complete = false, 110 .module_init_complete = false, 111 }; 112 113 static void 114 __attribute__((constructor)) 115 _bdev_init(void) 116 { 117 spdk_spin_init(&g_bdev_mgr.spinlock); 118 } 119 120 typedef void (*lock_range_cb)(void *ctx, int status); 121 122 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 123 124 struct lba_range { 125 uint64_t offset; 126 uint64_t length; 127 void *locked_ctx; 128 struct spdk_bdev_channel *owner_ch; 129 TAILQ_ENTRY(lba_range) tailq; 130 }; 131 132 static struct spdk_bdev_opts g_bdev_opts = { 133 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 134 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 135 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 136 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 137 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 138 }; 139 140 static spdk_bdev_init_cb g_init_cb_fn = NULL; 141 static void *g_init_cb_arg = NULL; 142 143 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 144 static void *g_fini_cb_arg = NULL; 145 static struct spdk_thread *g_fini_thread = NULL; 146 147 struct spdk_bdev_qos_limit { 148 /** IOs or bytes allowed per second (i.e., 1s). */ 149 uint64_t limit; 150 151 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 152 * For remaining bytes, allowed to run negative if an I/O is submitted when 153 * some bytes are remaining, but the I/O is bigger than that amount. The 154 * excess will be deducted from the next timeslice. 155 */ 156 int64_t remaining_this_timeslice; 157 158 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 159 uint32_t min_per_timeslice; 160 161 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 162 uint32_t max_per_timeslice; 163 164 /** Function to check whether to queue the IO. */ 165 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 166 167 /** Function to update for the submitted IO. */ 168 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 169 }; 170 171 struct spdk_bdev_qos { 172 /** Types of structure of rate limits. */ 173 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 174 175 /** The channel that all I/O are funneled through. */ 176 struct spdk_bdev_channel *ch; 177 178 /** The thread on which the poller is running. */ 179 struct spdk_thread *thread; 180 181 /** Queue of I/O waiting to be issued. */ 182 bdev_io_tailq_t queued; 183 184 /** Size of a timeslice in tsc ticks. */ 185 uint64_t timeslice_size; 186 187 /** Timestamp of start of last timeslice. */ 188 uint64_t last_timeslice; 189 190 /** Poller that processes queued I/O commands each time slice. */ 191 struct spdk_poller *poller; 192 }; 193 194 struct spdk_bdev_mgmt_channel { 195 /* 196 * Each thread keeps a cache of bdev_io - this allows 197 * bdev threads which are *not* DPDK threads to still 198 * benefit from a per-thread bdev_io cache. Without 199 * this, non-DPDK threads fetching from the mempool 200 * incur a cmpxchg on get and put. 201 */ 202 bdev_io_stailq_t per_thread_cache; 203 uint32_t per_thread_cache_count; 204 uint32_t bdev_io_cache_size; 205 206 struct spdk_iobuf_channel iobuf; 207 208 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 209 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 210 }; 211 212 /* 213 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 214 * will queue here their IO that awaits retry. It makes it possible to retry sending 215 * IO to one bdev after IO from other bdev completes. 216 */ 217 struct spdk_bdev_shared_resource { 218 /* The bdev management channel */ 219 struct spdk_bdev_mgmt_channel *mgmt_ch; 220 221 /* 222 * Count of I/O submitted to bdev module and waiting for completion. 223 * Incremented before submit_request() is called on an spdk_bdev_io. 224 */ 225 uint64_t io_outstanding; 226 227 /* 228 * Queue of IO awaiting retry because of a previous NOMEM status returned 229 * on this channel. 230 */ 231 bdev_io_tailq_t nomem_io; 232 233 /* 234 * Threshold which io_outstanding must drop to before retrying nomem_io. 235 */ 236 uint64_t nomem_threshold; 237 238 /* I/O channel allocated by a bdev module */ 239 struct spdk_io_channel *shared_ch; 240 241 /* Refcount of bdev channels using this resource */ 242 uint32_t ref; 243 244 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 245 }; 246 247 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 248 #define BDEV_CH_QOS_ENABLED (1 << 1) 249 250 struct spdk_bdev_channel { 251 struct spdk_bdev *bdev; 252 253 /* The channel for the underlying device */ 254 struct spdk_io_channel *channel; 255 256 /* Per io_device per thread data */ 257 struct spdk_bdev_shared_resource *shared_resource; 258 259 struct spdk_bdev_io_stat *stat; 260 261 /* 262 * Count of I/O submitted to the underlying dev module through this channel 263 * and waiting for completion. 264 */ 265 uint64_t io_outstanding; 266 267 /* 268 * List of all submitted I/Os including I/O that are generated via splitting. 269 */ 270 bdev_io_tailq_t io_submitted; 271 272 /* 273 * List of spdk_bdev_io that are currently queued because they write to a locked 274 * LBA range. 275 */ 276 bdev_io_tailq_t io_locked; 277 278 uint32_t flags; 279 280 struct spdk_histogram_data *histogram; 281 282 #ifdef SPDK_CONFIG_VTUNE 283 uint64_t start_tsc; 284 uint64_t interval_tsc; 285 __itt_string_handle *handle; 286 struct spdk_bdev_io_stat *prev_stat; 287 #endif 288 289 bdev_io_tailq_t queued_resets; 290 291 lba_range_tailq_t locked_ranges; 292 }; 293 294 struct media_event_entry { 295 struct spdk_bdev_media_event event; 296 TAILQ_ENTRY(media_event_entry) tailq; 297 }; 298 299 #define MEDIA_EVENT_POOL_SIZE 64 300 301 struct spdk_bdev_desc { 302 struct spdk_bdev *bdev; 303 struct spdk_thread *thread; 304 struct { 305 spdk_bdev_event_cb_t event_fn; 306 void *ctx; 307 } callback; 308 bool closed; 309 bool write; 310 bool memory_domains_supported; 311 struct spdk_spinlock spinlock; 312 uint32_t refs; 313 TAILQ_HEAD(, media_event_entry) pending_media_events; 314 TAILQ_HEAD(, media_event_entry) free_media_events; 315 struct media_event_entry *media_events_buffer; 316 TAILQ_ENTRY(spdk_bdev_desc) link; 317 318 uint64_t timeout_in_sec; 319 spdk_bdev_io_timeout_cb cb_fn; 320 void *cb_arg; 321 struct spdk_poller *io_timeout_poller; 322 }; 323 324 struct spdk_bdev_iostat_ctx { 325 struct spdk_bdev_io_stat *stat; 326 spdk_bdev_get_device_stat_cb cb; 327 void *cb_arg; 328 }; 329 330 struct set_qos_limit_ctx { 331 void (*cb_fn)(void *cb_arg, int status); 332 void *cb_arg; 333 struct spdk_bdev *bdev; 334 }; 335 336 struct spdk_bdev_channel_iter { 337 spdk_bdev_for_each_channel_msg fn; 338 spdk_bdev_for_each_channel_done cpl; 339 struct spdk_io_channel_iter *i; 340 void *ctx; 341 }; 342 343 struct spdk_bdev_io_error_stat { 344 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 345 }; 346 347 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 348 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 349 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 350 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 351 352 static inline void bdev_io_complete(void *ctx); 353 354 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 355 static void bdev_write_zero_buffer_next(void *_bdev_io); 356 357 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 358 struct spdk_io_channel *ch, void *_ctx); 359 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 360 361 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 362 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 363 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 364 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 365 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 366 struct iovec *iov, int iovcnt, void *md_buf, 367 uint64_t offset_blocks, uint64_t num_blocks, 368 spdk_bdev_io_completion_cb cb, void *cb_arg, 369 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 370 371 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 372 uint64_t offset, uint64_t length, 373 lock_range_cb cb_fn, void *cb_arg); 374 375 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 376 uint64_t offset, uint64_t length, 377 lock_range_cb cb_fn, void *cb_arg); 378 379 static inline void bdev_io_complete(void *ctx); 380 381 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 382 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 383 384 void 385 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 386 { 387 if (!opts) { 388 SPDK_ERRLOG("opts should not be NULL\n"); 389 return; 390 } 391 392 if (!opts_size) { 393 SPDK_ERRLOG("opts_size should not be zero value\n"); 394 return; 395 } 396 397 opts->opts_size = opts_size; 398 399 #define SET_FIELD(field) \ 400 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 401 opts->field = g_bdev_opts.field; \ 402 } \ 403 404 SET_FIELD(bdev_io_pool_size); 405 SET_FIELD(bdev_io_cache_size); 406 SET_FIELD(bdev_auto_examine); 407 SET_FIELD(small_buf_pool_size); 408 SET_FIELD(large_buf_pool_size); 409 410 /* Do not remove this statement, you should always update this statement when you adding a new field, 411 * and do not forget to add the SET_FIELD statement for your added field. */ 412 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 413 414 #undef SET_FIELD 415 } 416 417 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 418 "v23.05", 0); 419 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 420 "v23.05", 0); 421 int 422 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 423 { 424 struct spdk_iobuf_opts iobuf_opts; 425 uint32_t min_pool_size; 426 int rc; 427 428 if (!opts) { 429 SPDK_ERRLOG("opts cannot be NULL\n"); 430 return -1; 431 } 432 433 if (!opts->opts_size) { 434 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 435 return -1; 436 } 437 438 /* 439 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 440 * initialization. A second mgmt_ch will be created on the same thread when the application starts 441 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 442 */ 443 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 444 if (opts->bdev_io_pool_size < min_pool_size) { 445 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 446 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 447 spdk_thread_get_count()); 448 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 449 return -1; 450 } 451 452 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 453 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 454 } 455 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 456 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 457 } 458 459 #define SET_FIELD(field) \ 460 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 461 g_bdev_opts.field = opts->field; \ 462 } \ 463 464 SET_FIELD(bdev_io_pool_size); 465 SET_FIELD(bdev_io_cache_size); 466 SET_FIELD(bdev_auto_examine); 467 SET_FIELD(small_buf_pool_size); 468 SET_FIELD(large_buf_pool_size); 469 470 spdk_iobuf_get_opts(&iobuf_opts); 471 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 472 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 473 474 rc = spdk_iobuf_set_opts(&iobuf_opts); 475 if (rc != 0) { 476 SPDK_ERRLOG("Failed to set iobuf opts\n"); 477 return -1; 478 } 479 480 g_bdev_opts.opts_size = opts->opts_size; 481 482 #undef SET_FIELD 483 484 return 0; 485 } 486 487 static struct spdk_bdev * 488 bdev_get_by_name(const char *bdev_name) 489 { 490 struct spdk_bdev_name find; 491 struct spdk_bdev_name *res; 492 493 find.name = (char *)bdev_name; 494 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 495 if (res != NULL) { 496 return res->bdev; 497 } 498 499 return NULL; 500 } 501 502 struct spdk_bdev * 503 spdk_bdev_get_by_name(const char *bdev_name) 504 { 505 struct spdk_bdev *bdev; 506 507 spdk_spin_lock(&g_bdev_mgr.spinlock); 508 bdev = bdev_get_by_name(bdev_name); 509 spdk_spin_unlock(&g_bdev_mgr.spinlock); 510 511 return bdev; 512 } 513 514 struct bdev_io_status_string { 515 enum spdk_bdev_io_status status; 516 const char *str; 517 }; 518 519 static const struct bdev_io_status_string bdev_io_status_strings[] = { 520 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 521 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 522 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 523 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 524 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 525 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 526 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 527 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 528 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 529 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 530 }; 531 532 static const char * 533 bdev_io_status_get_string(enum spdk_bdev_io_status status) 534 { 535 uint32_t i; 536 537 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 538 if (bdev_io_status_strings[i].status == status) { 539 return bdev_io_status_strings[i].str; 540 } 541 } 542 543 return "reserved"; 544 } 545 546 struct spdk_bdev_wait_for_examine_ctx { 547 struct spdk_poller *poller; 548 spdk_bdev_wait_for_examine_cb cb_fn; 549 void *cb_arg; 550 }; 551 552 static bool bdev_module_all_actions_completed(void); 553 554 static int 555 bdev_wait_for_examine_cb(void *arg) 556 { 557 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 558 559 if (!bdev_module_all_actions_completed()) { 560 return SPDK_POLLER_IDLE; 561 } 562 563 spdk_poller_unregister(&ctx->poller); 564 ctx->cb_fn(ctx->cb_arg); 565 free(ctx); 566 567 return SPDK_POLLER_BUSY; 568 } 569 570 int 571 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 572 { 573 struct spdk_bdev_wait_for_examine_ctx *ctx; 574 575 ctx = calloc(1, sizeof(*ctx)); 576 if (ctx == NULL) { 577 return -ENOMEM; 578 } 579 ctx->cb_fn = cb_fn; 580 ctx->cb_arg = cb_arg; 581 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 582 583 return 0; 584 } 585 586 struct spdk_bdev_examine_item { 587 char *name; 588 TAILQ_ENTRY(spdk_bdev_examine_item) link; 589 }; 590 591 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 592 593 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 594 g_bdev_examine_allowlist); 595 596 static inline bool 597 bdev_examine_allowlist_check(const char *name) 598 { 599 struct spdk_bdev_examine_item *item; 600 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 601 if (strcmp(name, item->name) == 0) { 602 return true; 603 } 604 } 605 return false; 606 } 607 608 static inline void 609 bdev_examine_allowlist_free(void) 610 { 611 struct spdk_bdev_examine_item *item; 612 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 613 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 614 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 615 free(item->name); 616 free(item); 617 } 618 } 619 620 static inline bool 621 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 622 { 623 struct spdk_bdev_alias *tmp; 624 if (bdev_examine_allowlist_check(bdev->name)) { 625 return true; 626 } 627 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 628 if (bdev_examine_allowlist_check(tmp->alias.name)) { 629 return true; 630 } 631 } 632 return false; 633 } 634 635 static inline bool 636 bdev_ok_to_examine(struct spdk_bdev *bdev) 637 { 638 if (g_bdev_opts.bdev_auto_examine) { 639 return true; 640 } else { 641 return bdev_in_examine_allowlist(bdev); 642 } 643 } 644 645 static void 646 bdev_examine(struct spdk_bdev *bdev) 647 { 648 struct spdk_bdev_module *module; 649 uint32_t action; 650 651 if (!bdev_ok_to_examine(bdev)) { 652 return; 653 } 654 655 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 656 if (module->examine_config) { 657 spdk_spin_lock(&module->internal.spinlock); 658 action = module->internal.action_in_progress; 659 module->internal.action_in_progress++; 660 spdk_spin_unlock(&module->internal.spinlock); 661 module->examine_config(bdev); 662 if (action != module->internal.action_in_progress) { 663 SPDK_ERRLOG("examine_config for module %s did not call " 664 "spdk_bdev_module_examine_done()\n", module->name); 665 } 666 } 667 } 668 669 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 670 module = bdev->internal.claim.v1.module; 671 if (module->examine_disk) { 672 spdk_spin_lock(&module->internal.spinlock); 673 module->internal.action_in_progress++; 674 spdk_spin_unlock(&module->internal.spinlock); 675 module->examine_disk(bdev); 676 } 677 return; 678 } 679 680 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 681 if (module->examine_disk) { 682 spdk_spin_lock(&module->internal.spinlock); 683 module->internal.action_in_progress++; 684 spdk_spin_unlock(&module->internal.spinlock); 685 module->examine_disk(bdev); 686 } 687 } 688 } 689 690 int 691 spdk_bdev_examine(const char *name) 692 { 693 struct spdk_bdev *bdev; 694 struct spdk_bdev_examine_item *item; 695 696 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 697 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 698 } 699 700 if (g_bdev_opts.bdev_auto_examine) { 701 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 702 return -EINVAL; 703 } 704 705 if (bdev_examine_allowlist_check(name)) { 706 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 707 return -EEXIST; 708 } 709 710 item = calloc(1, sizeof(*item)); 711 if (!item) { 712 return -ENOMEM; 713 } 714 item->name = strdup(name); 715 if (!item->name) { 716 free(item); 717 return -ENOMEM; 718 } 719 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 720 721 bdev = spdk_bdev_get_by_name(name); 722 if (bdev) { 723 bdev_examine(bdev); 724 } 725 return 0; 726 } 727 728 static inline void 729 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 730 { 731 struct spdk_bdev_examine_item *item; 732 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 733 spdk_json_write_object_begin(w); 734 spdk_json_write_named_string(w, "method", "bdev_examine"); 735 spdk_json_write_named_object_begin(w, "params"); 736 spdk_json_write_named_string(w, "name", item->name); 737 spdk_json_write_object_end(w); 738 spdk_json_write_object_end(w); 739 } 740 } 741 742 struct spdk_bdev * 743 spdk_bdev_first(void) 744 { 745 struct spdk_bdev *bdev; 746 747 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 748 if (bdev) { 749 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 750 } 751 752 return bdev; 753 } 754 755 struct spdk_bdev * 756 spdk_bdev_next(struct spdk_bdev *prev) 757 { 758 struct spdk_bdev *bdev; 759 760 bdev = TAILQ_NEXT(prev, internal.link); 761 if (bdev) { 762 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 763 } 764 765 return bdev; 766 } 767 768 static struct spdk_bdev * 769 _bdev_next_leaf(struct spdk_bdev *bdev) 770 { 771 while (bdev != NULL) { 772 if (bdev->internal.claim_type == SPDK_BDEV_CLAIM_NONE) { 773 return bdev; 774 } else { 775 bdev = TAILQ_NEXT(bdev, internal.link); 776 } 777 } 778 779 return bdev; 780 } 781 782 struct spdk_bdev * 783 spdk_bdev_first_leaf(void) 784 { 785 struct spdk_bdev *bdev; 786 787 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 788 789 if (bdev) { 790 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 791 } 792 793 return bdev; 794 } 795 796 struct spdk_bdev * 797 spdk_bdev_next_leaf(struct spdk_bdev *prev) 798 { 799 struct spdk_bdev *bdev; 800 801 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 802 803 if (bdev) { 804 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 805 } 806 807 return bdev; 808 } 809 810 static inline bool 811 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 812 { 813 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 814 } 815 816 void 817 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 818 { 819 struct iovec *iovs; 820 821 if (bdev_io->u.bdev.iovs == NULL) { 822 bdev_io->u.bdev.iovs = &bdev_io->iov; 823 bdev_io->u.bdev.iovcnt = 1; 824 } 825 826 iovs = bdev_io->u.bdev.iovs; 827 828 assert(iovs != NULL); 829 assert(bdev_io->u.bdev.iovcnt >= 1); 830 831 iovs[0].iov_base = buf; 832 iovs[0].iov_len = len; 833 } 834 835 void 836 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 837 { 838 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 839 bdev_io->u.bdev.md_buf = md_buf; 840 } 841 842 static bool 843 _is_buf_allocated(const struct iovec *iovs) 844 { 845 if (iovs == NULL) { 846 return false; 847 } 848 849 return iovs[0].iov_base != NULL; 850 } 851 852 static bool 853 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 854 { 855 int i; 856 uintptr_t iov_base; 857 858 if (spdk_likely(alignment == 1)) { 859 return true; 860 } 861 862 for (i = 0; i < iovcnt; i++) { 863 iov_base = (uintptr_t)iovs[i].iov_base; 864 if ((iov_base & (alignment - 1)) != 0) { 865 return false; 866 } 867 } 868 869 return true; 870 } 871 872 static void 873 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 874 { 875 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 876 void *buf; 877 878 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 879 buf = bdev_io->internal.buf; 880 bdev_io->internal.buf = NULL; 881 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 882 bdev_io->internal.get_aux_buf_cb = NULL; 883 } else { 884 assert(bdev_io->internal.get_buf_cb != NULL); 885 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 886 bdev_io->internal.get_buf_cb = NULL; 887 } 888 } 889 890 static void 891 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 892 { 893 struct spdk_bdev_io *bdev_io = ctx; 894 895 if (rc) { 896 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 897 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 898 } 899 bdev_io_get_buf_complete(bdev_io, !rc); 900 } 901 902 static void 903 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 904 { 905 int rc = 0; 906 907 /* save original md_buf */ 908 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 909 bdev_io->internal.orig_md_iov.iov_len = len; 910 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 911 bdev_io->internal.bounce_md_iov.iov_len = len; 912 /* set bounce md_buf */ 913 bdev_io->u.bdev.md_buf = md_buf; 914 915 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 916 if (bdev_io_use_memory_domain(bdev_io)) { 917 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 918 bdev_io->internal.ext_opts->memory_domain_ctx, 919 &bdev_io->internal.orig_md_iov, 1, 920 &bdev_io->internal.bounce_md_iov, 1, 921 bdev_io->internal.data_transfer_cpl, 922 bdev_io); 923 if (rc == 0) { 924 /* Continue to submit IO in completion callback */ 925 return; 926 } 927 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 928 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 929 } else { 930 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 931 } 932 } 933 934 assert(bdev_io->internal.data_transfer_cpl); 935 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 936 } 937 938 static void 939 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 940 { 941 struct spdk_bdev *bdev = bdev_io->bdev; 942 uint64_t md_len; 943 void *buf; 944 945 if (spdk_bdev_is_md_separate(bdev)) { 946 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 947 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 948 949 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 950 951 if (bdev_io->u.bdev.md_buf != NULL) { 952 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 953 return; 954 } else { 955 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 956 } 957 } 958 959 bdev_io_get_buf_complete(bdev_io, true); 960 } 961 962 static void 963 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 964 { 965 struct spdk_bdev_io *bdev_io = ctx; 966 967 if (rc) { 968 SPDK_ERRLOG("Failed to get data buffer\n"); 969 assert(bdev_io->internal.data_transfer_cpl); 970 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 971 return; 972 } 973 974 _bdev_io_set_md_buf(bdev_io); 975 } 976 977 static void 978 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 979 bdev_copy_bounce_buffer_cpl cpl_cb) 980 { 981 int rc = 0; 982 983 bdev_io->internal.data_transfer_cpl = cpl_cb; 984 /* save original iovec */ 985 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 986 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 987 /* set bounce iov */ 988 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 989 bdev_io->u.bdev.iovcnt = 1; 990 /* set bounce buffer for this operation */ 991 bdev_io->u.bdev.iovs[0].iov_base = buf; 992 bdev_io->u.bdev.iovs[0].iov_len = len; 993 /* if this is write path, copy data from original buffer to bounce buffer */ 994 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 995 if (bdev_io_use_memory_domain(bdev_io)) { 996 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 997 bdev_io->internal.ext_opts->memory_domain_ctx, 998 bdev_io->internal.orig_iovs, 999 (uint32_t) bdev_io->internal.orig_iovcnt, 1000 bdev_io->u.bdev.iovs, 1, 1001 _bdev_io_pull_bounce_data_buf_done, 1002 bdev_io); 1003 if (rc == 0) { 1004 /* Continue to submit IO in completion callback */ 1005 return; 1006 } 1007 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 1008 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1009 } else { 1010 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 1011 } 1012 } 1013 1014 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1015 } 1016 1017 static void 1018 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1019 { 1020 struct spdk_bdev *bdev = bdev_io->bdev; 1021 bool buf_allocated; 1022 uint64_t alignment; 1023 void *aligned_buf; 1024 1025 bdev_io->internal.buf = buf; 1026 1027 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1028 bdev_io_get_buf_complete(bdev_io, true); 1029 return; 1030 } 1031 1032 alignment = spdk_bdev_get_buf_align(bdev); 1033 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1034 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1035 1036 if (buf_allocated) { 1037 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1038 /* Continue in completion callback */ 1039 return; 1040 } else { 1041 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1042 } 1043 1044 _bdev_io_set_md_buf(bdev_io); 1045 } 1046 1047 static inline uint64_t 1048 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1049 { 1050 struct spdk_bdev *bdev = bdev_io->bdev; 1051 uint64_t md_len, alignment; 1052 1053 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1054 alignment = spdk_bdev_get_buf_align(bdev); 1055 1056 return len + alignment + md_len; 1057 } 1058 1059 static void 1060 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1061 { 1062 struct spdk_bdev_mgmt_channel *ch; 1063 1064 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1065 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1066 } 1067 1068 static void 1069 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1070 { 1071 assert(bdev_io->internal.buf != NULL); 1072 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1073 bdev_io->internal.buf = NULL; 1074 } 1075 1076 void 1077 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1078 { 1079 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1080 1081 assert(buf != NULL); 1082 _bdev_io_put_buf(bdev_io, buf, len); 1083 } 1084 1085 static void 1086 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1087 { 1088 struct spdk_bdev *bdev = bdev_ch->bdev; 1089 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1090 struct spdk_bdev_io *bdev_io; 1091 1092 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1093 /* 1094 * Allow some more I/O to complete before retrying the nomem_io queue. 1095 * Some drivers (such as nvme) cannot immediately take a new I/O in 1096 * the context of a completion, because the resources for the I/O are 1097 * not released until control returns to the bdev poller. Also, we 1098 * may require several small I/O to complete before a larger I/O 1099 * (that requires splitting) can be submitted. 1100 */ 1101 return; 1102 } 1103 1104 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1105 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1106 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1107 bdev_io->internal.ch->io_outstanding++; 1108 shared_resource->io_outstanding++; 1109 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1110 bdev_io->internal.error.nvme.cdw0 = 0; 1111 bdev_io->num_retries++; 1112 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1113 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1114 break; 1115 } 1116 } 1117 } 1118 1119 static inline void 1120 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1121 struct spdk_bdev_shared_resource *shared_resource) 1122 { 1123 assert(bdev_ch->io_outstanding > 0); 1124 assert(shared_resource->io_outstanding > 0); 1125 bdev_ch->io_outstanding--; 1126 shared_resource->io_outstanding--; 1127 } 1128 1129 static inline bool 1130 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1131 { 1132 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1133 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1134 1135 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1136 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1137 /* 1138 * Wait for some of the outstanding I/O to complete before we 1139 * retry any of the nomem_io. Normally we will wait for 1140 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1141 * depth channels we will instead wait for half to complete. 1142 */ 1143 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1144 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1145 return true; 1146 } 1147 1148 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1149 bdev_ch_retry_io(bdev_ch); 1150 } 1151 1152 return false; 1153 } 1154 1155 static void 1156 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1157 { 1158 struct spdk_bdev_io *bdev_io = ctx; 1159 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1160 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1161 1162 if (rc) { 1163 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1164 } 1165 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1166 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1167 */ 1168 bdev_io_put_buf(bdev_io); 1169 1170 /* Continue with IO completion flow */ 1171 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1172 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1173 return; 1174 } 1175 1176 bdev_io_complete(bdev_io); 1177 } 1178 1179 static inline void 1180 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1181 { 1182 int rc = 0; 1183 1184 /* do the same for metadata buffer */ 1185 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1186 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1187 1188 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1189 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1190 if (bdev_io_use_memory_domain(bdev_io)) { 1191 /* If memory domain is used then we need to call async push function */ 1192 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1193 bdev_io->internal.ext_opts->memory_domain_ctx, 1194 &bdev_io->internal.orig_md_iov, 1195 (uint32_t)bdev_io->internal.orig_iovcnt, 1196 &bdev_io->internal.bounce_md_iov, 1, 1197 bdev_io->internal.data_transfer_cpl, 1198 bdev_io); 1199 if (rc == 0) { 1200 /* Continue IO completion in async callback */ 1201 return; 1202 } 1203 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1204 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1205 } else { 1206 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1207 bdev_io->internal.orig_md_iov.iov_len); 1208 } 1209 } 1210 } 1211 1212 assert(bdev_io->internal.data_transfer_cpl); 1213 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1214 } 1215 1216 static void 1217 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1218 { 1219 struct spdk_bdev_io *bdev_io = ctx; 1220 1221 assert(bdev_io->internal.data_transfer_cpl); 1222 1223 if (rc) { 1224 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1225 return; 1226 } 1227 1228 /* set original buffer for this io */ 1229 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1230 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1231 /* disable bouncing buffer for this io */ 1232 bdev_io->internal.orig_iovcnt = 0; 1233 bdev_io->internal.orig_iovs = NULL; 1234 1235 _bdev_io_push_bounce_md_buffer(bdev_io); 1236 } 1237 1238 static inline void 1239 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1240 { 1241 int rc = 0; 1242 1243 bdev_io->internal.data_transfer_cpl = cpl_cb; 1244 1245 /* if this is read path, copy data from bounce buffer to original buffer */ 1246 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1247 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1248 if (bdev_io_use_memory_domain(bdev_io)) { 1249 /* If memory domain is used then we need to call async push function */ 1250 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1251 bdev_io->internal.ext_opts->memory_domain_ctx, 1252 bdev_io->internal.orig_iovs, 1253 (uint32_t)bdev_io->internal.orig_iovcnt, 1254 &bdev_io->internal.bounce_iov, 1, 1255 _bdev_io_push_bounce_data_buffer_done, 1256 bdev_io); 1257 if (rc == 0) { 1258 /* Continue IO completion in async callback */ 1259 return; 1260 } 1261 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1262 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1263 } else { 1264 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1265 bdev_io->internal.orig_iovcnt, 1266 bdev_io->internal.bounce_iov.iov_base, 1267 bdev_io->internal.bounce_iov.iov_len); 1268 } 1269 } 1270 1271 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1272 } 1273 1274 static void 1275 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1276 { 1277 struct spdk_bdev_io *bdev_io; 1278 1279 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1280 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1281 } 1282 1283 static void 1284 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1285 { 1286 struct spdk_bdev_mgmt_channel *mgmt_ch; 1287 uint64_t max_len; 1288 void *buf; 1289 1290 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1291 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1292 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1293 1294 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1295 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1296 bdev_io_get_buf_complete(bdev_io, false); 1297 return; 1298 } 1299 1300 bdev_io->internal.buf_len = len; 1301 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1302 bdev_io_get_iobuf_cb); 1303 if (buf != NULL) { 1304 _bdev_io_set_buf(bdev_io, buf, len); 1305 } 1306 } 1307 1308 void 1309 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1310 { 1311 struct spdk_bdev *bdev = bdev_io->bdev; 1312 uint64_t alignment; 1313 1314 assert(cb != NULL); 1315 bdev_io->internal.get_buf_cb = cb; 1316 1317 alignment = spdk_bdev_get_buf_align(bdev); 1318 1319 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1320 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1321 /* Buffer already present and aligned */ 1322 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1323 return; 1324 } 1325 1326 bdev_io_get_buf(bdev_io, len); 1327 } 1328 1329 static void 1330 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1331 bool success) 1332 { 1333 if (!success) { 1334 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1335 bdev_io_complete(bdev_io); 1336 } else { 1337 bdev_io_submit(bdev_io); 1338 } 1339 } 1340 1341 static void 1342 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1343 uint64_t len) 1344 { 1345 assert(cb != NULL); 1346 bdev_io->internal.get_buf_cb = cb; 1347 1348 bdev_io_get_buf(bdev_io, len); 1349 } 1350 1351 void 1352 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1353 { 1354 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1355 1356 assert(cb != NULL); 1357 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1358 bdev_io->internal.get_aux_buf_cb = cb; 1359 bdev_io_get_buf(bdev_io, len); 1360 } 1361 1362 static int 1363 bdev_module_get_max_ctx_size(void) 1364 { 1365 struct spdk_bdev_module *bdev_module; 1366 int max_bdev_module_size = 0; 1367 1368 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1369 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1370 max_bdev_module_size = bdev_module->get_ctx_size(); 1371 } 1372 } 1373 1374 return max_bdev_module_size; 1375 } 1376 1377 static void 1378 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1379 { 1380 int i; 1381 struct spdk_bdev_qos *qos = bdev->internal.qos; 1382 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1383 1384 if (!qos) { 1385 return; 1386 } 1387 1388 spdk_bdev_get_qos_rate_limits(bdev, limits); 1389 1390 spdk_json_write_object_begin(w); 1391 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1392 1393 spdk_json_write_named_object_begin(w, "params"); 1394 spdk_json_write_named_string(w, "name", bdev->name); 1395 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1396 if (limits[i] > 0) { 1397 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1398 } 1399 } 1400 spdk_json_write_object_end(w); 1401 1402 spdk_json_write_object_end(w); 1403 } 1404 1405 void 1406 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1407 { 1408 struct spdk_bdev_module *bdev_module; 1409 struct spdk_bdev *bdev; 1410 1411 assert(w != NULL); 1412 1413 spdk_json_write_array_begin(w); 1414 1415 spdk_json_write_object_begin(w); 1416 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1417 spdk_json_write_named_object_begin(w, "params"); 1418 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1419 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1420 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1421 spdk_json_write_object_end(w); 1422 spdk_json_write_object_end(w); 1423 1424 bdev_examine_allowlist_config_json(w); 1425 1426 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1427 if (bdev_module->config_json) { 1428 bdev_module->config_json(w); 1429 } 1430 } 1431 1432 spdk_spin_lock(&g_bdev_mgr.spinlock); 1433 1434 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1435 if (bdev->fn_table->write_config_json) { 1436 bdev->fn_table->write_config_json(bdev, w); 1437 } 1438 1439 bdev_qos_config_json(bdev, w); 1440 } 1441 1442 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1443 1444 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1445 spdk_json_write_object_begin(w); 1446 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1447 spdk_json_write_object_end(w); 1448 1449 spdk_json_write_array_end(w); 1450 } 1451 1452 static void 1453 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1454 { 1455 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1456 struct spdk_bdev_io *bdev_io; 1457 1458 spdk_iobuf_channel_fini(&ch->iobuf); 1459 1460 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1461 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1462 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1463 ch->per_thread_cache_count--; 1464 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1465 } 1466 1467 assert(ch->per_thread_cache_count == 0); 1468 } 1469 1470 static int 1471 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1472 { 1473 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1474 struct spdk_bdev_io *bdev_io; 1475 uint32_t i; 1476 int rc; 1477 1478 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1479 if (rc != 0) { 1480 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1481 return -1; 1482 } 1483 1484 STAILQ_INIT(&ch->per_thread_cache); 1485 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1486 1487 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1488 ch->per_thread_cache_count = 0; 1489 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1490 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1491 if (bdev_io == NULL) { 1492 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1493 assert(false); 1494 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1495 return -1; 1496 } 1497 ch->per_thread_cache_count++; 1498 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1499 } 1500 1501 TAILQ_INIT(&ch->shared_resources); 1502 TAILQ_INIT(&ch->io_wait_queue); 1503 1504 return 0; 1505 } 1506 1507 static void 1508 bdev_init_complete(int rc) 1509 { 1510 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1511 void *cb_arg = g_init_cb_arg; 1512 struct spdk_bdev_module *m; 1513 1514 g_bdev_mgr.init_complete = true; 1515 g_init_cb_fn = NULL; 1516 g_init_cb_arg = NULL; 1517 1518 /* 1519 * For modules that need to know when subsystem init is complete, 1520 * inform them now. 1521 */ 1522 if (rc == 0) { 1523 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1524 if (m->init_complete) { 1525 m->init_complete(); 1526 } 1527 } 1528 } 1529 1530 cb_fn(cb_arg, rc); 1531 } 1532 1533 static bool 1534 bdev_module_all_actions_completed(void) 1535 { 1536 struct spdk_bdev_module *m; 1537 1538 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1539 if (m->internal.action_in_progress > 0) { 1540 return false; 1541 } 1542 } 1543 return true; 1544 } 1545 1546 static void 1547 bdev_module_action_complete(void) 1548 { 1549 /* 1550 * Don't finish bdev subsystem initialization if 1551 * module pre-initialization is still in progress, or 1552 * the subsystem been already initialized. 1553 */ 1554 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1555 return; 1556 } 1557 1558 /* 1559 * Check all bdev modules for inits/examinations in progress. If any 1560 * exist, return immediately since we cannot finish bdev subsystem 1561 * initialization until all are completed. 1562 */ 1563 if (!bdev_module_all_actions_completed()) { 1564 return; 1565 } 1566 1567 /* 1568 * Modules already finished initialization - now that all 1569 * the bdev modules have finished their asynchronous I/O 1570 * processing, the entire bdev layer can be marked as complete. 1571 */ 1572 bdev_init_complete(0); 1573 } 1574 1575 static void 1576 bdev_module_action_done(struct spdk_bdev_module *module) 1577 { 1578 spdk_spin_lock(&module->internal.spinlock); 1579 assert(module->internal.action_in_progress > 0); 1580 module->internal.action_in_progress--; 1581 spdk_spin_unlock(&module->internal.spinlock); 1582 bdev_module_action_complete(); 1583 } 1584 1585 void 1586 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1587 { 1588 assert(module->async_init); 1589 bdev_module_action_done(module); 1590 } 1591 1592 void 1593 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1594 { 1595 bdev_module_action_done(module); 1596 } 1597 1598 /** The last initialized bdev module */ 1599 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1600 1601 static void 1602 bdev_init_failed(void *cb_arg) 1603 { 1604 struct spdk_bdev_module *module = cb_arg; 1605 1606 spdk_spin_lock(&module->internal.spinlock); 1607 assert(module->internal.action_in_progress > 0); 1608 module->internal.action_in_progress--; 1609 spdk_spin_unlock(&module->internal.spinlock); 1610 bdev_init_complete(-1); 1611 } 1612 1613 static int 1614 bdev_modules_init(void) 1615 { 1616 struct spdk_bdev_module *module; 1617 int rc = 0; 1618 1619 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1620 g_resume_bdev_module = module; 1621 if (module->async_init) { 1622 spdk_spin_lock(&module->internal.spinlock); 1623 module->internal.action_in_progress = 1; 1624 spdk_spin_unlock(&module->internal.spinlock); 1625 } 1626 rc = module->module_init(); 1627 if (rc != 0) { 1628 /* Bump action_in_progress to prevent other modules from completion of modules_init 1629 * Send message to defer application shutdown until resources are cleaned up */ 1630 spdk_spin_lock(&module->internal.spinlock); 1631 module->internal.action_in_progress = 1; 1632 spdk_spin_unlock(&module->internal.spinlock); 1633 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1634 return rc; 1635 } 1636 } 1637 1638 g_resume_bdev_module = NULL; 1639 return 0; 1640 } 1641 1642 void 1643 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1644 { 1645 int rc = 0; 1646 char mempool_name[32]; 1647 1648 assert(cb_fn != NULL); 1649 1650 g_init_cb_fn = cb_fn; 1651 g_init_cb_arg = cb_arg; 1652 1653 spdk_notify_type_register("bdev_register"); 1654 spdk_notify_type_register("bdev_unregister"); 1655 1656 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1657 1658 rc = spdk_iobuf_register_module("bdev"); 1659 if (rc != 0) { 1660 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1661 bdev_init_complete(-1); 1662 return; 1663 } 1664 1665 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1666 g_bdev_opts.bdev_io_pool_size, 1667 sizeof(struct spdk_bdev_io) + 1668 bdev_module_get_max_ctx_size(), 1669 0, 1670 SPDK_ENV_SOCKET_ID_ANY); 1671 1672 if (g_bdev_mgr.bdev_io_pool == NULL) { 1673 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1674 bdev_init_complete(-1); 1675 return; 1676 } 1677 1678 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1679 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1680 if (!g_bdev_mgr.zero_buffer) { 1681 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1682 bdev_init_complete(-1); 1683 return; 1684 } 1685 1686 #ifdef SPDK_CONFIG_VTUNE 1687 SPDK_LOG_DEPRECATED(vtune_support); 1688 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1689 #endif 1690 1691 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1692 bdev_mgmt_channel_destroy, 1693 sizeof(struct spdk_bdev_mgmt_channel), 1694 "bdev_mgr"); 1695 1696 rc = bdev_modules_init(); 1697 g_bdev_mgr.module_init_complete = true; 1698 if (rc != 0) { 1699 SPDK_ERRLOG("bdev modules init failed\n"); 1700 return; 1701 } 1702 1703 bdev_module_action_complete(); 1704 } 1705 1706 static void 1707 bdev_mgr_unregister_cb(void *io_device) 1708 { 1709 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1710 1711 if (g_bdev_mgr.bdev_io_pool) { 1712 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1713 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1714 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1715 g_bdev_opts.bdev_io_pool_size); 1716 } 1717 1718 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1719 } 1720 1721 spdk_free(g_bdev_mgr.zero_buffer); 1722 1723 bdev_examine_allowlist_free(); 1724 1725 cb_fn(g_fini_cb_arg); 1726 g_fini_cb_fn = NULL; 1727 g_fini_cb_arg = NULL; 1728 g_bdev_mgr.init_complete = false; 1729 g_bdev_mgr.module_init_complete = false; 1730 } 1731 1732 static void 1733 bdev_module_fini_iter(void *arg) 1734 { 1735 struct spdk_bdev_module *bdev_module; 1736 1737 /* FIXME: Handling initialization failures is broken now, 1738 * so we won't even try cleaning up after successfully 1739 * initialized modules. if module_init_complete is false, 1740 * just call spdk_bdev_mgr_unregister_cb 1741 */ 1742 if (!g_bdev_mgr.module_init_complete) { 1743 bdev_mgr_unregister_cb(NULL); 1744 return; 1745 } 1746 1747 /* Start iterating from the last touched module */ 1748 if (!g_resume_bdev_module) { 1749 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1750 } else { 1751 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1752 internal.tailq); 1753 } 1754 1755 while (bdev_module) { 1756 if (bdev_module->async_fini) { 1757 /* Save our place so we can resume later. We must 1758 * save the variable here, before calling module_fini() 1759 * below, because in some cases the module may immediately 1760 * call spdk_bdev_module_fini_done() and re-enter 1761 * this function to continue iterating. */ 1762 g_resume_bdev_module = bdev_module; 1763 } 1764 1765 if (bdev_module->module_fini) { 1766 bdev_module->module_fini(); 1767 } 1768 1769 if (bdev_module->async_fini) { 1770 return; 1771 } 1772 1773 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1774 internal.tailq); 1775 } 1776 1777 g_resume_bdev_module = NULL; 1778 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1779 } 1780 1781 void 1782 spdk_bdev_module_fini_done(void) 1783 { 1784 if (spdk_get_thread() != g_fini_thread) { 1785 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1786 } else { 1787 bdev_module_fini_iter(NULL); 1788 } 1789 } 1790 1791 static void 1792 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1793 { 1794 struct spdk_bdev *bdev = cb_arg; 1795 1796 if (bdeverrno && bdev) { 1797 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1798 bdev->name); 1799 1800 /* 1801 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1802 * bdev; try to continue by manually removing this bdev from the list and continue 1803 * with the next bdev in the list. 1804 */ 1805 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1806 } 1807 1808 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1809 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1810 /* 1811 * Bdev module finish need to be deferred as we might be in the middle of some context 1812 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1813 * after returning. 1814 */ 1815 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1816 return; 1817 } 1818 1819 /* 1820 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1821 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1822 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1823 * base bdevs. 1824 * 1825 * Also, walk the list in the reverse order. 1826 */ 1827 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1828 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1829 spdk_spin_lock(&bdev->internal.spinlock); 1830 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 1831 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1832 bdev->name, bdev->internal.claim.v1.module->name); 1833 spdk_spin_unlock(&bdev->internal.spinlock); 1834 continue; 1835 } 1836 spdk_spin_unlock(&bdev->internal.spinlock); 1837 1838 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1839 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1840 return; 1841 } 1842 1843 /* 1844 * If any bdev fails to unclaim underlying bdev properly, we may face the 1845 * case of bdev list consisting of claimed bdevs only (if claims are managed 1846 * correctly, this would mean there's a loop in the claims graph which is 1847 * clearly impossible). Warn and unregister last bdev on the list then. 1848 */ 1849 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1850 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1851 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1852 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1853 return; 1854 } 1855 } 1856 1857 static void 1858 bdev_module_fini_start_iter(void *arg) 1859 { 1860 struct spdk_bdev_module *bdev_module; 1861 1862 if (!g_resume_bdev_module) { 1863 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1864 } else { 1865 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1866 } 1867 1868 while (bdev_module) { 1869 if (bdev_module->async_fini_start) { 1870 /* Save our place so we can resume later. We must 1871 * save the variable here, before calling fini_start() 1872 * below, because in some cases the module may immediately 1873 * call spdk_bdev_module_fini_start_done() and re-enter 1874 * this function to continue iterating. */ 1875 g_resume_bdev_module = bdev_module; 1876 } 1877 1878 if (bdev_module->fini_start) { 1879 bdev_module->fini_start(); 1880 } 1881 1882 if (bdev_module->async_fini_start) { 1883 return; 1884 } 1885 1886 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1887 } 1888 1889 g_resume_bdev_module = NULL; 1890 1891 bdev_finish_unregister_bdevs_iter(NULL, 0); 1892 } 1893 1894 void 1895 spdk_bdev_module_fini_start_done(void) 1896 { 1897 if (spdk_get_thread() != g_fini_thread) { 1898 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1899 } else { 1900 bdev_module_fini_start_iter(NULL); 1901 } 1902 } 1903 1904 static void 1905 bdev_finish_wait_for_examine_done(void *cb_arg) 1906 { 1907 bdev_module_fini_start_iter(NULL); 1908 } 1909 1910 void 1911 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1912 { 1913 int rc; 1914 1915 assert(cb_fn != NULL); 1916 1917 g_fini_thread = spdk_get_thread(); 1918 1919 g_fini_cb_fn = cb_fn; 1920 g_fini_cb_arg = cb_arg; 1921 1922 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1923 if (rc != 0) { 1924 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1925 bdev_finish_wait_for_examine_done(NULL); 1926 } 1927 } 1928 1929 struct spdk_bdev_io * 1930 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1931 { 1932 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1933 struct spdk_bdev_io *bdev_io; 1934 1935 if (ch->per_thread_cache_count > 0) { 1936 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1937 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1938 ch->per_thread_cache_count--; 1939 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1940 /* 1941 * Don't try to look for bdev_ios in the global pool if there are 1942 * waiters on bdev_ios - we don't want this caller to jump the line. 1943 */ 1944 bdev_io = NULL; 1945 } else { 1946 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1947 } 1948 1949 return bdev_io; 1950 } 1951 1952 void 1953 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1954 { 1955 struct spdk_bdev_mgmt_channel *ch; 1956 1957 assert(bdev_io != NULL); 1958 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1959 1960 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1961 1962 if (bdev_io->internal.buf != NULL) { 1963 bdev_io_put_buf(bdev_io); 1964 } 1965 1966 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1967 ch->per_thread_cache_count++; 1968 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1969 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1970 struct spdk_bdev_io_wait_entry *entry; 1971 1972 entry = TAILQ_FIRST(&ch->io_wait_queue); 1973 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1974 entry->cb_fn(entry->cb_arg); 1975 } 1976 } else { 1977 /* We should never have a full cache with entries on the io wait queue. */ 1978 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1979 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1980 } 1981 } 1982 1983 static bool 1984 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1985 { 1986 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1987 1988 switch (limit) { 1989 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1990 return true; 1991 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1992 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1993 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1994 return false; 1995 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1996 default: 1997 return false; 1998 } 1999 } 2000 2001 static bool 2002 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 2003 { 2004 switch (bdev_io->type) { 2005 case SPDK_BDEV_IO_TYPE_NVME_IO: 2006 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2007 case SPDK_BDEV_IO_TYPE_READ: 2008 case SPDK_BDEV_IO_TYPE_WRITE: 2009 return true; 2010 case SPDK_BDEV_IO_TYPE_ZCOPY: 2011 if (bdev_io->u.bdev.zcopy.start) { 2012 return true; 2013 } else { 2014 return false; 2015 } 2016 default: 2017 return false; 2018 } 2019 } 2020 2021 static bool 2022 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2023 { 2024 switch (bdev_io->type) { 2025 case SPDK_BDEV_IO_TYPE_NVME_IO: 2026 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2027 /* Bit 1 (0x2) set for read operation */ 2028 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2029 return true; 2030 } else { 2031 return false; 2032 } 2033 case SPDK_BDEV_IO_TYPE_READ: 2034 return true; 2035 case SPDK_BDEV_IO_TYPE_ZCOPY: 2036 /* Populate to read from disk */ 2037 if (bdev_io->u.bdev.zcopy.populate) { 2038 return true; 2039 } else { 2040 return false; 2041 } 2042 default: 2043 return false; 2044 } 2045 } 2046 2047 static uint64_t 2048 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2049 { 2050 struct spdk_bdev *bdev = bdev_io->bdev; 2051 2052 switch (bdev_io->type) { 2053 case SPDK_BDEV_IO_TYPE_NVME_IO: 2054 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2055 return bdev_io->u.nvme_passthru.nbytes; 2056 case SPDK_BDEV_IO_TYPE_READ: 2057 case SPDK_BDEV_IO_TYPE_WRITE: 2058 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2059 case SPDK_BDEV_IO_TYPE_ZCOPY: 2060 /* Track the data in the start phase only */ 2061 if (bdev_io->u.bdev.zcopy.start) { 2062 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2063 } else { 2064 return 0; 2065 } 2066 default: 2067 return 0; 2068 } 2069 } 2070 2071 static bool 2072 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2073 { 2074 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2075 return true; 2076 } else { 2077 return false; 2078 } 2079 } 2080 2081 static bool 2082 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2083 { 2084 if (bdev_is_read_io(io) == false) { 2085 return false; 2086 } 2087 2088 return bdev_qos_rw_queue_io(limit, io); 2089 } 2090 2091 static bool 2092 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2093 { 2094 if (bdev_is_read_io(io) == true) { 2095 return false; 2096 } 2097 2098 return bdev_qos_rw_queue_io(limit, io); 2099 } 2100 2101 static void 2102 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2103 { 2104 limit->remaining_this_timeslice--; 2105 } 2106 2107 static void 2108 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2109 { 2110 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2111 } 2112 2113 static void 2114 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2115 { 2116 if (bdev_is_read_io(io) == false) { 2117 return; 2118 } 2119 2120 return bdev_qos_rw_bps_update_quota(limit, io); 2121 } 2122 2123 static void 2124 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2125 { 2126 if (bdev_is_read_io(io) == true) { 2127 return; 2128 } 2129 2130 return bdev_qos_rw_bps_update_quota(limit, io); 2131 } 2132 2133 static void 2134 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2135 { 2136 int i; 2137 2138 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2139 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2140 qos->rate_limits[i].queue_io = NULL; 2141 qos->rate_limits[i].update_quota = NULL; 2142 continue; 2143 } 2144 2145 switch (i) { 2146 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2147 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2148 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2149 break; 2150 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2151 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2152 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2153 break; 2154 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2155 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2156 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2157 break; 2158 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2159 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2160 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2161 break; 2162 default: 2163 break; 2164 } 2165 } 2166 } 2167 2168 static void 2169 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2170 struct spdk_bdev_io *bdev_io, 2171 enum spdk_bdev_io_status status) 2172 { 2173 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2174 2175 bdev_io->internal.in_submit_request = true; 2176 bdev_ch->io_outstanding++; 2177 shared_resource->io_outstanding++; 2178 spdk_bdev_io_complete(bdev_io, status); 2179 bdev_io->internal.in_submit_request = false; 2180 } 2181 2182 static inline void 2183 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2184 { 2185 struct spdk_bdev *bdev = bdev_io->bdev; 2186 struct spdk_io_channel *ch = bdev_ch->channel; 2187 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2188 2189 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2190 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2191 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2192 2193 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2194 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2195 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2196 SPDK_BDEV_IO_STATUS_SUCCESS); 2197 return; 2198 } 2199 } 2200 2201 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2202 bdev_io->bdev->split_on_write_unit && 2203 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2204 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2205 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2206 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2207 return; 2208 } 2209 2210 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2211 bdev_ch->io_outstanding++; 2212 shared_resource->io_outstanding++; 2213 bdev_io->internal.in_submit_request = true; 2214 bdev->fn_table->submit_request(ch, bdev_io); 2215 bdev_io->internal.in_submit_request = false; 2216 } else { 2217 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2218 } 2219 } 2220 2221 static bool 2222 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2223 { 2224 int i; 2225 2226 if (bdev_qos_io_to_limit(bdev_io) == true) { 2227 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2228 if (!qos->rate_limits[i].queue_io) { 2229 continue; 2230 } 2231 2232 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2233 bdev_io) == true) { 2234 return true; 2235 } 2236 } 2237 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2238 if (!qos->rate_limits[i].update_quota) { 2239 continue; 2240 } 2241 2242 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2243 } 2244 } 2245 2246 return false; 2247 } 2248 2249 static inline void 2250 _bdev_io_do_submit(void *ctx) 2251 { 2252 struct spdk_bdev_io *bdev_io = ctx; 2253 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2254 2255 bdev_io_do_submit(ch, bdev_io); 2256 } 2257 2258 static int 2259 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2260 { 2261 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2262 int submitted_ios = 0; 2263 2264 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2265 if (!bdev_qos_queue_io(qos, bdev_io)) { 2266 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2267 2268 if (bdev_io->internal.io_submit_ch) { 2269 /* Send back the IO to the original thread for the actual processing. */ 2270 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2271 bdev_io->internal.io_submit_ch = NULL; 2272 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2273 _bdev_io_do_submit, bdev_io); 2274 } else { 2275 bdev_io_do_submit(ch, bdev_io); 2276 } 2277 2278 submitted_ios++; 2279 } 2280 } 2281 2282 return submitted_ios; 2283 } 2284 2285 static void 2286 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2287 { 2288 int rc; 2289 2290 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2291 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2292 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2293 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2294 &bdev_io->internal.waitq_entry); 2295 if (rc != 0) { 2296 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2297 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2298 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2299 } 2300 } 2301 2302 static bool 2303 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2304 { 2305 uint32_t io_boundary; 2306 struct spdk_bdev *bdev = bdev_io->bdev; 2307 uint32_t max_size = bdev->max_segment_size; 2308 int max_segs = bdev->max_num_segments; 2309 2310 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2311 io_boundary = bdev->write_unit_size; 2312 } else if (bdev->split_on_optimal_io_boundary) { 2313 io_boundary = bdev->optimal_io_boundary; 2314 } else { 2315 io_boundary = 0; 2316 } 2317 2318 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2319 return false; 2320 } 2321 2322 if (io_boundary) { 2323 uint64_t start_stripe, end_stripe; 2324 2325 start_stripe = bdev_io->u.bdev.offset_blocks; 2326 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2327 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2328 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2329 start_stripe >>= spdk_u32log2(io_boundary); 2330 end_stripe >>= spdk_u32log2(io_boundary); 2331 } else { 2332 start_stripe /= io_boundary; 2333 end_stripe /= io_boundary; 2334 } 2335 2336 if (start_stripe != end_stripe) { 2337 return true; 2338 } 2339 } 2340 2341 if (max_segs) { 2342 if (bdev_io->u.bdev.iovcnt > max_segs) { 2343 return true; 2344 } 2345 } 2346 2347 if (max_size) { 2348 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2349 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2350 return true; 2351 } 2352 } 2353 } 2354 2355 return false; 2356 } 2357 2358 static bool 2359 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2360 { 2361 uint32_t num_unmap_segments; 2362 2363 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2364 return false; 2365 } 2366 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2367 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2368 return true; 2369 } 2370 2371 return false; 2372 } 2373 2374 static bool 2375 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2376 { 2377 if (!bdev_io->bdev->max_write_zeroes) { 2378 return false; 2379 } 2380 2381 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2382 return true; 2383 } 2384 2385 return false; 2386 } 2387 2388 static bool 2389 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2390 { 2391 if (bdev_io->bdev->max_copy != 0 && 2392 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2393 return true; 2394 } 2395 2396 return false; 2397 } 2398 2399 static bool 2400 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2401 { 2402 switch (bdev_io->type) { 2403 case SPDK_BDEV_IO_TYPE_READ: 2404 case SPDK_BDEV_IO_TYPE_WRITE: 2405 return bdev_rw_should_split(bdev_io); 2406 case SPDK_BDEV_IO_TYPE_UNMAP: 2407 return bdev_unmap_should_split(bdev_io); 2408 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2409 return bdev_write_zeroes_should_split(bdev_io); 2410 case SPDK_BDEV_IO_TYPE_COPY: 2411 return bdev_copy_should_split(bdev_io); 2412 default: 2413 return false; 2414 } 2415 } 2416 2417 static uint32_t 2418 _to_next_boundary(uint64_t offset, uint32_t boundary) 2419 { 2420 return (boundary - (offset % boundary)); 2421 } 2422 2423 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2424 2425 static void _bdev_rw_split(void *_bdev_io); 2426 2427 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2428 2429 static void 2430 _bdev_unmap_split(void *_bdev_io) 2431 { 2432 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2433 } 2434 2435 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2436 2437 static void 2438 _bdev_write_zeroes_split(void *_bdev_io) 2439 { 2440 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2441 } 2442 2443 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2444 2445 static void 2446 _bdev_copy_split(void *_bdev_io) 2447 { 2448 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2449 } 2450 2451 static int 2452 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2453 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2454 { 2455 int rc; 2456 uint64_t current_offset, current_remaining, current_src_offset; 2457 spdk_bdev_io_wait_cb io_wait_fn; 2458 2459 current_offset = *offset; 2460 current_remaining = *remaining; 2461 2462 bdev_io->u.bdev.split_outstanding++; 2463 2464 io_wait_fn = _bdev_rw_split; 2465 switch (bdev_io->type) { 2466 case SPDK_BDEV_IO_TYPE_READ: 2467 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2468 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2469 iov, iovcnt, md_buf, current_offset, 2470 num_blocks, 2471 bdev_io_split_done, bdev_io, 2472 bdev_io->internal.ext_opts, true); 2473 break; 2474 case SPDK_BDEV_IO_TYPE_WRITE: 2475 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2476 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2477 iov, iovcnt, md_buf, current_offset, 2478 num_blocks, 2479 bdev_io_split_done, bdev_io, 2480 bdev_io->internal.ext_opts, true); 2481 break; 2482 case SPDK_BDEV_IO_TYPE_UNMAP: 2483 io_wait_fn = _bdev_unmap_split; 2484 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2485 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2486 current_offset, num_blocks, 2487 bdev_io_split_done, bdev_io); 2488 break; 2489 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2490 io_wait_fn = _bdev_write_zeroes_split; 2491 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2492 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2493 current_offset, num_blocks, 2494 bdev_io_split_done, bdev_io); 2495 break; 2496 case SPDK_BDEV_IO_TYPE_COPY: 2497 io_wait_fn = _bdev_copy_split; 2498 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2499 (current_offset - bdev_io->u.bdev.offset_blocks); 2500 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2501 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2502 current_offset, current_src_offset, num_blocks, 2503 bdev_io_split_done, bdev_io); 2504 break; 2505 default: 2506 assert(false); 2507 rc = -EINVAL; 2508 break; 2509 } 2510 2511 if (rc == 0) { 2512 current_offset += num_blocks; 2513 current_remaining -= num_blocks; 2514 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2515 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2516 *offset = current_offset; 2517 *remaining = current_remaining; 2518 } else { 2519 bdev_io->u.bdev.split_outstanding--; 2520 if (rc == -ENOMEM) { 2521 if (bdev_io->u.bdev.split_outstanding == 0) { 2522 /* No I/O is outstanding. Hence we should wait here. */ 2523 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2524 } 2525 } else { 2526 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2527 if (bdev_io->u.bdev.split_outstanding == 0) { 2528 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2529 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2530 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2531 } 2532 } 2533 } 2534 2535 return rc; 2536 } 2537 2538 static void 2539 _bdev_rw_split(void *_bdev_io) 2540 { 2541 struct iovec *parent_iov, *iov; 2542 struct spdk_bdev_io *bdev_io = _bdev_io; 2543 struct spdk_bdev *bdev = bdev_io->bdev; 2544 uint64_t parent_offset, current_offset, remaining; 2545 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2546 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2547 uint32_t iovcnt, iov_len, child_iovsize; 2548 uint32_t blocklen = bdev->blocklen; 2549 uint32_t io_boundary; 2550 uint32_t max_segment_size = bdev->max_segment_size; 2551 uint32_t max_child_iovcnt = bdev->max_num_segments; 2552 void *md_buf = NULL; 2553 int rc; 2554 2555 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2556 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2557 SPDK_BDEV_IO_NUM_CHILD_IOV; 2558 2559 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2560 io_boundary = bdev->write_unit_size; 2561 } else if (bdev->split_on_optimal_io_boundary) { 2562 io_boundary = bdev->optimal_io_boundary; 2563 } else { 2564 io_boundary = UINT32_MAX; 2565 } 2566 2567 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2568 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2569 parent_offset = bdev_io->u.bdev.offset_blocks; 2570 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2571 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2572 2573 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2574 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2575 if (parent_iov_offset < parent_iov->iov_len) { 2576 break; 2577 } 2578 parent_iov_offset -= parent_iov->iov_len; 2579 } 2580 2581 child_iovcnt = 0; 2582 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2583 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2584 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2585 to_next_boundary = spdk_min(remaining, to_next_boundary); 2586 to_next_boundary_bytes = to_next_boundary * blocklen; 2587 2588 iov = &bdev_io->child_iov[child_iovcnt]; 2589 iovcnt = 0; 2590 2591 if (bdev_io->u.bdev.md_buf) { 2592 md_buf = (char *)bdev_io->u.bdev.md_buf + 2593 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2594 } 2595 2596 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2597 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2598 iovcnt < child_iovsize) { 2599 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2600 iov_len = parent_iov->iov_len - parent_iov_offset; 2601 2602 iov_len = spdk_min(iov_len, max_segment_size); 2603 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2604 to_next_boundary_bytes -= iov_len; 2605 2606 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2607 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2608 2609 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2610 parent_iov_offset += iov_len; 2611 } else { 2612 parent_iovpos++; 2613 parent_iov_offset = 0; 2614 } 2615 child_iovcnt++; 2616 iovcnt++; 2617 } 2618 2619 if (to_next_boundary_bytes > 0) { 2620 /* We had to stop this child I/O early because we ran out of 2621 * child_iov space or were limited by max_num_segments. 2622 * Ensure the iovs to be aligned with block size and 2623 * then adjust to_next_boundary before starting the 2624 * child I/O. 2625 */ 2626 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2627 iovcnt == child_iovsize); 2628 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2629 if (to_last_block_bytes != 0) { 2630 uint32_t child_iovpos = child_iovcnt - 1; 2631 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2632 * so the loop will naturally end 2633 */ 2634 2635 to_last_block_bytes = blocklen - to_last_block_bytes; 2636 to_next_boundary_bytes += to_last_block_bytes; 2637 while (to_last_block_bytes > 0 && iovcnt > 0) { 2638 iov_len = spdk_min(to_last_block_bytes, 2639 bdev_io->child_iov[child_iovpos].iov_len); 2640 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2641 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2642 child_iovpos--; 2643 if (--iovcnt == 0) { 2644 /* If the child IO is less than a block size just return. 2645 * If the first child IO of any split round is less than 2646 * a block size, an error exit. 2647 */ 2648 if (bdev_io->u.bdev.split_outstanding == 0) { 2649 SPDK_ERRLOG("The first child io was less than a block size\n"); 2650 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2651 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2652 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2653 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2654 } 2655 2656 return; 2657 } 2658 } 2659 2660 to_last_block_bytes -= iov_len; 2661 2662 if (parent_iov_offset == 0) { 2663 parent_iovpos--; 2664 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2665 } 2666 parent_iov_offset -= iov_len; 2667 } 2668 2669 assert(to_last_block_bytes == 0); 2670 } 2671 to_next_boundary -= to_next_boundary_bytes / blocklen; 2672 } 2673 2674 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2675 ¤t_offset, &remaining); 2676 if (spdk_unlikely(rc)) { 2677 return; 2678 } 2679 } 2680 } 2681 2682 static void 2683 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2684 { 2685 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2686 uint32_t num_children_reqs = 0; 2687 int rc; 2688 2689 offset = bdev_io->u.bdev.split_current_offset_blocks; 2690 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2691 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2692 2693 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2694 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2695 2696 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2697 &offset, &remaining); 2698 if (spdk_likely(rc == 0)) { 2699 num_children_reqs++; 2700 } else { 2701 return; 2702 } 2703 } 2704 } 2705 2706 static void 2707 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2708 { 2709 uint64_t offset, write_zeroes_blocks, remaining; 2710 uint32_t num_children_reqs = 0; 2711 int rc; 2712 2713 offset = bdev_io->u.bdev.split_current_offset_blocks; 2714 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2715 2716 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2717 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2718 2719 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2720 &offset, &remaining); 2721 if (spdk_likely(rc == 0)) { 2722 num_children_reqs++; 2723 } else { 2724 return; 2725 } 2726 } 2727 } 2728 2729 static void 2730 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2731 { 2732 uint64_t offset, copy_blocks, remaining; 2733 uint32_t num_children_reqs = 0; 2734 int rc; 2735 2736 offset = bdev_io->u.bdev.split_current_offset_blocks; 2737 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2738 2739 assert(bdev_io->bdev->max_copy != 0); 2740 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2741 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2742 2743 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2744 &offset, &remaining); 2745 if (spdk_likely(rc == 0)) { 2746 num_children_reqs++; 2747 } else { 2748 return; 2749 } 2750 } 2751 } 2752 2753 static void 2754 parent_bdev_io_complete(void *ctx, int rc) 2755 { 2756 struct spdk_bdev_io *parent_io = ctx; 2757 2758 if (rc) { 2759 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2760 } 2761 2762 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2763 parent_io->internal.caller_ctx); 2764 } 2765 2766 static void 2767 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2768 { 2769 struct spdk_bdev_io *parent_io = cb_arg; 2770 2771 spdk_bdev_free_io(bdev_io); 2772 2773 if (!success) { 2774 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2775 /* If any child I/O failed, stop further splitting process. */ 2776 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2777 parent_io->u.bdev.split_remaining_num_blocks = 0; 2778 } 2779 parent_io->u.bdev.split_outstanding--; 2780 if (parent_io->u.bdev.split_outstanding != 0) { 2781 return; 2782 } 2783 2784 /* 2785 * Parent I/O finishes when all blocks are consumed. 2786 */ 2787 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2788 assert(parent_io->internal.cb != bdev_io_split_done); 2789 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2790 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2791 2792 if (parent_io->internal.orig_iovcnt != 0) { 2793 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2794 /* bdev IO will be completed in the callback */ 2795 } else { 2796 parent_bdev_io_complete(parent_io, 0); 2797 } 2798 return; 2799 } 2800 2801 /* 2802 * Continue with the splitting process. This function will complete the parent I/O if the 2803 * splitting is done. 2804 */ 2805 switch (parent_io->type) { 2806 case SPDK_BDEV_IO_TYPE_READ: 2807 case SPDK_BDEV_IO_TYPE_WRITE: 2808 _bdev_rw_split(parent_io); 2809 break; 2810 case SPDK_BDEV_IO_TYPE_UNMAP: 2811 bdev_unmap_split(parent_io); 2812 break; 2813 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2814 bdev_write_zeroes_split(parent_io); 2815 break; 2816 case SPDK_BDEV_IO_TYPE_COPY: 2817 bdev_copy_split(parent_io); 2818 break; 2819 default: 2820 assert(false); 2821 break; 2822 } 2823 } 2824 2825 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2826 bool success); 2827 2828 static void 2829 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2830 { 2831 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2832 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2833 bdev_io->u.bdev.split_outstanding = 0; 2834 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2835 2836 switch (bdev_io->type) { 2837 case SPDK_BDEV_IO_TYPE_READ: 2838 case SPDK_BDEV_IO_TYPE_WRITE: 2839 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2840 _bdev_rw_split(bdev_io); 2841 } else { 2842 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2843 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2844 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2845 } 2846 break; 2847 case SPDK_BDEV_IO_TYPE_UNMAP: 2848 bdev_unmap_split(bdev_io); 2849 break; 2850 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2851 bdev_write_zeroes_split(bdev_io); 2852 break; 2853 case SPDK_BDEV_IO_TYPE_COPY: 2854 bdev_copy_split(bdev_io); 2855 break; 2856 default: 2857 assert(false); 2858 break; 2859 } 2860 } 2861 2862 static void 2863 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2864 { 2865 if (!success) { 2866 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2867 return; 2868 } 2869 2870 _bdev_rw_split(bdev_io); 2871 } 2872 2873 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2874 * be inlined, at least on some compilers. 2875 */ 2876 static inline void 2877 _bdev_io_submit(void *ctx) 2878 { 2879 struct spdk_bdev_io *bdev_io = ctx; 2880 struct spdk_bdev *bdev = bdev_io->bdev; 2881 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2882 2883 if (spdk_likely(bdev_ch->flags == 0)) { 2884 bdev_io_do_submit(bdev_ch, bdev_io); 2885 return; 2886 } 2887 2888 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2889 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2890 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2891 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2892 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2893 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2894 } else { 2895 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2896 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2897 } 2898 } else { 2899 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2900 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2901 } 2902 } 2903 2904 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2905 2906 bool 2907 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2908 { 2909 if (range1->length == 0 || range2->length == 0) { 2910 return false; 2911 } 2912 2913 if (range1->offset + range1->length <= range2->offset) { 2914 return false; 2915 } 2916 2917 if (range2->offset + range2->length <= range1->offset) { 2918 return false; 2919 } 2920 2921 return true; 2922 } 2923 2924 static bool 2925 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2926 { 2927 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2928 struct lba_range r; 2929 2930 switch (bdev_io->type) { 2931 case SPDK_BDEV_IO_TYPE_NVME_IO: 2932 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2933 /* Don't try to decode the NVMe command - just assume worst-case and that 2934 * it overlaps a locked range. 2935 */ 2936 return true; 2937 case SPDK_BDEV_IO_TYPE_WRITE: 2938 case SPDK_BDEV_IO_TYPE_UNMAP: 2939 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2940 case SPDK_BDEV_IO_TYPE_ZCOPY: 2941 case SPDK_BDEV_IO_TYPE_COPY: 2942 r.offset = bdev_io->u.bdev.offset_blocks; 2943 r.length = bdev_io->u.bdev.num_blocks; 2944 if (!bdev_lba_range_overlapped(range, &r)) { 2945 /* This I/O doesn't overlap the specified LBA range. */ 2946 return false; 2947 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2948 /* This I/O overlaps, but the I/O is on the same channel that locked this 2949 * range, and the caller_ctx is the same as the locked_ctx. This means 2950 * that this I/O is associated with the lock, and is allowed to execute. 2951 */ 2952 return false; 2953 } else { 2954 return true; 2955 } 2956 default: 2957 return false; 2958 } 2959 } 2960 2961 void 2962 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2963 { 2964 struct spdk_bdev *bdev = bdev_io->bdev; 2965 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2966 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2967 2968 assert(thread != NULL); 2969 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2970 2971 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2972 struct lba_range *range; 2973 2974 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2975 if (bdev_io_range_is_locked(bdev_io, range)) { 2976 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2977 return; 2978 } 2979 } 2980 } 2981 2982 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2983 2984 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2985 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2986 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2987 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2988 spdk_bdev_get_name(bdev)); 2989 2990 if (bdev_io_should_split(bdev_io)) { 2991 bdev_io_split(NULL, bdev_io); 2992 return; 2993 } 2994 2995 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2996 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2997 _bdev_io_submit(bdev_io); 2998 } else { 2999 bdev_io->internal.io_submit_ch = ch; 3000 bdev_io->internal.ch = bdev->internal.qos->ch; 3001 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 3002 } 3003 } else { 3004 _bdev_io_submit(bdev_io); 3005 } 3006 } 3007 3008 static inline void 3009 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 3010 { 3011 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 3012 3013 /* Zero part we don't copy */ 3014 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 3015 memcpy(opts_copy, opts, opts->size); 3016 opts_copy->size = sizeof(*opts_copy); 3017 opts_copy->metadata = bdev_io->u.bdev.md_buf; 3018 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 3019 bdev_io->u.bdev.ext_opts = opts_copy; 3020 } 3021 3022 static inline void 3023 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 3024 { 3025 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 3026 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3027 * For write operation we need to pull buffers from memory domain before submitting IO. 3028 * Once read operation completes, we need to use memory_domain push functionality to 3029 * update data in original memory domain IO buffer 3030 * This IO request will go through a regular IO flow, so clear memory domains pointers in 3031 * the copied ext_opts */ 3032 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 3033 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 3034 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3035 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3036 } 3037 3038 static inline void 3039 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 3040 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 3041 { 3042 if (opts) { 3043 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 3044 assert(opts->size <= sizeof(*opts)); 3045 /* 3046 * copy if size is smaller than opts struct to avoid having to check size 3047 * on every access to bdev_io->u.bdev.ext_opts 3048 */ 3049 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 3050 _bdev_io_copy_ext_opts(bdev_io, opts); 3051 if (use_pull_push) { 3052 _bdev_io_ext_use_bounce_buffer(bdev_io); 3053 return; 3054 } 3055 } 3056 } 3057 bdev_io_submit(bdev_io); 3058 } 3059 3060 static void 3061 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3062 { 3063 struct spdk_bdev *bdev = bdev_io->bdev; 3064 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3065 struct spdk_io_channel *ch = bdev_ch->channel; 3066 3067 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3068 3069 bdev_io->internal.in_submit_request = true; 3070 bdev->fn_table->submit_request(ch, bdev_io); 3071 bdev_io->internal.in_submit_request = false; 3072 } 3073 3074 void 3075 bdev_io_init(struct spdk_bdev_io *bdev_io, 3076 struct spdk_bdev *bdev, void *cb_arg, 3077 spdk_bdev_io_completion_cb cb) 3078 { 3079 bdev_io->bdev = bdev; 3080 bdev_io->internal.caller_ctx = cb_arg; 3081 bdev_io->internal.cb = cb; 3082 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3083 bdev_io->internal.in_submit_request = false; 3084 bdev_io->internal.buf = NULL; 3085 bdev_io->internal.io_submit_ch = NULL; 3086 bdev_io->internal.orig_iovs = NULL; 3087 bdev_io->internal.orig_iovcnt = 0; 3088 bdev_io->internal.orig_md_iov.iov_base = NULL; 3089 bdev_io->internal.error.nvme.cdw0 = 0; 3090 bdev_io->num_retries = 0; 3091 bdev_io->internal.get_buf_cb = NULL; 3092 bdev_io->internal.get_aux_buf_cb = NULL; 3093 bdev_io->internal.ext_opts = NULL; 3094 bdev_io->internal.data_transfer_cpl = NULL; 3095 } 3096 3097 static bool 3098 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3099 { 3100 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3101 } 3102 3103 bool 3104 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3105 { 3106 bool supported; 3107 3108 supported = bdev_io_type_supported(bdev, io_type); 3109 3110 if (!supported) { 3111 switch (io_type) { 3112 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3113 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3114 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3115 break; 3116 default: 3117 break; 3118 } 3119 } 3120 3121 return supported; 3122 } 3123 3124 uint64_t 3125 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3126 { 3127 return bdev_io->internal.submit_tsc; 3128 } 3129 3130 int 3131 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3132 { 3133 if (bdev->fn_table->dump_info_json) { 3134 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3135 } 3136 3137 return 0; 3138 } 3139 3140 static void 3141 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3142 { 3143 uint32_t max_per_timeslice = 0; 3144 int i; 3145 3146 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3147 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3148 qos->rate_limits[i].max_per_timeslice = 0; 3149 continue; 3150 } 3151 3152 max_per_timeslice = qos->rate_limits[i].limit * 3153 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3154 3155 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3156 qos->rate_limits[i].min_per_timeslice); 3157 3158 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3159 } 3160 3161 bdev_qos_set_ops(qos); 3162 } 3163 3164 static int 3165 bdev_channel_poll_qos(void *arg) 3166 { 3167 struct spdk_bdev_qos *qos = arg; 3168 uint64_t now = spdk_get_ticks(); 3169 int i; 3170 3171 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3172 /* We received our callback earlier than expected - return 3173 * immediately and wait to do accounting until at least one 3174 * timeslice has actually expired. This should never happen 3175 * with a well-behaved timer implementation. 3176 */ 3177 return SPDK_POLLER_IDLE; 3178 } 3179 3180 /* Reset for next round of rate limiting */ 3181 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3182 /* We may have allowed the IOs or bytes to slightly overrun in the last 3183 * timeslice. remaining_this_timeslice is signed, so if it's negative 3184 * here, we'll account for the overrun so that the next timeslice will 3185 * be appropriately reduced. 3186 */ 3187 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3188 qos->rate_limits[i].remaining_this_timeslice = 0; 3189 } 3190 } 3191 3192 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3193 qos->last_timeslice += qos->timeslice_size; 3194 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3195 qos->rate_limits[i].remaining_this_timeslice += 3196 qos->rate_limits[i].max_per_timeslice; 3197 } 3198 } 3199 3200 return bdev_qos_io_submit(qos->ch, qos); 3201 } 3202 3203 static void 3204 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3205 { 3206 struct spdk_bdev_shared_resource *shared_resource; 3207 struct lba_range *range; 3208 3209 bdev_free_io_stat(ch->stat); 3210 #ifdef SPDK_CONFIG_VTUNE 3211 bdev_free_io_stat(ch->prev_stat); 3212 #endif 3213 3214 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3215 range = TAILQ_FIRST(&ch->locked_ranges); 3216 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3217 free(range); 3218 } 3219 3220 spdk_put_io_channel(ch->channel); 3221 3222 shared_resource = ch->shared_resource; 3223 3224 assert(TAILQ_EMPTY(&ch->io_locked)); 3225 assert(TAILQ_EMPTY(&ch->io_submitted)); 3226 assert(ch->io_outstanding == 0); 3227 assert(shared_resource->ref > 0); 3228 shared_resource->ref--; 3229 if (shared_resource->ref == 0) { 3230 assert(shared_resource->io_outstanding == 0); 3231 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3232 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3233 free(shared_resource); 3234 } 3235 } 3236 3237 static void 3238 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3239 { 3240 struct spdk_bdev_qos *qos = bdev->internal.qos; 3241 int i; 3242 3243 assert(spdk_spin_held(&bdev->internal.spinlock)); 3244 3245 /* Rate limiting on this bdev enabled */ 3246 if (qos) { 3247 if (qos->ch == NULL) { 3248 struct spdk_io_channel *io_ch; 3249 3250 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3251 bdev->name, spdk_get_thread()); 3252 3253 /* No qos channel has been selected, so set one up */ 3254 3255 /* Take another reference to ch */ 3256 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3257 assert(io_ch != NULL); 3258 qos->ch = ch; 3259 3260 qos->thread = spdk_io_channel_get_thread(io_ch); 3261 3262 TAILQ_INIT(&qos->queued); 3263 3264 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3265 if (bdev_qos_is_iops_rate_limit(i) == true) { 3266 qos->rate_limits[i].min_per_timeslice = 3267 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3268 } else { 3269 qos->rate_limits[i].min_per_timeslice = 3270 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3271 } 3272 3273 if (qos->rate_limits[i].limit == 0) { 3274 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3275 } 3276 } 3277 bdev_qos_update_max_quota_per_timeslice(qos); 3278 qos->timeslice_size = 3279 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3280 qos->last_timeslice = spdk_get_ticks(); 3281 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3282 qos, 3283 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3284 } 3285 3286 ch->flags |= BDEV_CH_QOS_ENABLED; 3287 } 3288 } 3289 3290 struct poll_timeout_ctx { 3291 struct spdk_bdev_desc *desc; 3292 uint64_t timeout_in_sec; 3293 spdk_bdev_io_timeout_cb cb_fn; 3294 void *cb_arg; 3295 }; 3296 3297 static void 3298 bdev_desc_free(struct spdk_bdev_desc *desc) 3299 { 3300 spdk_spin_destroy(&desc->spinlock); 3301 free(desc->media_events_buffer); 3302 free(desc); 3303 } 3304 3305 static void 3306 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3307 { 3308 struct poll_timeout_ctx *ctx = _ctx; 3309 struct spdk_bdev_desc *desc = ctx->desc; 3310 3311 free(ctx); 3312 3313 spdk_spin_lock(&desc->spinlock); 3314 desc->refs--; 3315 if (desc->closed == true && desc->refs == 0) { 3316 spdk_spin_unlock(&desc->spinlock); 3317 bdev_desc_free(desc); 3318 return; 3319 } 3320 spdk_spin_unlock(&desc->spinlock); 3321 } 3322 3323 static void 3324 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3325 struct spdk_io_channel *io_ch, void *_ctx) 3326 { 3327 struct poll_timeout_ctx *ctx = _ctx; 3328 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3329 struct spdk_bdev_desc *desc = ctx->desc; 3330 struct spdk_bdev_io *bdev_io; 3331 uint64_t now; 3332 3333 spdk_spin_lock(&desc->spinlock); 3334 if (desc->closed == true) { 3335 spdk_spin_unlock(&desc->spinlock); 3336 spdk_bdev_for_each_channel_continue(i, -1); 3337 return; 3338 } 3339 spdk_spin_unlock(&desc->spinlock); 3340 3341 now = spdk_get_ticks(); 3342 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3343 /* Exclude any I/O that are generated via splitting. */ 3344 if (bdev_io->internal.cb == bdev_io_split_done) { 3345 continue; 3346 } 3347 3348 /* Once we find an I/O that has not timed out, we can immediately 3349 * exit the loop. 3350 */ 3351 if (now < (bdev_io->internal.submit_tsc + 3352 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3353 goto end; 3354 } 3355 3356 if (bdev_io->internal.desc == desc) { 3357 ctx->cb_fn(ctx->cb_arg, bdev_io); 3358 } 3359 } 3360 3361 end: 3362 spdk_bdev_for_each_channel_continue(i, 0); 3363 } 3364 3365 static int 3366 bdev_poll_timeout_io(void *arg) 3367 { 3368 struct spdk_bdev_desc *desc = arg; 3369 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3370 struct poll_timeout_ctx *ctx; 3371 3372 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3373 if (!ctx) { 3374 SPDK_ERRLOG("failed to allocate memory\n"); 3375 return SPDK_POLLER_BUSY; 3376 } 3377 ctx->desc = desc; 3378 ctx->cb_arg = desc->cb_arg; 3379 ctx->cb_fn = desc->cb_fn; 3380 ctx->timeout_in_sec = desc->timeout_in_sec; 3381 3382 /* Take a ref on the descriptor in case it gets closed while we are checking 3383 * all of the channels. 3384 */ 3385 spdk_spin_lock(&desc->spinlock); 3386 desc->refs++; 3387 spdk_spin_unlock(&desc->spinlock); 3388 3389 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3390 bdev_channel_poll_timeout_io_done); 3391 3392 return SPDK_POLLER_BUSY; 3393 } 3394 3395 int 3396 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3397 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3398 { 3399 assert(desc->thread == spdk_get_thread()); 3400 3401 spdk_poller_unregister(&desc->io_timeout_poller); 3402 3403 if (timeout_in_sec) { 3404 assert(cb_fn != NULL); 3405 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3406 desc, 3407 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3408 1000); 3409 if (desc->io_timeout_poller == NULL) { 3410 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3411 return -1; 3412 } 3413 } 3414 3415 desc->cb_fn = cb_fn; 3416 desc->cb_arg = cb_arg; 3417 desc->timeout_in_sec = timeout_in_sec; 3418 3419 return 0; 3420 } 3421 3422 static int 3423 bdev_channel_create(void *io_device, void *ctx_buf) 3424 { 3425 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3426 struct spdk_bdev_channel *ch = ctx_buf; 3427 struct spdk_io_channel *mgmt_io_ch; 3428 struct spdk_bdev_mgmt_channel *mgmt_ch; 3429 struct spdk_bdev_shared_resource *shared_resource; 3430 struct lba_range *range; 3431 3432 ch->bdev = bdev; 3433 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3434 if (!ch->channel) { 3435 return -1; 3436 } 3437 3438 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3439 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3440 3441 assert(ch->histogram == NULL); 3442 if (bdev->internal.histogram_enabled) { 3443 ch->histogram = spdk_histogram_data_alloc(); 3444 if (ch->histogram == NULL) { 3445 SPDK_ERRLOG("Could not allocate histogram\n"); 3446 } 3447 } 3448 3449 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3450 if (!mgmt_io_ch) { 3451 spdk_put_io_channel(ch->channel); 3452 return -1; 3453 } 3454 3455 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3456 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3457 if (shared_resource->shared_ch == ch->channel) { 3458 spdk_put_io_channel(mgmt_io_ch); 3459 shared_resource->ref++; 3460 break; 3461 } 3462 } 3463 3464 if (shared_resource == NULL) { 3465 shared_resource = calloc(1, sizeof(*shared_resource)); 3466 if (shared_resource == NULL) { 3467 spdk_put_io_channel(ch->channel); 3468 spdk_put_io_channel(mgmt_io_ch); 3469 return -1; 3470 } 3471 3472 shared_resource->mgmt_ch = mgmt_ch; 3473 shared_resource->io_outstanding = 0; 3474 TAILQ_INIT(&shared_resource->nomem_io); 3475 shared_resource->nomem_threshold = 0; 3476 shared_resource->shared_ch = ch->channel; 3477 shared_resource->ref = 1; 3478 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3479 } 3480 3481 ch->io_outstanding = 0; 3482 TAILQ_INIT(&ch->queued_resets); 3483 TAILQ_INIT(&ch->locked_ranges); 3484 ch->flags = 0; 3485 ch->shared_resource = shared_resource; 3486 3487 TAILQ_INIT(&ch->io_submitted); 3488 TAILQ_INIT(&ch->io_locked); 3489 3490 ch->stat = bdev_alloc_io_stat(false); 3491 if (ch->stat == NULL) { 3492 bdev_channel_destroy_resource(ch); 3493 return -1; 3494 } 3495 3496 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3497 3498 #ifdef SPDK_CONFIG_VTUNE 3499 { 3500 char *name; 3501 __itt_init_ittlib(NULL, 0); 3502 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3503 if (!name) { 3504 bdev_channel_destroy_resource(ch); 3505 return -1; 3506 } 3507 ch->handle = __itt_string_handle_create(name); 3508 free(name); 3509 ch->start_tsc = spdk_get_ticks(); 3510 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3511 ch->prev_stat = bdev_alloc_io_stat(false); 3512 if (ch->prev_stat == NULL) { 3513 bdev_channel_destroy_resource(ch); 3514 return -1; 3515 } 3516 } 3517 #endif 3518 3519 spdk_spin_lock(&bdev->internal.spinlock); 3520 bdev_enable_qos(bdev, ch); 3521 3522 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3523 struct lba_range *new_range; 3524 3525 new_range = calloc(1, sizeof(*new_range)); 3526 if (new_range == NULL) { 3527 spdk_spin_unlock(&bdev->internal.spinlock); 3528 bdev_channel_destroy_resource(ch); 3529 return -1; 3530 } 3531 new_range->length = range->length; 3532 new_range->offset = range->offset; 3533 new_range->locked_ctx = range->locked_ctx; 3534 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3535 } 3536 3537 spdk_spin_unlock(&bdev->internal.spinlock); 3538 3539 return 0; 3540 } 3541 3542 static int 3543 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3544 void *cb_ctx) 3545 { 3546 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3547 struct spdk_bdev_io *bdev_io; 3548 uint64_t buf_len; 3549 3550 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3551 if (bdev_io->internal.ch == bdev_ch) { 3552 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3553 spdk_iobuf_entry_abort(ch, entry, buf_len); 3554 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3555 } 3556 3557 return 0; 3558 } 3559 3560 /* 3561 * Abort I/O that are waiting on a data buffer. 3562 */ 3563 static void 3564 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3565 { 3566 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3567 bdev_abort_all_buf_io_cb, ch); 3568 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3569 bdev_abort_all_buf_io_cb, ch); 3570 } 3571 3572 /* 3573 * Abort I/O that are queued waiting for submission. These types of I/O are 3574 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3575 */ 3576 static void 3577 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3578 { 3579 struct spdk_bdev_io *bdev_io, *tmp; 3580 3581 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3582 if (bdev_io->internal.ch == ch) { 3583 TAILQ_REMOVE(queue, bdev_io, internal.link); 3584 /* 3585 * spdk_bdev_io_complete() assumes that the completed I/O had 3586 * been submitted to the bdev module. Since in this case it 3587 * hadn't, bump io_outstanding to account for the decrement 3588 * that spdk_bdev_io_complete() will do. 3589 */ 3590 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3591 ch->io_outstanding++; 3592 ch->shared_resource->io_outstanding++; 3593 } 3594 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3595 } 3596 } 3597 } 3598 3599 static bool 3600 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3601 { 3602 struct spdk_bdev_io *bdev_io; 3603 3604 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3605 if (bdev_io == bio_to_abort) { 3606 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3607 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3608 return true; 3609 } 3610 } 3611 3612 return false; 3613 } 3614 3615 static int 3616 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3617 { 3618 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3619 uint64_t buf_len; 3620 3621 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3622 if (bdev_io == bio_to_abort) { 3623 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3624 spdk_iobuf_entry_abort(ch, entry, buf_len); 3625 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3626 return 1; 3627 } 3628 3629 return 0; 3630 } 3631 3632 static bool 3633 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3634 { 3635 int rc; 3636 3637 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3638 bdev_abort_buf_io_cb, bio_to_abort); 3639 if (rc == 1) { 3640 return true; 3641 } 3642 3643 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3644 bdev_abort_buf_io_cb, bio_to_abort); 3645 return rc == 1; 3646 } 3647 3648 static void 3649 bdev_qos_channel_destroy(void *cb_arg) 3650 { 3651 struct spdk_bdev_qos *qos = cb_arg; 3652 3653 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3654 spdk_poller_unregister(&qos->poller); 3655 3656 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3657 3658 free(qos); 3659 } 3660 3661 static int 3662 bdev_qos_destroy(struct spdk_bdev *bdev) 3663 { 3664 int i; 3665 3666 /* 3667 * Cleanly shutting down the QoS poller is tricky, because 3668 * during the asynchronous operation the user could open 3669 * a new descriptor and create a new channel, spawning 3670 * a new QoS poller. 3671 * 3672 * The strategy is to create a new QoS structure here and swap it 3673 * in. The shutdown path then continues to refer to the old one 3674 * until it completes and then releases it. 3675 */ 3676 struct spdk_bdev_qos *new_qos, *old_qos; 3677 3678 old_qos = bdev->internal.qos; 3679 3680 new_qos = calloc(1, sizeof(*new_qos)); 3681 if (!new_qos) { 3682 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3683 return -ENOMEM; 3684 } 3685 3686 /* Copy the old QoS data into the newly allocated structure */ 3687 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3688 3689 /* Zero out the key parts of the QoS structure */ 3690 new_qos->ch = NULL; 3691 new_qos->thread = NULL; 3692 new_qos->poller = NULL; 3693 TAILQ_INIT(&new_qos->queued); 3694 /* 3695 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3696 * It will be used later for the new QoS structure. 3697 */ 3698 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3699 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3700 new_qos->rate_limits[i].min_per_timeslice = 0; 3701 new_qos->rate_limits[i].max_per_timeslice = 0; 3702 } 3703 3704 bdev->internal.qos = new_qos; 3705 3706 if (old_qos->thread == NULL) { 3707 free(old_qos); 3708 } else { 3709 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3710 } 3711 3712 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3713 * been destroyed yet. The destruction path will end up waiting for the final 3714 * channel to be put before it releases resources. */ 3715 3716 return 0; 3717 } 3718 3719 void 3720 spdk_bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3721 { 3722 total->bytes_read += add->bytes_read; 3723 total->num_read_ops += add->num_read_ops; 3724 total->bytes_written += add->bytes_written; 3725 total->num_write_ops += add->num_write_ops; 3726 total->bytes_unmapped += add->bytes_unmapped; 3727 total->num_unmap_ops += add->num_unmap_ops; 3728 total->bytes_copied += add->bytes_copied; 3729 total->num_copy_ops += add->num_copy_ops; 3730 total->read_latency_ticks += add->read_latency_ticks; 3731 total->write_latency_ticks += add->write_latency_ticks; 3732 total->unmap_latency_ticks += add->unmap_latency_ticks; 3733 total->copy_latency_ticks += add->copy_latency_ticks; 3734 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3735 total->max_read_latency_ticks = add->max_read_latency_ticks; 3736 } 3737 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3738 total->min_read_latency_ticks = add->min_read_latency_ticks; 3739 } 3740 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3741 total->max_write_latency_ticks = add->max_write_latency_ticks; 3742 } 3743 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3744 total->min_write_latency_ticks = add->min_write_latency_ticks; 3745 } 3746 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3747 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3748 } 3749 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3750 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3751 } 3752 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3753 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3754 } 3755 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3756 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3757 } 3758 } 3759 3760 static void 3761 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3762 { 3763 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3764 3765 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3766 memcpy(to_stat->io_error, from_stat->io_error, 3767 sizeof(struct spdk_bdev_io_error_stat)); 3768 } 3769 } 3770 3771 void 3772 spdk_bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum spdk_bdev_reset_stat_mode mode) 3773 { 3774 stat->max_read_latency_ticks = 0; 3775 stat->min_read_latency_ticks = UINT64_MAX; 3776 stat->max_write_latency_ticks = 0; 3777 stat->min_write_latency_ticks = UINT64_MAX; 3778 stat->max_unmap_latency_ticks = 0; 3779 stat->min_unmap_latency_ticks = UINT64_MAX; 3780 stat->max_copy_latency_ticks = 0; 3781 stat->min_copy_latency_ticks = UINT64_MAX; 3782 3783 if (mode != SPDK_BDEV_RESET_STAT_ALL) { 3784 return; 3785 } 3786 3787 stat->bytes_read = 0; 3788 stat->num_read_ops = 0; 3789 stat->bytes_written = 0; 3790 stat->num_write_ops = 0; 3791 stat->bytes_unmapped = 0; 3792 stat->num_unmap_ops = 0; 3793 stat->bytes_copied = 0; 3794 stat->num_copy_ops = 0; 3795 stat->read_latency_ticks = 0; 3796 stat->write_latency_ticks = 0; 3797 stat->unmap_latency_ticks = 0; 3798 stat->copy_latency_ticks = 0; 3799 3800 if (stat->io_error != NULL) { 3801 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3802 } 3803 } 3804 3805 struct spdk_bdev_io_stat * 3806 bdev_alloc_io_stat(bool io_error_stat) 3807 { 3808 struct spdk_bdev_io_stat *stat; 3809 3810 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3811 if (stat == NULL) { 3812 return NULL; 3813 } 3814 3815 if (io_error_stat) { 3816 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3817 if (stat->io_error == NULL) { 3818 free(stat); 3819 return NULL; 3820 } 3821 } else { 3822 stat->io_error = NULL; 3823 } 3824 3825 spdk_bdev_reset_io_stat(stat, SPDK_BDEV_RESET_STAT_ALL); 3826 3827 return stat; 3828 } 3829 3830 void 3831 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3832 { 3833 if (stat != NULL) { 3834 free(stat->io_error); 3835 free(stat); 3836 } 3837 } 3838 3839 void 3840 spdk_bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3841 { 3842 int i; 3843 3844 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3845 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3846 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3847 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3848 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3849 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3850 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3851 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3852 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3853 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3854 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3855 stat->min_read_latency_ticks != UINT64_MAX ? 3856 stat->min_read_latency_ticks : 0); 3857 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3858 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3859 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3860 stat->min_write_latency_ticks != UINT64_MAX ? 3861 stat->min_write_latency_ticks : 0); 3862 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3863 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3864 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3865 stat->min_unmap_latency_ticks != UINT64_MAX ? 3866 stat->min_unmap_latency_ticks : 0); 3867 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3868 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3869 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3870 stat->min_copy_latency_ticks != UINT64_MAX ? 3871 stat->min_copy_latency_ticks : 0); 3872 3873 if (stat->io_error != NULL) { 3874 spdk_json_write_named_object_begin(w, "io_error"); 3875 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3876 if (stat->io_error->error_status[i] != 0) { 3877 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3878 stat->io_error->error_status[i]); 3879 } 3880 } 3881 spdk_json_write_object_end(w); 3882 } 3883 } 3884 3885 static void 3886 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3887 { 3888 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3889 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3890 3891 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3892 bdev_abort_all_buf_io(mgmt_ch, ch); 3893 bdev_abort_all_buf_io(mgmt_ch, ch); 3894 } 3895 3896 static void 3897 bdev_channel_destroy(void *io_device, void *ctx_buf) 3898 { 3899 struct spdk_bdev_channel *ch = ctx_buf; 3900 3901 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3902 spdk_get_thread()); 3903 3904 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3905 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3906 3907 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3908 spdk_spin_lock(&ch->bdev->internal.spinlock); 3909 spdk_bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3910 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3911 3912 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3913 3914 bdev_channel_abort_queued_ios(ch); 3915 3916 if (ch->histogram) { 3917 spdk_histogram_data_free(ch->histogram); 3918 } 3919 3920 bdev_channel_destroy_resource(ch); 3921 } 3922 3923 /* 3924 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3925 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3926 */ 3927 static int 3928 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3929 { 3930 struct spdk_bdev_name *tmp; 3931 3932 bdev_name->name = strdup(name); 3933 if (bdev_name->name == NULL) { 3934 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3935 return -ENOMEM; 3936 } 3937 3938 bdev_name->bdev = bdev; 3939 3940 spdk_spin_lock(&g_bdev_mgr.spinlock); 3941 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3942 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3943 3944 if (tmp != NULL) { 3945 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3946 free(bdev_name->name); 3947 return -EEXIST; 3948 } 3949 3950 return 0; 3951 } 3952 3953 static void 3954 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3955 { 3956 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3957 free(bdev_name->name); 3958 } 3959 3960 static void 3961 bdev_name_del(struct spdk_bdev_name *bdev_name) 3962 { 3963 spdk_spin_lock(&g_bdev_mgr.spinlock); 3964 bdev_name_del_unsafe(bdev_name); 3965 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3966 } 3967 3968 int 3969 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3970 { 3971 struct spdk_bdev_alias *tmp; 3972 int ret; 3973 3974 if (alias == NULL) { 3975 SPDK_ERRLOG("Empty alias passed\n"); 3976 return -EINVAL; 3977 } 3978 3979 tmp = calloc(1, sizeof(*tmp)); 3980 if (tmp == NULL) { 3981 SPDK_ERRLOG("Unable to allocate alias\n"); 3982 return -ENOMEM; 3983 } 3984 3985 ret = bdev_name_add(&tmp->alias, bdev, alias); 3986 if (ret != 0) { 3987 free(tmp); 3988 return ret; 3989 } 3990 3991 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3992 3993 return 0; 3994 } 3995 3996 static int 3997 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3998 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3999 { 4000 struct spdk_bdev_alias *tmp; 4001 4002 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 4003 if (strcmp(alias, tmp->alias.name) == 0) { 4004 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 4005 alias_del_fn(&tmp->alias); 4006 free(tmp); 4007 return 0; 4008 } 4009 } 4010 4011 return -ENOENT; 4012 } 4013 4014 int 4015 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 4016 { 4017 int rc; 4018 4019 rc = bdev_alias_del(bdev, alias, bdev_name_del); 4020 if (rc == -ENOENT) { 4021 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 4022 } 4023 4024 return rc; 4025 } 4026 4027 void 4028 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 4029 { 4030 struct spdk_bdev_alias *p, *tmp; 4031 4032 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4033 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4034 bdev_name_del(&p->alias); 4035 free(p); 4036 } 4037 } 4038 4039 struct spdk_io_channel * 4040 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4041 { 4042 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4043 } 4044 4045 void * 4046 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4047 { 4048 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4049 void *ctx = NULL; 4050 4051 if (bdev->fn_table->get_module_ctx) { 4052 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4053 } 4054 4055 return ctx; 4056 } 4057 4058 const char * 4059 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4060 { 4061 return bdev->module->name; 4062 } 4063 4064 const char * 4065 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4066 { 4067 return bdev->name; 4068 } 4069 4070 const char * 4071 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4072 { 4073 return bdev->product_name; 4074 } 4075 4076 const struct spdk_bdev_aliases_list * 4077 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4078 { 4079 return &bdev->aliases; 4080 } 4081 4082 uint32_t 4083 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4084 { 4085 return bdev->blocklen; 4086 } 4087 4088 uint32_t 4089 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4090 { 4091 return bdev->write_unit_size; 4092 } 4093 4094 uint64_t 4095 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4096 { 4097 return bdev->blockcnt; 4098 } 4099 4100 const char * 4101 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4102 { 4103 return qos_rpc_type[type]; 4104 } 4105 4106 void 4107 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4108 { 4109 int i; 4110 4111 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4112 4113 spdk_spin_lock(&bdev->internal.spinlock); 4114 if (bdev->internal.qos) { 4115 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4116 if (bdev->internal.qos->rate_limits[i].limit != 4117 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4118 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4119 if (bdev_qos_is_iops_rate_limit(i) == false) { 4120 /* Change from Byte to Megabyte which is user visible. */ 4121 limits[i] = limits[i] / 1024 / 1024; 4122 } 4123 } 4124 } 4125 } 4126 spdk_spin_unlock(&bdev->internal.spinlock); 4127 } 4128 4129 size_t 4130 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4131 { 4132 return 1 << bdev->required_alignment; 4133 } 4134 4135 uint32_t 4136 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4137 { 4138 return bdev->optimal_io_boundary; 4139 } 4140 4141 bool 4142 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4143 { 4144 return bdev->write_cache; 4145 } 4146 4147 const struct spdk_uuid * 4148 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4149 { 4150 return &bdev->uuid; 4151 } 4152 4153 uint16_t 4154 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4155 { 4156 return bdev->acwu; 4157 } 4158 4159 uint32_t 4160 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4161 { 4162 return bdev->md_len; 4163 } 4164 4165 bool 4166 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4167 { 4168 return (bdev->md_len != 0) && bdev->md_interleave; 4169 } 4170 4171 bool 4172 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4173 { 4174 return (bdev->md_len != 0) && !bdev->md_interleave; 4175 } 4176 4177 bool 4178 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4179 { 4180 return bdev->zoned; 4181 } 4182 4183 uint32_t 4184 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4185 { 4186 if (spdk_bdev_is_md_interleaved(bdev)) { 4187 return bdev->blocklen - bdev->md_len; 4188 } else { 4189 return bdev->blocklen; 4190 } 4191 } 4192 4193 uint32_t 4194 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4195 { 4196 return bdev->phys_blocklen; 4197 } 4198 4199 static uint32_t 4200 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4201 { 4202 if (!spdk_bdev_is_md_interleaved(bdev)) { 4203 return bdev->blocklen + bdev->md_len; 4204 } else { 4205 return bdev->blocklen; 4206 } 4207 } 4208 4209 /* We have to use the typedef in the function declaration to appease astyle. */ 4210 typedef enum spdk_dif_type spdk_dif_type_t; 4211 4212 spdk_dif_type_t 4213 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4214 { 4215 if (bdev->md_len != 0) { 4216 return bdev->dif_type; 4217 } else { 4218 return SPDK_DIF_DISABLE; 4219 } 4220 } 4221 4222 bool 4223 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4224 { 4225 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4226 return bdev->dif_is_head_of_md; 4227 } else { 4228 return false; 4229 } 4230 } 4231 4232 bool 4233 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4234 enum spdk_dif_check_type check_type) 4235 { 4236 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4237 return false; 4238 } 4239 4240 switch (check_type) { 4241 case SPDK_DIF_CHECK_TYPE_REFTAG: 4242 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4243 case SPDK_DIF_CHECK_TYPE_APPTAG: 4244 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4245 case SPDK_DIF_CHECK_TYPE_GUARD: 4246 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4247 default: 4248 return false; 4249 } 4250 } 4251 4252 uint32_t 4253 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4254 { 4255 return bdev->max_copy; 4256 } 4257 4258 uint64_t 4259 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4260 { 4261 return bdev->internal.measured_queue_depth; 4262 } 4263 4264 uint64_t 4265 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4266 { 4267 return bdev->internal.period; 4268 } 4269 4270 uint64_t 4271 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4272 { 4273 return bdev->internal.weighted_io_time; 4274 } 4275 4276 uint64_t 4277 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4278 { 4279 return bdev->internal.io_time; 4280 } 4281 4282 static void bdev_update_qd_sampling_period(void *ctx); 4283 4284 static void 4285 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4286 { 4287 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4288 4289 if (bdev->internal.measured_queue_depth) { 4290 bdev->internal.io_time += bdev->internal.period; 4291 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4292 } 4293 4294 bdev->internal.qd_poll_in_progress = false; 4295 4296 bdev_update_qd_sampling_period(bdev); 4297 } 4298 4299 static void 4300 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4301 struct spdk_io_channel *io_ch, void *_ctx) 4302 { 4303 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4304 4305 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4306 spdk_bdev_for_each_channel_continue(i, 0); 4307 } 4308 4309 static int 4310 bdev_calculate_measured_queue_depth(void *ctx) 4311 { 4312 struct spdk_bdev *bdev = ctx; 4313 4314 bdev->internal.qd_poll_in_progress = true; 4315 bdev->internal.temporary_queue_depth = 0; 4316 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4317 return SPDK_POLLER_BUSY; 4318 } 4319 4320 static void 4321 bdev_update_qd_sampling_period(void *ctx) 4322 { 4323 struct spdk_bdev *bdev = ctx; 4324 4325 if (bdev->internal.period == bdev->internal.new_period) { 4326 return; 4327 } 4328 4329 if (bdev->internal.qd_poll_in_progress) { 4330 return; 4331 } 4332 4333 bdev->internal.period = bdev->internal.new_period; 4334 4335 spdk_poller_unregister(&bdev->internal.qd_poller); 4336 if (bdev->internal.period != 0) { 4337 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4338 bdev, bdev->internal.period); 4339 } else { 4340 spdk_bdev_close(bdev->internal.qd_desc); 4341 bdev->internal.qd_desc = NULL; 4342 } 4343 } 4344 4345 static void 4346 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4347 { 4348 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4349 } 4350 4351 void 4352 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4353 { 4354 int rc; 4355 4356 if (bdev->internal.new_period == period) { 4357 return; 4358 } 4359 4360 bdev->internal.new_period = period; 4361 4362 if (bdev->internal.qd_desc != NULL) { 4363 assert(bdev->internal.period != 0); 4364 4365 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4366 bdev_update_qd_sampling_period, bdev); 4367 return; 4368 } 4369 4370 assert(bdev->internal.period == 0); 4371 4372 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4373 NULL, &bdev->internal.qd_desc); 4374 if (rc != 0) { 4375 return; 4376 } 4377 4378 bdev->internal.period = period; 4379 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4380 bdev, period); 4381 } 4382 4383 struct bdev_get_current_qd_ctx { 4384 uint64_t current_qd; 4385 spdk_bdev_get_current_qd_cb cb_fn; 4386 void *cb_arg; 4387 }; 4388 4389 static void 4390 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4391 { 4392 struct bdev_get_current_qd_ctx *ctx = _ctx; 4393 4394 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4395 4396 free(ctx); 4397 } 4398 4399 static void 4400 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4401 struct spdk_io_channel *io_ch, void *_ctx) 4402 { 4403 struct bdev_get_current_qd_ctx *ctx = _ctx; 4404 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4405 4406 ctx->current_qd += bdev_ch->io_outstanding; 4407 4408 spdk_bdev_for_each_channel_continue(i, 0); 4409 } 4410 4411 void 4412 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4413 void *cb_arg) 4414 { 4415 struct bdev_get_current_qd_ctx *ctx; 4416 4417 assert(cb_fn != NULL); 4418 4419 ctx = calloc(1, sizeof(*ctx)); 4420 if (ctx == NULL) { 4421 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4422 return; 4423 } 4424 4425 ctx->cb_fn = cb_fn; 4426 ctx->cb_arg = cb_arg; 4427 4428 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4429 } 4430 4431 static void 4432 _resize_notify(void *arg) 4433 { 4434 struct spdk_bdev_desc *desc = arg; 4435 4436 spdk_spin_lock(&desc->spinlock); 4437 desc->refs--; 4438 if (!desc->closed) { 4439 spdk_spin_unlock(&desc->spinlock); 4440 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4441 desc->bdev, 4442 desc->callback.ctx); 4443 return; 4444 } else if (0 == desc->refs) { 4445 /* This descriptor was closed after this resize_notify message was sent. 4446 * spdk_bdev_close() could not free the descriptor since this message was 4447 * in flight, so we free it now using bdev_desc_free(). 4448 */ 4449 spdk_spin_unlock(&desc->spinlock); 4450 bdev_desc_free(desc); 4451 return; 4452 } 4453 spdk_spin_unlock(&desc->spinlock); 4454 } 4455 4456 int 4457 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4458 { 4459 struct spdk_bdev_desc *desc; 4460 int ret; 4461 4462 if (size == bdev->blockcnt) { 4463 return 0; 4464 } 4465 4466 spdk_spin_lock(&bdev->internal.spinlock); 4467 4468 /* bdev has open descriptors */ 4469 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4470 bdev->blockcnt > size) { 4471 ret = -EBUSY; 4472 } else { 4473 bdev->blockcnt = size; 4474 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4475 spdk_spin_lock(&desc->spinlock); 4476 if (!desc->closed) { 4477 desc->refs++; 4478 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4479 } 4480 spdk_spin_unlock(&desc->spinlock); 4481 } 4482 ret = 0; 4483 } 4484 4485 spdk_spin_unlock(&bdev->internal.spinlock); 4486 4487 return ret; 4488 } 4489 4490 /* 4491 * Convert I/O offset and length from bytes to blocks. 4492 * 4493 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4494 */ 4495 static uint64_t 4496 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4497 uint64_t num_bytes, uint64_t *num_blocks) 4498 { 4499 uint32_t block_size = bdev->blocklen; 4500 uint8_t shift_cnt; 4501 4502 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4503 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4504 shift_cnt = spdk_u32log2(block_size); 4505 *offset_blocks = offset_bytes >> shift_cnt; 4506 *num_blocks = num_bytes >> shift_cnt; 4507 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4508 (num_bytes - (*num_blocks << shift_cnt)); 4509 } else { 4510 *offset_blocks = offset_bytes / block_size; 4511 *num_blocks = num_bytes / block_size; 4512 return (offset_bytes % block_size) | (num_bytes % block_size); 4513 } 4514 } 4515 4516 static bool 4517 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4518 { 4519 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4520 * has been an overflow and hence the offset has been wrapped around */ 4521 if (offset_blocks + num_blocks < offset_blocks) { 4522 return false; 4523 } 4524 4525 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4526 if (offset_blocks + num_blocks > bdev->blockcnt) { 4527 return false; 4528 } 4529 4530 return true; 4531 } 4532 4533 static void 4534 bdev_seek_complete_cb(void *ctx) 4535 { 4536 struct spdk_bdev_io *bdev_io = ctx; 4537 4538 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4539 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4540 } 4541 4542 static int 4543 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4544 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4545 spdk_bdev_io_completion_cb cb, void *cb_arg) 4546 { 4547 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4548 struct spdk_bdev_io *bdev_io; 4549 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4550 4551 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4552 4553 /* Check if offset_blocks is valid looking at the validity of one block */ 4554 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4555 return -EINVAL; 4556 } 4557 4558 bdev_io = bdev_channel_get_io(channel); 4559 if (!bdev_io) { 4560 return -ENOMEM; 4561 } 4562 4563 bdev_io->internal.ch = channel; 4564 bdev_io->internal.desc = desc; 4565 bdev_io->type = io_type; 4566 bdev_io->u.bdev.offset_blocks = offset_blocks; 4567 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4568 4569 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4570 /* In case bdev doesn't support seek to next data/hole offset, 4571 * it is assumed that only data and no holes are present */ 4572 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4573 bdev_io->u.bdev.seek.offset = offset_blocks; 4574 } else { 4575 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4576 } 4577 4578 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4579 return 0; 4580 } 4581 4582 bdev_io_submit(bdev_io); 4583 return 0; 4584 } 4585 4586 int 4587 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4588 uint64_t offset_blocks, 4589 spdk_bdev_io_completion_cb cb, void *cb_arg) 4590 { 4591 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4592 } 4593 4594 int 4595 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4596 uint64_t offset_blocks, 4597 spdk_bdev_io_completion_cb cb, void *cb_arg) 4598 { 4599 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4600 } 4601 4602 uint64_t 4603 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4604 { 4605 return bdev_io->u.bdev.seek.offset; 4606 } 4607 4608 static int 4609 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4610 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4611 spdk_bdev_io_completion_cb cb, void *cb_arg) 4612 { 4613 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4614 struct spdk_bdev_io *bdev_io; 4615 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4616 4617 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4618 return -EINVAL; 4619 } 4620 4621 bdev_io = bdev_channel_get_io(channel); 4622 if (!bdev_io) { 4623 return -ENOMEM; 4624 } 4625 4626 bdev_io->internal.ch = channel; 4627 bdev_io->internal.desc = desc; 4628 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4629 bdev_io->u.bdev.iovs = &bdev_io->iov; 4630 bdev_io->u.bdev.iovs[0].iov_base = buf; 4631 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4632 bdev_io->u.bdev.iovcnt = 1; 4633 bdev_io->u.bdev.md_buf = md_buf; 4634 bdev_io->u.bdev.num_blocks = num_blocks; 4635 bdev_io->u.bdev.offset_blocks = offset_blocks; 4636 bdev_io->u.bdev.ext_opts = NULL; 4637 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4638 4639 bdev_io_submit(bdev_io); 4640 return 0; 4641 } 4642 4643 int 4644 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4645 void *buf, uint64_t offset, uint64_t nbytes, 4646 spdk_bdev_io_completion_cb cb, void *cb_arg) 4647 { 4648 uint64_t offset_blocks, num_blocks; 4649 4650 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4651 nbytes, &num_blocks) != 0) { 4652 return -EINVAL; 4653 } 4654 4655 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4656 } 4657 4658 int 4659 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4660 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4661 spdk_bdev_io_completion_cb cb, void *cb_arg) 4662 { 4663 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4664 } 4665 4666 int 4667 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4668 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4669 spdk_bdev_io_completion_cb cb, void *cb_arg) 4670 { 4671 struct iovec iov = { 4672 .iov_base = buf, 4673 }; 4674 4675 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4676 return -EINVAL; 4677 } 4678 4679 if (md_buf && !_is_buf_allocated(&iov)) { 4680 return -EINVAL; 4681 } 4682 4683 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4684 cb, cb_arg); 4685 } 4686 4687 int 4688 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4689 struct iovec *iov, int iovcnt, 4690 uint64_t offset, uint64_t nbytes, 4691 spdk_bdev_io_completion_cb cb, void *cb_arg) 4692 { 4693 uint64_t offset_blocks, num_blocks; 4694 4695 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4696 nbytes, &num_blocks) != 0) { 4697 return -EINVAL; 4698 } 4699 4700 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4701 } 4702 4703 static int 4704 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4705 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4706 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4707 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4708 { 4709 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4710 struct spdk_bdev_io *bdev_io; 4711 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4712 4713 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4714 return -EINVAL; 4715 } 4716 4717 bdev_io = bdev_channel_get_io(channel); 4718 if (!bdev_io) { 4719 return -ENOMEM; 4720 } 4721 4722 bdev_io->internal.ch = channel; 4723 bdev_io->internal.desc = desc; 4724 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4725 bdev_io->u.bdev.iovs = iov; 4726 bdev_io->u.bdev.iovcnt = iovcnt; 4727 bdev_io->u.bdev.md_buf = md_buf; 4728 bdev_io->u.bdev.num_blocks = num_blocks; 4729 bdev_io->u.bdev.offset_blocks = offset_blocks; 4730 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4731 bdev_io->internal.ext_opts = opts; 4732 bdev_io->u.bdev.ext_opts = opts; 4733 4734 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4735 4736 return 0; 4737 } 4738 4739 int 4740 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4741 struct iovec *iov, int iovcnt, 4742 uint64_t offset_blocks, uint64_t num_blocks, 4743 spdk_bdev_io_completion_cb cb, void *cb_arg) 4744 { 4745 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4746 num_blocks, cb, cb_arg, NULL, false); 4747 } 4748 4749 int 4750 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4751 struct iovec *iov, int iovcnt, void *md_buf, 4752 uint64_t offset_blocks, uint64_t num_blocks, 4753 spdk_bdev_io_completion_cb cb, void *cb_arg) 4754 { 4755 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4756 return -EINVAL; 4757 } 4758 4759 if (md_buf && !_is_buf_allocated(iov)) { 4760 return -EINVAL; 4761 } 4762 4763 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4764 num_blocks, cb, cb_arg, NULL, false); 4765 } 4766 4767 static inline bool 4768 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4769 { 4770 /* 4771 * We check if opts size is at least of size when we first introduced 4772 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4773 * are not checked internal. 4774 */ 4775 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4776 sizeof(opts->metadata) && 4777 opts->size <= sizeof(*opts) && 4778 /* When memory domain is used, the user must provide data buffers */ 4779 (!opts->memory_domain || (iov && iov[0].iov_base)); 4780 } 4781 4782 int 4783 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4784 struct iovec *iov, int iovcnt, 4785 uint64_t offset_blocks, uint64_t num_blocks, 4786 spdk_bdev_io_completion_cb cb, void *cb_arg, 4787 struct spdk_bdev_ext_io_opts *opts) 4788 { 4789 void *md = NULL; 4790 4791 if (opts) { 4792 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4793 return -EINVAL; 4794 } 4795 md = opts->metadata; 4796 } 4797 4798 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4799 return -EINVAL; 4800 } 4801 4802 if (md && !_is_buf_allocated(iov)) { 4803 return -EINVAL; 4804 } 4805 4806 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4807 num_blocks, cb, cb_arg, opts, false); 4808 } 4809 4810 static int 4811 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4812 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4813 spdk_bdev_io_completion_cb cb, void *cb_arg) 4814 { 4815 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4816 struct spdk_bdev_io *bdev_io; 4817 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4818 4819 if (!desc->write) { 4820 return -EBADF; 4821 } 4822 4823 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4824 return -EINVAL; 4825 } 4826 4827 bdev_io = bdev_channel_get_io(channel); 4828 if (!bdev_io) { 4829 return -ENOMEM; 4830 } 4831 4832 bdev_io->internal.ch = channel; 4833 bdev_io->internal.desc = desc; 4834 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4835 bdev_io->u.bdev.iovs = &bdev_io->iov; 4836 bdev_io->u.bdev.iovs[0].iov_base = buf; 4837 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4838 bdev_io->u.bdev.iovcnt = 1; 4839 bdev_io->u.bdev.md_buf = md_buf; 4840 bdev_io->u.bdev.num_blocks = num_blocks; 4841 bdev_io->u.bdev.offset_blocks = offset_blocks; 4842 bdev_io->u.bdev.ext_opts = NULL; 4843 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4844 4845 bdev_io_submit(bdev_io); 4846 return 0; 4847 } 4848 4849 int 4850 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4851 void *buf, uint64_t offset, uint64_t nbytes, 4852 spdk_bdev_io_completion_cb cb, void *cb_arg) 4853 { 4854 uint64_t offset_blocks, num_blocks; 4855 4856 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4857 nbytes, &num_blocks) != 0) { 4858 return -EINVAL; 4859 } 4860 4861 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4862 } 4863 4864 int 4865 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4866 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4867 spdk_bdev_io_completion_cb cb, void *cb_arg) 4868 { 4869 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4870 cb, cb_arg); 4871 } 4872 4873 int 4874 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4875 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4876 spdk_bdev_io_completion_cb cb, void *cb_arg) 4877 { 4878 struct iovec iov = { 4879 .iov_base = buf, 4880 }; 4881 4882 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4883 return -EINVAL; 4884 } 4885 4886 if (md_buf && !_is_buf_allocated(&iov)) { 4887 return -EINVAL; 4888 } 4889 4890 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4891 cb, cb_arg); 4892 } 4893 4894 static int 4895 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4896 struct iovec *iov, int iovcnt, void *md_buf, 4897 uint64_t offset_blocks, uint64_t num_blocks, 4898 spdk_bdev_io_completion_cb cb, void *cb_arg, 4899 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4900 { 4901 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4902 struct spdk_bdev_io *bdev_io; 4903 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4904 4905 if (!desc->write) { 4906 return -EBADF; 4907 } 4908 4909 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4910 return -EINVAL; 4911 } 4912 4913 bdev_io = bdev_channel_get_io(channel); 4914 if (!bdev_io) { 4915 return -ENOMEM; 4916 } 4917 4918 bdev_io->internal.ch = channel; 4919 bdev_io->internal.desc = desc; 4920 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4921 bdev_io->u.bdev.iovs = iov; 4922 bdev_io->u.bdev.iovcnt = iovcnt; 4923 bdev_io->u.bdev.md_buf = md_buf; 4924 bdev_io->u.bdev.num_blocks = num_blocks; 4925 bdev_io->u.bdev.offset_blocks = offset_blocks; 4926 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4927 bdev_io->internal.ext_opts = opts; 4928 bdev_io->u.bdev.ext_opts = opts; 4929 4930 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4931 4932 return 0; 4933 } 4934 4935 int 4936 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4937 struct iovec *iov, int iovcnt, 4938 uint64_t offset, uint64_t len, 4939 spdk_bdev_io_completion_cb cb, void *cb_arg) 4940 { 4941 uint64_t offset_blocks, num_blocks; 4942 4943 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4944 len, &num_blocks) != 0) { 4945 return -EINVAL; 4946 } 4947 4948 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4949 } 4950 4951 int 4952 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4953 struct iovec *iov, int iovcnt, 4954 uint64_t offset_blocks, uint64_t num_blocks, 4955 spdk_bdev_io_completion_cb cb, void *cb_arg) 4956 { 4957 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4958 num_blocks, cb, cb_arg, NULL, false); 4959 } 4960 4961 int 4962 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4963 struct iovec *iov, int iovcnt, void *md_buf, 4964 uint64_t offset_blocks, uint64_t num_blocks, 4965 spdk_bdev_io_completion_cb cb, void *cb_arg) 4966 { 4967 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4968 return -EINVAL; 4969 } 4970 4971 if (md_buf && !_is_buf_allocated(iov)) { 4972 return -EINVAL; 4973 } 4974 4975 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4976 num_blocks, cb, cb_arg, NULL, false); 4977 } 4978 4979 int 4980 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4981 struct iovec *iov, int iovcnt, 4982 uint64_t offset_blocks, uint64_t num_blocks, 4983 spdk_bdev_io_completion_cb cb, void *cb_arg, 4984 struct spdk_bdev_ext_io_opts *opts) 4985 { 4986 void *md = NULL; 4987 4988 if (opts) { 4989 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4990 return -EINVAL; 4991 } 4992 md = opts->metadata; 4993 } 4994 4995 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4996 return -EINVAL; 4997 } 4998 4999 if (md && !_is_buf_allocated(iov)) { 5000 return -EINVAL; 5001 } 5002 5003 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 5004 num_blocks, cb, cb_arg, opts, false); 5005 } 5006 5007 static void 5008 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5009 { 5010 struct spdk_bdev_io *parent_io = cb_arg; 5011 struct spdk_bdev *bdev = parent_io->bdev; 5012 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 5013 int i, rc = 0; 5014 5015 if (!success) { 5016 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5017 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5018 spdk_bdev_free_io(bdev_io); 5019 return; 5020 } 5021 5022 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 5023 rc = memcmp(read_buf, 5024 parent_io->u.bdev.iovs[i].iov_base, 5025 parent_io->u.bdev.iovs[i].iov_len); 5026 if (rc) { 5027 break; 5028 } 5029 read_buf += parent_io->u.bdev.iovs[i].iov_len; 5030 } 5031 5032 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5033 rc = memcmp(bdev_io->u.bdev.md_buf, 5034 parent_io->u.bdev.md_buf, 5035 spdk_bdev_get_md_size(bdev)); 5036 } 5037 5038 spdk_bdev_free_io(bdev_io); 5039 5040 if (rc == 0) { 5041 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5042 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5043 } else { 5044 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5045 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5046 } 5047 } 5048 5049 static void 5050 bdev_compare_do_read(void *_bdev_io) 5051 { 5052 struct spdk_bdev_io *bdev_io = _bdev_io; 5053 int rc; 5054 5055 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5056 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5057 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5058 bdev_compare_do_read_done, bdev_io); 5059 5060 if (rc == -ENOMEM) { 5061 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5062 } else if (rc != 0) { 5063 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5064 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5065 } 5066 } 5067 5068 static int 5069 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5070 struct iovec *iov, int iovcnt, void *md_buf, 5071 uint64_t offset_blocks, uint64_t num_blocks, 5072 spdk_bdev_io_completion_cb cb, void *cb_arg) 5073 { 5074 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5075 struct spdk_bdev_io *bdev_io; 5076 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5077 5078 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5079 return -EINVAL; 5080 } 5081 5082 bdev_io = bdev_channel_get_io(channel); 5083 if (!bdev_io) { 5084 return -ENOMEM; 5085 } 5086 5087 bdev_io->internal.ch = channel; 5088 bdev_io->internal.desc = desc; 5089 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5090 bdev_io->u.bdev.iovs = iov; 5091 bdev_io->u.bdev.iovcnt = iovcnt; 5092 bdev_io->u.bdev.md_buf = md_buf; 5093 bdev_io->u.bdev.num_blocks = num_blocks; 5094 bdev_io->u.bdev.offset_blocks = offset_blocks; 5095 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5096 bdev_io->u.bdev.ext_opts = NULL; 5097 5098 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5099 bdev_io_submit(bdev_io); 5100 return 0; 5101 } 5102 5103 bdev_compare_do_read(bdev_io); 5104 5105 return 0; 5106 } 5107 5108 int 5109 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5110 struct iovec *iov, int iovcnt, 5111 uint64_t offset_blocks, uint64_t num_blocks, 5112 spdk_bdev_io_completion_cb cb, void *cb_arg) 5113 { 5114 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5115 num_blocks, cb, cb_arg); 5116 } 5117 5118 int 5119 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5120 struct iovec *iov, int iovcnt, void *md_buf, 5121 uint64_t offset_blocks, uint64_t num_blocks, 5122 spdk_bdev_io_completion_cb cb, void *cb_arg) 5123 { 5124 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5125 return -EINVAL; 5126 } 5127 5128 if (md_buf && !_is_buf_allocated(iov)) { 5129 return -EINVAL; 5130 } 5131 5132 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5133 num_blocks, cb, cb_arg); 5134 } 5135 5136 static int 5137 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5138 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5139 spdk_bdev_io_completion_cb cb, void *cb_arg) 5140 { 5141 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5142 struct spdk_bdev_io *bdev_io; 5143 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5144 5145 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5146 return -EINVAL; 5147 } 5148 5149 bdev_io = bdev_channel_get_io(channel); 5150 if (!bdev_io) { 5151 return -ENOMEM; 5152 } 5153 5154 bdev_io->internal.ch = channel; 5155 bdev_io->internal.desc = desc; 5156 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5157 bdev_io->u.bdev.iovs = &bdev_io->iov; 5158 bdev_io->u.bdev.iovs[0].iov_base = buf; 5159 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5160 bdev_io->u.bdev.iovcnt = 1; 5161 bdev_io->u.bdev.md_buf = md_buf; 5162 bdev_io->u.bdev.num_blocks = num_blocks; 5163 bdev_io->u.bdev.offset_blocks = offset_blocks; 5164 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5165 bdev_io->u.bdev.ext_opts = NULL; 5166 5167 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5168 bdev_io_submit(bdev_io); 5169 return 0; 5170 } 5171 5172 bdev_compare_do_read(bdev_io); 5173 5174 return 0; 5175 } 5176 5177 int 5178 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5179 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5180 spdk_bdev_io_completion_cb cb, void *cb_arg) 5181 { 5182 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5183 cb, cb_arg); 5184 } 5185 5186 int 5187 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5188 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5189 spdk_bdev_io_completion_cb cb, void *cb_arg) 5190 { 5191 struct iovec iov = { 5192 .iov_base = buf, 5193 }; 5194 5195 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5196 return -EINVAL; 5197 } 5198 5199 if (md_buf && !_is_buf_allocated(&iov)) { 5200 return -EINVAL; 5201 } 5202 5203 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5204 cb, cb_arg); 5205 } 5206 5207 static void 5208 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5209 { 5210 struct spdk_bdev_io *bdev_io = ctx; 5211 5212 if (unlock_status) { 5213 SPDK_ERRLOG("LBA range unlock failed\n"); 5214 } 5215 5216 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5217 false, bdev_io->internal.caller_ctx); 5218 } 5219 5220 static void 5221 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5222 { 5223 bdev_io->internal.status = status; 5224 5225 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5226 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5227 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5228 } 5229 5230 static void 5231 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5232 { 5233 struct spdk_bdev_io *parent_io = cb_arg; 5234 5235 if (!success) { 5236 SPDK_ERRLOG("Compare and write operation failed\n"); 5237 } 5238 5239 spdk_bdev_free_io(bdev_io); 5240 5241 bdev_comparev_and_writev_blocks_unlock(parent_io, 5242 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5243 } 5244 5245 static void 5246 bdev_compare_and_write_do_write(void *_bdev_io) 5247 { 5248 struct spdk_bdev_io *bdev_io = _bdev_io; 5249 int rc; 5250 5251 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5252 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5253 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5254 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5255 bdev_compare_and_write_do_write_done, bdev_io); 5256 5257 5258 if (rc == -ENOMEM) { 5259 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5260 } else if (rc != 0) { 5261 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5262 } 5263 } 5264 5265 static void 5266 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5267 { 5268 struct spdk_bdev_io *parent_io = cb_arg; 5269 5270 spdk_bdev_free_io(bdev_io); 5271 5272 if (!success) { 5273 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5274 return; 5275 } 5276 5277 bdev_compare_and_write_do_write(parent_io); 5278 } 5279 5280 static void 5281 bdev_compare_and_write_do_compare(void *_bdev_io) 5282 { 5283 struct spdk_bdev_io *bdev_io = _bdev_io; 5284 int rc; 5285 5286 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5287 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5288 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5289 bdev_compare_and_write_do_compare_done, bdev_io); 5290 5291 if (rc == -ENOMEM) { 5292 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5293 } else if (rc != 0) { 5294 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5295 } 5296 } 5297 5298 static void 5299 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5300 { 5301 struct spdk_bdev_io *bdev_io = ctx; 5302 5303 if (status) { 5304 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5305 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5306 return; 5307 } 5308 5309 bdev_compare_and_write_do_compare(bdev_io); 5310 } 5311 5312 int 5313 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5314 struct iovec *compare_iov, int compare_iovcnt, 5315 struct iovec *write_iov, int write_iovcnt, 5316 uint64_t offset_blocks, uint64_t num_blocks, 5317 spdk_bdev_io_completion_cb cb, void *cb_arg) 5318 { 5319 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5320 struct spdk_bdev_io *bdev_io; 5321 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5322 5323 if (!desc->write) { 5324 return -EBADF; 5325 } 5326 5327 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5328 return -EINVAL; 5329 } 5330 5331 if (num_blocks > bdev->acwu) { 5332 return -EINVAL; 5333 } 5334 5335 bdev_io = bdev_channel_get_io(channel); 5336 if (!bdev_io) { 5337 return -ENOMEM; 5338 } 5339 5340 bdev_io->internal.ch = channel; 5341 bdev_io->internal.desc = desc; 5342 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5343 bdev_io->u.bdev.iovs = compare_iov; 5344 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5345 bdev_io->u.bdev.fused_iovs = write_iov; 5346 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5347 bdev_io->u.bdev.md_buf = NULL; 5348 bdev_io->u.bdev.num_blocks = num_blocks; 5349 bdev_io->u.bdev.offset_blocks = offset_blocks; 5350 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5351 bdev_io->u.bdev.ext_opts = NULL; 5352 5353 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5354 bdev_io_submit(bdev_io); 5355 return 0; 5356 } 5357 5358 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5359 bdev_comparev_and_writev_blocks_locked, bdev_io); 5360 } 5361 5362 int 5363 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5364 struct iovec *iov, int iovcnt, 5365 uint64_t offset_blocks, uint64_t num_blocks, 5366 bool populate, 5367 spdk_bdev_io_completion_cb cb, void *cb_arg) 5368 { 5369 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5370 struct spdk_bdev_io *bdev_io; 5371 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5372 5373 if (!desc->write) { 5374 return -EBADF; 5375 } 5376 5377 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5378 return -EINVAL; 5379 } 5380 5381 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5382 return -ENOTSUP; 5383 } 5384 5385 bdev_io = bdev_channel_get_io(channel); 5386 if (!bdev_io) { 5387 return -ENOMEM; 5388 } 5389 5390 bdev_io->internal.ch = channel; 5391 bdev_io->internal.desc = desc; 5392 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5393 bdev_io->u.bdev.num_blocks = num_blocks; 5394 bdev_io->u.bdev.offset_blocks = offset_blocks; 5395 bdev_io->u.bdev.iovs = iov; 5396 bdev_io->u.bdev.iovcnt = iovcnt; 5397 bdev_io->u.bdev.md_buf = NULL; 5398 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5399 bdev_io->u.bdev.zcopy.commit = 0; 5400 bdev_io->u.bdev.zcopy.start = 1; 5401 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5402 bdev_io->u.bdev.ext_opts = NULL; 5403 5404 bdev_io_submit(bdev_io); 5405 5406 return 0; 5407 } 5408 5409 int 5410 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5411 spdk_bdev_io_completion_cb cb, void *cb_arg) 5412 { 5413 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5414 return -EINVAL; 5415 } 5416 5417 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5418 bdev_io->u.bdev.zcopy.start = 0; 5419 bdev_io->internal.caller_ctx = cb_arg; 5420 bdev_io->internal.cb = cb; 5421 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5422 5423 bdev_io_submit(bdev_io); 5424 5425 return 0; 5426 } 5427 5428 int 5429 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5430 uint64_t offset, uint64_t len, 5431 spdk_bdev_io_completion_cb cb, void *cb_arg) 5432 { 5433 uint64_t offset_blocks, num_blocks; 5434 5435 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5436 len, &num_blocks) != 0) { 5437 return -EINVAL; 5438 } 5439 5440 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5441 } 5442 5443 int 5444 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5445 uint64_t offset_blocks, uint64_t num_blocks, 5446 spdk_bdev_io_completion_cb cb, void *cb_arg) 5447 { 5448 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5449 struct spdk_bdev_io *bdev_io; 5450 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5451 5452 if (!desc->write) { 5453 return -EBADF; 5454 } 5455 5456 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5457 return -EINVAL; 5458 } 5459 5460 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5461 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5462 return -ENOTSUP; 5463 } 5464 5465 bdev_io = bdev_channel_get_io(channel); 5466 5467 if (!bdev_io) { 5468 return -ENOMEM; 5469 } 5470 5471 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5472 bdev_io->internal.ch = channel; 5473 bdev_io->internal.desc = desc; 5474 bdev_io->u.bdev.offset_blocks = offset_blocks; 5475 bdev_io->u.bdev.num_blocks = num_blocks; 5476 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5477 bdev_io->u.bdev.ext_opts = NULL; 5478 5479 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5480 bdev_io_submit(bdev_io); 5481 return 0; 5482 } 5483 5484 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5485 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5486 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5487 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5488 bdev_write_zero_buffer_next(bdev_io); 5489 5490 return 0; 5491 } 5492 5493 int 5494 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5495 uint64_t offset, uint64_t nbytes, 5496 spdk_bdev_io_completion_cb cb, void *cb_arg) 5497 { 5498 uint64_t offset_blocks, num_blocks; 5499 5500 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5501 nbytes, &num_blocks) != 0) { 5502 return -EINVAL; 5503 } 5504 5505 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5506 } 5507 5508 int 5509 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5510 uint64_t offset_blocks, uint64_t num_blocks, 5511 spdk_bdev_io_completion_cb cb, void *cb_arg) 5512 { 5513 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5514 struct spdk_bdev_io *bdev_io; 5515 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5516 5517 if (!desc->write) { 5518 return -EBADF; 5519 } 5520 5521 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5522 return -EINVAL; 5523 } 5524 5525 if (num_blocks == 0) { 5526 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5527 return -EINVAL; 5528 } 5529 5530 bdev_io = bdev_channel_get_io(channel); 5531 if (!bdev_io) { 5532 return -ENOMEM; 5533 } 5534 5535 bdev_io->internal.ch = channel; 5536 bdev_io->internal.desc = desc; 5537 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5538 5539 bdev_io->u.bdev.iovs = &bdev_io->iov; 5540 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5541 bdev_io->u.bdev.iovs[0].iov_len = 0; 5542 bdev_io->u.bdev.iovcnt = 1; 5543 5544 bdev_io->u.bdev.offset_blocks = offset_blocks; 5545 bdev_io->u.bdev.num_blocks = num_blocks; 5546 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5547 bdev_io->u.bdev.ext_opts = NULL; 5548 5549 bdev_io_submit(bdev_io); 5550 return 0; 5551 } 5552 5553 int 5554 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5555 uint64_t offset, uint64_t length, 5556 spdk_bdev_io_completion_cb cb, void *cb_arg) 5557 { 5558 uint64_t offset_blocks, num_blocks; 5559 5560 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5561 length, &num_blocks) != 0) { 5562 return -EINVAL; 5563 } 5564 5565 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5566 } 5567 5568 int 5569 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5570 uint64_t offset_blocks, uint64_t num_blocks, 5571 spdk_bdev_io_completion_cb cb, void *cb_arg) 5572 { 5573 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5574 struct spdk_bdev_io *bdev_io; 5575 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5576 5577 if (!desc->write) { 5578 return -EBADF; 5579 } 5580 5581 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5582 return -EINVAL; 5583 } 5584 5585 bdev_io = bdev_channel_get_io(channel); 5586 if (!bdev_io) { 5587 return -ENOMEM; 5588 } 5589 5590 bdev_io->internal.ch = channel; 5591 bdev_io->internal.desc = desc; 5592 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5593 bdev_io->u.bdev.iovs = NULL; 5594 bdev_io->u.bdev.iovcnt = 0; 5595 bdev_io->u.bdev.offset_blocks = offset_blocks; 5596 bdev_io->u.bdev.num_blocks = num_blocks; 5597 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5598 5599 bdev_io_submit(bdev_io); 5600 return 0; 5601 } 5602 5603 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5604 5605 static void 5606 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5607 { 5608 struct spdk_bdev_channel *ch = _ctx; 5609 struct spdk_bdev_io *bdev_io; 5610 5611 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5612 5613 if (status == -EBUSY) { 5614 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5615 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5616 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5617 } else { 5618 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5619 * start the reset. */ 5620 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5621 bdev_io_submit_reset(bdev_io); 5622 } 5623 } else { 5624 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5625 SPDK_DEBUGLOG(bdev, 5626 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5627 ch->bdev->name); 5628 /* Mark the completion status as a SUCCESS and complete the reset. */ 5629 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5630 } 5631 } 5632 5633 static void 5634 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5635 struct spdk_io_channel *io_ch, void *_ctx) 5636 { 5637 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5638 int status = 0; 5639 5640 if (cur_ch->io_outstanding > 0) { 5641 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5642 * further iteration over the rest of the channels and pass non-zero status 5643 * to the callback function. */ 5644 status = -EBUSY; 5645 } 5646 spdk_bdev_for_each_channel_continue(i, status); 5647 } 5648 5649 static int 5650 bdev_reset_poll_for_outstanding_io(void *ctx) 5651 { 5652 struct spdk_bdev_channel *ch = ctx; 5653 struct spdk_bdev_io *bdev_io; 5654 5655 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5656 5657 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5658 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5659 bdev_reset_check_outstanding_io_done); 5660 5661 return SPDK_POLLER_BUSY; 5662 } 5663 5664 static void 5665 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5666 { 5667 struct spdk_bdev_channel *ch = _ctx; 5668 struct spdk_bdev_io *bdev_io; 5669 5670 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5671 5672 if (bdev->reset_io_drain_timeout == 0) { 5673 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5674 5675 bdev_io_submit_reset(bdev_io); 5676 return; 5677 } 5678 5679 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5680 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5681 5682 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5683 * submit the reset to the underlying module only if outstanding I/O 5684 * remain after reset_io_drain_timeout seconds have passed. */ 5685 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5686 bdev_reset_check_outstanding_io_done); 5687 } 5688 5689 static void 5690 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5691 struct spdk_io_channel *ch, void *_ctx) 5692 { 5693 struct spdk_bdev_channel *channel; 5694 struct spdk_bdev_mgmt_channel *mgmt_channel; 5695 struct spdk_bdev_shared_resource *shared_resource; 5696 bdev_io_tailq_t tmp_queued; 5697 5698 TAILQ_INIT(&tmp_queued); 5699 5700 channel = __io_ch_to_bdev_ch(ch); 5701 shared_resource = channel->shared_resource; 5702 mgmt_channel = shared_resource->mgmt_ch; 5703 5704 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5705 5706 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5707 /* The QoS object is always valid and readable while 5708 * the channel flag is set, so the lock here should not 5709 * be necessary. We're not in the fast path though, so 5710 * just take it anyway. */ 5711 spdk_spin_lock(&channel->bdev->internal.spinlock); 5712 if (channel->bdev->internal.qos->ch == channel) { 5713 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5714 } 5715 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5716 } 5717 5718 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5719 bdev_abort_all_buf_io(mgmt_channel, channel); 5720 bdev_abort_all_buf_io(mgmt_channel, channel); 5721 bdev_abort_all_queued_io(&tmp_queued, channel); 5722 5723 spdk_bdev_for_each_channel_continue(i, 0); 5724 } 5725 5726 static void 5727 bdev_start_reset(void *ctx) 5728 { 5729 struct spdk_bdev_channel *ch = ctx; 5730 5731 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5732 bdev_reset_freeze_channel_done); 5733 } 5734 5735 static void 5736 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5737 { 5738 struct spdk_bdev *bdev = ch->bdev; 5739 5740 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5741 5742 spdk_spin_lock(&bdev->internal.spinlock); 5743 if (bdev->internal.reset_in_progress == NULL) { 5744 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5745 /* 5746 * Take a channel reference for the target bdev for the life of this 5747 * reset. This guards against the channel getting destroyed while 5748 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5749 * progress. We will release the reference when this reset is 5750 * completed. 5751 */ 5752 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5753 bdev_start_reset(ch); 5754 } 5755 spdk_spin_unlock(&bdev->internal.spinlock); 5756 } 5757 5758 int 5759 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5760 spdk_bdev_io_completion_cb cb, void *cb_arg) 5761 { 5762 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5763 struct spdk_bdev_io *bdev_io; 5764 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5765 5766 bdev_io = bdev_channel_get_io(channel); 5767 if (!bdev_io) { 5768 return -ENOMEM; 5769 } 5770 5771 bdev_io->internal.ch = channel; 5772 bdev_io->internal.desc = desc; 5773 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5774 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5775 bdev_io->u.reset.ch_ref = NULL; 5776 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5777 5778 spdk_spin_lock(&bdev->internal.spinlock); 5779 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5780 spdk_spin_unlock(&bdev->internal.spinlock); 5781 5782 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5783 internal.ch_link); 5784 5785 bdev_channel_start_reset(channel); 5786 5787 return 0; 5788 } 5789 5790 void 5791 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5792 struct spdk_bdev_io_stat *stat) 5793 { 5794 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5795 5796 bdev_get_io_stat(stat, channel->stat); 5797 } 5798 5799 static void 5800 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5801 { 5802 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5803 5804 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5805 bdev_iostat_ctx->cb_arg, 0); 5806 free(bdev_iostat_ctx); 5807 } 5808 5809 static void 5810 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5811 struct spdk_io_channel *ch, void *_ctx) 5812 { 5813 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5814 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5815 5816 spdk_bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5817 spdk_bdev_for_each_channel_continue(i, 0); 5818 } 5819 5820 void 5821 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5822 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5823 { 5824 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5825 5826 assert(bdev != NULL); 5827 assert(stat != NULL); 5828 assert(cb != NULL); 5829 5830 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5831 if (bdev_iostat_ctx == NULL) { 5832 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5833 cb(bdev, stat, cb_arg, -ENOMEM); 5834 return; 5835 } 5836 5837 bdev_iostat_ctx->stat = stat; 5838 bdev_iostat_ctx->cb = cb; 5839 bdev_iostat_ctx->cb_arg = cb_arg; 5840 5841 /* Start with the statistics from previously deleted channels. */ 5842 spdk_spin_lock(&bdev->internal.spinlock); 5843 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5844 spdk_spin_unlock(&bdev->internal.spinlock); 5845 5846 /* Then iterate and add the statistics from each existing channel. */ 5847 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5848 bdev_get_device_stat_done); 5849 } 5850 5851 struct bdev_iostat_reset_ctx { 5852 enum spdk_bdev_reset_stat_mode mode; 5853 bdev_reset_device_stat_cb cb; 5854 void *cb_arg; 5855 }; 5856 5857 static void 5858 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5859 { 5860 struct bdev_iostat_reset_ctx *ctx = _ctx; 5861 5862 ctx->cb(bdev, ctx->cb_arg, 0); 5863 5864 free(ctx); 5865 } 5866 5867 static void 5868 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5869 struct spdk_io_channel *ch, void *_ctx) 5870 { 5871 struct bdev_iostat_reset_ctx *ctx = _ctx; 5872 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5873 5874 spdk_bdev_reset_io_stat(channel->stat, ctx->mode); 5875 5876 spdk_bdev_for_each_channel_continue(i, 0); 5877 } 5878 5879 void 5880 bdev_reset_device_stat(struct spdk_bdev *bdev, enum spdk_bdev_reset_stat_mode mode, 5881 bdev_reset_device_stat_cb cb, void *cb_arg) 5882 { 5883 struct bdev_iostat_reset_ctx *ctx; 5884 5885 assert(bdev != NULL); 5886 assert(cb != NULL); 5887 5888 ctx = calloc(1, sizeof(*ctx)); 5889 if (ctx == NULL) { 5890 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5891 cb(bdev, cb_arg, -ENOMEM); 5892 return; 5893 } 5894 5895 ctx->mode = mode; 5896 ctx->cb = cb; 5897 ctx->cb_arg = cb_arg; 5898 5899 spdk_spin_lock(&bdev->internal.spinlock); 5900 spdk_bdev_reset_io_stat(bdev->internal.stat, mode); 5901 spdk_spin_unlock(&bdev->internal.spinlock); 5902 5903 spdk_bdev_for_each_channel(bdev, 5904 bdev_reset_each_channel_stat, 5905 ctx, 5906 bdev_reset_device_stat_done); 5907 } 5908 5909 int 5910 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5911 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5912 spdk_bdev_io_completion_cb cb, void *cb_arg) 5913 { 5914 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5915 struct spdk_bdev_io *bdev_io; 5916 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5917 5918 if (!desc->write) { 5919 return -EBADF; 5920 } 5921 5922 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5923 return -ENOTSUP; 5924 } 5925 5926 bdev_io = bdev_channel_get_io(channel); 5927 if (!bdev_io) { 5928 return -ENOMEM; 5929 } 5930 5931 bdev_io->internal.ch = channel; 5932 bdev_io->internal.desc = desc; 5933 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5934 bdev_io->u.nvme_passthru.cmd = *cmd; 5935 bdev_io->u.nvme_passthru.buf = buf; 5936 bdev_io->u.nvme_passthru.nbytes = nbytes; 5937 bdev_io->u.nvme_passthru.md_buf = NULL; 5938 bdev_io->u.nvme_passthru.md_len = 0; 5939 5940 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5941 5942 bdev_io_submit(bdev_io); 5943 return 0; 5944 } 5945 5946 int 5947 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5948 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5949 spdk_bdev_io_completion_cb cb, void *cb_arg) 5950 { 5951 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5952 struct spdk_bdev_io *bdev_io; 5953 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5954 5955 if (!desc->write) { 5956 /* 5957 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5958 * to easily determine if the command is a read or write, but for now just 5959 * do not allow io_passthru with a read-only descriptor. 5960 */ 5961 return -EBADF; 5962 } 5963 5964 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5965 return -ENOTSUP; 5966 } 5967 5968 bdev_io = bdev_channel_get_io(channel); 5969 if (!bdev_io) { 5970 return -ENOMEM; 5971 } 5972 5973 bdev_io->internal.ch = channel; 5974 bdev_io->internal.desc = desc; 5975 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5976 bdev_io->u.nvme_passthru.cmd = *cmd; 5977 bdev_io->u.nvme_passthru.buf = buf; 5978 bdev_io->u.nvme_passthru.nbytes = nbytes; 5979 bdev_io->u.nvme_passthru.md_buf = NULL; 5980 bdev_io->u.nvme_passthru.md_len = 0; 5981 5982 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5983 5984 bdev_io_submit(bdev_io); 5985 return 0; 5986 } 5987 5988 int 5989 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5990 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5991 spdk_bdev_io_completion_cb cb, void *cb_arg) 5992 { 5993 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5994 struct spdk_bdev_io *bdev_io; 5995 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5996 5997 if (!desc->write) { 5998 /* 5999 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 6000 * to easily determine if the command is a read or write, but for now just 6001 * do not allow io_passthru with a read-only descriptor. 6002 */ 6003 return -EBADF; 6004 } 6005 6006 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 6007 return -ENOTSUP; 6008 } 6009 6010 bdev_io = bdev_channel_get_io(channel); 6011 if (!bdev_io) { 6012 return -ENOMEM; 6013 } 6014 6015 bdev_io->internal.ch = channel; 6016 bdev_io->internal.desc = desc; 6017 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 6018 bdev_io->u.nvme_passthru.cmd = *cmd; 6019 bdev_io->u.nvme_passthru.buf = buf; 6020 bdev_io->u.nvme_passthru.nbytes = nbytes; 6021 bdev_io->u.nvme_passthru.md_buf = md_buf; 6022 bdev_io->u.nvme_passthru.md_len = md_len; 6023 6024 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6025 6026 bdev_io_submit(bdev_io); 6027 return 0; 6028 } 6029 6030 static void bdev_abort_retry(void *ctx); 6031 static void bdev_abort(struct spdk_bdev_io *parent_io); 6032 6033 static void 6034 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6035 { 6036 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6037 struct spdk_bdev_io *parent_io = cb_arg; 6038 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6039 6040 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6041 6042 spdk_bdev_free_io(bdev_io); 6043 6044 if (!success) { 6045 /* Check if the target I/O completed in the meantime. */ 6046 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6047 if (tmp_io == bio_to_abort) { 6048 break; 6049 } 6050 } 6051 6052 /* If the target I/O still exists, set the parent to failed. */ 6053 if (tmp_io != NULL) { 6054 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6055 } 6056 } 6057 6058 parent_io->u.bdev.split_outstanding--; 6059 if (parent_io->u.bdev.split_outstanding == 0) { 6060 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6061 bdev_abort_retry(parent_io); 6062 } else { 6063 bdev_io_complete(parent_io); 6064 } 6065 } 6066 } 6067 6068 static int 6069 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6070 struct spdk_bdev_io *bio_to_abort, 6071 spdk_bdev_io_completion_cb cb, void *cb_arg) 6072 { 6073 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6074 struct spdk_bdev_io *bdev_io; 6075 6076 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6077 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6078 /* TODO: Abort reset or abort request. */ 6079 return -ENOTSUP; 6080 } 6081 6082 bdev_io = bdev_channel_get_io(channel); 6083 if (bdev_io == NULL) { 6084 return -ENOMEM; 6085 } 6086 6087 bdev_io->internal.ch = channel; 6088 bdev_io->internal.desc = desc; 6089 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6090 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6091 6092 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6093 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6094 6095 /* Parent abort request is not submitted directly, but to manage its 6096 * execution add it to the submitted list here. 6097 */ 6098 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6099 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6100 6101 bdev_abort(bdev_io); 6102 6103 return 0; 6104 } 6105 6106 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6107 6108 /* Submit the abort request to the underlying bdev module. */ 6109 bdev_io_submit(bdev_io); 6110 6111 return 0; 6112 } 6113 6114 static uint32_t 6115 _bdev_abort(struct spdk_bdev_io *parent_io) 6116 { 6117 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6118 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6119 void *bio_cb_arg; 6120 struct spdk_bdev_io *bio_to_abort; 6121 uint32_t matched_ios; 6122 int rc; 6123 6124 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6125 6126 /* matched_ios is returned and will be kept by the caller. 6127 * 6128 * This function will be used for two cases, 1) the same cb_arg is used for 6129 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6130 * Incrementing split_outstanding directly here may confuse readers especially 6131 * for the 1st case. 6132 * 6133 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6134 * works as expected. 6135 */ 6136 matched_ios = 0; 6137 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6138 6139 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6140 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6141 continue; 6142 } 6143 6144 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6145 /* Any I/O which was submitted after this abort command should be excluded. */ 6146 continue; 6147 } 6148 6149 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6150 if (rc != 0) { 6151 if (rc == -ENOMEM) { 6152 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6153 } else { 6154 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6155 } 6156 break; 6157 } 6158 matched_ios++; 6159 } 6160 6161 return matched_ios; 6162 } 6163 6164 static void 6165 bdev_abort_retry(void *ctx) 6166 { 6167 struct spdk_bdev_io *parent_io = ctx; 6168 uint32_t matched_ios; 6169 6170 matched_ios = _bdev_abort(parent_io); 6171 6172 if (matched_ios == 0) { 6173 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6174 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6175 } else { 6176 /* For retry, the case that no target I/O was found is success 6177 * because it means target I/Os completed in the meantime. 6178 */ 6179 bdev_io_complete(parent_io); 6180 } 6181 return; 6182 } 6183 6184 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6185 parent_io->u.bdev.split_outstanding = matched_ios; 6186 } 6187 6188 static void 6189 bdev_abort(struct spdk_bdev_io *parent_io) 6190 { 6191 uint32_t matched_ios; 6192 6193 matched_ios = _bdev_abort(parent_io); 6194 6195 if (matched_ios == 0) { 6196 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6197 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6198 } else { 6199 /* The case the no target I/O was found is failure. */ 6200 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6201 bdev_io_complete(parent_io); 6202 } 6203 return; 6204 } 6205 6206 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6207 parent_io->u.bdev.split_outstanding = matched_ios; 6208 } 6209 6210 int 6211 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6212 void *bio_cb_arg, 6213 spdk_bdev_io_completion_cb cb, void *cb_arg) 6214 { 6215 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6216 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6217 struct spdk_bdev_io *bdev_io; 6218 6219 if (bio_cb_arg == NULL) { 6220 return -EINVAL; 6221 } 6222 6223 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6224 return -ENOTSUP; 6225 } 6226 6227 bdev_io = bdev_channel_get_io(channel); 6228 if (bdev_io == NULL) { 6229 return -ENOMEM; 6230 } 6231 6232 bdev_io->internal.ch = channel; 6233 bdev_io->internal.desc = desc; 6234 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6235 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6236 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6237 6238 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6239 6240 /* Parent abort request is not submitted directly, but to manage its execution, 6241 * add it to the submitted list here. 6242 */ 6243 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6244 6245 bdev_abort(bdev_io); 6246 6247 return 0; 6248 } 6249 6250 int 6251 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6252 struct spdk_bdev_io_wait_entry *entry) 6253 { 6254 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6255 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6256 6257 if (bdev != entry->bdev) { 6258 SPDK_ERRLOG("bdevs do not match\n"); 6259 return -EINVAL; 6260 } 6261 6262 if (mgmt_ch->per_thread_cache_count > 0) { 6263 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6264 return -EINVAL; 6265 } 6266 6267 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6268 return 0; 6269 } 6270 6271 static inline void 6272 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6273 { 6274 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6275 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6276 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6277 uint32_t blocklen = bdev_io->bdev->blocklen; 6278 6279 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6280 switch (bdev_io->type) { 6281 case SPDK_BDEV_IO_TYPE_READ: 6282 io_stat->bytes_read += num_blocks * blocklen; 6283 io_stat->num_read_ops++; 6284 io_stat->read_latency_ticks += tsc_diff; 6285 if (io_stat->max_read_latency_ticks < tsc_diff) { 6286 io_stat->max_read_latency_ticks = tsc_diff; 6287 } 6288 if (io_stat->min_read_latency_ticks > tsc_diff) { 6289 io_stat->min_read_latency_ticks = tsc_diff; 6290 } 6291 break; 6292 case SPDK_BDEV_IO_TYPE_WRITE: 6293 io_stat->bytes_written += num_blocks * blocklen; 6294 io_stat->num_write_ops++; 6295 io_stat->write_latency_ticks += tsc_diff; 6296 if (io_stat->max_write_latency_ticks < tsc_diff) { 6297 io_stat->max_write_latency_ticks = tsc_diff; 6298 } 6299 if (io_stat->min_write_latency_ticks > tsc_diff) { 6300 io_stat->min_write_latency_ticks = tsc_diff; 6301 } 6302 break; 6303 case SPDK_BDEV_IO_TYPE_UNMAP: 6304 io_stat->bytes_unmapped += num_blocks * blocklen; 6305 io_stat->num_unmap_ops++; 6306 io_stat->unmap_latency_ticks += tsc_diff; 6307 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6308 io_stat->max_unmap_latency_ticks = tsc_diff; 6309 } 6310 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6311 io_stat->min_unmap_latency_ticks = tsc_diff; 6312 } 6313 break; 6314 case SPDK_BDEV_IO_TYPE_ZCOPY: 6315 /* Track the data in the start phase only */ 6316 if (bdev_io->u.bdev.zcopy.start) { 6317 if (bdev_io->u.bdev.zcopy.populate) { 6318 io_stat->bytes_read += num_blocks * blocklen; 6319 io_stat->num_read_ops++; 6320 io_stat->read_latency_ticks += tsc_diff; 6321 if (io_stat->max_read_latency_ticks < tsc_diff) { 6322 io_stat->max_read_latency_ticks = tsc_diff; 6323 } 6324 if (io_stat->min_read_latency_ticks > tsc_diff) { 6325 io_stat->min_read_latency_ticks = tsc_diff; 6326 } 6327 } else { 6328 io_stat->bytes_written += num_blocks * blocklen; 6329 io_stat->num_write_ops++; 6330 io_stat->write_latency_ticks += tsc_diff; 6331 if (io_stat->max_write_latency_ticks < tsc_diff) { 6332 io_stat->max_write_latency_ticks = tsc_diff; 6333 } 6334 if (io_stat->min_write_latency_ticks > tsc_diff) { 6335 io_stat->min_write_latency_ticks = tsc_diff; 6336 } 6337 } 6338 } 6339 break; 6340 case SPDK_BDEV_IO_TYPE_COPY: 6341 io_stat->bytes_copied += num_blocks * blocklen; 6342 io_stat->num_copy_ops++; 6343 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6344 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6345 io_stat->max_copy_latency_ticks = tsc_diff; 6346 } 6347 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6348 io_stat->min_copy_latency_ticks = tsc_diff; 6349 } 6350 break; 6351 default: 6352 break; 6353 } 6354 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6355 io_stat = bdev_io->bdev->internal.stat; 6356 assert(io_stat->io_error != NULL); 6357 6358 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6359 io_stat->io_error->error_status[-io_status - 1]++; 6360 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6361 } 6362 6363 #ifdef SPDK_CONFIG_VTUNE 6364 uint64_t now_tsc = spdk_get_ticks(); 6365 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6366 uint64_t data[5]; 6367 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6368 6369 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6370 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6371 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6372 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6373 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6374 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6375 6376 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6377 __itt_metadata_u64, 5, data); 6378 6379 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6380 bdev_io->internal.ch->start_tsc = now_tsc; 6381 } 6382 #endif 6383 } 6384 6385 static inline void 6386 bdev_io_complete(void *ctx) 6387 { 6388 struct spdk_bdev_io *bdev_io = ctx; 6389 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6390 uint64_t tsc, tsc_diff; 6391 6392 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6393 /* 6394 * Defer completion to avoid potential infinite recursion if the 6395 * user's completion callback issues a new I/O. 6396 */ 6397 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6398 bdev_io_complete, bdev_io); 6399 return; 6400 } 6401 6402 tsc = spdk_get_ticks(); 6403 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6404 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6405 bdev_io->internal.caller_ctx); 6406 6407 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6408 6409 if (bdev_io->internal.ch->histogram) { 6410 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6411 } 6412 6413 bdev_io_update_io_stat(bdev_io, tsc_diff); 6414 6415 assert(bdev_io->internal.cb != NULL); 6416 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6417 6418 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6419 bdev_io->internal.caller_ctx); 6420 } 6421 6422 static void bdev_destroy_cb(void *io_device); 6423 6424 static void 6425 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6426 { 6427 struct spdk_bdev_io *bdev_io = _ctx; 6428 6429 if (bdev_io->u.reset.ch_ref != NULL) { 6430 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6431 bdev_io->u.reset.ch_ref = NULL; 6432 } 6433 6434 bdev_io_complete(bdev_io); 6435 6436 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6437 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6438 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6439 } 6440 } 6441 6442 static void 6443 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6444 struct spdk_io_channel *_ch, void *_ctx) 6445 { 6446 struct spdk_bdev_io *bdev_io = _ctx; 6447 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6448 struct spdk_bdev_io *queued_reset; 6449 6450 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6451 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6452 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6453 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6454 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6455 } 6456 6457 spdk_bdev_for_each_channel_continue(i, 0); 6458 } 6459 6460 void 6461 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6462 { 6463 struct spdk_bdev *bdev = bdev_io->bdev; 6464 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6465 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6466 6467 bdev_io->internal.status = status; 6468 6469 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6470 bool unlock_channels = false; 6471 6472 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6473 SPDK_ERRLOG("NOMEM returned for reset\n"); 6474 } 6475 spdk_spin_lock(&bdev->internal.spinlock); 6476 if (bdev_io == bdev->internal.reset_in_progress) { 6477 bdev->internal.reset_in_progress = NULL; 6478 unlock_channels = true; 6479 } 6480 spdk_spin_unlock(&bdev->internal.spinlock); 6481 6482 if (unlock_channels) { 6483 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6484 bdev_reset_complete); 6485 return; 6486 } 6487 } else { 6488 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6489 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6490 /* bdev IO will be completed in the callback */ 6491 return; 6492 } 6493 6494 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6495 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6496 return; 6497 } 6498 } 6499 6500 bdev_io_complete(bdev_io); 6501 } 6502 6503 void 6504 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6505 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6506 { 6507 if (sc == SPDK_SCSI_STATUS_GOOD) { 6508 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6509 } else { 6510 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6511 bdev_io->internal.error.scsi.sc = sc; 6512 bdev_io->internal.error.scsi.sk = sk; 6513 bdev_io->internal.error.scsi.asc = asc; 6514 bdev_io->internal.error.scsi.ascq = ascq; 6515 } 6516 6517 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6518 } 6519 6520 void 6521 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6522 int *sc, int *sk, int *asc, int *ascq) 6523 { 6524 assert(sc != NULL); 6525 assert(sk != NULL); 6526 assert(asc != NULL); 6527 assert(ascq != NULL); 6528 6529 switch (bdev_io->internal.status) { 6530 case SPDK_BDEV_IO_STATUS_SUCCESS: 6531 *sc = SPDK_SCSI_STATUS_GOOD; 6532 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6533 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6534 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6535 break; 6536 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6537 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6538 break; 6539 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6540 *sc = bdev_io->internal.error.scsi.sc; 6541 *sk = bdev_io->internal.error.scsi.sk; 6542 *asc = bdev_io->internal.error.scsi.asc; 6543 *ascq = bdev_io->internal.error.scsi.ascq; 6544 break; 6545 default: 6546 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6547 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6548 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6549 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6550 break; 6551 } 6552 } 6553 6554 void 6555 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6556 { 6557 if (aio_result == 0) { 6558 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6559 } else { 6560 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6561 } 6562 6563 bdev_io->internal.error.aio_result = aio_result; 6564 6565 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6566 } 6567 6568 void 6569 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6570 { 6571 assert(aio_result != NULL); 6572 6573 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6574 *aio_result = bdev_io->internal.error.aio_result; 6575 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6576 *aio_result = 0; 6577 } else { 6578 *aio_result = -EIO; 6579 } 6580 } 6581 6582 void 6583 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6584 { 6585 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6586 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6587 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6588 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6589 } else { 6590 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6591 } 6592 6593 bdev_io->internal.error.nvme.cdw0 = cdw0; 6594 bdev_io->internal.error.nvme.sct = sct; 6595 bdev_io->internal.error.nvme.sc = sc; 6596 6597 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6598 } 6599 6600 void 6601 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6602 { 6603 assert(sct != NULL); 6604 assert(sc != NULL); 6605 assert(cdw0 != NULL); 6606 6607 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6608 *sct = SPDK_NVME_SCT_GENERIC; 6609 *sc = SPDK_NVME_SC_SUCCESS; 6610 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6611 *cdw0 = 0; 6612 } else { 6613 *cdw0 = 1U; 6614 } 6615 return; 6616 } 6617 6618 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6619 *sct = bdev_io->internal.error.nvme.sct; 6620 *sc = bdev_io->internal.error.nvme.sc; 6621 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6622 *sct = SPDK_NVME_SCT_GENERIC; 6623 *sc = SPDK_NVME_SC_SUCCESS; 6624 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6625 *sct = SPDK_NVME_SCT_GENERIC; 6626 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6627 } else { 6628 *sct = SPDK_NVME_SCT_GENERIC; 6629 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6630 } 6631 6632 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6633 } 6634 6635 void 6636 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6637 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6638 { 6639 assert(first_sct != NULL); 6640 assert(first_sc != NULL); 6641 assert(second_sct != NULL); 6642 assert(second_sc != NULL); 6643 assert(cdw0 != NULL); 6644 6645 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6646 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6647 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6648 *first_sct = bdev_io->internal.error.nvme.sct; 6649 *first_sc = bdev_io->internal.error.nvme.sc; 6650 *second_sct = SPDK_NVME_SCT_GENERIC; 6651 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6652 } else { 6653 *first_sct = SPDK_NVME_SCT_GENERIC; 6654 *first_sc = SPDK_NVME_SC_SUCCESS; 6655 *second_sct = bdev_io->internal.error.nvme.sct; 6656 *second_sc = bdev_io->internal.error.nvme.sc; 6657 } 6658 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6659 *first_sct = SPDK_NVME_SCT_GENERIC; 6660 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6661 *second_sct = SPDK_NVME_SCT_GENERIC; 6662 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6663 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6664 *first_sct = SPDK_NVME_SCT_GENERIC; 6665 *first_sc = SPDK_NVME_SC_SUCCESS; 6666 *second_sct = SPDK_NVME_SCT_GENERIC; 6667 *second_sc = SPDK_NVME_SC_SUCCESS; 6668 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6669 *first_sct = SPDK_NVME_SCT_GENERIC; 6670 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6671 *second_sct = SPDK_NVME_SCT_GENERIC; 6672 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6673 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6674 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6675 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6676 *second_sct = SPDK_NVME_SCT_GENERIC; 6677 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6678 } else { 6679 *first_sct = SPDK_NVME_SCT_GENERIC; 6680 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6681 *second_sct = SPDK_NVME_SCT_GENERIC; 6682 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6683 } 6684 6685 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6686 } 6687 6688 struct spdk_thread * 6689 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6690 { 6691 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6692 } 6693 6694 struct spdk_io_channel * 6695 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6696 { 6697 return bdev_io->internal.ch->channel; 6698 } 6699 6700 static int 6701 bdev_register(struct spdk_bdev *bdev) 6702 { 6703 char *bdev_name; 6704 char uuid[SPDK_UUID_STRING_LEN]; 6705 int ret; 6706 6707 assert(bdev->module != NULL); 6708 6709 if (!bdev->name) { 6710 SPDK_ERRLOG("Bdev name is NULL\n"); 6711 return -EINVAL; 6712 } 6713 6714 if (!strlen(bdev->name)) { 6715 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6716 return -EINVAL; 6717 } 6718 6719 /* Users often register their own I/O devices using the bdev name. In 6720 * order to avoid conflicts, prepend bdev_. */ 6721 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6722 if (!bdev_name) { 6723 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6724 return -ENOMEM; 6725 } 6726 6727 bdev->internal.stat = bdev_alloc_io_stat(true); 6728 if (!bdev->internal.stat) { 6729 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6730 free(bdev_name); 6731 return -ENOMEM; 6732 } 6733 6734 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6735 bdev->internal.measured_queue_depth = UINT64_MAX; 6736 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 6737 memset(&bdev->internal.claim, 0, sizeof(bdev->internal.claim)); 6738 bdev->internal.qd_poller = NULL; 6739 bdev->internal.qos = NULL; 6740 6741 TAILQ_INIT(&bdev->internal.open_descs); 6742 TAILQ_INIT(&bdev->internal.locked_ranges); 6743 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6744 TAILQ_INIT(&bdev->aliases); 6745 6746 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6747 if (ret != 0) { 6748 bdev_free_io_stat(bdev->internal.stat); 6749 free(bdev_name); 6750 return ret; 6751 } 6752 6753 /* UUID has to be specified by the user or defined by bdev itself. 6754 * Otherwise this field must remain empty, to indicate that this 6755 * value cannot be depended upon. */ 6756 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6757 /* Add the UUID alias only if it's different than the name */ 6758 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6759 if (strcmp(bdev->name, uuid) != 0) { 6760 ret = spdk_bdev_alias_add(bdev, uuid); 6761 if (ret != 0) { 6762 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6763 bdev_name_del(&bdev->internal.bdev_name); 6764 bdev_free_io_stat(bdev->internal.stat); 6765 free(bdev_name); 6766 return ret; 6767 } 6768 } 6769 } 6770 6771 if (spdk_bdev_get_buf_align(bdev) > 1) { 6772 if (bdev->split_on_optimal_io_boundary) { 6773 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6774 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6775 } else { 6776 bdev->split_on_optimal_io_boundary = true; 6777 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6778 } 6779 } 6780 6781 /* If the user didn't specify a write unit size, set it to one. */ 6782 if (bdev->write_unit_size == 0) { 6783 bdev->write_unit_size = 1; 6784 } 6785 6786 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6787 if (bdev->acwu == 0) { 6788 bdev->acwu = bdev->write_unit_size; 6789 } 6790 6791 if (bdev->phys_blocklen == 0) { 6792 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6793 } 6794 6795 bdev->internal.reset_in_progress = NULL; 6796 bdev->internal.qd_poll_in_progress = false; 6797 bdev->internal.period = 0; 6798 bdev->internal.new_period = 0; 6799 6800 spdk_io_device_register(__bdev_to_io_dev(bdev), 6801 bdev_channel_create, bdev_channel_destroy, 6802 sizeof(struct spdk_bdev_channel), 6803 bdev_name); 6804 6805 free(bdev_name); 6806 6807 spdk_spin_init(&bdev->internal.spinlock); 6808 6809 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6810 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6811 6812 return 0; 6813 } 6814 6815 static void 6816 bdev_destroy_cb(void *io_device) 6817 { 6818 int rc; 6819 struct spdk_bdev *bdev; 6820 spdk_bdev_unregister_cb cb_fn; 6821 void *cb_arg; 6822 6823 bdev = __bdev_from_io_dev(io_device); 6824 cb_fn = bdev->internal.unregister_cb; 6825 cb_arg = bdev->internal.unregister_ctx; 6826 6827 spdk_spin_destroy(&bdev->internal.spinlock); 6828 free(bdev->internal.qos); 6829 bdev_free_io_stat(bdev->internal.stat); 6830 6831 rc = bdev->fn_table->destruct(bdev->ctxt); 6832 if (rc < 0) { 6833 SPDK_ERRLOG("destruct failed\n"); 6834 } 6835 if (rc <= 0 && cb_fn != NULL) { 6836 cb_fn(cb_arg, rc); 6837 } 6838 } 6839 6840 void 6841 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6842 { 6843 if (bdev->internal.unregister_cb != NULL) { 6844 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6845 } 6846 } 6847 6848 static void 6849 _remove_notify(void *arg) 6850 { 6851 struct spdk_bdev_desc *desc = arg; 6852 6853 spdk_spin_lock(&desc->spinlock); 6854 desc->refs--; 6855 6856 if (!desc->closed) { 6857 spdk_spin_unlock(&desc->spinlock); 6858 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6859 return; 6860 } else if (0 == desc->refs) { 6861 /* This descriptor was closed after this remove_notify message was sent. 6862 * spdk_bdev_close() could not free the descriptor since this message was 6863 * in flight, so we free it now using bdev_desc_free(). 6864 */ 6865 spdk_spin_unlock(&desc->spinlock); 6866 bdev_desc_free(desc); 6867 return; 6868 } 6869 spdk_spin_unlock(&desc->spinlock); 6870 } 6871 6872 /* returns: 0 - bdev removed and ready to be destructed. 6873 * -EBUSY - bdev can't be destructed yet. */ 6874 static int 6875 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6876 { 6877 struct spdk_bdev_desc *desc, *tmp; 6878 int rc = 0; 6879 char uuid[SPDK_UUID_STRING_LEN]; 6880 6881 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6882 assert(spdk_spin_held(&bdev->internal.spinlock)); 6883 6884 /* Notify each descriptor about hotremoval */ 6885 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6886 rc = -EBUSY; 6887 spdk_spin_lock(&desc->spinlock); 6888 /* 6889 * Defer invocation of the event_cb to a separate message that will 6890 * run later on its thread. This ensures this context unwinds and 6891 * we don't recursively unregister this bdev again if the event_cb 6892 * immediately closes its descriptor. 6893 */ 6894 desc->refs++; 6895 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6896 spdk_spin_unlock(&desc->spinlock); 6897 } 6898 6899 /* If there are no descriptors, proceed removing the bdev */ 6900 if (rc == 0) { 6901 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6902 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6903 6904 /* Delete the name and the UUID alias */ 6905 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6906 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6907 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6908 6909 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6910 6911 if (bdev->internal.reset_in_progress != NULL) { 6912 /* If reset is in progress, let the completion callback for reset 6913 * unregister the bdev. 6914 */ 6915 rc = -EBUSY; 6916 } 6917 } 6918 6919 return rc; 6920 } 6921 6922 static void 6923 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6924 struct spdk_io_channel *io_ch, void *_ctx) 6925 { 6926 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 6927 6928 bdev_channel_abort_queued_ios(bdev_ch); 6929 spdk_bdev_for_each_channel_continue(i, 0); 6930 } 6931 6932 static void 6933 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 6934 { 6935 int rc; 6936 6937 spdk_spin_lock(&g_bdev_mgr.spinlock); 6938 spdk_spin_lock(&bdev->internal.spinlock); 6939 /* 6940 * Set the status to REMOVING after completing to abort channels. Otherwise, 6941 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6942 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 6943 * may fail. 6944 */ 6945 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6946 rc = bdev_unregister_unsafe(bdev); 6947 spdk_spin_unlock(&bdev->internal.spinlock); 6948 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6949 6950 if (rc == 0) { 6951 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6952 } 6953 } 6954 6955 void 6956 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6957 { 6958 struct spdk_thread *thread; 6959 6960 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6961 6962 thread = spdk_get_thread(); 6963 if (!thread) { 6964 /* The user called this from a non-SPDK thread. */ 6965 if (cb_fn != NULL) { 6966 cb_fn(cb_arg, -ENOTSUP); 6967 } 6968 return; 6969 } 6970 6971 spdk_spin_lock(&g_bdev_mgr.spinlock); 6972 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6973 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6974 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6975 if (cb_fn) { 6976 cb_fn(cb_arg, -EBUSY); 6977 } 6978 return; 6979 } 6980 6981 spdk_spin_lock(&bdev->internal.spinlock); 6982 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6983 bdev->internal.unregister_cb = cb_fn; 6984 bdev->internal.unregister_ctx = cb_arg; 6985 spdk_spin_unlock(&bdev->internal.spinlock); 6986 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6987 6988 spdk_bdev_set_qd_sampling_period(bdev, 0); 6989 6990 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 6991 bdev_unregister); 6992 } 6993 6994 int 6995 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6996 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6997 { 6998 struct spdk_bdev_desc *desc; 6999 struct spdk_bdev *bdev; 7000 int rc; 7001 7002 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 7003 if (rc != 0) { 7004 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 7005 return rc; 7006 } 7007 7008 bdev = spdk_bdev_desc_get_bdev(desc); 7009 7010 if (bdev->module != module) { 7011 spdk_bdev_close(desc); 7012 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 7013 bdev_name); 7014 return -ENODEV; 7015 } 7016 7017 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 7018 7019 spdk_bdev_close(desc); 7020 7021 return 0; 7022 } 7023 7024 static int 7025 bdev_start_qos(struct spdk_bdev *bdev) 7026 { 7027 struct set_qos_limit_ctx *ctx; 7028 7029 /* Enable QoS */ 7030 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 7031 ctx = calloc(1, sizeof(*ctx)); 7032 if (ctx == NULL) { 7033 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7034 return -ENOMEM; 7035 } 7036 ctx->bdev = bdev; 7037 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7038 } 7039 7040 return 0; 7041 } 7042 7043 static int 7044 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7045 { 7046 struct spdk_thread *thread; 7047 int rc = 0; 7048 7049 thread = spdk_get_thread(); 7050 if (!thread) { 7051 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7052 return -ENOTSUP; 7053 } 7054 7055 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7056 spdk_get_thread()); 7057 7058 desc->bdev = bdev; 7059 desc->thread = thread; 7060 desc->write = write; 7061 7062 spdk_spin_lock(&bdev->internal.spinlock); 7063 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7064 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7065 spdk_spin_unlock(&bdev->internal.spinlock); 7066 return -ENODEV; 7067 } 7068 7069 if (write && bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7070 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 7071 bdev->name, bdev->internal.claim.v1.module->name); 7072 spdk_spin_unlock(&bdev->internal.spinlock); 7073 return -EPERM; 7074 } 7075 7076 rc = bdev_start_qos(bdev); 7077 if (rc != 0) { 7078 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7079 spdk_spin_unlock(&bdev->internal.spinlock); 7080 return rc; 7081 } 7082 7083 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7084 7085 spdk_spin_unlock(&bdev->internal.spinlock); 7086 7087 return 0; 7088 } 7089 7090 static int 7091 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7092 struct spdk_bdev_desc **_desc) 7093 { 7094 struct spdk_bdev_desc *desc; 7095 unsigned int event_id; 7096 7097 desc = calloc(1, sizeof(*desc)); 7098 if (desc == NULL) { 7099 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7100 return -ENOMEM; 7101 } 7102 7103 TAILQ_INIT(&desc->pending_media_events); 7104 TAILQ_INIT(&desc->free_media_events); 7105 7106 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7107 desc->callback.event_fn = event_cb; 7108 desc->callback.ctx = event_ctx; 7109 spdk_spin_init(&desc->spinlock); 7110 7111 if (bdev->media_events) { 7112 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7113 sizeof(*desc->media_events_buffer)); 7114 if (desc->media_events_buffer == NULL) { 7115 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7116 bdev_desc_free(desc); 7117 return -ENOMEM; 7118 } 7119 7120 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7121 TAILQ_INSERT_TAIL(&desc->free_media_events, 7122 &desc->media_events_buffer[event_id], tailq); 7123 } 7124 } 7125 7126 *_desc = desc; 7127 7128 return 0; 7129 } 7130 7131 int 7132 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7133 void *event_ctx, struct spdk_bdev_desc **_desc) 7134 { 7135 struct spdk_bdev_desc *desc; 7136 struct spdk_bdev *bdev; 7137 int rc; 7138 7139 if (event_cb == NULL) { 7140 SPDK_ERRLOG("Missing event callback function\n"); 7141 return -EINVAL; 7142 } 7143 7144 spdk_spin_lock(&g_bdev_mgr.spinlock); 7145 7146 bdev = bdev_get_by_name(bdev_name); 7147 7148 if (bdev == NULL) { 7149 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7150 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7151 return -ENODEV; 7152 } 7153 7154 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7155 if (rc != 0) { 7156 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7157 return rc; 7158 } 7159 7160 rc = bdev_open(bdev, write, desc); 7161 if (rc != 0) { 7162 bdev_desc_free(desc); 7163 desc = NULL; 7164 } 7165 7166 *_desc = desc; 7167 7168 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7169 7170 return rc; 7171 } 7172 7173 static void 7174 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7175 { 7176 int rc; 7177 7178 spdk_spin_lock(&bdev->internal.spinlock); 7179 spdk_spin_lock(&desc->spinlock); 7180 7181 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7182 7183 desc->closed = true; 7184 7185 if (0 == desc->refs) { 7186 spdk_spin_unlock(&desc->spinlock); 7187 bdev_desc_free(desc); 7188 } else { 7189 spdk_spin_unlock(&desc->spinlock); 7190 } 7191 7192 /* If no more descriptors, kill QoS channel */ 7193 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7194 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7195 bdev->name, spdk_get_thread()); 7196 7197 if (bdev_qos_destroy(bdev)) { 7198 /* There isn't anything we can do to recover here. Just let the 7199 * old QoS poller keep running. The QoS handling won't change 7200 * cores when the user allocates a new channel, but it won't break. */ 7201 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7202 } 7203 } 7204 7205 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7206 rc = bdev_unregister_unsafe(bdev); 7207 spdk_spin_unlock(&bdev->internal.spinlock); 7208 7209 if (rc == 0) { 7210 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7211 } 7212 } else { 7213 spdk_spin_unlock(&bdev->internal.spinlock); 7214 } 7215 } 7216 7217 void 7218 spdk_bdev_close(struct spdk_bdev_desc *desc) 7219 { 7220 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7221 7222 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7223 spdk_get_thread()); 7224 7225 assert(desc->thread == spdk_get_thread()); 7226 7227 spdk_poller_unregister(&desc->io_timeout_poller); 7228 7229 spdk_spin_lock(&g_bdev_mgr.spinlock); 7230 7231 bdev_close(bdev, desc); 7232 7233 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7234 } 7235 7236 static void 7237 bdev_register_finished(void *arg) 7238 { 7239 struct spdk_bdev_desc *desc = arg; 7240 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7241 7242 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7243 7244 spdk_spin_lock(&g_bdev_mgr.spinlock); 7245 7246 bdev_close(bdev, desc); 7247 7248 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7249 } 7250 7251 int 7252 spdk_bdev_register(struct spdk_bdev *bdev) 7253 { 7254 struct spdk_bdev_desc *desc; 7255 int rc; 7256 7257 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7258 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 7259 } 7260 7261 rc = bdev_register(bdev); 7262 if (rc != 0) { 7263 return rc; 7264 } 7265 7266 /* A descriptor is opened to prevent bdev deletion during examination */ 7267 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7268 if (rc != 0) { 7269 spdk_bdev_unregister(bdev, NULL, NULL); 7270 return rc; 7271 } 7272 7273 rc = bdev_open(bdev, false, desc); 7274 if (rc != 0) { 7275 bdev_desc_free(desc); 7276 spdk_bdev_unregister(bdev, NULL, NULL); 7277 return rc; 7278 } 7279 7280 /* Examine configuration before initializing I/O */ 7281 bdev_examine(bdev); 7282 7283 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7284 if (rc != 0) { 7285 bdev_close(bdev, desc); 7286 spdk_bdev_unregister(bdev, NULL, NULL); 7287 } 7288 7289 return rc; 7290 } 7291 7292 int 7293 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7294 struct spdk_bdev_module *module) 7295 { 7296 spdk_spin_lock(&bdev->internal.spinlock); 7297 7298 if (bdev->internal.claim_type != SPDK_BDEV_CLAIM_NONE) { 7299 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 7300 bdev->internal.claim.v1.module->name); 7301 spdk_spin_unlock(&bdev->internal.spinlock); 7302 return -EPERM; 7303 } 7304 7305 if (desc && !desc->write) { 7306 desc->write = true; 7307 } 7308 7309 bdev->internal.claim_type = SPDK_BDEV_CLAIM_EXCL_WRITE; 7310 bdev->internal.claim.v1.module = module; 7311 7312 spdk_spin_unlock(&bdev->internal.spinlock); 7313 return 0; 7314 } 7315 7316 void 7317 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7318 { 7319 spdk_spin_lock(&bdev->internal.spinlock); 7320 7321 assert(bdev->internal.claim.v1.module != NULL); 7322 assert(bdev->internal.claim_type == SPDK_BDEV_CLAIM_EXCL_WRITE); 7323 bdev->internal.claim_type = SPDK_BDEV_CLAIM_NONE; 7324 bdev->internal.claim.v1.module = NULL; 7325 7326 spdk_spin_unlock(&bdev->internal.spinlock); 7327 } 7328 7329 struct spdk_bdev * 7330 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7331 { 7332 assert(desc != NULL); 7333 return desc->bdev; 7334 } 7335 7336 int 7337 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7338 { 7339 struct spdk_bdev *bdev, *tmp; 7340 struct spdk_bdev_desc *desc; 7341 int rc = 0; 7342 7343 assert(fn != NULL); 7344 7345 spdk_spin_lock(&g_bdev_mgr.spinlock); 7346 bdev = spdk_bdev_first(); 7347 while (bdev != NULL) { 7348 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7349 if (rc != 0) { 7350 break; 7351 } 7352 rc = bdev_open(bdev, false, desc); 7353 if (rc != 0) { 7354 bdev_desc_free(desc); 7355 if (rc == -ENODEV) { 7356 /* Ignore the error and move to the next bdev. */ 7357 rc = 0; 7358 bdev = spdk_bdev_next(bdev); 7359 continue; 7360 } 7361 break; 7362 } 7363 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7364 7365 rc = fn(ctx, bdev); 7366 7367 spdk_spin_lock(&g_bdev_mgr.spinlock); 7368 tmp = spdk_bdev_next(bdev); 7369 bdev_close(bdev, desc); 7370 if (rc != 0) { 7371 break; 7372 } 7373 bdev = tmp; 7374 } 7375 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7376 7377 return rc; 7378 } 7379 7380 int 7381 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7382 { 7383 struct spdk_bdev *bdev, *tmp; 7384 struct spdk_bdev_desc *desc; 7385 int rc = 0; 7386 7387 assert(fn != NULL); 7388 7389 spdk_spin_lock(&g_bdev_mgr.spinlock); 7390 bdev = spdk_bdev_first_leaf(); 7391 while (bdev != NULL) { 7392 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7393 if (rc != 0) { 7394 break; 7395 } 7396 rc = bdev_open(bdev, false, desc); 7397 if (rc != 0) { 7398 bdev_desc_free(desc); 7399 if (rc == -ENODEV) { 7400 /* Ignore the error and move to the next bdev. */ 7401 rc = 0; 7402 bdev = spdk_bdev_next_leaf(bdev); 7403 continue; 7404 } 7405 break; 7406 } 7407 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7408 7409 rc = fn(ctx, bdev); 7410 7411 spdk_spin_lock(&g_bdev_mgr.spinlock); 7412 tmp = spdk_bdev_next_leaf(bdev); 7413 bdev_close(bdev, desc); 7414 if (rc != 0) { 7415 break; 7416 } 7417 bdev = tmp; 7418 } 7419 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7420 7421 return rc; 7422 } 7423 7424 void 7425 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7426 { 7427 struct iovec *iovs; 7428 int iovcnt; 7429 7430 if (bdev_io == NULL) { 7431 return; 7432 } 7433 7434 switch (bdev_io->type) { 7435 case SPDK_BDEV_IO_TYPE_READ: 7436 case SPDK_BDEV_IO_TYPE_WRITE: 7437 case SPDK_BDEV_IO_TYPE_ZCOPY: 7438 iovs = bdev_io->u.bdev.iovs; 7439 iovcnt = bdev_io->u.bdev.iovcnt; 7440 break; 7441 default: 7442 iovs = NULL; 7443 iovcnt = 0; 7444 break; 7445 } 7446 7447 if (iovp) { 7448 *iovp = iovs; 7449 } 7450 if (iovcntp) { 7451 *iovcntp = iovcnt; 7452 } 7453 } 7454 7455 void * 7456 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7457 { 7458 if (bdev_io == NULL) { 7459 return NULL; 7460 } 7461 7462 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7463 return NULL; 7464 } 7465 7466 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7467 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7468 return bdev_io->u.bdev.md_buf; 7469 } 7470 7471 return NULL; 7472 } 7473 7474 void * 7475 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7476 { 7477 if (bdev_io == NULL) { 7478 assert(false); 7479 return NULL; 7480 } 7481 7482 return bdev_io->internal.caller_ctx; 7483 } 7484 7485 void 7486 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7487 { 7488 7489 if (spdk_bdev_module_list_find(bdev_module->name)) { 7490 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7491 assert(false); 7492 } 7493 7494 spdk_spin_init(&bdev_module->internal.spinlock); 7495 7496 /* 7497 * Modules with examine callbacks must be initialized first, so they are 7498 * ready to handle examine callbacks from later modules that will 7499 * register physical bdevs. 7500 */ 7501 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7502 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7503 } else { 7504 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7505 } 7506 } 7507 7508 struct spdk_bdev_module * 7509 spdk_bdev_module_list_find(const char *name) 7510 { 7511 struct spdk_bdev_module *bdev_module; 7512 7513 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7514 if (strcmp(name, bdev_module->name) == 0) { 7515 break; 7516 } 7517 } 7518 7519 return bdev_module; 7520 } 7521 7522 static void 7523 bdev_write_zero_buffer_next(void *_bdev_io) 7524 { 7525 struct spdk_bdev_io *bdev_io = _bdev_io; 7526 uint64_t num_bytes, num_blocks; 7527 void *md_buf = NULL; 7528 int rc; 7529 7530 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7531 bdev_io->u.bdev.split_remaining_num_blocks, 7532 ZERO_BUFFER_SIZE); 7533 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7534 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7535 7536 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7537 md_buf = (char *)g_bdev_mgr.zero_buffer + 7538 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7539 } 7540 7541 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7542 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7543 g_bdev_mgr.zero_buffer, md_buf, 7544 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7545 bdev_write_zero_buffer_done, bdev_io); 7546 if (rc == 0) { 7547 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7548 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7549 } else if (rc == -ENOMEM) { 7550 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7551 } else { 7552 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7553 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7554 } 7555 } 7556 7557 static void 7558 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7559 { 7560 struct spdk_bdev_io *parent_io = cb_arg; 7561 7562 spdk_bdev_free_io(bdev_io); 7563 7564 if (!success) { 7565 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7566 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7567 return; 7568 } 7569 7570 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7571 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7572 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7573 return; 7574 } 7575 7576 bdev_write_zero_buffer_next(parent_io); 7577 } 7578 7579 static void 7580 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7581 { 7582 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7583 ctx->bdev->internal.qos_mod_in_progress = false; 7584 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7585 7586 if (ctx->cb_fn) { 7587 ctx->cb_fn(ctx->cb_arg, status); 7588 } 7589 free(ctx); 7590 } 7591 7592 static void 7593 bdev_disable_qos_done(void *cb_arg) 7594 { 7595 struct set_qos_limit_ctx *ctx = cb_arg; 7596 struct spdk_bdev *bdev = ctx->bdev; 7597 struct spdk_bdev_io *bdev_io; 7598 struct spdk_bdev_qos *qos; 7599 7600 spdk_spin_lock(&bdev->internal.spinlock); 7601 qos = bdev->internal.qos; 7602 bdev->internal.qos = NULL; 7603 spdk_spin_unlock(&bdev->internal.spinlock); 7604 7605 while (!TAILQ_EMPTY(&qos->queued)) { 7606 /* Send queued I/O back to their original thread for resubmission. */ 7607 bdev_io = TAILQ_FIRST(&qos->queued); 7608 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7609 7610 if (bdev_io->internal.io_submit_ch) { 7611 /* 7612 * Channel was changed when sending it to the QoS thread - change it back 7613 * before sending it back to the original thread. 7614 */ 7615 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7616 bdev_io->internal.io_submit_ch = NULL; 7617 } 7618 7619 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7620 _bdev_io_submit, bdev_io); 7621 } 7622 7623 if (qos->thread != NULL) { 7624 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7625 spdk_poller_unregister(&qos->poller); 7626 } 7627 7628 free(qos); 7629 7630 bdev_set_qos_limit_done(ctx, 0); 7631 } 7632 7633 static void 7634 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 7635 { 7636 struct set_qos_limit_ctx *ctx = _ctx; 7637 struct spdk_thread *thread; 7638 7639 spdk_spin_lock(&bdev->internal.spinlock); 7640 thread = bdev->internal.qos->thread; 7641 spdk_spin_unlock(&bdev->internal.spinlock); 7642 7643 if (thread != NULL) { 7644 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7645 } else { 7646 bdev_disable_qos_done(ctx); 7647 } 7648 } 7649 7650 static void 7651 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7652 struct spdk_io_channel *ch, void *_ctx) 7653 { 7654 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7655 7656 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7657 7658 spdk_bdev_for_each_channel_continue(i, 0); 7659 } 7660 7661 static void 7662 bdev_update_qos_rate_limit_msg(void *cb_arg) 7663 { 7664 struct set_qos_limit_ctx *ctx = cb_arg; 7665 struct spdk_bdev *bdev = ctx->bdev; 7666 7667 spdk_spin_lock(&bdev->internal.spinlock); 7668 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7669 spdk_spin_unlock(&bdev->internal.spinlock); 7670 7671 bdev_set_qos_limit_done(ctx, 0); 7672 } 7673 7674 static void 7675 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7676 struct spdk_io_channel *ch, void *_ctx) 7677 { 7678 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7679 7680 spdk_spin_lock(&bdev->internal.spinlock); 7681 bdev_enable_qos(bdev, bdev_ch); 7682 spdk_spin_unlock(&bdev->internal.spinlock); 7683 spdk_bdev_for_each_channel_continue(i, 0); 7684 } 7685 7686 static void 7687 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 7688 { 7689 struct set_qos_limit_ctx *ctx = _ctx; 7690 7691 bdev_set_qos_limit_done(ctx, status); 7692 } 7693 7694 static void 7695 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7696 { 7697 int i; 7698 7699 assert(bdev->internal.qos != NULL); 7700 7701 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7702 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7703 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7704 7705 if (limits[i] == 0) { 7706 bdev->internal.qos->rate_limits[i].limit = 7707 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7708 } 7709 } 7710 } 7711 } 7712 7713 void 7714 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7715 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7716 { 7717 struct set_qos_limit_ctx *ctx; 7718 uint32_t limit_set_complement; 7719 uint64_t min_limit_per_sec; 7720 int i; 7721 bool disable_rate_limit = true; 7722 7723 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7724 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7725 continue; 7726 } 7727 7728 if (limits[i] > 0) { 7729 disable_rate_limit = false; 7730 } 7731 7732 if (bdev_qos_is_iops_rate_limit(i) == true) { 7733 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7734 } else { 7735 /* Change from megabyte to byte rate limit */ 7736 limits[i] = limits[i] * 1024 * 1024; 7737 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7738 } 7739 7740 limit_set_complement = limits[i] % min_limit_per_sec; 7741 if (limit_set_complement) { 7742 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7743 limits[i], min_limit_per_sec); 7744 limits[i] += min_limit_per_sec - limit_set_complement; 7745 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7746 } 7747 } 7748 7749 ctx = calloc(1, sizeof(*ctx)); 7750 if (ctx == NULL) { 7751 cb_fn(cb_arg, -ENOMEM); 7752 return; 7753 } 7754 7755 ctx->cb_fn = cb_fn; 7756 ctx->cb_arg = cb_arg; 7757 ctx->bdev = bdev; 7758 7759 spdk_spin_lock(&bdev->internal.spinlock); 7760 if (bdev->internal.qos_mod_in_progress) { 7761 spdk_spin_unlock(&bdev->internal.spinlock); 7762 free(ctx); 7763 cb_fn(cb_arg, -EAGAIN); 7764 return; 7765 } 7766 bdev->internal.qos_mod_in_progress = true; 7767 7768 if (disable_rate_limit == true && bdev->internal.qos) { 7769 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7770 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7771 (bdev->internal.qos->rate_limits[i].limit > 0 && 7772 bdev->internal.qos->rate_limits[i].limit != 7773 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7774 disable_rate_limit = false; 7775 break; 7776 } 7777 } 7778 } 7779 7780 if (disable_rate_limit == false) { 7781 if (bdev->internal.qos == NULL) { 7782 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7783 if (!bdev->internal.qos) { 7784 spdk_spin_unlock(&bdev->internal.spinlock); 7785 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7786 bdev_set_qos_limit_done(ctx, -ENOMEM); 7787 return; 7788 } 7789 } 7790 7791 if (bdev->internal.qos->thread == NULL) { 7792 /* Enabling */ 7793 bdev_set_qos_rate_limits(bdev, limits); 7794 7795 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 7796 bdev_enable_qos_done); 7797 } else { 7798 /* Updating */ 7799 bdev_set_qos_rate_limits(bdev, limits); 7800 7801 spdk_thread_send_msg(bdev->internal.qos->thread, 7802 bdev_update_qos_rate_limit_msg, ctx); 7803 } 7804 } else { 7805 if (bdev->internal.qos != NULL) { 7806 bdev_set_qos_rate_limits(bdev, limits); 7807 7808 /* Disabling */ 7809 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 7810 bdev_disable_qos_msg_done); 7811 } else { 7812 spdk_spin_unlock(&bdev->internal.spinlock); 7813 bdev_set_qos_limit_done(ctx, 0); 7814 return; 7815 } 7816 } 7817 7818 spdk_spin_unlock(&bdev->internal.spinlock); 7819 } 7820 7821 struct spdk_bdev_histogram_ctx { 7822 spdk_bdev_histogram_status_cb cb_fn; 7823 void *cb_arg; 7824 struct spdk_bdev *bdev; 7825 int status; 7826 }; 7827 7828 static void 7829 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7830 { 7831 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7832 7833 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7834 ctx->bdev->internal.histogram_in_progress = false; 7835 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7836 ctx->cb_fn(ctx->cb_arg, ctx->status); 7837 free(ctx); 7838 } 7839 7840 static void 7841 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7842 struct spdk_io_channel *_ch, void *_ctx) 7843 { 7844 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7845 7846 if (ch->histogram != NULL) { 7847 spdk_histogram_data_free(ch->histogram); 7848 ch->histogram = NULL; 7849 } 7850 spdk_bdev_for_each_channel_continue(i, 0); 7851 } 7852 7853 static void 7854 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7855 { 7856 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7857 7858 if (status != 0) { 7859 ctx->status = status; 7860 ctx->bdev->internal.histogram_enabled = false; 7861 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 7862 bdev_histogram_disable_channel_cb); 7863 } else { 7864 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7865 ctx->bdev->internal.histogram_in_progress = false; 7866 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7867 ctx->cb_fn(ctx->cb_arg, ctx->status); 7868 free(ctx); 7869 } 7870 } 7871 7872 static void 7873 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7874 struct spdk_io_channel *_ch, void *_ctx) 7875 { 7876 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7877 int status = 0; 7878 7879 if (ch->histogram == NULL) { 7880 ch->histogram = spdk_histogram_data_alloc(); 7881 if (ch->histogram == NULL) { 7882 status = -ENOMEM; 7883 } 7884 } 7885 7886 spdk_bdev_for_each_channel_continue(i, status); 7887 } 7888 7889 void 7890 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7891 void *cb_arg, bool enable) 7892 { 7893 struct spdk_bdev_histogram_ctx *ctx; 7894 7895 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7896 if (ctx == NULL) { 7897 cb_fn(cb_arg, -ENOMEM); 7898 return; 7899 } 7900 7901 ctx->bdev = bdev; 7902 ctx->status = 0; 7903 ctx->cb_fn = cb_fn; 7904 ctx->cb_arg = cb_arg; 7905 7906 spdk_spin_lock(&bdev->internal.spinlock); 7907 if (bdev->internal.histogram_in_progress) { 7908 spdk_spin_unlock(&bdev->internal.spinlock); 7909 free(ctx); 7910 cb_fn(cb_arg, -EAGAIN); 7911 return; 7912 } 7913 7914 bdev->internal.histogram_in_progress = true; 7915 spdk_spin_unlock(&bdev->internal.spinlock); 7916 7917 bdev->internal.histogram_enabled = enable; 7918 7919 if (enable) { 7920 /* Allocate histogram for each channel */ 7921 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 7922 bdev_histogram_enable_channel_cb); 7923 } else { 7924 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 7925 bdev_histogram_disable_channel_cb); 7926 } 7927 } 7928 7929 struct spdk_bdev_histogram_data_ctx { 7930 spdk_bdev_histogram_data_cb cb_fn; 7931 void *cb_arg; 7932 struct spdk_bdev *bdev; 7933 /** merged histogram data from all channels */ 7934 struct spdk_histogram_data *histogram; 7935 }; 7936 7937 static void 7938 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7939 { 7940 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7941 7942 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7943 free(ctx); 7944 } 7945 7946 static void 7947 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7948 struct spdk_io_channel *_ch, void *_ctx) 7949 { 7950 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7951 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7952 int status = 0; 7953 7954 if (ch->histogram == NULL) { 7955 status = -EFAULT; 7956 } else { 7957 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7958 } 7959 7960 spdk_bdev_for_each_channel_continue(i, status); 7961 } 7962 7963 void 7964 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7965 spdk_bdev_histogram_data_cb cb_fn, 7966 void *cb_arg) 7967 { 7968 struct spdk_bdev_histogram_data_ctx *ctx; 7969 7970 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7971 if (ctx == NULL) { 7972 cb_fn(cb_arg, -ENOMEM, NULL); 7973 return; 7974 } 7975 7976 ctx->bdev = bdev; 7977 ctx->cb_fn = cb_fn; 7978 ctx->cb_arg = cb_arg; 7979 7980 ctx->histogram = histogram; 7981 7982 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 7983 bdev_histogram_get_channel_cb); 7984 } 7985 7986 void 7987 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 7988 void *cb_arg) 7989 { 7990 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7991 int status = 0; 7992 7993 assert(cb_fn != NULL); 7994 7995 if (bdev_ch->histogram == NULL) { 7996 status = -EFAULT; 7997 } 7998 cb_fn(cb_arg, status, bdev_ch->histogram); 7999 } 8000 8001 size_t 8002 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 8003 size_t max_events) 8004 { 8005 struct media_event_entry *entry; 8006 size_t num_events = 0; 8007 8008 for (; num_events < max_events; ++num_events) { 8009 entry = TAILQ_FIRST(&desc->pending_media_events); 8010 if (entry == NULL) { 8011 break; 8012 } 8013 8014 events[num_events] = entry->event; 8015 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 8016 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 8017 } 8018 8019 return num_events; 8020 } 8021 8022 int 8023 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 8024 size_t num_events) 8025 { 8026 struct spdk_bdev_desc *desc; 8027 struct media_event_entry *entry; 8028 size_t event_id; 8029 int rc = 0; 8030 8031 assert(bdev->media_events); 8032 8033 spdk_spin_lock(&bdev->internal.spinlock); 8034 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8035 if (desc->write) { 8036 break; 8037 } 8038 } 8039 8040 if (desc == NULL || desc->media_events_buffer == NULL) { 8041 rc = -ENODEV; 8042 goto out; 8043 } 8044 8045 for (event_id = 0; event_id < num_events; ++event_id) { 8046 entry = TAILQ_FIRST(&desc->free_media_events); 8047 if (entry == NULL) { 8048 break; 8049 } 8050 8051 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8052 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8053 entry->event = events[event_id]; 8054 } 8055 8056 rc = event_id; 8057 out: 8058 spdk_spin_unlock(&bdev->internal.spinlock); 8059 return rc; 8060 } 8061 8062 void 8063 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8064 { 8065 struct spdk_bdev_desc *desc; 8066 8067 spdk_spin_lock(&bdev->internal.spinlock); 8068 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8069 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8070 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 8071 desc->callback.ctx); 8072 } 8073 } 8074 spdk_spin_unlock(&bdev->internal.spinlock); 8075 } 8076 8077 struct locked_lba_range_ctx { 8078 struct lba_range range; 8079 struct spdk_bdev *bdev; 8080 struct lba_range *current_range; 8081 struct lba_range *owner_range; 8082 struct spdk_poller *poller; 8083 lock_range_cb cb_fn; 8084 void *cb_arg; 8085 }; 8086 8087 static void 8088 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8089 { 8090 struct locked_lba_range_ctx *ctx = _ctx; 8091 8092 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8093 free(ctx); 8094 } 8095 8096 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8097 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8098 8099 static void 8100 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8101 { 8102 struct locked_lba_range_ctx *ctx = _ctx; 8103 8104 if (status == -ENOMEM) { 8105 /* One of the channels could not allocate a range object. 8106 * So we have to go back and clean up any ranges that were 8107 * allocated successfully before we return error status to 8108 * the caller. We can reuse the unlock function to do that 8109 * clean up. 8110 */ 8111 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8112 bdev_lock_error_cleanup_cb); 8113 return; 8114 } 8115 8116 /* All channels have locked this range and no I/O overlapping the range 8117 * are outstanding! Set the owner_ch for the range object for the 8118 * locking channel, so that this channel will know that it is allowed 8119 * to write to this range. 8120 */ 8121 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8122 ctx->cb_fn(ctx->cb_arg, status); 8123 8124 /* Don't free the ctx here. Its range is in the bdev's global list of 8125 * locked ranges still, and will be removed and freed when this range 8126 * is later unlocked. 8127 */ 8128 } 8129 8130 static int 8131 bdev_lock_lba_range_check_io(void *_i) 8132 { 8133 struct spdk_bdev_channel_iter *i = _i; 8134 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8135 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8136 struct locked_lba_range_ctx *ctx = i->ctx; 8137 struct lba_range *range = ctx->current_range; 8138 struct spdk_bdev_io *bdev_io; 8139 8140 spdk_poller_unregister(&ctx->poller); 8141 8142 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8143 * range. But we need to wait until any outstanding IO overlapping with this range 8144 * are completed. 8145 */ 8146 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8147 if (bdev_io_range_is_locked(bdev_io, range)) { 8148 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8149 return SPDK_POLLER_BUSY; 8150 } 8151 } 8152 8153 spdk_bdev_for_each_channel_continue(i, 0); 8154 return SPDK_POLLER_BUSY; 8155 } 8156 8157 static void 8158 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8159 struct spdk_io_channel *_ch, void *_ctx) 8160 { 8161 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8162 struct locked_lba_range_ctx *ctx = _ctx; 8163 struct lba_range *range; 8164 8165 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8166 if (range->length == ctx->range.length && 8167 range->offset == ctx->range.offset && 8168 range->locked_ctx == ctx->range.locked_ctx) { 8169 /* This range already exists on this channel, so don't add 8170 * it again. This can happen when a new channel is created 8171 * while the for_each_channel operation is in progress. 8172 * Do not check for outstanding I/O in that case, since the 8173 * range was locked before any I/O could be submitted to the 8174 * new channel. 8175 */ 8176 spdk_bdev_for_each_channel_continue(i, 0); 8177 return; 8178 } 8179 } 8180 8181 range = calloc(1, sizeof(*range)); 8182 if (range == NULL) { 8183 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8184 return; 8185 } 8186 8187 range->length = ctx->range.length; 8188 range->offset = ctx->range.offset; 8189 range->locked_ctx = ctx->range.locked_ctx; 8190 ctx->current_range = range; 8191 if (ctx->range.owner_ch == ch) { 8192 /* This is the range object for the channel that will hold 8193 * the lock. Store it in the ctx object so that we can easily 8194 * set its owner_ch after the lock is finally acquired. 8195 */ 8196 ctx->owner_range = range; 8197 } 8198 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8199 bdev_lock_lba_range_check_io(i); 8200 } 8201 8202 static void 8203 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8204 { 8205 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8206 8207 /* We will add a copy of this range to each channel now. */ 8208 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8209 bdev_lock_lba_range_cb); 8210 } 8211 8212 static bool 8213 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8214 { 8215 struct lba_range *r; 8216 8217 TAILQ_FOREACH(r, tailq, tailq) { 8218 if (bdev_lba_range_overlapped(range, r)) { 8219 return true; 8220 } 8221 } 8222 return false; 8223 } 8224 8225 static int 8226 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8227 uint64_t offset, uint64_t length, 8228 lock_range_cb cb_fn, void *cb_arg) 8229 { 8230 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8231 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8232 struct locked_lba_range_ctx *ctx; 8233 8234 if (cb_arg == NULL) { 8235 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8236 return -EINVAL; 8237 } 8238 8239 ctx = calloc(1, sizeof(*ctx)); 8240 if (ctx == NULL) { 8241 return -ENOMEM; 8242 } 8243 8244 ctx->range.offset = offset; 8245 ctx->range.length = length; 8246 ctx->range.owner_ch = ch; 8247 ctx->range.locked_ctx = cb_arg; 8248 ctx->bdev = bdev; 8249 ctx->cb_fn = cb_fn; 8250 ctx->cb_arg = cb_arg; 8251 8252 spdk_spin_lock(&bdev->internal.spinlock); 8253 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8254 /* There is an active lock overlapping with this range. 8255 * Put it on the pending list until this range no 8256 * longer overlaps with another. 8257 */ 8258 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8259 } else { 8260 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8261 bdev_lock_lba_range_ctx(bdev, ctx); 8262 } 8263 spdk_spin_unlock(&bdev->internal.spinlock); 8264 return 0; 8265 } 8266 8267 static void 8268 bdev_lock_lba_range_ctx_msg(void *_ctx) 8269 { 8270 struct locked_lba_range_ctx *ctx = _ctx; 8271 8272 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8273 } 8274 8275 static void 8276 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8277 { 8278 struct locked_lba_range_ctx *ctx = _ctx; 8279 struct locked_lba_range_ctx *pending_ctx; 8280 struct lba_range *range, *tmp; 8281 8282 spdk_spin_lock(&bdev->internal.spinlock); 8283 /* Check if there are any pending locked ranges that overlap with this range 8284 * that was just unlocked. If there are, check that it doesn't overlap with any 8285 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8286 * the lock process. 8287 */ 8288 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8289 if (bdev_lba_range_overlapped(range, &ctx->range) && 8290 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8291 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8292 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8293 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8294 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8295 bdev_lock_lba_range_ctx_msg, pending_ctx); 8296 } 8297 } 8298 spdk_spin_unlock(&bdev->internal.spinlock); 8299 8300 ctx->cb_fn(ctx->cb_arg, status); 8301 free(ctx); 8302 } 8303 8304 static void 8305 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8306 struct spdk_io_channel *_ch, void *_ctx) 8307 { 8308 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8309 struct locked_lba_range_ctx *ctx = _ctx; 8310 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8311 struct spdk_bdev_io *bdev_io; 8312 struct lba_range *range; 8313 8314 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8315 if (ctx->range.offset == range->offset && 8316 ctx->range.length == range->length && 8317 ctx->range.locked_ctx == range->locked_ctx) { 8318 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8319 free(range); 8320 break; 8321 } 8322 } 8323 8324 /* Note: we should almost always be able to assert that the range specified 8325 * was found. But there are some very rare corner cases where a new channel 8326 * gets created simultaneously with a range unlock, where this function 8327 * would execute on that new channel and wouldn't have the range. 8328 * We also use this to clean up range allocations when a later allocation 8329 * fails in the locking path. 8330 * So we can't actually assert() here. 8331 */ 8332 8333 /* Swap the locked IO into a temporary list, and then try to submit them again. 8334 * We could hyper-optimize this to only resubmit locked I/O that overlap 8335 * with the range that was just unlocked, but this isn't a performance path so 8336 * we go for simplicity here. 8337 */ 8338 TAILQ_INIT(&io_locked); 8339 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8340 while (!TAILQ_EMPTY(&io_locked)) { 8341 bdev_io = TAILQ_FIRST(&io_locked); 8342 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8343 bdev_io_submit(bdev_io); 8344 } 8345 8346 spdk_bdev_for_each_channel_continue(i, 0); 8347 } 8348 8349 static int 8350 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8351 uint64_t offset, uint64_t length, 8352 lock_range_cb cb_fn, void *cb_arg) 8353 { 8354 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8355 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8356 struct locked_lba_range_ctx *ctx; 8357 struct lba_range *range; 8358 bool range_found = false; 8359 8360 /* Let's make sure the specified channel actually has a lock on 8361 * the specified range. Note that the range must match exactly. 8362 */ 8363 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8364 if (range->offset == offset && range->length == length && 8365 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8366 range_found = true; 8367 break; 8368 } 8369 } 8370 8371 if (!range_found) { 8372 return -EINVAL; 8373 } 8374 8375 spdk_spin_lock(&bdev->internal.spinlock); 8376 /* We confirmed that this channel has locked the specified range. To 8377 * start the unlock the process, we find the range in the bdev's locked_ranges 8378 * and remove it. This ensures new channels don't inherit the locked range. 8379 * Then we will send a message to each channel (including the one specified 8380 * here) to remove the range from its per-channel list. 8381 */ 8382 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8383 if (range->offset == offset && range->length == length && 8384 range->locked_ctx == cb_arg) { 8385 break; 8386 } 8387 } 8388 if (range == NULL) { 8389 assert(false); 8390 spdk_spin_unlock(&bdev->internal.spinlock); 8391 return -EINVAL; 8392 } 8393 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8394 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8395 spdk_spin_unlock(&bdev->internal.spinlock); 8396 8397 ctx->cb_fn = cb_fn; 8398 ctx->cb_arg = cb_arg; 8399 8400 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8401 bdev_unlock_lba_range_cb); 8402 return 0; 8403 } 8404 8405 int 8406 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8407 int array_size) 8408 { 8409 if (!bdev) { 8410 return -EINVAL; 8411 } 8412 8413 if (bdev->fn_table->get_memory_domains) { 8414 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8415 } 8416 8417 return 0; 8418 } 8419 8420 struct spdk_bdev_for_each_io_ctx { 8421 void *ctx; 8422 spdk_bdev_io_fn fn; 8423 spdk_bdev_for_each_io_cb cb; 8424 }; 8425 8426 static void 8427 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8428 struct spdk_io_channel *io_ch, void *_ctx) 8429 { 8430 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8431 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8432 struct spdk_bdev_io *bdev_io; 8433 int rc = 0; 8434 8435 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8436 rc = ctx->fn(ctx->ctx, bdev_io); 8437 if (rc != 0) { 8438 break; 8439 } 8440 } 8441 8442 spdk_bdev_for_each_channel_continue(i, rc); 8443 } 8444 8445 static void 8446 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8447 { 8448 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8449 8450 ctx->cb(ctx->ctx, status); 8451 8452 free(ctx); 8453 } 8454 8455 void 8456 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8457 spdk_bdev_for_each_io_cb cb) 8458 { 8459 struct spdk_bdev_for_each_io_ctx *ctx; 8460 8461 assert(fn != NULL && cb != NULL); 8462 8463 ctx = calloc(1, sizeof(*ctx)); 8464 if (ctx == NULL) { 8465 SPDK_ERRLOG("Failed to allocate context.\n"); 8466 cb(_ctx, -ENOMEM); 8467 return; 8468 } 8469 8470 ctx->ctx = _ctx; 8471 ctx->fn = fn; 8472 ctx->cb = cb; 8473 8474 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8475 bdev_for_each_io_done); 8476 } 8477 8478 void 8479 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8480 { 8481 spdk_for_each_channel_continue(iter->i, status); 8482 } 8483 8484 static struct spdk_bdev * 8485 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8486 { 8487 void *io_device = spdk_io_channel_iter_get_io_device(i); 8488 8489 return __bdev_from_io_dev(io_device); 8490 } 8491 8492 static void 8493 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8494 { 8495 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8496 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8497 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8498 8499 iter->i = i; 8500 iter->fn(iter, bdev, ch, iter->ctx); 8501 } 8502 8503 static void 8504 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 8505 { 8506 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8507 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8508 8509 iter->i = i; 8510 iter->cpl(bdev, iter->ctx, status); 8511 8512 free(iter); 8513 } 8514 8515 void 8516 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 8517 void *ctx, spdk_bdev_for_each_channel_done cpl) 8518 { 8519 struct spdk_bdev_channel_iter *iter; 8520 8521 assert(bdev != NULL && fn != NULL && ctx != NULL); 8522 8523 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 8524 if (iter == NULL) { 8525 SPDK_ERRLOG("Unable to allocate iterator\n"); 8526 assert(false); 8527 return; 8528 } 8529 8530 iter->fn = fn; 8531 iter->cpl = cpl; 8532 iter->ctx = ctx; 8533 8534 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 8535 iter, bdev_each_channel_cpl); 8536 } 8537 8538 int 8539 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 8540 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 8541 spdk_bdev_io_completion_cb cb, void *cb_arg) 8542 { 8543 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8544 struct spdk_bdev_io *bdev_io; 8545 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 8546 8547 if (!desc->write) { 8548 return -EBADF; 8549 } 8550 8551 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 8552 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 8553 return -ENOTSUP; 8554 } 8555 8556 if (num_blocks == 0) { 8557 SPDK_ERRLOG("Can't copy 0 blocks\n"); 8558 return -EINVAL; 8559 } 8560 8561 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 8562 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 8563 SPDK_DEBUGLOG(bdev, 8564 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 8565 dst_offset_blocks, src_offset_blocks, num_blocks); 8566 return -EINVAL; 8567 } 8568 8569 bdev_io = bdev_channel_get_io(channel); 8570 if (!bdev_io) { 8571 return -ENOMEM; 8572 } 8573 8574 bdev_io->internal.ch = channel; 8575 bdev_io->internal.desc = desc; 8576 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 8577 8578 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 8579 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 8580 bdev_io->u.bdev.num_blocks = num_blocks; 8581 bdev_io->u.bdev.ext_opts = NULL; 8582 bdev_io_init(bdev_io, bdev, cb_arg, cb); 8583 8584 bdev_io_submit(bdev_io); 8585 return 0; 8586 } 8587 8588 SPDK_LOG_REGISTER_COMPONENT(bdev) 8589 8590 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8591 { 8592 struct spdk_trace_tpoint_opts opts[] = { 8593 { 8594 "BDEV_IO_START", TRACE_BDEV_IO_START, 8595 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8596 { 8597 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8598 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8599 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8600 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8601 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8602 } 8603 }, 8604 { 8605 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8606 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8607 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8608 }, 8609 { 8610 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8611 OWNER_BDEV, OBJECT_NONE, 1, 8612 { 8613 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8614 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8615 } 8616 }, 8617 { 8618 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8619 OWNER_BDEV, OBJECT_NONE, 0, 8620 { 8621 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8622 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8623 } 8624 }, 8625 }; 8626 8627 8628 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8629 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8630 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8631 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8632 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8633 } 8634