1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 SPDK_LOG_DEPRECATION_REGISTER(bdev_register_examine_thread, 65 "bdev register and examine on non-app thread", "SPDK 23.05", 0); 66 67 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 68 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 69 }; 70 71 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 72 73 RB_HEAD(bdev_name_tree, spdk_bdev_name); 74 75 static int 76 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 77 { 78 return strcmp(name1->name, name2->name); 79 } 80 81 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 82 83 struct spdk_bdev_mgr { 84 struct spdk_mempool *bdev_io_pool; 85 86 void *zero_buffer; 87 88 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 89 90 struct spdk_bdev_list bdevs; 91 struct bdev_name_tree bdev_names; 92 93 bool init_complete; 94 bool module_init_complete; 95 96 struct spdk_spinlock spinlock; 97 98 #ifdef SPDK_CONFIG_VTUNE 99 __itt_domain *domain; 100 #endif 101 }; 102 103 static struct spdk_bdev_mgr g_bdev_mgr = { 104 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 105 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 106 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 107 .init_complete = false, 108 .module_init_complete = false, 109 }; 110 111 static void 112 __attribute__((constructor)) 113 _bdev_init(void) 114 { 115 spdk_spin_init(&g_bdev_mgr.spinlock); 116 } 117 118 typedef void (*lock_range_cb)(void *ctx, int status); 119 120 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 121 122 struct lba_range { 123 uint64_t offset; 124 uint64_t length; 125 void *locked_ctx; 126 struct spdk_bdev_channel *owner_ch; 127 TAILQ_ENTRY(lba_range) tailq; 128 }; 129 130 static struct spdk_bdev_opts g_bdev_opts = { 131 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 132 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 133 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 134 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 135 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 136 }; 137 138 static spdk_bdev_init_cb g_init_cb_fn = NULL; 139 static void *g_init_cb_arg = NULL; 140 141 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 142 static void *g_fini_cb_arg = NULL; 143 static struct spdk_thread *g_fini_thread = NULL; 144 145 struct spdk_bdev_qos_limit { 146 /** IOs or bytes allowed per second (i.e., 1s). */ 147 uint64_t limit; 148 149 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 150 * For remaining bytes, allowed to run negative if an I/O is submitted when 151 * some bytes are remaining, but the I/O is bigger than that amount. The 152 * excess will be deducted from the next timeslice. 153 */ 154 int64_t remaining_this_timeslice; 155 156 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 157 uint32_t min_per_timeslice; 158 159 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 160 uint32_t max_per_timeslice; 161 162 /** Function to check whether to queue the IO. */ 163 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 164 165 /** Function to update for the submitted IO. */ 166 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 167 }; 168 169 struct spdk_bdev_qos { 170 /** Types of structure of rate limits. */ 171 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 172 173 /** The channel that all I/O are funneled through. */ 174 struct spdk_bdev_channel *ch; 175 176 /** The thread on which the poller is running. */ 177 struct spdk_thread *thread; 178 179 /** Queue of I/O waiting to be issued. */ 180 bdev_io_tailq_t queued; 181 182 /** Size of a timeslice in tsc ticks. */ 183 uint64_t timeslice_size; 184 185 /** Timestamp of start of last timeslice. */ 186 uint64_t last_timeslice; 187 188 /** Poller that processes queued I/O commands each time slice. */ 189 struct spdk_poller *poller; 190 }; 191 192 struct spdk_bdev_mgmt_channel { 193 /* 194 * Each thread keeps a cache of bdev_io - this allows 195 * bdev threads which are *not* DPDK threads to still 196 * benefit from a per-thread bdev_io cache. Without 197 * this, non-DPDK threads fetching from the mempool 198 * incur a cmpxchg on get and put. 199 */ 200 bdev_io_stailq_t per_thread_cache; 201 uint32_t per_thread_cache_count; 202 uint32_t bdev_io_cache_size; 203 204 struct spdk_iobuf_channel iobuf; 205 206 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 207 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 208 }; 209 210 /* 211 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 212 * will queue here their IO that awaits retry. It makes it possible to retry sending 213 * IO to one bdev after IO from other bdev completes. 214 */ 215 struct spdk_bdev_shared_resource { 216 /* The bdev management channel */ 217 struct spdk_bdev_mgmt_channel *mgmt_ch; 218 219 /* 220 * Count of I/O submitted to bdev module and waiting for completion. 221 * Incremented before submit_request() is called on an spdk_bdev_io. 222 */ 223 uint64_t io_outstanding; 224 225 /* 226 * Queue of IO awaiting retry because of a previous NOMEM status returned 227 * on this channel. 228 */ 229 bdev_io_tailq_t nomem_io; 230 231 /* 232 * Threshold which io_outstanding must drop to before retrying nomem_io. 233 */ 234 uint64_t nomem_threshold; 235 236 /* I/O channel allocated by a bdev module */ 237 struct spdk_io_channel *shared_ch; 238 239 /* Refcount of bdev channels using this resource */ 240 uint32_t ref; 241 242 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 243 }; 244 245 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 246 #define BDEV_CH_QOS_ENABLED (1 << 1) 247 248 struct spdk_bdev_channel { 249 struct spdk_bdev *bdev; 250 251 /* The channel for the underlying device */ 252 struct spdk_io_channel *channel; 253 254 /* Per io_device per thread data */ 255 struct spdk_bdev_shared_resource *shared_resource; 256 257 struct spdk_bdev_io_stat *stat; 258 259 /* 260 * Count of I/O submitted to the underlying dev module through this channel 261 * and waiting for completion. 262 */ 263 uint64_t io_outstanding; 264 265 /* 266 * List of all submitted I/Os including I/O that are generated via splitting. 267 */ 268 bdev_io_tailq_t io_submitted; 269 270 /* 271 * List of spdk_bdev_io that are currently queued because they write to a locked 272 * LBA range. 273 */ 274 bdev_io_tailq_t io_locked; 275 276 uint32_t flags; 277 278 struct spdk_histogram_data *histogram; 279 280 #ifdef SPDK_CONFIG_VTUNE 281 uint64_t start_tsc; 282 uint64_t interval_tsc; 283 __itt_string_handle *handle; 284 struct spdk_bdev_io_stat *prev_stat; 285 #endif 286 287 bdev_io_tailq_t queued_resets; 288 289 lba_range_tailq_t locked_ranges; 290 }; 291 292 struct media_event_entry { 293 struct spdk_bdev_media_event event; 294 TAILQ_ENTRY(media_event_entry) tailq; 295 }; 296 297 #define MEDIA_EVENT_POOL_SIZE 64 298 299 struct spdk_bdev_desc { 300 struct spdk_bdev *bdev; 301 struct spdk_thread *thread; 302 struct { 303 spdk_bdev_event_cb_t event_fn; 304 void *ctx; 305 } callback; 306 bool closed; 307 bool write; 308 bool memory_domains_supported; 309 struct spdk_spinlock spinlock; 310 uint32_t refs; 311 TAILQ_HEAD(, media_event_entry) pending_media_events; 312 TAILQ_HEAD(, media_event_entry) free_media_events; 313 struct media_event_entry *media_events_buffer; 314 TAILQ_ENTRY(spdk_bdev_desc) link; 315 316 uint64_t timeout_in_sec; 317 spdk_bdev_io_timeout_cb cb_fn; 318 void *cb_arg; 319 struct spdk_poller *io_timeout_poller; 320 }; 321 322 struct spdk_bdev_iostat_ctx { 323 struct spdk_bdev_io_stat *stat; 324 spdk_bdev_get_device_stat_cb cb; 325 void *cb_arg; 326 }; 327 328 struct set_qos_limit_ctx { 329 void (*cb_fn)(void *cb_arg, int status); 330 void *cb_arg; 331 struct spdk_bdev *bdev; 332 }; 333 334 struct spdk_bdev_channel_iter { 335 spdk_bdev_for_each_channel_msg fn; 336 spdk_bdev_for_each_channel_done cpl; 337 struct spdk_io_channel_iter *i; 338 void *ctx; 339 }; 340 341 struct spdk_bdev_io_error_stat { 342 uint32_t error_status[-SPDK_MIN_BDEV_IO_STATUS]; 343 }; 344 345 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 346 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 347 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 348 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 349 350 static inline void bdev_io_complete(void *ctx); 351 352 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 353 static void bdev_write_zero_buffer_next(void *_bdev_io); 354 355 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 356 struct spdk_io_channel *ch, void *_ctx); 357 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 358 359 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 360 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 361 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 362 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 363 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 364 struct iovec *iov, int iovcnt, void *md_buf, 365 uint64_t offset_blocks, uint64_t num_blocks, 366 spdk_bdev_io_completion_cb cb, void *cb_arg, 367 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 368 369 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 370 uint64_t offset, uint64_t length, 371 lock_range_cb cb_fn, void *cb_arg); 372 373 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 374 uint64_t offset, uint64_t length, 375 lock_range_cb cb_fn, void *cb_arg); 376 377 static inline void bdev_io_complete(void *ctx); 378 379 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 380 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 381 382 void 383 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 384 { 385 if (!opts) { 386 SPDK_ERRLOG("opts should not be NULL\n"); 387 return; 388 } 389 390 if (!opts_size) { 391 SPDK_ERRLOG("opts_size should not be zero value\n"); 392 return; 393 } 394 395 opts->opts_size = opts_size; 396 397 #define SET_FIELD(field) \ 398 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 399 opts->field = g_bdev_opts.field; \ 400 } \ 401 402 SET_FIELD(bdev_io_pool_size); 403 SET_FIELD(bdev_io_cache_size); 404 SET_FIELD(bdev_auto_examine); 405 SET_FIELD(small_buf_pool_size); 406 SET_FIELD(large_buf_pool_size); 407 408 /* Do not remove this statement, you should always update this statement when you adding a new field, 409 * and do not forget to add the SET_FIELD statement for your added field. */ 410 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 411 412 #undef SET_FIELD 413 } 414 415 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 416 "v23.05", 0); 417 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 418 "v23.05", 0); 419 int 420 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 421 { 422 struct spdk_iobuf_opts iobuf_opts; 423 uint32_t min_pool_size; 424 int rc; 425 426 if (!opts) { 427 SPDK_ERRLOG("opts cannot be NULL\n"); 428 return -1; 429 } 430 431 if (!opts->opts_size) { 432 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 433 return -1; 434 } 435 436 /* 437 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 438 * initialization. A second mgmt_ch will be created on the same thread when the application starts 439 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 440 */ 441 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 442 if (opts->bdev_io_pool_size < min_pool_size) { 443 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 444 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 445 spdk_thread_get_count()); 446 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 447 return -1; 448 } 449 450 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 451 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 452 } 453 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 454 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 455 } 456 457 #define SET_FIELD(field) \ 458 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 459 g_bdev_opts.field = opts->field; \ 460 } \ 461 462 SET_FIELD(bdev_io_pool_size); 463 SET_FIELD(bdev_io_cache_size); 464 SET_FIELD(bdev_auto_examine); 465 SET_FIELD(small_buf_pool_size); 466 SET_FIELD(large_buf_pool_size); 467 468 spdk_iobuf_get_opts(&iobuf_opts); 469 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 470 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 471 472 rc = spdk_iobuf_set_opts(&iobuf_opts); 473 if (rc != 0) { 474 SPDK_ERRLOG("Failed to set iobuf opts\n"); 475 return -1; 476 } 477 478 g_bdev_opts.opts_size = opts->opts_size; 479 480 #undef SET_FIELD 481 482 return 0; 483 } 484 485 static struct spdk_bdev * 486 bdev_get_by_name(const char *bdev_name) 487 { 488 struct spdk_bdev_name find; 489 struct spdk_bdev_name *res; 490 491 find.name = (char *)bdev_name; 492 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 493 if (res != NULL) { 494 return res->bdev; 495 } 496 497 return NULL; 498 } 499 500 struct spdk_bdev * 501 spdk_bdev_get_by_name(const char *bdev_name) 502 { 503 struct spdk_bdev *bdev; 504 505 spdk_spin_lock(&g_bdev_mgr.spinlock); 506 bdev = bdev_get_by_name(bdev_name); 507 spdk_spin_unlock(&g_bdev_mgr.spinlock); 508 509 return bdev; 510 } 511 512 struct bdev_io_status_string { 513 enum spdk_bdev_io_status status; 514 const char *str; 515 }; 516 517 static const struct bdev_io_status_string bdev_io_status_strings[] = { 518 { SPDK_BDEV_IO_STATUS_AIO_ERROR, "aio_error" }, 519 { SPDK_BDEV_IO_STATUS_ABORTED, "aborted" }, 520 { SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED, "first_fused_failed" }, 521 { SPDK_BDEV_IO_STATUS_MISCOMPARE, "miscompare" }, 522 { SPDK_BDEV_IO_STATUS_NOMEM, "nomem" }, 523 { SPDK_BDEV_IO_STATUS_SCSI_ERROR, "scsi_error" }, 524 { SPDK_BDEV_IO_STATUS_NVME_ERROR, "nvme_error" }, 525 { SPDK_BDEV_IO_STATUS_FAILED, "failed" }, 526 { SPDK_BDEV_IO_STATUS_PENDING, "pending" }, 527 { SPDK_BDEV_IO_STATUS_SUCCESS, "success" }, 528 }; 529 530 static const char * 531 bdev_io_status_get_string(enum spdk_bdev_io_status status) 532 { 533 uint32_t i; 534 535 for (i = 0; i < SPDK_COUNTOF(bdev_io_status_strings); i++) { 536 if (bdev_io_status_strings[i].status == status) { 537 return bdev_io_status_strings[i].str; 538 } 539 } 540 541 return "reserved"; 542 } 543 544 struct spdk_bdev_wait_for_examine_ctx { 545 struct spdk_poller *poller; 546 spdk_bdev_wait_for_examine_cb cb_fn; 547 void *cb_arg; 548 }; 549 550 static bool bdev_module_all_actions_completed(void); 551 552 static int 553 bdev_wait_for_examine_cb(void *arg) 554 { 555 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 556 557 if (!bdev_module_all_actions_completed()) { 558 return SPDK_POLLER_IDLE; 559 } 560 561 spdk_poller_unregister(&ctx->poller); 562 ctx->cb_fn(ctx->cb_arg); 563 free(ctx); 564 565 return SPDK_POLLER_BUSY; 566 } 567 568 int 569 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 570 { 571 struct spdk_bdev_wait_for_examine_ctx *ctx; 572 573 ctx = calloc(1, sizeof(*ctx)); 574 if (ctx == NULL) { 575 return -ENOMEM; 576 } 577 ctx->cb_fn = cb_fn; 578 ctx->cb_arg = cb_arg; 579 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 580 581 return 0; 582 } 583 584 struct spdk_bdev_examine_item { 585 char *name; 586 TAILQ_ENTRY(spdk_bdev_examine_item) link; 587 }; 588 589 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 590 591 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 592 g_bdev_examine_allowlist); 593 594 static inline bool 595 bdev_examine_allowlist_check(const char *name) 596 { 597 struct spdk_bdev_examine_item *item; 598 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 599 if (strcmp(name, item->name) == 0) { 600 return true; 601 } 602 } 603 return false; 604 } 605 606 static inline void 607 bdev_examine_allowlist_free(void) 608 { 609 struct spdk_bdev_examine_item *item; 610 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 611 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 612 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 613 free(item->name); 614 free(item); 615 } 616 } 617 618 static inline bool 619 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 620 { 621 struct spdk_bdev_alias *tmp; 622 if (bdev_examine_allowlist_check(bdev->name)) { 623 return true; 624 } 625 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 626 if (bdev_examine_allowlist_check(tmp->alias.name)) { 627 return true; 628 } 629 } 630 return false; 631 } 632 633 static inline bool 634 bdev_ok_to_examine(struct spdk_bdev *bdev) 635 { 636 if (g_bdev_opts.bdev_auto_examine) { 637 return true; 638 } else { 639 return bdev_in_examine_allowlist(bdev); 640 } 641 } 642 643 static void 644 bdev_examine(struct spdk_bdev *bdev) 645 { 646 struct spdk_bdev_module *module; 647 uint32_t action; 648 649 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 650 if (module->examine_config && bdev_ok_to_examine(bdev)) { 651 action = module->internal.action_in_progress; 652 module->internal.action_in_progress++; 653 module->examine_config(bdev); 654 if (action != module->internal.action_in_progress) { 655 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 656 module->name); 657 } 658 } 659 } 660 661 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 662 if (bdev->internal.claim_module->examine_disk) { 663 bdev->internal.claim_module->internal.action_in_progress++; 664 bdev->internal.claim_module->examine_disk(bdev); 665 } 666 return; 667 } 668 669 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 670 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 671 module->internal.action_in_progress++; 672 module->examine_disk(bdev); 673 } 674 } 675 } 676 677 int 678 spdk_bdev_examine(const char *name) 679 { 680 struct spdk_bdev *bdev; 681 struct spdk_bdev_examine_item *item; 682 683 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 684 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 685 } 686 687 if (g_bdev_opts.bdev_auto_examine) { 688 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 689 return -EINVAL; 690 } 691 692 if (bdev_examine_allowlist_check(name)) { 693 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 694 return -EEXIST; 695 } 696 697 item = calloc(1, sizeof(*item)); 698 if (!item) { 699 return -ENOMEM; 700 } 701 item->name = strdup(name); 702 if (!item->name) { 703 free(item); 704 return -ENOMEM; 705 } 706 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 707 708 bdev = spdk_bdev_get_by_name(name); 709 if (bdev) { 710 bdev_examine(bdev); 711 } 712 return 0; 713 } 714 715 static inline void 716 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 717 { 718 struct spdk_bdev_examine_item *item; 719 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 720 spdk_json_write_object_begin(w); 721 spdk_json_write_named_string(w, "method", "bdev_examine"); 722 spdk_json_write_named_object_begin(w, "params"); 723 spdk_json_write_named_string(w, "name", item->name); 724 spdk_json_write_object_end(w); 725 spdk_json_write_object_end(w); 726 } 727 } 728 729 struct spdk_bdev * 730 spdk_bdev_first(void) 731 { 732 struct spdk_bdev *bdev; 733 734 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 735 if (bdev) { 736 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 737 } 738 739 return bdev; 740 } 741 742 struct spdk_bdev * 743 spdk_bdev_next(struct spdk_bdev *prev) 744 { 745 struct spdk_bdev *bdev; 746 747 bdev = TAILQ_NEXT(prev, internal.link); 748 if (bdev) { 749 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 750 } 751 752 return bdev; 753 } 754 755 static struct spdk_bdev * 756 _bdev_next_leaf(struct spdk_bdev *bdev) 757 { 758 while (bdev != NULL) { 759 if (bdev->internal.claim_module == NULL) { 760 return bdev; 761 } else { 762 bdev = TAILQ_NEXT(bdev, internal.link); 763 } 764 } 765 766 return bdev; 767 } 768 769 struct spdk_bdev * 770 spdk_bdev_first_leaf(void) 771 { 772 struct spdk_bdev *bdev; 773 774 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 775 776 if (bdev) { 777 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 778 } 779 780 return bdev; 781 } 782 783 struct spdk_bdev * 784 spdk_bdev_next_leaf(struct spdk_bdev *prev) 785 { 786 struct spdk_bdev *bdev; 787 788 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 789 790 if (bdev) { 791 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 792 } 793 794 return bdev; 795 } 796 797 static inline bool 798 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 799 { 800 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 801 } 802 803 void 804 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 805 { 806 struct iovec *iovs; 807 808 if (bdev_io->u.bdev.iovs == NULL) { 809 bdev_io->u.bdev.iovs = &bdev_io->iov; 810 bdev_io->u.bdev.iovcnt = 1; 811 } 812 813 iovs = bdev_io->u.bdev.iovs; 814 815 assert(iovs != NULL); 816 assert(bdev_io->u.bdev.iovcnt >= 1); 817 818 iovs[0].iov_base = buf; 819 iovs[0].iov_len = len; 820 } 821 822 void 823 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 824 { 825 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 826 bdev_io->u.bdev.md_buf = md_buf; 827 } 828 829 static bool 830 _is_buf_allocated(const struct iovec *iovs) 831 { 832 if (iovs == NULL) { 833 return false; 834 } 835 836 return iovs[0].iov_base != NULL; 837 } 838 839 static bool 840 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 841 { 842 int i; 843 uintptr_t iov_base; 844 845 if (spdk_likely(alignment == 1)) { 846 return true; 847 } 848 849 for (i = 0; i < iovcnt; i++) { 850 iov_base = (uintptr_t)iovs[i].iov_base; 851 if ((iov_base & (alignment - 1)) != 0) { 852 return false; 853 } 854 } 855 856 return true; 857 } 858 859 static void 860 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 861 { 862 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 863 void *buf; 864 865 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 866 buf = bdev_io->internal.buf; 867 bdev_io->internal.buf = NULL; 868 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 869 bdev_io->internal.get_aux_buf_cb = NULL; 870 } else { 871 assert(bdev_io->internal.get_buf_cb != NULL); 872 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 873 bdev_io->internal.get_buf_cb = NULL; 874 } 875 } 876 877 static void 878 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 879 { 880 struct spdk_bdev_io *bdev_io = ctx; 881 882 if (rc) { 883 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 884 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 885 } 886 bdev_io_get_buf_complete(bdev_io, !rc); 887 } 888 889 static void 890 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 891 { 892 int rc = 0; 893 894 /* save original md_buf */ 895 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 896 bdev_io->internal.orig_md_iov.iov_len = len; 897 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 898 bdev_io->internal.bounce_md_iov.iov_len = len; 899 /* set bounce md_buf */ 900 bdev_io->u.bdev.md_buf = md_buf; 901 902 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 903 if (bdev_io_use_memory_domain(bdev_io)) { 904 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 905 bdev_io->internal.ext_opts->memory_domain_ctx, 906 &bdev_io->internal.orig_md_iov, 1, 907 &bdev_io->internal.bounce_md_iov, 1, 908 bdev_io->internal.data_transfer_cpl, 909 bdev_io); 910 if (rc == 0) { 911 /* Continue to submit IO in completion callback */ 912 return; 913 } 914 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 915 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 916 } else { 917 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 918 } 919 } 920 921 assert(bdev_io->internal.data_transfer_cpl); 922 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 923 } 924 925 static void 926 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 927 { 928 struct spdk_bdev *bdev = bdev_io->bdev; 929 uint64_t md_len; 930 void *buf; 931 932 if (spdk_bdev_is_md_separate(bdev)) { 933 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 934 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 935 936 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 937 938 if (bdev_io->u.bdev.md_buf != NULL) { 939 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 940 return; 941 } else { 942 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 943 } 944 } 945 946 bdev_io_get_buf_complete(bdev_io, true); 947 } 948 949 static void 950 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 951 { 952 struct spdk_bdev_io *bdev_io = ctx; 953 954 if (rc) { 955 SPDK_ERRLOG("Failed to get data buffer\n"); 956 assert(bdev_io->internal.data_transfer_cpl); 957 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 958 return; 959 } 960 961 _bdev_io_set_md_buf(bdev_io); 962 } 963 964 static void 965 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 966 bdev_copy_bounce_buffer_cpl cpl_cb) 967 { 968 int rc = 0; 969 970 bdev_io->internal.data_transfer_cpl = cpl_cb; 971 /* save original iovec */ 972 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 973 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 974 /* set bounce iov */ 975 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 976 bdev_io->u.bdev.iovcnt = 1; 977 /* set bounce buffer for this operation */ 978 bdev_io->u.bdev.iovs[0].iov_base = buf; 979 bdev_io->u.bdev.iovs[0].iov_len = len; 980 /* if this is write path, copy data from original buffer to bounce buffer */ 981 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 982 if (bdev_io_use_memory_domain(bdev_io)) { 983 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 984 bdev_io->internal.ext_opts->memory_domain_ctx, 985 bdev_io->internal.orig_iovs, 986 (uint32_t) bdev_io->internal.orig_iovcnt, 987 bdev_io->u.bdev.iovs, 1, 988 _bdev_io_pull_bounce_data_buf_done, 989 bdev_io); 990 if (rc == 0) { 991 /* Continue to submit IO in completion callback */ 992 return; 993 } 994 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 995 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 996 } else { 997 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 998 } 999 } 1000 1001 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 1002 } 1003 1004 static void 1005 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 1006 { 1007 struct spdk_bdev *bdev = bdev_io->bdev; 1008 bool buf_allocated; 1009 uint64_t alignment; 1010 void *aligned_buf; 1011 1012 bdev_io->internal.buf = buf; 1013 1014 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 1015 bdev_io_get_buf_complete(bdev_io, true); 1016 return; 1017 } 1018 1019 alignment = spdk_bdev_get_buf_align(bdev); 1020 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 1021 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 1022 1023 if (buf_allocated) { 1024 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 1025 /* Continue in completion callback */ 1026 return; 1027 } else { 1028 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 1029 } 1030 1031 _bdev_io_set_md_buf(bdev_io); 1032 } 1033 1034 static inline uint64_t 1035 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 1036 { 1037 struct spdk_bdev *bdev = bdev_io->bdev; 1038 uint64_t md_len, alignment; 1039 1040 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1041 alignment = spdk_bdev_get_buf_align(bdev); 1042 1043 return len + alignment + md_len; 1044 } 1045 1046 static void 1047 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1048 { 1049 struct spdk_bdev_mgmt_channel *ch; 1050 1051 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1052 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1053 } 1054 1055 static void 1056 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1057 { 1058 assert(bdev_io->internal.buf != NULL); 1059 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1060 bdev_io->internal.buf = NULL; 1061 } 1062 1063 void 1064 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1065 { 1066 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1067 1068 assert(buf != NULL); 1069 _bdev_io_put_buf(bdev_io, buf, len); 1070 } 1071 1072 static void 1073 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1074 { 1075 struct spdk_bdev *bdev = bdev_ch->bdev; 1076 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1077 struct spdk_bdev_io *bdev_io; 1078 1079 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1080 /* 1081 * Allow some more I/O to complete before retrying the nomem_io queue. 1082 * Some drivers (such as nvme) cannot immediately take a new I/O in 1083 * the context of a completion, because the resources for the I/O are 1084 * not released until control returns to the bdev poller. Also, we 1085 * may require several small I/O to complete before a larger I/O 1086 * (that requires splitting) can be submitted. 1087 */ 1088 return; 1089 } 1090 1091 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1092 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1093 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1094 bdev_io->internal.ch->io_outstanding++; 1095 shared_resource->io_outstanding++; 1096 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1097 bdev_io->internal.error.nvme.cdw0 = 0; 1098 bdev_io->num_retries++; 1099 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1100 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1101 break; 1102 } 1103 } 1104 } 1105 1106 static inline void 1107 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1108 struct spdk_bdev_shared_resource *shared_resource) 1109 { 1110 assert(bdev_ch->io_outstanding > 0); 1111 assert(shared_resource->io_outstanding > 0); 1112 bdev_ch->io_outstanding--; 1113 shared_resource->io_outstanding--; 1114 } 1115 1116 static inline bool 1117 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1118 { 1119 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1120 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1121 1122 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1123 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1124 /* 1125 * Wait for some of the outstanding I/O to complete before we 1126 * retry any of the nomem_io. Normally we will wait for 1127 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1128 * depth channels we will instead wait for half to complete. 1129 */ 1130 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1131 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1132 return true; 1133 } 1134 1135 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1136 bdev_ch_retry_io(bdev_ch); 1137 } 1138 1139 return false; 1140 } 1141 1142 static void 1143 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1144 { 1145 struct spdk_bdev_io *bdev_io = ctx; 1146 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1147 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1148 1149 if (rc) { 1150 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1151 } 1152 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1153 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1154 */ 1155 bdev_io_put_buf(bdev_io); 1156 1157 /* Continue with IO completion flow */ 1158 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1159 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1160 return; 1161 } 1162 1163 bdev_io_complete(bdev_io); 1164 } 1165 1166 static inline void 1167 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1168 { 1169 int rc = 0; 1170 1171 /* do the same for metadata buffer */ 1172 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1173 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1174 1175 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1176 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1177 if (bdev_io_use_memory_domain(bdev_io)) { 1178 /* If memory domain is used then we need to call async push function */ 1179 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1180 bdev_io->internal.ext_opts->memory_domain_ctx, 1181 &bdev_io->internal.orig_md_iov, 1182 (uint32_t)bdev_io->internal.orig_iovcnt, 1183 &bdev_io->internal.bounce_md_iov, 1, 1184 bdev_io->internal.data_transfer_cpl, 1185 bdev_io); 1186 if (rc == 0) { 1187 /* Continue IO completion in async callback */ 1188 return; 1189 } 1190 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1191 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1192 } else { 1193 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1194 bdev_io->internal.orig_md_iov.iov_len); 1195 } 1196 } 1197 } 1198 1199 assert(bdev_io->internal.data_transfer_cpl); 1200 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1201 } 1202 1203 static void 1204 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1205 { 1206 struct spdk_bdev_io *bdev_io = ctx; 1207 1208 assert(bdev_io->internal.data_transfer_cpl); 1209 1210 if (rc) { 1211 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1212 return; 1213 } 1214 1215 /* set original buffer for this io */ 1216 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1217 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1218 /* disable bouncing buffer for this io */ 1219 bdev_io->internal.orig_iovcnt = 0; 1220 bdev_io->internal.orig_iovs = NULL; 1221 1222 _bdev_io_push_bounce_md_buffer(bdev_io); 1223 } 1224 1225 static inline void 1226 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1227 { 1228 int rc = 0; 1229 1230 bdev_io->internal.data_transfer_cpl = cpl_cb; 1231 1232 /* if this is read path, copy data from bounce buffer to original buffer */ 1233 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1234 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1235 if (bdev_io_use_memory_domain(bdev_io)) { 1236 /* If memory domain is used then we need to call async push function */ 1237 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1238 bdev_io->internal.ext_opts->memory_domain_ctx, 1239 bdev_io->internal.orig_iovs, 1240 (uint32_t)bdev_io->internal.orig_iovcnt, 1241 &bdev_io->internal.bounce_iov, 1, 1242 _bdev_io_push_bounce_data_buffer_done, 1243 bdev_io); 1244 if (rc == 0) { 1245 /* Continue IO completion in async callback */ 1246 return; 1247 } 1248 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1249 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1250 } else { 1251 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1252 bdev_io->internal.orig_iovcnt, 1253 bdev_io->internal.bounce_iov.iov_base, 1254 bdev_io->internal.bounce_iov.iov_len); 1255 } 1256 } 1257 1258 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1259 } 1260 1261 static void 1262 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1263 { 1264 struct spdk_bdev_io *bdev_io; 1265 1266 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1267 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1268 } 1269 1270 static void 1271 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1272 { 1273 struct spdk_bdev_mgmt_channel *mgmt_ch; 1274 uint64_t max_len; 1275 void *buf; 1276 1277 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1278 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1279 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1280 1281 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1282 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1283 bdev_io_get_buf_complete(bdev_io, false); 1284 return; 1285 } 1286 1287 bdev_io->internal.buf_len = len; 1288 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1289 bdev_io_get_iobuf_cb); 1290 if (buf != NULL) { 1291 _bdev_io_set_buf(bdev_io, buf, len); 1292 } 1293 } 1294 1295 void 1296 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1297 { 1298 struct spdk_bdev *bdev = bdev_io->bdev; 1299 uint64_t alignment; 1300 1301 assert(cb != NULL); 1302 bdev_io->internal.get_buf_cb = cb; 1303 1304 alignment = spdk_bdev_get_buf_align(bdev); 1305 1306 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1307 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1308 /* Buffer already present and aligned */ 1309 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1310 return; 1311 } 1312 1313 bdev_io_get_buf(bdev_io, len); 1314 } 1315 1316 static void 1317 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1318 bool success) 1319 { 1320 if (!success) { 1321 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1322 bdev_io_complete(bdev_io); 1323 } else { 1324 bdev_io_submit(bdev_io); 1325 } 1326 } 1327 1328 static void 1329 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1330 uint64_t len) 1331 { 1332 assert(cb != NULL); 1333 bdev_io->internal.get_buf_cb = cb; 1334 1335 bdev_io_get_buf(bdev_io, len); 1336 } 1337 1338 void 1339 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1340 { 1341 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1342 1343 assert(cb != NULL); 1344 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1345 bdev_io->internal.get_aux_buf_cb = cb; 1346 bdev_io_get_buf(bdev_io, len); 1347 } 1348 1349 static int 1350 bdev_module_get_max_ctx_size(void) 1351 { 1352 struct spdk_bdev_module *bdev_module; 1353 int max_bdev_module_size = 0; 1354 1355 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1356 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1357 max_bdev_module_size = bdev_module->get_ctx_size(); 1358 } 1359 } 1360 1361 return max_bdev_module_size; 1362 } 1363 1364 static void 1365 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1366 { 1367 int i; 1368 struct spdk_bdev_qos *qos = bdev->internal.qos; 1369 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1370 1371 if (!qos) { 1372 return; 1373 } 1374 1375 spdk_bdev_get_qos_rate_limits(bdev, limits); 1376 1377 spdk_json_write_object_begin(w); 1378 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1379 1380 spdk_json_write_named_object_begin(w, "params"); 1381 spdk_json_write_named_string(w, "name", bdev->name); 1382 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1383 if (limits[i] > 0) { 1384 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1385 } 1386 } 1387 spdk_json_write_object_end(w); 1388 1389 spdk_json_write_object_end(w); 1390 } 1391 1392 void 1393 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1394 { 1395 struct spdk_bdev_module *bdev_module; 1396 struct spdk_bdev *bdev; 1397 1398 assert(w != NULL); 1399 1400 spdk_json_write_array_begin(w); 1401 1402 spdk_json_write_object_begin(w); 1403 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1404 spdk_json_write_named_object_begin(w, "params"); 1405 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1406 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1407 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1408 spdk_json_write_object_end(w); 1409 spdk_json_write_object_end(w); 1410 1411 bdev_examine_allowlist_config_json(w); 1412 1413 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1414 if (bdev_module->config_json) { 1415 bdev_module->config_json(w); 1416 } 1417 } 1418 1419 spdk_spin_lock(&g_bdev_mgr.spinlock); 1420 1421 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1422 if (bdev->fn_table->write_config_json) { 1423 bdev->fn_table->write_config_json(bdev, w); 1424 } 1425 1426 bdev_qos_config_json(bdev, w); 1427 } 1428 1429 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1430 1431 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1432 spdk_json_write_object_begin(w); 1433 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1434 spdk_json_write_object_end(w); 1435 1436 spdk_json_write_array_end(w); 1437 } 1438 1439 static void 1440 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1441 { 1442 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1443 struct spdk_bdev_io *bdev_io; 1444 1445 spdk_iobuf_channel_fini(&ch->iobuf); 1446 1447 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1448 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1449 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1450 ch->per_thread_cache_count--; 1451 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1452 } 1453 1454 assert(ch->per_thread_cache_count == 0); 1455 } 1456 1457 static int 1458 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1459 { 1460 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1461 struct spdk_bdev_io *bdev_io; 1462 uint32_t i; 1463 int rc; 1464 1465 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1466 if (rc != 0) { 1467 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1468 return -1; 1469 } 1470 1471 STAILQ_INIT(&ch->per_thread_cache); 1472 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1473 1474 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1475 ch->per_thread_cache_count = 0; 1476 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1477 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1478 if (bdev_io == NULL) { 1479 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1480 assert(false); 1481 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1482 return -1; 1483 } 1484 ch->per_thread_cache_count++; 1485 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1486 } 1487 1488 TAILQ_INIT(&ch->shared_resources); 1489 TAILQ_INIT(&ch->io_wait_queue); 1490 1491 return 0; 1492 } 1493 1494 static void 1495 bdev_init_complete(int rc) 1496 { 1497 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1498 void *cb_arg = g_init_cb_arg; 1499 struct spdk_bdev_module *m; 1500 1501 g_bdev_mgr.init_complete = true; 1502 g_init_cb_fn = NULL; 1503 g_init_cb_arg = NULL; 1504 1505 /* 1506 * For modules that need to know when subsystem init is complete, 1507 * inform them now. 1508 */ 1509 if (rc == 0) { 1510 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1511 if (m->init_complete) { 1512 m->init_complete(); 1513 } 1514 } 1515 } 1516 1517 cb_fn(cb_arg, rc); 1518 } 1519 1520 static bool 1521 bdev_module_all_actions_completed(void) 1522 { 1523 struct spdk_bdev_module *m; 1524 1525 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1526 if (m->internal.action_in_progress > 0) { 1527 return false; 1528 } 1529 } 1530 return true; 1531 } 1532 1533 static void 1534 bdev_module_action_complete(void) 1535 { 1536 /* 1537 * Don't finish bdev subsystem initialization if 1538 * module pre-initialization is still in progress, or 1539 * the subsystem been already initialized. 1540 */ 1541 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1542 return; 1543 } 1544 1545 /* 1546 * Check all bdev modules for inits/examinations in progress. If any 1547 * exist, return immediately since we cannot finish bdev subsystem 1548 * initialization until all are completed. 1549 */ 1550 if (!bdev_module_all_actions_completed()) { 1551 return; 1552 } 1553 1554 /* 1555 * Modules already finished initialization - now that all 1556 * the bdev modules have finished their asynchronous I/O 1557 * processing, the entire bdev layer can be marked as complete. 1558 */ 1559 bdev_init_complete(0); 1560 } 1561 1562 static void 1563 bdev_module_action_done(struct spdk_bdev_module *module) 1564 { 1565 assert(module->internal.action_in_progress > 0); 1566 module->internal.action_in_progress--; 1567 bdev_module_action_complete(); 1568 } 1569 1570 void 1571 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1572 { 1573 bdev_module_action_done(module); 1574 } 1575 1576 void 1577 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1578 { 1579 bdev_module_action_done(module); 1580 } 1581 1582 /** The last initialized bdev module */ 1583 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1584 1585 static void 1586 bdev_init_failed(void *cb_arg) 1587 { 1588 struct spdk_bdev_module *module = cb_arg; 1589 1590 module->internal.action_in_progress--; 1591 bdev_init_complete(-1); 1592 } 1593 1594 static int 1595 bdev_modules_init(void) 1596 { 1597 struct spdk_bdev_module *module; 1598 int rc = 0; 1599 1600 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1601 g_resume_bdev_module = module; 1602 if (module->async_init) { 1603 module->internal.action_in_progress = 1; 1604 } 1605 rc = module->module_init(); 1606 if (rc != 0) { 1607 /* Bump action_in_progress to prevent other modules from completion of modules_init 1608 * Send message to defer application shutdown until resources are cleaned up */ 1609 module->internal.action_in_progress = 1; 1610 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1611 return rc; 1612 } 1613 } 1614 1615 g_resume_bdev_module = NULL; 1616 return 0; 1617 } 1618 1619 void 1620 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1621 { 1622 int rc = 0; 1623 char mempool_name[32]; 1624 1625 assert(cb_fn != NULL); 1626 1627 g_init_cb_fn = cb_fn; 1628 g_init_cb_arg = cb_arg; 1629 1630 spdk_notify_type_register("bdev_register"); 1631 spdk_notify_type_register("bdev_unregister"); 1632 1633 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1634 1635 rc = spdk_iobuf_register_module("bdev"); 1636 if (rc != 0) { 1637 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1638 bdev_init_complete(-1); 1639 return; 1640 } 1641 1642 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1643 g_bdev_opts.bdev_io_pool_size, 1644 sizeof(struct spdk_bdev_io) + 1645 bdev_module_get_max_ctx_size(), 1646 0, 1647 SPDK_ENV_SOCKET_ID_ANY); 1648 1649 if (g_bdev_mgr.bdev_io_pool == NULL) { 1650 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1651 bdev_init_complete(-1); 1652 return; 1653 } 1654 1655 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1656 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1657 if (!g_bdev_mgr.zero_buffer) { 1658 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1659 bdev_init_complete(-1); 1660 return; 1661 } 1662 1663 #ifdef SPDK_CONFIG_VTUNE 1664 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1665 #endif 1666 1667 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1668 bdev_mgmt_channel_destroy, 1669 sizeof(struct spdk_bdev_mgmt_channel), 1670 "bdev_mgr"); 1671 1672 rc = bdev_modules_init(); 1673 g_bdev_mgr.module_init_complete = true; 1674 if (rc != 0) { 1675 SPDK_ERRLOG("bdev modules init failed\n"); 1676 return; 1677 } 1678 1679 bdev_module_action_complete(); 1680 } 1681 1682 static void 1683 bdev_mgr_unregister_cb(void *io_device) 1684 { 1685 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1686 1687 if (g_bdev_mgr.bdev_io_pool) { 1688 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1689 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1690 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1691 g_bdev_opts.bdev_io_pool_size); 1692 } 1693 1694 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1695 } 1696 1697 spdk_free(g_bdev_mgr.zero_buffer); 1698 1699 bdev_examine_allowlist_free(); 1700 1701 cb_fn(g_fini_cb_arg); 1702 g_fini_cb_fn = NULL; 1703 g_fini_cb_arg = NULL; 1704 g_bdev_mgr.init_complete = false; 1705 g_bdev_mgr.module_init_complete = false; 1706 } 1707 1708 static void 1709 bdev_module_fini_iter(void *arg) 1710 { 1711 struct spdk_bdev_module *bdev_module; 1712 1713 /* FIXME: Handling initialization failures is broken now, 1714 * so we won't even try cleaning up after successfully 1715 * initialized modules. if module_init_complete is false, 1716 * just call spdk_bdev_mgr_unregister_cb 1717 */ 1718 if (!g_bdev_mgr.module_init_complete) { 1719 bdev_mgr_unregister_cb(NULL); 1720 return; 1721 } 1722 1723 /* Start iterating from the last touched module */ 1724 if (!g_resume_bdev_module) { 1725 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1726 } else { 1727 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1728 internal.tailq); 1729 } 1730 1731 while (bdev_module) { 1732 if (bdev_module->async_fini) { 1733 /* Save our place so we can resume later. We must 1734 * save the variable here, before calling module_fini() 1735 * below, because in some cases the module may immediately 1736 * call spdk_bdev_module_fini_done() and re-enter 1737 * this function to continue iterating. */ 1738 g_resume_bdev_module = bdev_module; 1739 } 1740 1741 if (bdev_module->module_fini) { 1742 bdev_module->module_fini(); 1743 } 1744 1745 if (bdev_module->async_fini) { 1746 return; 1747 } 1748 1749 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1750 internal.tailq); 1751 } 1752 1753 g_resume_bdev_module = NULL; 1754 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1755 } 1756 1757 void 1758 spdk_bdev_module_fini_done(void) 1759 { 1760 if (spdk_get_thread() != g_fini_thread) { 1761 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1762 } else { 1763 bdev_module_fini_iter(NULL); 1764 } 1765 } 1766 1767 static void 1768 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1769 { 1770 struct spdk_bdev *bdev = cb_arg; 1771 1772 if (bdeverrno && bdev) { 1773 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1774 bdev->name); 1775 1776 /* 1777 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1778 * bdev; try to continue by manually removing this bdev from the list and continue 1779 * with the next bdev in the list. 1780 */ 1781 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1782 } 1783 1784 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1785 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1786 /* 1787 * Bdev module finish need to be deferred as we might be in the middle of some context 1788 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1789 * after returning. 1790 */ 1791 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1792 return; 1793 } 1794 1795 /* 1796 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1797 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1798 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1799 * base bdevs. 1800 * 1801 * Also, walk the list in the reverse order. 1802 */ 1803 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1804 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1805 if (bdev->internal.claim_module != NULL) { 1806 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1807 bdev->name, bdev->internal.claim_module->name); 1808 continue; 1809 } 1810 1811 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1812 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1813 return; 1814 } 1815 1816 /* 1817 * If any bdev fails to unclaim underlying bdev properly, we may face the 1818 * case of bdev list consisting of claimed bdevs only (if claims are managed 1819 * correctly, this would mean there's a loop in the claims graph which is 1820 * clearly impossible). Warn and unregister last bdev on the list then. 1821 */ 1822 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1823 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1824 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1825 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1826 return; 1827 } 1828 } 1829 1830 static void 1831 bdev_module_fini_start_iter(void *arg) 1832 { 1833 struct spdk_bdev_module *bdev_module; 1834 1835 if (!g_resume_bdev_module) { 1836 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1837 } else { 1838 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1839 } 1840 1841 while (bdev_module) { 1842 if (bdev_module->async_fini_start) { 1843 /* Save our place so we can resume later. We must 1844 * save the variable here, before calling fini_start() 1845 * below, because in some cases the module may immediately 1846 * call spdk_bdev_module_fini_start_done() and re-enter 1847 * this function to continue iterating. */ 1848 g_resume_bdev_module = bdev_module; 1849 } 1850 1851 if (bdev_module->fini_start) { 1852 bdev_module->fini_start(); 1853 } 1854 1855 if (bdev_module->async_fini_start) { 1856 return; 1857 } 1858 1859 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1860 } 1861 1862 g_resume_bdev_module = NULL; 1863 1864 bdev_finish_unregister_bdevs_iter(NULL, 0); 1865 } 1866 1867 void 1868 spdk_bdev_module_fini_start_done(void) 1869 { 1870 if (spdk_get_thread() != g_fini_thread) { 1871 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1872 } else { 1873 bdev_module_fini_start_iter(NULL); 1874 } 1875 } 1876 1877 static void 1878 bdev_finish_wait_for_examine_done(void *cb_arg) 1879 { 1880 bdev_module_fini_start_iter(NULL); 1881 } 1882 1883 void 1884 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1885 { 1886 int rc; 1887 1888 assert(cb_fn != NULL); 1889 1890 g_fini_thread = spdk_get_thread(); 1891 1892 g_fini_cb_fn = cb_fn; 1893 g_fini_cb_arg = cb_arg; 1894 1895 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1896 if (rc != 0) { 1897 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1898 bdev_finish_wait_for_examine_done(NULL); 1899 } 1900 } 1901 1902 struct spdk_bdev_io * 1903 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1904 { 1905 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1906 struct spdk_bdev_io *bdev_io; 1907 1908 if (ch->per_thread_cache_count > 0) { 1909 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1910 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1911 ch->per_thread_cache_count--; 1912 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1913 /* 1914 * Don't try to look for bdev_ios in the global pool if there are 1915 * waiters on bdev_ios - we don't want this caller to jump the line. 1916 */ 1917 bdev_io = NULL; 1918 } else { 1919 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1920 } 1921 1922 return bdev_io; 1923 } 1924 1925 void 1926 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1927 { 1928 struct spdk_bdev_mgmt_channel *ch; 1929 1930 assert(bdev_io != NULL); 1931 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1932 1933 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1934 1935 if (bdev_io->internal.buf != NULL) { 1936 bdev_io_put_buf(bdev_io); 1937 } 1938 1939 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1940 ch->per_thread_cache_count++; 1941 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1942 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1943 struct spdk_bdev_io_wait_entry *entry; 1944 1945 entry = TAILQ_FIRST(&ch->io_wait_queue); 1946 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1947 entry->cb_fn(entry->cb_arg); 1948 } 1949 } else { 1950 /* We should never have a full cache with entries on the io wait queue. */ 1951 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1952 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1953 } 1954 } 1955 1956 static bool 1957 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1958 { 1959 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1960 1961 switch (limit) { 1962 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1963 return true; 1964 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1965 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1966 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1967 return false; 1968 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1969 default: 1970 return false; 1971 } 1972 } 1973 1974 static bool 1975 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1976 { 1977 switch (bdev_io->type) { 1978 case SPDK_BDEV_IO_TYPE_NVME_IO: 1979 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1980 case SPDK_BDEV_IO_TYPE_READ: 1981 case SPDK_BDEV_IO_TYPE_WRITE: 1982 return true; 1983 case SPDK_BDEV_IO_TYPE_ZCOPY: 1984 if (bdev_io->u.bdev.zcopy.start) { 1985 return true; 1986 } else { 1987 return false; 1988 } 1989 default: 1990 return false; 1991 } 1992 } 1993 1994 static bool 1995 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1996 { 1997 switch (bdev_io->type) { 1998 case SPDK_BDEV_IO_TYPE_NVME_IO: 1999 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2000 /* Bit 1 (0x2) set for read operation */ 2001 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2002 return true; 2003 } else { 2004 return false; 2005 } 2006 case SPDK_BDEV_IO_TYPE_READ: 2007 return true; 2008 case SPDK_BDEV_IO_TYPE_ZCOPY: 2009 /* Populate to read from disk */ 2010 if (bdev_io->u.bdev.zcopy.populate) { 2011 return true; 2012 } else { 2013 return false; 2014 } 2015 default: 2016 return false; 2017 } 2018 } 2019 2020 static uint64_t 2021 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2022 { 2023 struct spdk_bdev *bdev = bdev_io->bdev; 2024 2025 switch (bdev_io->type) { 2026 case SPDK_BDEV_IO_TYPE_NVME_IO: 2027 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2028 return bdev_io->u.nvme_passthru.nbytes; 2029 case SPDK_BDEV_IO_TYPE_READ: 2030 case SPDK_BDEV_IO_TYPE_WRITE: 2031 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2032 case SPDK_BDEV_IO_TYPE_ZCOPY: 2033 /* Track the data in the start phase only */ 2034 if (bdev_io->u.bdev.zcopy.start) { 2035 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2036 } else { 2037 return 0; 2038 } 2039 default: 2040 return 0; 2041 } 2042 } 2043 2044 static bool 2045 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2046 { 2047 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2048 return true; 2049 } else { 2050 return false; 2051 } 2052 } 2053 2054 static bool 2055 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2056 { 2057 if (bdev_is_read_io(io) == false) { 2058 return false; 2059 } 2060 2061 return bdev_qos_rw_queue_io(limit, io); 2062 } 2063 2064 static bool 2065 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2066 { 2067 if (bdev_is_read_io(io) == true) { 2068 return false; 2069 } 2070 2071 return bdev_qos_rw_queue_io(limit, io); 2072 } 2073 2074 static void 2075 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2076 { 2077 limit->remaining_this_timeslice--; 2078 } 2079 2080 static void 2081 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2082 { 2083 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2084 } 2085 2086 static void 2087 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2088 { 2089 if (bdev_is_read_io(io) == false) { 2090 return; 2091 } 2092 2093 return bdev_qos_rw_bps_update_quota(limit, io); 2094 } 2095 2096 static void 2097 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2098 { 2099 if (bdev_is_read_io(io) == true) { 2100 return; 2101 } 2102 2103 return bdev_qos_rw_bps_update_quota(limit, io); 2104 } 2105 2106 static void 2107 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2108 { 2109 int i; 2110 2111 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2112 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2113 qos->rate_limits[i].queue_io = NULL; 2114 qos->rate_limits[i].update_quota = NULL; 2115 continue; 2116 } 2117 2118 switch (i) { 2119 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2120 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2121 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2122 break; 2123 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2124 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2125 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2126 break; 2127 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2128 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2129 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2130 break; 2131 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2132 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2133 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2134 break; 2135 default: 2136 break; 2137 } 2138 } 2139 } 2140 2141 static void 2142 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2143 struct spdk_bdev_io *bdev_io, 2144 enum spdk_bdev_io_status status) 2145 { 2146 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2147 2148 bdev_io->internal.in_submit_request = true; 2149 bdev_ch->io_outstanding++; 2150 shared_resource->io_outstanding++; 2151 spdk_bdev_io_complete(bdev_io, status); 2152 bdev_io->internal.in_submit_request = false; 2153 } 2154 2155 static inline void 2156 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2157 { 2158 struct spdk_bdev *bdev = bdev_io->bdev; 2159 struct spdk_io_channel *ch = bdev_ch->channel; 2160 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2161 2162 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2163 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2164 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2165 2166 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2167 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2168 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2169 SPDK_BDEV_IO_STATUS_SUCCESS); 2170 return; 2171 } 2172 } 2173 2174 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2175 bdev_io->bdev->split_on_write_unit && 2176 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2177 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2178 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2179 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2180 return; 2181 } 2182 2183 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2184 bdev_ch->io_outstanding++; 2185 shared_resource->io_outstanding++; 2186 bdev_io->internal.in_submit_request = true; 2187 bdev->fn_table->submit_request(ch, bdev_io); 2188 bdev_io->internal.in_submit_request = false; 2189 } else { 2190 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2191 } 2192 } 2193 2194 static bool 2195 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2196 { 2197 int i; 2198 2199 if (bdev_qos_io_to_limit(bdev_io) == true) { 2200 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2201 if (!qos->rate_limits[i].queue_io) { 2202 continue; 2203 } 2204 2205 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2206 bdev_io) == true) { 2207 return true; 2208 } 2209 } 2210 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2211 if (!qos->rate_limits[i].update_quota) { 2212 continue; 2213 } 2214 2215 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2216 } 2217 } 2218 2219 return false; 2220 } 2221 2222 static inline void 2223 _bdev_io_do_submit(void *ctx) 2224 { 2225 struct spdk_bdev_io *bdev_io = ctx; 2226 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2227 2228 bdev_io_do_submit(ch, bdev_io); 2229 } 2230 2231 static int 2232 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2233 { 2234 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2235 int submitted_ios = 0; 2236 2237 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2238 if (!bdev_qos_queue_io(qos, bdev_io)) { 2239 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2240 2241 if (bdev_io->internal.io_submit_ch) { 2242 /* Send back the IO to the original thread for the actual processing. */ 2243 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 2244 bdev_io->internal.io_submit_ch = NULL; 2245 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 2246 _bdev_io_do_submit, bdev_io); 2247 } else { 2248 bdev_io_do_submit(ch, bdev_io); 2249 } 2250 2251 submitted_ios++; 2252 } 2253 } 2254 2255 return submitted_ios; 2256 } 2257 2258 static void 2259 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2260 { 2261 int rc; 2262 2263 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2264 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2265 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2266 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2267 &bdev_io->internal.waitq_entry); 2268 if (rc != 0) { 2269 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2270 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2271 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2272 } 2273 } 2274 2275 static bool 2276 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2277 { 2278 uint32_t io_boundary; 2279 struct spdk_bdev *bdev = bdev_io->bdev; 2280 uint32_t max_size = bdev->max_segment_size; 2281 int max_segs = bdev->max_num_segments; 2282 2283 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2284 io_boundary = bdev->write_unit_size; 2285 } else if (bdev->split_on_optimal_io_boundary) { 2286 io_boundary = bdev->optimal_io_boundary; 2287 } else { 2288 io_boundary = 0; 2289 } 2290 2291 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2292 return false; 2293 } 2294 2295 if (io_boundary) { 2296 uint64_t start_stripe, end_stripe; 2297 2298 start_stripe = bdev_io->u.bdev.offset_blocks; 2299 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2300 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2301 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2302 start_stripe >>= spdk_u32log2(io_boundary); 2303 end_stripe >>= spdk_u32log2(io_boundary); 2304 } else { 2305 start_stripe /= io_boundary; 2306 end_stripe /= io_boundary; 2307 } 2308 2309 if (start_stripe != end_stripe) { 2310 return true; 2311 } 2312 } 2313 2314 if (max_segs) { 2315 if (bdev_io->u.bdev.iovcnt > max_segs) { 2316 return true; 2317 } 2318 } 2319 2320 if (max_size) { 2321 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2322 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2323 return true; 2324 } 2325 } 2326 } 2327 2328 return false; 2329 } 2330 2331 static bool 2332 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2333 { 2334 uint32_t num_unmap_segments; 2335 2336 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2337 return false; 2338 } 2339 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2340 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2341 return true; 2342 } 2343 2344 return false; 2345 } 2346 2347 static bool 2348 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2349 { 2350 if (!bdev_io->bdev->max_write_zeroes) { 2351 return false; 2352 } 2353 2354 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2355 return true; 2356 } 2357 2358 return false; 2359 } 2360 2361 static bool 2362 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2363 { 2364 if (bdev_io->bdev->max_copy != 0 && 2365 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2366 return true; 2367 } 2368 2369 return false; 2370 } 2371 2372 static bool 2373 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2374 { 2375 switch (bdev_io->type) { 2376 case SPDK_BDEV_IO_TYPE_READ: 2377 case SPDK_BDEV_IO_TYPE_WRITE: 2378 return bdev_rw_should_split(bdev_io); 2379 case SPDK_BDEV_IO_TYPE_UNMAP: 2380 return bdev_unmap_should_split(bdev_io); 2381 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2382 return bdev_write_zeroes_should_split(bdev_io); 2383 case SPDK_BDEV_IO_TYPE_COPY: 2384 return bdev_copy_should_split(bdev_io); 2385 default: 2386 return false; 2387 } 2388 } 2389 2390 static uint32_t 2391 _to_next_boundary(uint64_t offset, uint32_t boundary) 2392 { 2393 return (boundary - (offset % boundary)); 2394 } 2395 2396 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2397 2398 static void _bdev_rw_split(void *_bdev_io); 2399 2400 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2401 2402 static void 2403 _bdev_unmap_split(void *_bdev_io) 2404 { 2405 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2406 } 2407 2408 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2409 2410 static void 2411 _bdev_write_zeroes_split(void *_bdev_io) 2412 { 2413 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2414 } 2415 2416 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2417 2418 static void 2419 _bdev_copy_split(void *_bdev_io) 2420 { 2421 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2422 } 2423 2424 static int 2425 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2426 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2427 { 2428 int rc; 2429 uint64_t current_offset, current_remaining, current_src_offset; 2430 spdk_bdev_io_wait_cb io_wait_fn; 2431 2432 current_offset = *offset; 2433 current_remaining = *remaining; 2434 2435 bdev_io->u.bdev.split_outstanding++; 2436 2437 io_wait_fn = _bdev_rw_split; 2438 switch (bdev_io->type) { 2439 case SPDK_BDEV_IO_TYPE_READ: 2440 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2441 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2442 iov, iovcnt, md_buf, current_offset, 2443 num_blocks, 2444 bdev_io_split_done, bdev_io, 2445 bdev_io->internal.ext_opts, true); 2446 break; 2447 case SPDK_BDEV_IO_TYPE_WRITE: 2448 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2449 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2450 iov, iovcnt, md_buf, current_offset, 2451 num_blocks, 2452 bdev_io_split_done, bdev_io, 2453 bdev_io->internal.ext_opts, true); 2454 break; 2455 case SPDK_BDEV_IO_TYPE_UNMAP: 2456 io_wait_fn = _bdev_unmap_split; 2457 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2458 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2459 current_offset, num_blocks, 2460 bdev_io_split_done, bdev_io); 2461 break; 2462 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2463 io_wait_fn = _bdev_write_zeroes_split; 2464 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2465 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2466 current_offset, num_blocks, 2467 bdev_io_split_done, bdev_io); 2468 break; 2469 case SPDK_BDEV_IO_TYPE_COPY: 2470 io_wait_fn = _bdev_copy_split; 2471 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2472 (current_offset - bdev_io->u.bdev.offset_blocks); 2473 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2474 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2475 current_offset, current_src_offset, num_blocks, 2476 bdev_io_split_done, bdev_io); 2477 break; 2478 default: 2479 assert(false); 2480 rc = -EINVAL; 2481 break; 2482 } 2483 2484 if (rc == 0) { 2485 current_offset += num_blocks; 2486 current_remaining -= num_blocks; 2487 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2488 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2489 *offset = current_offset; 2490 *remaining = current_remaining; 2491 } else { 2492 bdev_io->u.bdev.split_outstanding--; 2493 if (rc == -ENOMEM) { 2494 if (bdev_io->u.bdev.split_outstanding == 0) { 2495 /* No I/O is outstanding. Hence we should wait here. */ 2496 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2497 } 2498 } else { 2499 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2500 if (bdev_io->u.bdev.split_outstanding == 0) { 2501 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2502 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2503 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2504 } 2505 } 2506 } 2507 2508 return rc; 2509 } 2510 2511 static void 2512 _bdev_rw_split(void *_bdev_io) 2513 { 2514 struct iovec *parent_iov, *iov; 2515 struct spdk_bdev_io *bdev_io = _bdev_io; 2516 struct spdk_bdev *bdev = bdev_io->bdev; 2517 uint64_t parent_offset, current_offset, remaining; 2518 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2519 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2520 uint32_t iovcnt, iov_len, child_iovsize; 2521 uint32_t blocklen = bdev->blocklen; 2522 uint32_t io_boundary; 2523 uint32_t max_segment_size = bdev->max_segment_size; 2524 uint32_t max_child_iovcnt = bdev->max_num_segments; 2525 void *md_buf = NULL; 2526 int rc; 2527 2528 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2529 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2530 SPDK_BDEV_IO_NUM_CHILD_IOV; 2531 2532 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2533 io_boundary = bdev->write_unit_size; 2534 } else if (bdev->split_on_optimal_io_boundary) { 2535 io_boundary = bdev->optimal_io_boundary; 2536 } else { 2537 io_boundary = UINT32_MAX; 2538 } 2539 2540 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2541 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2542 parent_offset = bdev_io->u.bdev.offset_blocks; 2543 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2544 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2545 2546 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2547 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2548 if (parent_iov_offset < parent_iov->iov_len) { 2549 break; 2550 } 2551 parent_iov_offset -= parent_iov->iov_len; 2552 } 2553 2554 child_iovcnt = 0; 2555 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2556 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2557 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2558 to_next_boundary = spdk_min(remaining, to_next_boundary); 2559 to_next_boundary_bytes = to_next_boundary * blocklen; 2560 2561 iov = &bdev_io->child_iov[child_iovcnt]; 2562 iovcnt = 0; 2563 2564 if (bdev_io->u.bdev.md_buf) { 2565 md_buf = (char *)bdev_io->u.bdev.md_buf + 2566 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2567 } 2568 2569 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2570 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2571 iovcnt < child_iovsize) { 2572 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2573 iov_len = parent_iov->iov_len - parent_iov_offset; 2574 2575 iov_len = spdk_min(iov_len, max_segment_size); 2576 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2577 to_next_boundary_bytes -= iov_len; 2578 2579 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2580 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2581 2582 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2583 parent_iov_offset += iov_len; 2584 } else { 2585 parent_iovpos++; 2586 parent_iov_offset = 0; 2587 } 2588 child_iovcnt++; 2589 iovcnt++; 2590 } 2591 2592 if (to_next_boundary_bytes > 0) { 2593 /* We had to stop this child I/O early because we ran out of 2594 * child_iov space or were limited by max_num_segments. 2595 * Ensure the iovs to be aligned with block size and 2596 * then adjust to_next_boundary before starting the 2597 * child I/O. 2598 */ 2599 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2600 iovcnt == child_iovsize); 2601 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2602 if (to_last_block_bytes != 0) { 2603 uint32_t child_iovpos = child_iovcnt - 1; 2604 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2605 * so the loop will naturally end 2606 */ 2607 2608 to_last_block_bytes = blocklen - to_last_block_bytes; 2609 to_next_boundary_bytes += to_last_block_bytes; 2610 while (to_last_block_bytes > 0 && iovcnt > 0) { 2611 iov_len = spdk_min(to_last_block_bytes, 2612 bdev_io->child_iov[child_iovpos].iov_len); 2613 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2614 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2615 child_iovpos--; 2616 if (--iovcnt == 0) { 2617 /* If the child IO is less than a block size just return. 2618 * If the first child IO of any split round is less than 2619 * a block size, an error exit. 2620 */ 2621 if (bdev_io->u.bdev.split_outstanding == 0) { 2622 SPDK_ERRLOG("The first child io was less than a block size\n"); 2623 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2624 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2625 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2626 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2627 } 2628 2629 return; 2630 } 2631 } 2632 2633 to_last_block_bytes -= iov_len; 2634 2635 if (parent_iov_offset == 0) { 2636 parent_iovpos--; 2637 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2638 } 2639 parent_iov_offset -= iov_len; 2640 } 2641 2642 assert(to_last_block_bytes == 0); 2643 } 2644 to_next_boundary -= to_next_boundary_bytes / blocklen; 2645 } 2646 2647 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2648 ¤t_offset, &remaining); 2649 if (spdk_unlikely(rc)) { 2650 return; 2651 } 2652 } 2653 } 2654 2655 static void 2656 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2657 { 2658 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2659 uint32_t num_children_reqs = 0; 2660 int rc; 2661 2662 offset = bdev_io->u.bdev.split_current_offset_blocks; 2663 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2664 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2665 2666 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2667 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2668 2669 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2670 &offset, &remaining); 2671 if (spdk_likely(rc == 0)) { 2672 num_children_reqs++; 2673 } else { 2674 return; 2675 } 2676 } 2677 } 2678 2679 static void 2680 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2681 { 2682 uint64_t offset, write_zeroes_blocks, remaining; 2683 uint32_t num_children_reqs = 0; 2684 int rc; 2685 2686 offset = bdev_io->u.bdev.split_current_offset_blocks; 2687 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2688 2689 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2690 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2691 2692 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2693 &offset, &remaining); 2694 if (spdk_likely(rc == 0)) { 2695 num_children_reqs++; 2696 } else { 2697 return; 2698 } 2699 } 2700 } 2701 2702 static void 2703 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2704 { 2705 uint64_t offset, copy_blocks, remaining; 2706 uint32_t num_children_reqs = 0; 2707 int rc; 2708 2709 offset = bdev_io->u.bdev.split_current_offset_blocks; 2710 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2711 2712 assert(bdev_io->bdev->max_copy != 0); 2713 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2714 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2715 2716 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2717 &offset, &remaining); 2718 if (spdk_likely(rc == 0)) { 2719 num_children_reqs++; 2720 } else { 2721 return; 2722 } 2723 } 2724 } 2725 2726 static void 2727 parent_bdev_io_complete(void *ctx, int rc) 2728 { 2729 struct spdk_bdev_io *parent_io = ctx; 2730 2731 if (rc) { 2732 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2733 } 2734 2735 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2736 parent_io->internal.caller_ctx); 2737 } 2738 2739 static void 2740 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2741 { 2742 struct spdk_bdev_io *parent_io = cb_arg; 2743 2744 spdk_bdev_free_io(bdev_io); 2745 2746 if (!success) { 2747 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2748 /* If any child I/O failed, stop further splitting process. */ 2749 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2750 parent_io->u.bdev.split_remaining_num_blocks = 0; 2751 } 2752 parent_io->u.bdev.split_outstanding--; 2753 if (parent_io->u.bdev.split_outstanding != 0) { 2754 return; 2755 } 2756 2757 /* 2758 * Parent I/O finishes when all blocks are consumed. 2759 */ 2760 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2761 assert(parent_io->internal.cb != bdev_io_split_done); 2762 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2763 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2764 2765 if (parent_io->internal.orig_iovcnt != 0) { 2766 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2767 /* bdev IO will be completed in the callback */ 2768 } else { 2769 parent_bdev_io_complete(parent_io, 0); 2770 } 2771 return; 2772 } 2773 2774 /* 2775 * Continue with the splitting process. This function will complete the parent I/O if the 2776 * splitting is done. 2777 */ 2778 switch (parent_io->type) { 2779 case SPDK_BDEV_IO_TYPE_READ: 2780 case SPDK_BDEV_IO_TYPE_WRITE: 2781 _bdev_rw_split(parent_io); 2782 break; 2783 case SPDK_BDEV_IO_TYPE_UNMAP: 2784 bdev_unmap_split(parent_io); 2785 break; 2786 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2787 bdev_write_zeroes_split(parent_io); 2788 break; 2789 case SPDK_BDEV_IO_TYPE_COPY: 2790 bdev_copy_split(parent_io); 2791 break; 2792 default: 2793 assert(false); 2794 break; 2795 } 2796 } 2797 2798 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2799 bool success); 2800 2801 static void 2802 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2803 { 2804 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2805 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2806 bdev_io->u.bdev.split_outstanding = 0; 2807 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2808 2809 switch (bdev_io->type) { 2810 case SPDK_BDEV_IO_TYPE_READ: 2811 case SPDK_BDEV_IO_TYPE_WRITE: 2812 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2813 _bdev_rw_split(bdev_io); 2814 } else { 2815 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2816 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2817 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2818 } 2819 break; 2820 case SPDK_BDEV_IO_TYPE_UNMAP: 2821 bdev_unmap_split(bdev_io); 2822 break; 2823 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2824 bdev_write_zeroes_split(bdev_io); 2825 break; 2826 case SPDK_BDEV_IO_TYPE_COPY: 2827 bdev_copy_split(bdev_io); 2828 break; 2829 default: 2830 assert(false); 2831 break; 2832 } 2833 } 2834 2835 static void 2836 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2837 { 2838 if (!success) { 2839 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2840 return; 2841 } 2842 2843 _bdev_rw_split(bdev_io); 2844 } 2845 2846 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2847 * be inlined, at least on some compilers. 2848 */ 2849 static inline void 2850 _bdev_io_submit(void *ctx) 2851 { 2852 struct spdk_bdev_io *bdev_io = ctx; 2853 struct spdk_bdev *bdev = bdev_io->bdev; 2854 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2855 2856 if (spdk_likely(bdev_ch->flags == 0)) { 2857 bdev_io_do_submit(bdev_ch, bdev_io); 2858 return; 2859 } 2860 2861 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2862 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2863 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2864 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2865 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2866 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2867 } else { 2868 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2869 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2870 } 2871 } else { 2872 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2873 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2874 } 2875 } 2876 2877 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2878 2879 bool 2880 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2881 { 2882 if (range1->length == 0 || range2->length == 0) { 2883 return false; 2884 } 2885 2886 if (range1->offset + range1->length <= range2->offset) { 2887 return false; 2888 } 2889 2890 if (range2->offset + range2->length <= range1->offset) { 2891 return false; 2892 } 2893 2894 return true; 2895 } 2896 2897 static bool 2898 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2899 { 2900 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2901 struct lba_range r; 2902 2903 switch (bdev_io->type) { 2904 case SPDK_BDEV_IO_TYPE_NVME_IO: 2905 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2906 /* Don't try to decode the NVMe command - just assume worst-case and that 2907 * it overlaps a locked range. 2908 */ 2909 return true; 2910 case SPDK_BDEV_IO_TYPE_WRITE: 2911 case SPDK_BDEV_IO_TYPE_UNMAP: 2912 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2913 case SPDK_BDEV_IO_TYPE_ZCOPY: 2914 case SPDK_BDEV_IO_TYPE_COPY: 2915 r.offset = bdev_io->u.bdev.offset_blocks; 2916 r.length = bdev_io->u.bdev.num_blocks; 2917 if (!bdev_lba_range_overlapped(range, &r)) { 2918 /* This I/O doesn't overlap the specified LBA range. */ 2919 return false; 2920 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2921 /* This I/O overlaps, but the I/O is on the same channel that locked this 2922 * range, and the caller_ctx is the same as the locked_ctx. This means 2923 * that this I/O is associated with the lock, and is allowed to execute. 2924 */ 2925 return false; 2926 } else { 2927 return true; 2928 } 2929 default: 2930 return false; 2931 } 2932 } 2933 2934 void 2935 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2936 { 2937 struct spdk_bdev *bdev = bdev_io->bdev; 2938 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2939 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2940 2941 assert(thread != NULL); 2942 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2943 2944 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2945 struct lba_range *range; 2946 2947 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2948 if (bdev_io_range_is_locked(bdev_io, range)) { 2949 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2950 return; 2951 } 2952 } 2953 } 2954 2955 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2956 2957 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2958 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2959 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2960 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2961 spdk_bdev_get_name(bdev)); 2962 2963 if (bdev_io_should_split(bdev_io)) { 2964 bdev_io_split(NULL, bdev_io); 2965 return; 2966 } 2967 2968 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2969 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2970 _bdev_io_submit(bdev_io); 2971 } else { 2972 bdev_io->internal.io_submit_ch = ch; 2973 bdev_io->internal.ch = bdev->internal.qos->ch; 2974 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2975 } 2976 } else { 2977 _bdev_io_submit(bdev_io); 2978 } 2979 } 2980 2981 static inline void 2982 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2983 { 2984 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2985 2986 /* Zero part we don't copy */ 2987 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2988 memcpy(opts_copy, opts, opts->size); 2989 opts_copy->size = sizeof(*opts_copy); 2990 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2991 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2992 bdev_io->u.bdev.ext_opts = opts_copy; 2993 } 2994 2995 static inline void 2996 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2997 { 2998 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2999 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 3000 * For write operation we need to pull buffers from memory domain before submitting IO. 3001 * Once read operation completes, we need to use memory_domain push functionality to 3002 * update data in original memory domain IO buffer 3003 * This IO request will go through a regular IO flow, so clear memory domains pointers in 3004 * the copied ext_opts */ 3005 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 3006 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 3007 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 3008 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 3009 } 3010 3011 static inline void 3012 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 3013 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 3014 { 3015 if (opts) { 3016 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 3017 assert(opts->size <= sizeof(*opts)); 3018 /* 3019 * copy if size is smaller than opts struct to avoid having to check size 3020 * on every access to bdev_io->u.bdev.ext_opts 3021 */ 3022 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 3023 _bdev_io_copy_ext_opts(bdev_io, opts); 3024 if (use_pull_push) { 3025 _bdev_io_ext_use_bounce_buffer(bdev_io); 3026 return; 3027 } 3028 } 3029 } 3030 bdev_io_submit(bdev_io); 3031 } 3032 3033 static void 3034 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 3035 { 3036 struct spdk_bdev *bdev = bdev_io->bdev; 3037 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 3038 struct spdk_io_channel *ch = bdev_ch->channel; 3039 3040 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 3041 3042 bdev_io->internal.in_submit_request = true; 3043 bdev->fn_table->submit_request(ch, bdev_io); 3044 bdev_io->internal.in_submit_request = false; 3045 } 3046 3047 void 3048 bdev_io_init(struct spdk_bdev_io *bdev_io, 3049 struct spdk_bdev *bdev, void *cb_arg, 3050 spdk_bdev_io_completion_cb cb) 3051 { 3052 bdev_io->bdev = bdev; 3053 bdev_io->internal.caller_ctx = cb_arg; 3054 bdev_io->internal.cb = cb; 3055 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 3056 bdev_io->internal.in_submit_request = false; 3057 bdev_io->internal.buf = NULL; 3058 bdev_io->internal.io_submit_ch = NULL; 3059 bdev_io->internal.orig_iovs = NULL; 3060 bdev_io->internal.orig_iovcnt = 0; 3061 bdev_io->internal.orig_md_iov.iov_base = NULL; 3062 bdev_io->internal.error.nvme.cdw0 = 0; 3063 bdev_io->num_retries = 0; 3064 bdev_io->internal.get_buf_cb = NULL; 3065 bdev_io->internal.get_aux_buf_cb = NULL; 3066 bdev_io->internal.ext_opts = NULL; 3067 bdev_io->internal.data_transfer_cpl = NULL; 3068 } 3069 3070 static bool 3071 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3072 { 3073 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3074 } 3075 3076 bool 3077 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3078 { 3079 bool supported; 3080 3081 supported = bdev_io_type_supported(bdev, io_type); 3082 3083 if (!supported) { 3084 switch (io_type) { 3085 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3086 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3087 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3088 break; 3089 default: 3090 break; 3091 } 3092 } 3093 3094 return supported; 3095 } 3096 3097 uint64_t 3098 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3099 { 3100 return bdev_io->internal.submit_tsc; 3101 } 3102 3103 int 3104 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3105 { 3106 if (bdev->fn_table->dump_info_json) { 3107 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3108 } 3109 3110 return 0; 3111 } 3112 3113 static void 3114 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3115 { 3116 uint32_t max_per_timeslice = 0; 3117 int i; 3118 3119 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3120 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3121 qos->rate_limits[i].max_per_timeslice = 0; 3122 continue; 3123 } 3124 3125 max_per_timeslice = qos->rate_limits[i].limit * 3126 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3127 3128 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3129 qos->rate_limits[i].min_per_timeslice); 3130 3131 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3132 } 3133 3134 bdev_qos_set_ops(qos); 3135 } 3136 3137 static int 3138 bdev_channel_poll_qos(void *arg) 3139 { 3140 struct spdk_bdev_qos *qos = arg; 3141 uint64_t now = spdk_get_ticks(); 3142 int i; 3143 3144 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3145 /* We received our callback earlier than expected - return 3146 * immediately and wait to do accounting until at least one 3147 * timeslice has actually expired. This should never happen 3148 * with a well-behaved timer implementation. 3149 */ 3150 return SPDK_POLLER_IDLE; 3151 } 3152 3153 /* Reset for next round of rate limiting */ 3154 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3155 /* We may have allowed the IOs or bytes to slightly overrun in the last 3156 * timeslice. remaining_this_timeslice is signed, so if it's negative 3157 * here, we'll account for the overrun so that the next timeslice will 3158 * be appropriately reduced. 3159 */ 3160 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3161 qos->rate_limits[i].remaining_this_timeslice = 0; 3162 } 3163 } 3164 3165 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3166 qos->last_timeslice += qos->timeslice_size; 3167 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3168 qos->rate_limits[i].remaining_this_timeslice += 3169 qos->rate_limits[i].max_per_timeslice; 3170 } 3171 } 3172 3173 return bdev_qos_io_submit(qos->ch, qos); 3174 } 3175 3176 static void 3177 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3178 { 3179 struct spdk_bdev_shared_resource *shared_resource; 3180 struct lba_range *range; 3181 3182 bdev_free_io_stat(ch->stat); 3183 #ifdef SPDK_CONFIG_VTUNE 3184 bdev_free_io_stat(ch->prev_stat); 3185 #endif 3186 3187 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3188 range = TAILQ_FIRST(&ch->locked_ranges); 3189 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3190 free(range); 3191 } 3192 3193 spdk_put_io_channel(ch->channel); 3194 3195 shared_resource = ch->shared_resource; 3196 3197 assert(TAILQ_EMPTY(&ch->io_locked)); 3198 assert(TAILQ_EMPTY(&ch->io_submitted)); 3199 assert(ch->io_outstanding == 0); 3200 assert(shared_resource->ref > 0); 3201 shared_resource->ref--; 3202 if (shared_resource->ref == 0) { 3203 assert(shared_resource->io_outstanding == 0); 3204 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3205 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3206 free(shared_resource); 3207 } 3208 } 3209 3210 static void 3211 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3212 { 3213 struct spdk_bdev_qos *qos = bdev->internal.qos; 3214 int i; 3215 3216 assert(spdk_spin_held(&bdev->internal.spinlock)); 3217 3218 /* Rate limiting on this bdev enabled */ 3219 if (qos) { 3220 if (qos->ch == NULL) { 3221 struct spdk_io_channel *io_ch; 3222 3223 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3224 bdev->name, spdk_get_thread()); 3225 3226 /* No qos channel has been selected, so set one up */ 3227 3228 /* Take another reference to ch */ 3229 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3230 assert(io_ch != NULL); 3231 qos->ch = ch; 3232 3233 qos->thread = spdk_io_channel_get_thread(io_ch); 3234 3235 TAILQ_INIT(&qos->queued); 3236 3237 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3238 if (bdev_qos_is_iops_rate_limit(i) == true) { 3239 qos->rate_limits[i].min_per_timeslice = 3240 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3241 } else { 3242 qos->rate_limits[i].min_per_timeslice = 3243 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3244 } 3245 3246 if (qos->rate_limits[i].limit == 0) { 3247 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3248 } 3249 } 3250 bdev_qos_update_max_quota_per_timeslice(qos); 3251 qos->timeslice_size = 3252 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3253 qos->last_timeslice = spdk_get_ticks(); 3254 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3255 qos, 3256 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3257 } 3258 3259 ch->flags |= BDEV_CH_QOS_ENABLED; 3260 } 3261 } 3262 3263 struct poll_timeout_ctx { 3264 struct spdk_bdev_desc *desc; 3265 uint64_t timeout_in_sec; 3266 spdk_bdev_io_timeout_cb cb_fn; 3267 void *cb_arg; 3268 }; 3269 3270 static void 3271 bdev_desc_free(struct spdk_bdev_desc *desc) 3272 { 3273 spdk_spin_destroy(&desc->spinlock); 3274 free(desc->media_events_buffer); 3275 free(desc); 3276 } 3277 3278 static void 3279 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3280 { 3281 struct poll_timeout_ctx *ctx = _ctx; 3282 struct spdk_bdev_desc *desc = ctx->desc; 3283 3284 free(ctx); 3285 3286 spdk_spin_lock(&desc->spinlock); 3287 desc->refs--; 3288 if (desc->closed == true && desc->refs == 0) { 3289 spdk_spin_unlock(&desc->spinlock); 3290 bdev_desc_free(desc); 3291 return; 3292 } 3293 spdk_spin_unlock(&desc->spinlock); 3294 } 3295 3296 static void 3297 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3298 struct spdk_io_channel *io_ch, void *_ctx) 3299 { 3300 struct poll_timeout_ctx *ctx = _ctx; 3301 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3302 struct spdk_bdev_desc *desc = ctx->desc; 3303 struct spdk_bdev_io *bdev_io; 3304 uint64_t now; 3305 3306 spdk_spin_lock(&desc->spinlock); 3307 if (desc->closed == true) { 3308 spdk_spin_unlock(&desc->spinlock); 3309 spdk_bdev_for_each_channel_continue(i, -1); 3310 return; 3311 } 3312 spdk_spin_unlock(&desc->spinlock); 3313 3314 now = spdk_get_ticks(); 3315 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3316 /* Exclude any I/O that are generated via splitting. */ 3317 if (bdev_io->internal.cb == bdev_io_split_done) { 3318 continue; 3319 } 3320 3321 /* Once we find an I/O that has not timed out, we can immediately 3322 * exit the loop. 3323 */ 3324 if (now < (bdev_io->internal.submit_tsc + 3325 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3326 goto end; 3327 } 3328 3329 if (bdev_io->internal.desc == desc) { 3330 ctx->cb_fn(ctx->cb_arg, bdev_io); 3331 } 3332 } 3333 3334 end: 3335 spdk_bdev_for_each_channel_continue(i, 0); 3336 } 3337 3338 static int 3339 bdev_poll_timeout_io(void *arg) 3340 { 3341 struct spdk_bdev_desc *desc = arg; 3342 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3343 struct poll_timeout_ctx *ctx; 3344 3345 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3346 if (!ctx) { 3347 SPDK_ERRLOG("failed to allocate memory\n"); 3348 return SPDK_POLLER_BUSY; 3349 } 3350 ctx->desc = desc; 3351 ctx->cb_arg = desc->cb_arg; 3352 ctx->cb_fn = desc->cb_fn; 3353 ctx->timeout_in_sec = desc->timeout_in_sec; 3354 3355 /* Take a ref on the descriptor in case it gets closed while we are checking 3356 * all of the channels. 3357 */ 3358 spdk_spin_lock(&desc->spinlock); 3359 desc->refs++; 3360 spdk_spin_unlock(&desc->spinlock); 3361 3362 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3363 bdev_channel_poll_timeout_io_done); 3364 3365 return SPDK_POLLER_BUSY; 3366 } 3367 3368 int 3369 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3370 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3371 { 3372 assert(desc->thread == spdk_get_thread()); 3373 3374 spdk_poller_unregister(&desc->io_timeout_poller); 3375 3376 if (timeout_in_sec) { 3377 assert(cb_fn != NULL); 3378 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3379 desc, 3380 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3381 1000); 3382 if (desc->io_timeout_poller == NULL) { 3383 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3384 return -1; 3385 } 3386 } 3387 3388 desc->cb_fn = cb_fn; 3389 desc->cb_arg = cb_arg; 3390 desc->timeout_in_sec = timeout_in_sec; 3391 3392 return 0; 3393 } 3394 3395 static int 3396 bdev_channel_create(void *io_device, void *ctx_buf) 3397 { 3398 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3399 struct spdk_bdev_channel *ch = ctx_buf; 3400 struct spdk_io_channel *mgmt_io_ch; 3401 struct spdk_bdev_mgmt_channel *mgmt_ch; 3402 struct spdk_bdev_shared_resource *shared_resource; 3403 struct lba_range *range; 3404 3405 ch->bdev = bdev; 3406 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3407 if (!ch->channel) { 3408 return -1; 3409 } 3410 3411 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3412 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3413 3414 assert(ch->histogram == NULL); 3415 if (bdev->internal.histogram_enabled) { 3416 ch->histogram = spdk_histogram_data_alloc(); 3417 if (ch->histogram == NULL) { 3418 SPDK_ERRLOG("Could not allocate histogram\n"); 3419 } 3420 } 3421 3422 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3423 if (!mgmt_io_ch) { 3424 spdk_put_io_channel(ch->channel); 3425 return -1; 3426 } 3427 3428 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3429 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3430 if (shared_resource->shared_ch == ch->channel) { 3431 spdk_put_io_channel(mgmt_io_ch); 3432 shared_resource->ref++; 3433 break; 3434 } 3435 } 3436 3437 if (shared_resource == NULL) { 3438 shared_resource = calloc(1, sizeof(*shared_resource)); 3439 if (shared_resource == NULL) { 3440 spdk_put_io_channel(ch->channel); 3441 spdk_put_io_channel(mgmt_io_ch); 3442 return -1; 3443 } 3444 3445 shared_resource->mgmt_ch = mgmt_ch; 3446 shared_resource->io_outstanding = 0; 3447 TAILQ_INIT(&shared_resource->nomem_io); 3448 shared_resource->nomem_threshold = 0; 3449 shared_resource->shared_ch = ch->channel; 3450 shared_resource->ref = 1; 3451 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3452 } 3453 3454 ch->io_outstanding = 0; 3455 TAILQ_INIT(&ch->queued_resets); 3456 TAILQ_INIT(&ch->locked_ranges); 3457 ch->flags = 0; 3458 ch->shared_resource = shared_resource; 3459 3460 TAILQ_INIT(&ch->io_submitted); 3461 TAILQ_INIT(&ch->io_locked); 3462 3463 ch->stat = bdev_alloc_io_stat(false); 3464 if (ch->stat == NULL) { 3465 bdev_channel_destroy_resource(ch); 3466 return -1; 3467 } 3468 3469 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3470 3471 #ifdef SPDK_CONFIG_VTUNE 3472 { 3473 char *name; 3474 __itt_init_ittlib(NULL, 0); 3475 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3476 if (!name) { 3477 bdev_channel_destroy_resource(ch); 3478 return -1; 3479 } 3480 ch->handle = __itt_string_handle_create(name); 3481 free(name); 3482 ch->start_tsc = spdk_get_ticks(); 3483 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3484 ch->prev_stat = bdev_alloc_io_stat(false); 3485 if (ch->prev_stat == NULL) { 3486 bdev_channel_destroy_resource(ch); 3487 return -1; 3488 } 3489 } 3490 #endif 3491 3492 spdk_spin_lock(&bdev->internal.spinlock); 3493 bdev_enable_qos(bdev, ch); 3494 3495 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3496 struct lba_range *new_range; 3497 3498 new_range = calloc(1, sizeof(*new_range)); 3499 if (new_range == NULL) { 3500 spdk_spin_unlock(&bdev->internal.spinlock); 3501 bdev_channel_destroy_resource(ch); 3502 return -1; 3503 } 3504 new_range->length = range->length; 3505 new_range->offset = range->offset; 3506 new_range->locked_ctx = range->locked_ctx; 3507 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3508 } 3509 3510 spdk_spin_unlock(&bdev->internal.spinlock); 3511 3512 return 0; 3513 } 3514 3515 static int 3516 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3517 void *cb_ctx) 3518 { 3519 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3520 struct spdk_bdev_io *bdev_io; 3521 uint64_t buf_len; 3522 3523 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3524 if (bdev_io->internal.ch == bdev_ch) { 3525 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3526 spdk_iobuf_entry_abort(ch, entry, buf_len); 3527 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3528 } 3529 3530 return 0; 3531 } 3532 3533 /* 3534 * Abort I/O that are waiting on a data buffer. 3535 */ 3536 static void 3537 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3538 { 3539 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3540 bdev_abort_all_buf_io_cb, ch); 3541 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3542 bdev_abort_all_buf_io_cb, ch); 3543 } 3544 3545 /* 3546 * Abort I/O that are queued waiting for submission. These types of I/O are 3547 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3548 */ 3549 static void 3550 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3551 { 3552 struct spdk_bdev_io *bdev_io, *tmp; 3553 3554 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3555 if (bdev_io->internal.ch == ch) { 3556 TAILQ_REMOVE(queue, bdev_io, internal.link); 3557 /* 3558 * spdk_bdev_io_complete() assumes that the completed I/O had 3559 * been submitted to the bdev module. Since in this case it 3560 * hadn't, bump io_outstanding to account for the decrement 3561 * that spdk_bdev_io_complete() will do. 3562 */ 3563 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3564 ch->io_outstanding++; 3565 ch->shared_resource->io_outstanding++; 3566 } 3567 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3568 } 3569 } 3570 } 3571 3572 static bool 3573 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3574 { 3575 struct spdk_bdev_io *bdev_io; 3576 3577 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3578 if (bdev_io == bio_to_abort) { 3579 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3580 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3581 return true; 3582 } 3583 } 3584 3585 return false; 3586 } 3587 3588 static int 3589 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3590 { 3591 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3592 uint64_t buf_len; 3593 3594 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3595 if (bdev_io == bio_to_abort) { 3596 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3597 spdk_iobuf_entry_abort(ch, entry, buf_len); 3598 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3599 return 1; 3600 } 3601 3602 return 0; 3603 } 3604 3605 static bool 3606 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3607 { 3608 int rc; 3609 3610 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3611 bdev_abort_buf_io_cb, bio_to_abort); 3612 if (rc == 1) { 3613 return true; 3614 } 3615 3616 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3617 bdev_abort_buf_io_cb, bio_to_abort); 3618 return rc == 1; 3619 } 3620 3621 static void 3622 bdev_qos_channel_destroy(void *cb_arg) 3623 { 3624 struct spdk_bdev_qos *qos = cb_arg; 3625 3626 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3627 spdk_poller_unregister(&qos->poller); 3628 3629 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3630 3631 free(qos); 3632 } 3633 3634 static int 3635 bdev_qos_destroy(struct spdk_bdev *bdev) 3636 { 3637 int i; 3638 3639 /* 3640 * Cleanly shutting down the QoS poller is tricky, because 3641 * during the asynchronous operation the user could open 3642 * a new descriptor and create a new channel, spawning 3643 * a new QoS poller. 3644 * 3645 * The strategy is to create a new QoS structure here and swap it 3646 * in. The shutdown path then continues to refer to the old one 3647 * until it completes and then releases it. 3648 */ 3649 struct spdk_bdev_qos *new_qos, *old_qos; 3650 3651 old_qos = bdev->internal.qos; 3652 3653 new_qos = calloc(1, sizeof(*new_qos)); 3654 if (!new_qos) { 3655 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3656 return -ENOMEM; 3657 } 3658 3659 /* Copy the old QoS data into the newly allocated structure */ 3660 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3661 3662 /* Zero out the key parts of the QoS structure */ 3663 new_qos->ch = NULL; 3664 new_qos->thread = NULL; 3665 new_qos->poller = NULL; 3666 TAILQ_INIT(&new_qos->queued); 3667 /* 3668 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3669 * It will be used later for the new QoS structure. 3670 */ 3671 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3672 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3673 new_qos->rate_limits[i].min_per_timeslice = 0; 3674 new_qos->rate_limits[i].max_per_timeslice = 0; 3675 } 3676 3677 bdev->internal.qos = new_qos; 3678 3679 if (old_qos->thread == NULL) { 3680 free(old_qos); 3681 } else { 3682 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3683 } 3684 3685 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3686 * been destroyed yet. The destruction path will end up waiting for the final 3687 * channel to be put before it releases resources. */ 3688 3689 return 0; 3690 } 3691 3692 static void 3693 bdev_add_io_stat(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3694 { 3695 total->bytes_read += add->bytes_read; 3696 total->num_read_ops += add->num_read_ops; 3697 total->bytes_written += add->bytes_written; 3698 total->num_write_ops += add->num_write_ops; 3699 total->bytes_unmapped += add->bytes_unmapped; 3700 total->num_unmap_ops += add->num_unmap_ops; 3701 total->bytes_copied += add->bytes_copied; 3702 total->num_copy_ops += add->num_copy_ops; 3703 total->read_latency_ticks += add->read_latency_ticks; 3704 total->write_latency_ticks += add->write_latency_ticks; 3705 total->unmap_latency_ticks += add->unmap_latency_ticks; 3706 total->copy_latency_ticks += add->copy_latency_ticks; 3707 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3708 total->max_read_latency_ticks = add->max_read_latency_ticks; 3709 } 3710 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3711 total->min_read_latency_ticks = add->min_read_latency_ticks; 3712 } 3713 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3714 total->max_write_latency_ticks = add->max_write_latency_ticks; 3715 } 3716 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3717 total->min_write_latency_ticks = add->min_write_latency_ticks; 3718 } 3719 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3720 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3721 } 3722 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3723 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3724 } 3725 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3726 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3727 } 3728 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3729 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3730 } 3731 } 3732 3733 static void 3734 bdev_get_io_stat(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3735 { 3736 memcpy(to_stat, from_stat, offsetof(struct spdk_bdev_io_stat, io_error)); 3737 3738 if (to_stat->io_error != NULL && from_stat->io_error != NULL) { 3739 memcpy(to_stat->io_error, from_stat->io_error, 3740 sizeof(struct spdk_bdev_io_error_stat)); 3741 } 3742 } 3743 3744 static void 3745 bdev_reset_io_stat(struct spdk_bdev_io_stat *stat, enum bdev_reset_stat_mode mode) 3746 { 3747 stat->max_read_latency_ticks = 0; 3748 stat->min_read_latency_ticks = UINT64_MAX; 3749 stat->max_write_latency_ticks = 0; 3750 stat->min_write_latency_ticks = UINT64_MAX; 3751 stat->max_unmap_latency_ticks = 0; 3752 stat->min_unmap_latency_ticks = UINT64_MAX; 3753 stat->max_copy_latency_ticks = 0; 3754 stat->min_copy_latency_ticks = UINT64_MAX; 3755 3756 if (mode != BDEV_RESET_STAT_ALL) { 3757 return; 3758 } 3759 3760 stat->bytes_read = 0; 3761 stat->num_read_ops = 0; 3762 stat->bytes_written = 0; 3763 stat->num_write_ops = 0; 3764 stat->bytes_unmapped = 0; 3765 stat->num_unmap_ops = 0; 3766 stat->read_latency_ticks = 0; 3767 stat->write_latency_ticks = 0; 3768 stat->unmap_latency_ticks = 0; 3769 3770 if (stat->io_error != NULL) { 3771 memset(stat->io_error, 0, sizeof(struct spdk_bdev_io_error_stat)); 3772 } 3773 } 3774 3775 struct spdk_bdev_io_stat * 3776 bdev_alloc_io_stat(bool io_error_stat) 3777 { 3778 struct spdk_bdev_io_stat *stat; 3779 3780 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3781 if (stat == NULL) { 3782 return NULL; 3783 } 3784 3785 if (io_error_stat) { 3786 stat->io_error = malloc(sizeof(struct spdk_bdev_io_error_stat)); 3787 if (stat->io_error == NULL) { 3788 free(stat); 3789 return NULL; 3790 } 3791 } else { 3792 stat->io_error = NULL; 3793 } 3794 3795 bdev_reset_io_stat(stat, BDEV_RESET_STAT_ALL); 3796 3797 return stat; 3798 } 3799 3800 void 3801 bdev_free_io_stat(struct spdk_bdev_io_stat *stat) 3802 { 3803 free(stat->io_error); 3804 free(stat); 3805 } 3806 3807 void 3808 bdev_dump_io_stat_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3809 { 3810 int i; 3811 3812 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3813 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3814 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3815 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3816 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3817 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3818 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3819 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3820 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3821 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3822 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3823 stat->min_read_latency_ticks != UINT64_MAX ? 3824 stat->min_read_latency_ticks : 0); 3825 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3826 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3827 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3828 stat->min_write_latency_ticks != UINT64_MAX ? 3829 stat->min_write_latency_ticks : 0); 3830 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3831 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3832 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3833 stat->min_unmap_latency_ticks != UINT64_MAX ? 3834 stat->min_unmap_latency_ticks : 0); 3835 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3836 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3837 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3838 stat->min_copy_latency_ticks != UINT64_MAX ? 3839 stat->min_copy_latency_ticks : 0); 3840 3841 if (stat->io_error != NULL) { 3842 spdk_json_write_named_object_begin(w, "io_error"); 3843 for (i = 0; i < -SPDK_MIN_BDEV_IO_STATUS; i++) { 3844 if (stat->io_error->error_status[i] != 0) { 3845 spdk_json_write_named_uint32(w, bdev_io_status_get_string(-(i + 1)), 3846 stat->io_error->error_status[i]); 3847 } 3848 } 3849 spdk_json_write_object_end(w); 3850 } 3851 } 3852 3853 static void 3854 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3855 { 3856 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3857 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3858 3859 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3860 bdev_abort_all_buf_io(mgmt_ch, ch); 3861 bdev_abort_all_buf_io(mgmt_ch, ch); 3862 } 3863 3864 static void 3865 bdev_channel_destroy(void *io_device, void *ctx_buf) 3866 { 3867 struct spdk_bdev_channel *ch = ctx_buf; 3868 3869 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3870 spdk_get_thread()); 3871 3872 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3873 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3874 3875 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3876 spdk_spin_lock(&ch->bdev->internal.spinlock); 3877 bdev_add_io_stat(ch->bdev->internal.stat, ch->stat); 3878 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3879 3880 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3881 3882 bdev_channel_abort_queued_ios(ch); 3883 3884 if (ch->histogram) { 3885 spdk_histogram_data_free(ch->histogram); 3886 } 3887 3888 bdev_channel_destroy_resource(ch); 3889 } 3890 3891 /* 3892 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3893 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3894 */ 3895 static int 3896 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3897 { 3898 struct spdk_bdev_name *tmp; 3899 3900 bdev_name->name = strdup(name); 3901 if (bdev_name->name == NULL) { 3902 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3903 return -ENOMEM; 3904 } 3905 3906 bdev_name->bdev = bdev; 3907 3908 spdk_spin_lock(&g_bdev_mgr.spinlock); 3909 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3910 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3911 3912 if (tmp != NULL) { 3913 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3914 free(bdev_name->name); 3915 return -EEXIST; 3916 } 3917 3918 return 0; 3919 } 3920 3921 static void 3922 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3923 { 3924 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3925 free(bdev_name->name); 3926 } 3927 3928 static void 3929 bdev_name_del(struct spdk_bdev_name *bdev_name) 3930 { 3931 spdk_spin_lock(&g_bdev_mgr.spinlock); 3932 bdev_name_del_unsafe(bdev_name); 3933 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3934 } 3935 3936 int 3937 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3938 { 3939 struct spdk_bdev_alias *tmp; 3940 int ret; 3941 3942 if (alias == NULL) { 3943 SPDK_ERRLOG("Empty alias passed\n"); 3944 return -EINVAL; 3945 } 3946 3947 tmp = calloc(1, sizeof(*tmp)); 3948 if (tmp == NULL) { 3949 SPDK_ERRLOG("Unable to allocate alias\n"); 3950 return -ENOMEM; 3951 } 3952 3953 ret = bdev_name_add(&tmp->alias, bdev, alias); 3954 if (ret != 0) { 3955 free(tmp); 3956 return ret; 3957 } 3958 3959 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3960 3961 return 0; 3962 } 3963 3964 static int 3965 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3966 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3967 { 3968 struct spdk_bdev_alias *tmp; 3969 3970 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3971 if (strcmp(alias, tmp->alias.name) == 0) { 3972 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3973 alias_del_fn(&tmp->alias); 3974 free(tmp); 3975 return 0; 3976 } 3977 } 3978 3979 return -ENOENT; 3980 } 3981 3982 int 3983 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3984 { 3985 int rc; 3986 3987 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3988 if (rc == -ENOENT) { 3989 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3990 } 3991 3992 return rc; 3993 } 3994 3995 void 3996 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3997 { 3998 struct spdk_bdev_alias *p, *tmp; 3999 4000 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 4001 TAILQ_REMOVE(&bdev->aliases, p, tailq); 4002 bdev_name_del(&p->alias); 4003 free(p); 4004 } 4005 } 4006 4007 struct spdk_io_channel * 4008 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 4009 { 4010 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 4011 } 4012 4013 void * 4014 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 4015 { 4016 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4017 void *ctx = NULL; 4018 4019 if (bdev->fn_table->get_module_ctx) { 4020 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 4021 } 4022 4023 return ctx; 4024 } 4025 4026 const char * 4027 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 4028 { 4029 return bdev->module->name; 4030 } 4031 4032 const char * 4033 spdk_bdev_get_name(const struct spdk_bdev *bdev) 4034 { 4035 return bdev->name; 4036 } 4037 4038 const char * 4039 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 4040 { 4041 return bdev->product_name; 4042 } 4043 4044 const struct spdk_bdev_aliases_list * 4045 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 4046 { 4047 return &bdev->aliases; 4048 } 4049 4050 uint32_t 4051 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 4052 { 4053 return bdev->blocklen; 4054 } 4055 4056 uint32_t 4057 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 4058 { 4059 return bdev->write_unit_size; 4060 } 4061 4062 uint64_t 4063 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 4064 { 4065 return bdev->blockcnt; 4066 } 4067 4068 const char * 4069 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 4070 { 4071 return qos_rpc_type[type]; 4072 } 4073 4074 void 4075 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 4076 { 4077 int i; 4078 4079 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 4080 4081 spdk_spin_lock(&bdev->internal.spinlock); 4082 if (bdev->internal.qos) { 4083 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 4084 if (bdev->internal.qos->rate_limits[i].limit != 4085 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 4086 limits[i] = bdev->internal.qos->rate_limits[i].limit; 4087 if (bdev_qos_is_iops_rate_limit(i) == false) { 4088 /* Change from Byte to Megabyte which is user visible. */ 4089 limits[i] = limits[i] / 1024 / 1024; 4090 } 4091 } 4092 } 4093 } 4094 spdk_spin_unlock(&bdev->internal.spinlock); 4095 } 4096 4097 size_t 4098 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4099 { 4100 return 1 << bdev->required_alignment; 4101 } 4102 4103 uint32_t 4104 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4105 { 4106 return bdev->optimal_io_boundary; 4107 } 4108 4109 bool 4110 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4111 { 4112 return bdev->write_cache; 4113 } 4114 4115 const struct spdk_uuid * 4116 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4117 { 4118 return &bdev->uuid; 4119 } 4120 4121 uint16_t 4122 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4123 { 4124 return bdev->acwu; 4125 } 4126 4127 uint32_t 4128 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4129 { 4130 return bdev->md_len; 4131 } 4132 4133 bool 4134 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4135 { 4136 return (bdev->md_len != 0) && bdev->md_interleave; 4137 } 4138 4139 bool 4140 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4141 { 4142 return (bdev->md_len != 0) && !bdev->md_interleave; 4143 } 4144 4145 bool 4146 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4147 { 4148 return bdev->zoned; 4149 } 4150 4151 uint32_t 4152 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4153 { 4154 if (spdk_bdev_is_md_interleaved(bdev)) { 4155 return bdev->blocklen - bdev->md_len; 4156 } else { 4157 return bdev->blocklen; 4158 } 4159 } 4160 4161 uint32_t 4162 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4163 { 4164 return bdev->phys_blocklen; 4165 } 4166 4167 static uint32_t 4168 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4169 { 4170 if (!spdk_bdev_is_md_interleaved(bdev)) { 4171 return bdev->blocklen + bdev->md_len; 4172 } else { 4173 return bdev->blocklen; 4174 } 4175 } 4176 4177 /* We have to use the typedef in the function declaration to appease astyle. */ 4178 typedef enum spdk_dif_type spdk_dif_type_t; 4179 4180 spdk_dif_type_t 4181 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4182 { 4183 if (bdev->md_len != 0) { 4184 return bdev->dif_type; 4185 } else { 4186 return SPDK_DIF_DISABLE; 4187 } 4188 } 4189 4190 bool 4191 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4192 { 4193 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4194 return bdev->dif_is_head_of_md; 4195 } else { 4196 return false; 4197 } 4198 } 4199 4200 bool 4201 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4202 enum spdk_dif_check_type check_type) 4203 { 4204 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4205 return false; 4206 } 4207 4208 switch (check_type) { 4209 case SPDK_DIF_CHECK_TYPE_REFTAG: 4210 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4211 case SPDK_DIF_CHECK_TYPE_APPTAG: 4212 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4213 case SPDK_DIF_CHECK_TYPE_GUARD: 4214 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4215 default: 4216 return false; 4217 } 4218 } 4219 4220 uint32_t 4221 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4222 { 4223 return bdev->max_copy; 4224 } 4225 4226 uint64_t 4227 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4228 { 4229 return bdev->internal.measured_queue_depth; 4230 } 4231 4232 uint64_t 4233 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4234 { 4235 return bdev->internal.period; 4236 } 4237 4238 uint64_t 4239 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4240 { 4241 return bdev->internal.weighted_io_time; 4242 } 4243 4244 uint64_t 4245 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4246 { 4247 return bdev->internal.io_time; 4248 } 4249 4250 static void bdev_update_qd_sampling_period(void *ctx); 4251 4252 static void 4253 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4254 { 4255 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4256 4257 if (bdev->internal.measured_queue_depth) { 4258 bdev->internal.io_time += bdev->internal.period; 4259 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4260 } 4261 4262 bdev->internal.qd_poll_in_progress = false; 4263 4264 bdev_update_qd_sampling_period(bdev); 4265 } 4266 4267 static void 4268 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4269 struct spdk_io_channel *io_ch, void *_ctx) 4270 { 4271 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4272 4273 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4274 spdk_bdev_for_each_channel_continue(i, 0); 4275 } 4276 4277 static int 4278 bdev_calculate_measured_queue_depth(void *ctx) 4279 { 4280 struct spdk_bdev *bdev = ctx; 4281 4282 bdev->internal.qd_poll_in_progress = true; 4283 bdev->internal.temporary_queue_depth = 0; 4284 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4285 return SPDK_POLLER_BUSY; 4286 } 4287 4288 static void 4289 bdev_update_qd_sampling_period(void *ctx) 4290 { 4291 struct spdk_bdev *bdev = ctx; 4292 4293 if (bdev->internal.period == bdev->internal.new_period) { 4294 return; 4295 } 4296 4297 if (bdev->internal.qd_poll_in_progress) { 4298 return; 4299 } 4300 4301 bdev->internal.period = bdev->internal.new_period; 4302 4303 spdk_poller_unregister(&bdev->internal.qd_poller); 4304 if (bdev->internal.period != 0) { 4305 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4306 bdev, bdev->internal.period); 4307 } else { 4308 spdk_bdev_close(bdev->internal.qd_desc); 4309 bdev->internal.qd_desc = NULL; 4310 } 4311 } 4312 4313 static void 4314 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4315 { 4316 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4317 } 4318 4319 void 4320 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4321 { 4322 int rc; 4323 4324 if (bdev->internal.new_period == period) { 4325 return; 4326 } 4327 4328 bdev->internal.new_period = period; 4329 4330 if (bdev->internal.qd_desc != NULL) { 4331 assert(bdev->internal.period != 0); 4332 4333 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4334 bdev_update_qd_sampling_period, bdev); 4335 return; 4336 } 4337 4338 assert(bdev->internal.period == 0); 4339 4340 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4341 NULL, &bdev->internal.qd_desc); 4342 if (rc != 0) { 4343 return; 4344 } 4345 4346 bdev->internal.period = period; 4347 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4348 bdev, period); 4349 } 4350 4351 struct bdev_get_current_qd_ctx { 4352 uint64_t current_qd; 4353 spdk_bdev_get_current_qd_cb cb_fn; 4354 void *cb_arg; 4355 }; 4356 4357 static void 4358 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4359 { 4360 struct bdev_get_current_qd_ctx *ctx = _ctx; 4361 4362 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4363 4364 free(ctx); 4365 } 4366 4367 static void 4368 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4369 struct spdk_io_channel *io_ch, void *_ctx) 4370 { 4371 struct bdev_get_current_qd_ctx *ctx = _ctx; 4372 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4373 4374 ctx->current_qd += bdev_ch->io_outstanding; 4375 4376 spdk_bdev_for_each_channel_continue(i, 0); 4377 } 4378 4379 void 4380 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4381 void *cb_arg) 4382 { 4383 struct bdev_get_current_qd_ctx *ctx; 4384 4385 assert(cb_fn != NULL); 4386 4387 ctx = calloc(1, sizeof(*ctx)); 4388 if (ctx == NULL) { 4389 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4390 return; 4391 } 4392 4393 ctx->cb_fn = cb_fn; 4394 ctx->cb_arg = cb_arg; 4395 4396 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4397 } 4398 4399 static void 4400 _resize_notify(void *arg) 4401 { 4402 struct spdk_bdev_desc *desc = arg; 4403 4404 spdk_spin_lock(&desc->spinlock); 4405 desc->refs--; 4406 if (!desc->closed) { 4407 spdk_spin_unlock(&desc->spinlock); 4408 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4409 desc->bdev, 4410 desc->callback.ctx); 4411 return; 4412 } else if (0 == desc->refs) { 4413 /* This descriptor was closed after this resize_notify message was sent. 4414 * spdk_bdev_close() could not free the descriptor since this message was 4415 * in flight, so we free it now using bdev_desc_free(). 4416 */ 4417 spdk_spin_unlock(&desc->spinlock); 4418 bdev_desc_free(desc); 4419 return; 4420 } 4421 spdk_spin_unlock(&desc->spinlock); 4422 } 4423 4424 int 4425 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4426 { 4427 struct spdk_bdev_desc *desc; 4428 int ret; 4429 4430 if (size == bdev->blockcnt) { 4431 return 0; 4432 } 4433 4434 spdk_spin_lock(&bdev->internal.spinlock); 4435 4436 /* bdev has open descriptors */ 4437 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4438 bdev->blockcnt > size) { 4439 ret = -EBUSY; 4440 } else { 4441 bdev->blockcnt = size; 4442 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4443 spdk_spin_lock(&desc->spinlock); 4444 if (!desc->closed) { 4445 desc->refs++; 4446 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4447 } 4448 spdk_spin_unlock(&desc->spinlock); 4449 } 4450 ret = 0; 4451 } 4452 4453 spdk_spin_unlock(&bdev->internal.spinlock); 4454 4455 return ret; 4456 } 4457 4458 /* 4459 * Convert I/O offset and length from bytes to blocks. 4460 * 4461 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4462 */ 4463 static uint64_t 4464 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4465 uint64_t num_bytes, uint64_t *num_blocks) 4466 { 4467 uint32_t block_size = bdev->blocklen; 4468 uint8_t shift_cnt; 4469 4470 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4471 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4472 shift_cnt = spdk_u32log2(block_size); 4473 *offset_blocks = offset_bytes >> shift_cnt; 4474 *num_blocks = num_bytes >> shift_cnt; 4475 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4476 (num_bytes - (*num_blocks << shift_cnt)); 4477 } else { 4478 *offset_blocks = offset_bytes / block_size; 4479 *num_blocks = num_bytes / block_size; 4480 return (offset_bytes % block_size) | (num_bytes % block_size); 4481 } 4482 } 4483 4484 static bool 4485 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4486 { 4487 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4488 * has been an overflow and hence the offset has been wrapped around */ 4489 if (offset_blocks + num_blocks < offset_blocks) { 4490 return false; 4491 } 4492 4493 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4494 if (offset_blocks + num_blocks > bdev->blockcnt) { 4495 return false; 4496 } 4497 4498 return true; 4499 } 4500 4501 static void 4502 bdev_seek_complete_cb(void *ctx) 4503 { 4504 struct spdk_bdev_io *bdev_io = ctx; 4505 4506 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4507 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4508 } 4509 4510 static int 4511 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4512 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4513 spdk_bdev_io_completion_cb cb, void *cb_arg) 4514 { 4515 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4516 struct spdk_bdev_io *bdev_io; 4517 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4518 4519 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4520 4521 /* Check if offset_blocks is valid looking at the validity of one block */ 4522 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4523 return -EINVAL; 4524 } 4525 4526 bdev_io = bdev_channel_get_io(channel); 4527 if (!bdev_io) { 4528 return -ENOMEM; 4529 } 4530 4531 bdev_io->internal.ch = channel; 4532 bdev_io->internal.desc = desc; 4533 bdev_io->type = io_type; 4534 bdev_io->u.bdev.offset_blocks = offset_blocks; 4535 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4536 4537 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4538 /* In case bdev doesn't support seek to next data/hole offset, 4539 * it is assumed that only data and no holes are present */ 4540 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4541 bdev_io->u.bdev.seek.offset = offset_blocks; 4542 } else { 4543 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4544 } 4545 4546 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4547 return 0; 4548 } 4549 4550 bdev_io_submit(bdev_io); 4551 return 0; 4552 } 4553 4554 int 4555 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4556 uint64_t offset_blocks, 4557 spdk_bdev_io_completion_cb cb, void *cb_arg) 4558 { 4559 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4560 } 4561 4562 int 4563 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4564 uint64_t offset_blocks, 4565 spdk_bdev_io_completion_cb cb, void *cb_arg) 4566 { 4567 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4568 } 4569 4570 uint64_t 4571 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4572 { 4573 return bdev_io->u.bdev.seek.offset; 4574 } 4575 4576 static int 4577 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4578 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4579 spdk_bdev_io_completion_cb cb, void *cb_arg) 4580 { 4581 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4582 struct spdk_bdev_io *bdev_io; 4583 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4584 4585 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4586 return -EINVAL; 4587 } 4588 4589 bdev_io = bdev_channel_get_io(channel); 4590 if (!bdev_io) { 4591 return -ENOMEM; 4592 } 4593 4594 bdev_io->internal.ch = channel; 4595 bdev_io->internal.desc = desc; 4596 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4597 bdev_io->u.bdev.iovs = &bdev_io->iov; 4598 bdev_io->u.bdev.iovs[0].iov_base = buf; 4599 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4600 bdev_io->u.bdev.iovcnt = 1; 4601 bdev_io->u.bdev.md_buf = md_buf; 4602 bdev_io->u.bdev.num_blocks = num_blocks; 4603 bdev_io->u.bdev.offset_blocks = offset_blocks; 4604 bdev_io->u.bdev.ext_opts = NULL; 4605 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4606 4607 bdev_io_submit(bdev_io); 4608 return 0; 4609 } 4610 4611 int 4612 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4613 void *buf, uint64_t offset, uint64_t nbytes, 4614 spdk_bdev_io_completion_cb cb, void *cb_arg) 4615 { 4616 uint64_t offset_blocks, num_blocks; 4617 4618 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4619 nbytes, &num_blocks) != 0) { 4620 return -EINVAL; 4621 } 4622 4623 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4624 } 4625 4626 int 4627 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4628 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4629 spdk_bdev_io_completion_cb cb, void *cb_arg) 4630 { 4631 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4632 } 4633 4634 int 4635 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4636 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4637 spdk_bdev_io_completion_cb cb, void *cb_arg) 4638 { 4639 struct iovec iov = { 4640 .iov_base = buf, 4641 }; 4642 4643 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4644 return -EINVAL; 4645 } 4646 4647 if (md_buf && !_is_buf_allocated(&iov)) { 4648 return -EINVAL; 4649 } 4650 4651 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4652 cb, cb_arg); 4653 } 4654 4655 int 4656 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4657 struct iovec *iov, int iovcnt, 4658 uint64_t offset, uint64_t nbytes, 4659 spdk_bdev_io_completion_cb cb, void *cb_arg) 4660 { 4661 uint64_t offset_blocks, num_blocks; 4662 4663 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4664 nbytes, &num_blocks) != 0) { 4665 return -EINVAL; 4666 } 4667 4668 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4669 } 4670 4671 static int 4672 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4673 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4674 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4675 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4676 { 4677 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4678 struct spdk_bdev_io *bdev_io; 4679 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4680 4681 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4682 return -EINVAL; 4683 } 4684 4685 bdev_io = bdev_channel_get_io(channel); 4686 if (!bdev_io) { 4687 return -ENOMEM; 4688 } 4689 4690 bdev_io->internal.ch = channel; 4691 bdev_io->internal.desc = desc; 4692 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4693 bdev_io->u.bdev.iovs = iov; 4694 bdev_io->u.bdev.iovcnt = iovcnt; 4695 bdev_io->u.bdev.md_buf = md_buf; 4696 bdev_io->u.bdev.num_blocks = num_blocks; 4697 bdev_io->u.bdev.offset_blocks = offset_blocks; 4698 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4699 bdev_io->internal.ext_opts = opts; 4700 bdev_io->u.bdev.ext_opts = opts; 4701 4702 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4703 4704 return 0; 4705 } 4706 4707 int 4708 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4709 struct iovec *iov, int iovcnt, 4710 uint64_t offset_blocks, uint64_t num_blocks, 4711 spdk_bdev_io_completion_cb cb, void *cb_arg) 4712 { 4713 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4714 num_blocks, cb, cb_arg, NULL, false); 4715 } 4716 4717 int 4718 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4719 struct iovec *iov, int iovcnt, void *md_buf, 4720 uint64_t offset_blocks, uint64_t num_blocks, 4721 spdk_bdev_io_completion_cb cb, void *cb_arg) 4722 { 4723 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4724 return -EINVAL; 4725 } 4726 4727 if (md_buf && !_is_buf_allocated(iov)) { 4728 return -EINVAL; 4729 } 4730 4731 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4732 num_blocks, cb, cb_arg, NULL, false); 4733 } 4734 4735 static inline bool 4736 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4737 { 4738 /* 4739 * We check if opts size is at least of size when we first introduced 4740 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4741 * are not checked internal. 4742 */ 4743 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4744 sizeof(opts->metadata) && 4745 opts->size <= sizeof(*opts) && 4746 /* When memory domain is used, the user must provide data buffers */ 4747 (!opts->memory_domain || (iov && iov[0].iov_base)); 4748 } 4749 4750 int 4751 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4752 struct iovec *iov, int iovcnt, 4753 uint64_t offset_blocks, uint64_t num_blocks, 4754 spdk_bdev_io_completion_cb cb, void *cb_arg, 4755 struct spdk_bdev_ext_io_opts *opts) 4756 { 4757 void *md = NULL; 4758 4759 if (opts) { 4760 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4761 return -EINVAL; 4762 } 4763 md = opts->metadata; 4764 } 4765 4766 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4767 return -EINVAL; 4768 } 4769 4770 if (md && !_is_buf_allocated(iov)) { 4771 return -EINVAL; 4772 } 4773 4774 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4775 num_blocks, cb, cb_arg, opts, false); 4776 } 4777 4778 static int 4779 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4780 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4781 spdk_bdev_io_completion_cb cb, void *cb_arg) 4782 { 4783 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4784 struct spdk_bdev_io *bdev_io; 4785 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4786 4787 if (!desc->write) { 4788 return -EBADF; 4789 } 4790 4791 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4792 return -EINVAL; 4793 } 4794 4795 bdev_io = bdev_channel_get_io(channel); 4796 if (!bdev_io) { 4797 return -ENOMEM; 4798 } 4799 4800 bdev_io->internal.ch = channel; 4801 bdev_io->internal.desc = desc; 4802 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4803 bdev_io->u.bdev.iovs = &bdev_io->iov; 4804 bdev_io->u.bdev.iovs[0].iov_base = buf; 4805 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4806 bdev_io->u.bdev.iovcnt = 1; 4807 bdev_io->u.bdev.md_buf = md_buf; 4808 bdev_io->u.bdev.num_blocks = num_blocks; 4809 bdev_io->u.bdev.offset_blocks = offset_blocks; 4810 bdev_io->u.bdev.ext_opts = NULL; 4811 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4812 4813 bdev_io_submit(bdev_io); 4814 return 0; 4815 } 4816 4817 int 4818 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4819 void *buf, uint64_t offset, uint64_t nbytes, 4820 spdk_bdev_io_completion_cb cb, void *cb_arg) 4821 { 4822 uint64_t offset_blocks, num_blocks; 4823 4824 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4825 nbytes, &num_blocks) != 0) { 4826 return -EINVAL; 4827 } 4828 4829 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4830 } 4831 4832 int 4833 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4834 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4835 spdk_bdev_io_completion_cb cb, void *cb_arg) 4836 { 4837 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4838 cb, cb_arg); 4839 } 4840 4841 int 4842 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4843 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4844 spdk_bdev_io_completion_cb cb, void *cb_arg) 4845 { 4846 struct iovec iov = { 4847 .iov_base = buf, 4848 }; 4849 4850 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4851 return -EINVAL; 4852 } 4853 4854 if (md_buf && !_is_buf_allocated(&iov)) { 4855 return -EINVAL; 4856 } 4857 4858 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4859 cb, cb_arg); 4860 } 4861 4862 static int 4863 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4864 struct iovec *iov, int iovcnt, void *md_buf, 4865 uint64_t offset_blocks, uint64_t num_blocks, 4866 spdk_bdev_io_completion_cb cb, void *cb_arg, 4867 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4868 { 4869 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4870 struct spdk_bdev_io *bdev_io; 4871 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4872 4873 if (!desc->write) { 4874 return -EBADF; 4875 } 4876 4877 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4878 return -EINVAL; 4879 } 4880 4881 bdev_io = bdev_channel_get_io(channel); 4882 if (!bdev_io) { 4883 return -ENOMEM; 4884 } 4885 4886 bdev_io->internal.ch = channel; 4887 bdev_io->internal.desc = desc; 4888 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4889 bdev_io->u.bdev.iovs = iov; 4890 bdev_io->u.bdev.iovcnt = iovcnt; 4891 bdev_io->u.bdev.md_buf = md_buf; 4892 bdev_io->u.bdev.num_blocks = num_blocks; 4893 bdev_io->u.bdev.offset_blocks = offset_blocks; 4894 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4895 bdev_io->internal.ext_opts = opts; 4896 bdev_io->u.bdev.ext_opts = opts; 4897 4898 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4899 4900 return 0; 4901 } 4902 4903 int 4904 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4905 struct iovec *iov, int iovcnt, 4906 uint64_t offset, uint64_t len, 4907 spdk_bdev_io_completion_cb cb, void *cb_arg) 4908 { 4909 uint64_t offset_blocks, num_blocks; 4910 4911 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4912 len, &num_blocks) != 0) { 4913 return -EINVAL; 4914 } 4915 4916 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4917 } 4918 4919 int 4920 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4921 struct iovec *iov, int iovcnt, 4922 uint64_t offset_blocks, uint64_t num_blocks, 4923 spdk_bdev_io_completion_cb cb, void *cb_arg) 4924 { 4925 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4926 num_blocks, cb, cb_arg, NULL, false); 4927 } 4928 4929 int 4930 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4931 struct iovec *iov, int iovcnt, void *md_buf, 4932 uint64_t offset_blocks, uint64_t num_blocks, 4933 spdk_bdev_io_completion_cb cb, void *cb_arg) 4934 { 4935 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4936 return -EINVAL; 4937 } 4938 4939 if (md_buf && !_is_buf_allocated(iov)) { 4940 return -EINVAL; 4941 } 4942 4943 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4944 num_blocks, cb, cb_arg, NULL, false); 4945 } 4946 4947 int 4948 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4949 struct iovec *iov, int iovcnt, 4950 uint64_t offset_blocks, uint64_t num_blocks, 4951 spdk_bdev_io_completion_cb cb, void *cb_arg, 4952 struct spdk_bdev_ext_io_opts *opts) 4953 { 4954 void *md = NULL; 4955 4956 if (opts) { 4957 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4958 return -EINVAL; 4959 } 4960 md = opts->metadata; 4961 } 4962 4963 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4964 return -EINVAL; 4965 } 4966 4967 if (md && !_is_buf_allocated(iov)) { 4968 return -EINVAL; 4969 } 4970 4971 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4972 num_blocks, cb, cb_arg, opts, false); 4973 } 4974 4975 static void 4976 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4977 { 4978 struct spdk_bdev_io *parent_io = cb_arg; 4979 struct spdk_bdev *bdev = parent_io->bdev; 4980 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4981 int i, rc = 0; 4982 4983 if (!success) { 4984 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4985 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4986 spdk_bdev_free_io(bdev_io); 4987 return; 4988 } 4989 4990 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4991 rc = memcmp(read_buf, 4992 parent_io->u.bdev.iovs[i].iov_base, 4993 parent_io->u.bdev.iovs[i].iov_len); 4994 if (rc) { 4995 break; 4996 } 4997 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4998 } 4999 5000 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 5001 rc = memcmp(bdev_io->u.bdev.md_buf, 5002 parent_io->u.bdev.md_buf, 5003 spdk_bdev_get_md_size(bdev)); 5004 } 5005 5006 spdk_bdev_free_io(bdev_io); 5007 5008 if (rc == 0) { 5009 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5010 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 5011 } else { 5012 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 5013 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 5014 } 5015 } 5016 5017 static void 5018 bdev_compare_do_read(void *_bdev_io) 5019 { 5020 struct spdk_bdev_io *bdev_io = _bdev_io; 5021 int rc; 5022 5023 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 5024 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 5025 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5026 bdev_compare_do_read_done, bdev_io); 5027 5028 if (rc == -ENOMEM) { 5029 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 5030 } else if (rc != 0) { 5031 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5032 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5033 } 5034 } 5035 5036 static int 5037 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5038 struct iovec *iov, int iovcnt, void *md_buf, 5039 uint64_t offset_blocks, uint64_t num_blocks, 5040 spdk_bdev_io_completion_cb cb, void *cb_arg) 5041 { 5042 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5043 struct spdk_bdev_io *bdev_io; 5044 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5045 5046 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5047 return -EINVAL; 5048 } 5049 5050 bdev_io = bdev_channel_get_io(channel); 5051 if (!bdev_io) { 5052 return -ENOMEM; 5053 } 5054 5055 bdev_io->internal.ch = channel; 5056 bdev_io->internal.desc = desc; 5057 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5058 bdev_io->u.bdev.iovs = iov; 5059 bdev_io->u.bdev.iovcnt = iovcnt; 5060 bdev_io->u.bdev.md_buf = md_buf; 5061 bdev_io->u.bdev.num_blocks = num_blocks; 5062 bdev_io->u.bdev.offset_blocks = offset_blocks; 5063 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5064 bdev_io->u.bdev.ext_opts = NULL; 5065 5066 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5067 bdev_io_submit(bdev_io); 5068 return 0; 5069 } 5070 5071 bdev_compare_do_read(bdev_io); 5072 5073 return 0; 5074 } 5075 5076 int 5077 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5078 struct iovec *iov, int iovcnt, 5079 uint64_t offset_blocks, uint64_t num_blocks, 5080 spdk_bdev_io_completion_cb cb, void *cb_arg) 5081 { 5082 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 5083 num_blocks, cb, cb_arg); 5084 } 5085 5086 int 5087 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5088 struct iovec *iov, int iovcnt, void *md_buf, 5089 uint64_t offset_blocks, uint64_t num_blocks, 5090 spdk_bdev_io_completion_cb cb, void *cb_arg) 5091 { 5092 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5093 return -EINVAL; 5094 } 5095 5096 if (md_buf && !_is_buf_allocated(iov)) { 5097 return -EINVAL; 5098 } 5099 5100 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5101 num_blocks, cb, cb_arg); 5102 } 5103 5104 static int 5105 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5106 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5107 spdk_bdev_io_completion_cb cb, void *cb_arg) 5108 { 5109 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5110 struct spdk_bdev_io *bdev_io; 5111 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5112 5113 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5114 return -EINVAL; 5115 } 5116 5117 bdev_io = bdev_channel_get_io(channel); 5118 if (!bdev_io) { 5119 return -ENOMEM; 5120 } 5121 5122 bdev_io->internal.ch = channel; 5123 bdev_io->internal.desc = desc; 5124 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5125 bdev_io->u.bdev.iovs = &bdev_io->iov; 5126 bdev_io->u.bdev.iovs[0].iov_base = buf; 5127 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5128 bdev_io->u.bdev.iovcnt = 1; 5129 bdev_io->u.bdev.md_buf = md_buf; 5130 bdev_io->u.bdev.num_blocks = num_blocks; 5131 bdev_io->u.bdev.offset_blocks = offset_blocks; 5132 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5133 bdev_io->u.bdev.ext_opts = NULL; 5134 5135 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5136 bdev_io_submit(bdev_io); 5137 return 0; 5138 } 5139 5140 bdev_compare_do_read(bdev_io); 5141 5142 return 0; 5143 } 5144 5145 int 5146 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5147 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5148 spdk_bdev_io_completion_cb cb, void *cb_arg) 5149 { 5150 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5151 cb, cb_arg); 5152 } 5153 5154 int 5155 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5156 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5157 spdk_bdev_io_completion_cb cb, void *cb_arg) 5158 { 5159 struct iovec iov = { 5160 .iov_base = buf, 5161 }; 5162 5163 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5164 return -EINVAL; 5165 } 5166 5167 if (md_buf && !_is_buf_allocated(&iov)) { 5168 return -EINVAL; 5169 } 5170 5171 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5172 cb, cb_arg); 5173 } 5174 5175 static void 5176 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5177 { 5178 struct spdk_bdev_io *bdev_io = ctx; 5179 5180 if (unlock_status) { 5181 SPDK_ERRLOG("LBA range unlock failed\n"); 5182 } 5183 5184 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5185 false, bdev_io->internal.caller_ctx); 5186 } 5187 5188 static void 5189 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5190 { 5191 bdev_io->internal.status = status; 5192 5193 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5194 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5195 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5196 } 5197 5198 static void 5199 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5200 { 5201 struct spdk_bdev_io *parent_io = cb_arg; 5202 5203 if (!success) { 5204 SPDK_ERRLOG("Compare and write operation failed\n"); 5205 } 5206 5207 spdk_bdev_free_io(bdev_io); 5208 5209 bdev_comparev_and_writev_blocks_unlock(parent_io, 5210 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5211 } 5212 5213 static void 5214 bdev_compare_and_write_do_write(void *_bdev_io) 5215 { 5216 struct spdk_bdev_io *bdev_io = _bdev_io; 5217 int rc; 5218 5219 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5220 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5221 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5222 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5223 bdev_compare_and_write_do_write_done, bdev_io); 5224 5225 5226 if (rc == -ENOMEM) { 5227 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5228 } else if (rc != 0) { 5229 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5230 } 5231 } 5232 5233 static void 5234 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5235 { 5236 struct spdk_bdev_io *parent_io = cb_arg; 5237 5238 spdk_bdev_free_io(bdev_io); 5239 5240 if (!success) { 5241 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5242 return; 5243 } 5244 5245 bdev_compare_and_write_do_write(parent_io); 5246 } 5247 5248 static void 5249 bdev_compare_and_write_do_compare(void *_bdev_io) 5250 { 5251 struct spdk_bdev_io *bdev_io = _bdev_io; 5252 int rc; 5253 5254 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5255 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5256 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5257 bdev_compare_and_write_do_compare_done, bdev_io); 5258 5259 if (rc == -ENOMEM) { 5260 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5261 } else if (rc != 0) { 5262 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5263 } 5264 } 5265 5266 static void 5267 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5268 { 5269 struct spdk_bdev_io *bdev_io = ctx; 5270 5271 if (status) { 5272 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5273 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5274 return; 5275 } 5276 5277 bdev_compare_and_write_do_compare(bdev_io); 5278 } 5279 5280 int 5281 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5282 struct iovec *compare_iov, int compare_iovcnt, 5283 struct iovec *write_iov, int write_iovcnt, 5284 uint64_t offset_blocks, uint64_t num_blocks, 5285 spdk_bdev_io_completion_cb cb, void *cb_arg) 5286 { 5287 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5288 struct spdk_bdev_io *bdev_io; 5289 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5290 5291 if (!desc->write) { 5292 return -EBADF; 5293 } 5294 5295 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5296 return -EINVAL; 5297 } 5298 5299 if (num_blocks > bdev->acwu) { 5300 return -EINVAL; 5301 } 5302 5303 bdev_io = bdev_channel_get_io(channel); 5304 if (!bdev_io) { 5305 return -ENOMEM; 5306 } 5307 5308 bdev_io->internal.ch = channel; 5309 bdev_io->internal.desc = desc; 5310 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5311 bdev_io->u.bdev.iovs = compare_iov; 5312 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5313 bdev_io->u.bdev.fused_iovs = write_iov; 5314 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5315 bdev_io->u.bdev.md_buf = NULL; 5316 bdev_io->u.bdev.num_blocks = num_blocks; 5317 bdev_io->u.bdev.offset_blocks = offset_blocks; 5318 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5319 bdev_io->u.bdev.ext_opts = NULL; 5320 5321 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5322 bdev_io_submit(bdev_io); 5323 return 0; 5324 } 5325 5326 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5327 bdev_comparev_and_writev_blocks_locked, bdev_io); 5328 } 5329 5330 int 5331 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5332 struct iovec *iov, int iovcnt, 5333 uint64_t offset_blocks, uint64_t num_blocks, 5334 bool populate, 5335 spdk_bdev_io_completion_cb cb, void *cb_arg) 5336 { 5337 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5338 struct spdk_bdev_io *bdev_io; 5339 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5340 5341 if (!desc->write) { 5342 return -EBADF; 5343 } 5344 5345 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5346 return -EINVAL; 5347 } 5348 5349 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5350 return -ENOTSUP; 5351 } 5352 5353 bdev_io = bdev_channel_get_io(channel); 5354 if (!bdev_io) { 5355 return -ENOMEM; 5356 } 5357 5358 bdev_io->internal.ch = channel; 5359 bdev_io->internal.desc = desc; 5360 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5361 bdev_io->u.bdev.num_blocks = num_blocks; 5362 bdev_io->u.bdev.offset_blocks = offset_blocks; 5363 bdev_io->u.bdev.iovs = iov; 5364 bdev_io->u.bdev.iovcnt = iovcnt; 5365 bdev_io->u.bdev.md_buf = NULL; 5366 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5367 bdev_io->u.bdev.zcopy.commit = 0; 5368 bdev_io->u.bdev.zcopy.start = 1; 5369 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5370 bdev_io->u.bdev.ext_opts = NULL; 5371 5372 bdev_io_submit(bdev_io); 5373 5374 return 0; 5375 } 5376 5377 int 5378 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5379 spdk_bdev_io_completion_cb cb, void *cb_arg) 5380 { 5381 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5382 return -EINVAL; 5383 } 5384 5385 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5386 bdev_io->u.bdev.zcopy.start = 0; 5387 bdev_io->internal.caller_ctx = cb_arg; 5388 bdev_io->internal.cb = cb; 5389 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5390 5391 bdev_io_submit(bdev_io); 5392 5393 return 0; 5394 } 5395 5396 int 5397 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5398 uint64_t offset, uint64_t len, 5399 spdk_bdev_io_completion_cb cb, void *cb_arg) 5400 { 5401 uint64_t offset_blocks, num_blocks; 5402 5403 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5404 len, &num_blocks) != 0) { 5405 return -EINVAL; 5406 } 5407 5408 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5409 } 5410 5411 int 5412 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5413 uint64_t offset_blocks, uint64_t num_blocks, 5414 spdk_bdev_io_completion_cb cb, void *cb_arg) 5415 { 5416 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5417 struct spdk_bdev_io *bdev_io; 5418 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5419 5420 if (!desc->write) { 5421 return -EBADF; 5422 } 5423 5424 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5425 return -EINVAL; 5426 } 5427 5428 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5429 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5430 return -ENOTSUP; 5431 } 5432 5433 bdev_io = bdev_channel_get_io(channel); 5434 5435 if (!bdev_io) { 5436 return -ENOMEM; 5437 } 5438 5439 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5440 bdev_io->internal.ch = channel; 5441 bdev_io->internal.desc = desc; 5442 bdev_io->u.bdev.offset_blocks = offset_blocks; 5443 bdev_io->u.bdev.num_blocks = num_blocks; 5444 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5445 bdev_io->u.bdev.ext_opts = NULL; 5446 5447 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5448 bdev_io_submit(bdev_io); 5449 return 0; 5450 } 5451 5452 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5453 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5454 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5455 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5456 bdev_write_zero_buffer_next(bdev_io); 5457 5458 return 0; 5459 } 5460 5461 int 5462 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5463 uint64_t offset, uint64_t nbytes, 5464 spdk_bdev_io_completion_cb cb, void *cb_arg) 5465 { 5466 uint64_t offset_blocks, num_blocks; 5467 5468 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5469 nbytes, &num_blocks) != 0) { 5470 return -EINVAL; 5471 } 5472 5473 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5474 } 5475 5476 int 5477 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5478 uint64_t offset_blocks, uint64_t num_blocks, 5479 spdk_bdev_io_completion_cb cb, void *cb_arg) 5480 { 5481 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5482 struct spdk_bdev_io *bdev_io; 5483 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5484 5485 if (!desc->write) { 5486 return -EBADF; 5487 } 5488 5489 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5490 return -EINVAL; 5491 } 5492 5493 if (num_blocks == 0) { 5494 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5495 return -EINVAL; 5496 } 5497 5498 bdev_io = bdev_channel_get_io(channel); 5499 if (!bdev_io) { 5500 return -ENOMEM; 5501 } 5502 5503 bdev_io->internal.ch = channel; 5504 bdev_io->internal.desc = desc; 5505 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5506 5507 bdev_io->u.bdev.iovs = &bdev_io->iov; 5508 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5509 bdev_io->u.bdev.iovs[0].iov_len = 0; 5510 bdev_io->u.bdev.iovcnt = 1; 5511 5512 bdev_io->u.bdev.offset_blocks = offset_blocks; 5513 bdev_io->u.bdev.num_blocks = num_blocks; 5514 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5515 bdev_io->u.bdev.ext_opts = NULL; 5516 5517 bdev_io_submit(bdev_io); 5518 return 0; 5519 } 5520 5521 int 5522 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5523 uint64_t offset, uint64_t length, 5524 spdk_bdev_io_completion_cb cb, void *cb_arg) 5525 { 5526 uint64_t offset_blocks, num_blocks; 5527 5528 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5529 length, &num_blocks) != 0) { 5530 return -EINVAL; 5531 } 5532 5533 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5534 } 5535 5536 int 5537 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5538 uint64_t offset_blocks, uint64_t num_blocks, 5539 spdk_bdev_io_completion_cb cb, void *cb_arg) 5540 { 5541 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5542 struct spdk_bdev_io *bdev_io; 5543 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5544 5545 if (!desc->write) { 5546 return -EBADF; 5547 } 5548 5549 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5550 return -EINVAL; 5551 } 5552 5553 bdev_io = bdev_channel_get_io(channel); 5554 if (!bdev_io) { 5555 return -ENOMEM; 5556 } 5557 5558 bdev_io->internal.ch = channel; 5559 bdev_io->internal.desc = desc; 5560 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5561 bdev_io->u.bdev.iovs = NULL; 5562 bdev_io->u.bdev.iovcnt = 0; 5563 bdev_io->u.bdev.offset_blocks = offset_blocks; 5564 bdev_io->u.bdev.num_blocks = num_blocks; 5565 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5566 5567 bdev_io_submit(bdev_io); 5568 return 0; 5569 } 5570 5571 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5572 5573 static void 5574 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5575 { 5576 struct spdk_bdev_channel *ch = _ctx; 5577 struct spdk_bdev_io *bdev_io; 5578 5579 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5580 5581 if (status == -EBUSY) { 5582 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5583 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5584 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5585 } else { 5586 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5587 * start the reset. */ 5588 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5589 bdev_io_submit_reset(bdev_io); 5590 } 5591 } else { 5592 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5593 SPDK_DEBUGLOG(bdev, 5594 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5595 ch->bdev->name); 5596 /* Mark the completion status as a SUCCESS and complete the reset. */ 5597 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5598 } 5599 } 5600 5601 static void 5602 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5603 struct spdk_io_channel *io_ch, void *_ctx) 5604 { 5605 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5606 int status = 0; 5607 5608 if (cur_ch->io_outstanding > 0) { 5609 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5610 * further iteration over the rest of the channels and pass non-zero status 5611 * to the callback function. */ 5612 status = -EBUSY; 5613 } 5614 spdk_bdev_for_each_channel_continue(i, status); 5615 } 5616 5617 static int 5618 bdev_reset_poll_for_outstanding_io(void *ctx) 5619 { 5620 struct spdk_bdev_channel *ch = ctx; 5621 struct spdk_bdev_io *bdev_io; 5622 5623 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5624 5625 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5626 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5627 bdev_reset_check_outstanding_io_done); 5628 5629 return SPDK_POLLER_BUSY; 5630 } 5631 5632 static void 5633 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5634 { 5635 struct spdk_bdev_channel *ch = _ctx; 5636 struct spdk_bdev_io *bdev_io; 5637 5638 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5639 5640 if (bdev->reset_io_drain_timeout == 0) { 5641 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5642 5643 bdev_io_submit_reset(bdev_io); 5644 return; 5645 } 5646 5647 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5648 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5649 5650 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5651 * submit the reset to the underlying module only if outstanding I/O 5652 * remain after reset_io_drain_timeout seconds have passed. */ 5653 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5654 bdev_reset_check_outstanding_io_done); 5655 } 5656 5657 static void 5658 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5659 struct spdk_io_channel *ch, void *_ctx) 5660 { 5661 struct spdk_bdev_channel *channel; 5662 struct spdk_bdev_mgmt_channel *mgmt_channel; 5663 struct spdk_bdev_shared_resource *shared_resource; 5664 bdev_io_tailq_t tmp_queued; 5665 5666 TAILQ_INIT(&tmp_queued); 5667 5668 channel = __io_ch_to_bdev_ch(ch); 5669 shared_resource = channel->shared_resource; 5670 mgmt_channel = shared_resource->mgmt_ch; 5671 5672 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5673 5674 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5675 /* The QoS object is always valid and readable while 5676 * the channel flag is set, so the lock here should not 5677 * be necessary. We're not in the fast path though, so 5678 * just take it anyway. */ 5679 spdk_spin_lock(&channel->bdev->internal.spinlock); 5680 if (channel->bdev->internal.qos->ch == channel) { 5681 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5682 } 5683 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5684 } 5685 5686 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5687 bdev_abort_all_buf_io(mgmt_channel, channel); 5688 bdev_abort_all_buf_io(mgmt_channel, channel); 5689 bdev_abort_all_queued_io(&tmp_queued, channel); 5690 5691 spdk_bdev_for_each_channel_continue(i, 0); 5692 } 5693 5694 static void 5695 bdev_start_reset(void *ctx) 5696 { 5697 struct spdk_bdev_channel *ch = ctx; 5698 5699 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5700 bdev_reset_freeze_channel_done); 5701 } 5702 5703 static void 5704 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5705 { 5706 struct spdk_bdev *bdev = ch->bdev; 5707 5708 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5709 5710 spdk_spin_lock(&bdev->internal.spinlock); 5711 if (bdev->internal.reset_in_progress == NULL) { 5712 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5713 /* 5714 * Take a channel reference for the target bdev for the life of this 5715 * reset. This guards against the channel getting destroyed while 5716 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5717 * progress. We will release the reference when this reset is 5718 * completed. 5719 */ 5720 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5721 bdev_start_reset(ch); 5722 } 5723 spdk_spin_unlock(&bdev->internal.spinlock); 5724 } 5725 5726 int 5727 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5728 spdk_bdev_io_completion_cb cb, void *cb_arg) 5729 { 5730 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5731 struct spdk_bdev_io *bdev_io; 5732 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5733 5734 bdev_io = bdev_channel_get_io(channel); 5735 if (!bdev_io) { 5736 return -ENOMEM; 5737 } 5738 5739 bdev_io->internal.ch = channel; 5740 bdev_io->internal.desc = desc; 5741 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5742 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5743 bdev_io->u.reset.ch_ref = NULL; 5744 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5745 5746 spdk_spin_lock(&bdev->internal.spinlock); 5747 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5748 spdk_spin_unlock(&bdev->internal.spinlock); 5749 5750 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5751 internal.ch_link); 5752 5753 bdev_channel_start_reset(channel); 5754 5755 return 0; 5756 } 5757 5758 void 5759 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5760 struct spdk_bdev_io_stat *stat) 5761 { 5762 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5763 5764 bdev_get_io_stat(stat, channel->stat); 5765 } 5766 5767 static void 5768 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5769 { 5770 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5771 5772 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5773 bdev_iostat_ctx->cb_arg, 0); 5774 free(bdev_iostat_ctx); 5775 } 5776 5777 static void 5778 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5779 struct spdk_io_channel *ch, void *_ctx) 5780 { 5781 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5782 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5783 5784 bdev_add_io_stat(bdev_iostat_ctx->stat, channel->stat); 5785 spdk_bdev_for_each_channel_continue(i, 0); 5786 } 5787 5788 void 5789 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5790 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5791 { 5792 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5793 5794 assert(bdev != NULL); 5795 assert(stat != NULL); 5796 assert(cb != NULL); 5797 5798 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5799 if (bdev_iostat_ctx == NULL) { 5800 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5801 cb(bdev, stat, cb_arg, -ENOMEM); 5802 return; 5803 } 5804 5805 bdev_iostat_ctx->stat = stat; 5806 bdev_iostat_ctx->cb = cb; 5807 bdev_iostat_ctx->cb_arg = cb_arg; 5808 5809 /* Start with the statistics from previously deleted channels. */ 5810 spdk_spin_lock(&bdev->internal.spinlock); 5811 bdev_get_io_stat(bdev_iostat_ctx->stat, bdev->internal.stat); 5812 spdk_spin_unlock(&bdev->internal.spinlock); 5813 5814 /* Then iterate and add the statistics from each existing channel. */ 5815 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5816 bdev_get_device_stat_done); 5817 } 5818 5819 struct bdev_iostat_reset_ctx { 5820 enum bdev_reset_stat_mode mode; 5821 bdev_reset_device_stat_cb cb; 5822 void *cb_arg; 5823 }; 5824 5825 static void 5826 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5827 { 5828 struct bdev_iostat_reset_ctx *ctx = _ctx; 5829 5830 ctx->cb(bdev, ctx->cb_arg, 0); 5831 5832 free(ctx); 5833 } 5834 5835 static void 5836 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5837 struct spdk_io_channel *ch, void *_ctx) 5838 { 5839 struct bdev_iostat_reset_ctx *ctx = _ctx; 5840 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5841 5842 bdev_reset_io_stat(channel->stat, ctx->mode); 5843 5844 spdk_bdev_for_each_channel_continue(i, 0); 5845 } 5846 5847 void 5848 bdev_reset_device_stat(struct spdk_bdev *bdev, enum bdev_reset_stat_mode mode, 5849 bdev_reset_device_stat_cb cb, void *cb_arg) 5850 { 5851 struct bdev_iostat_reset_ctx *ctx; 5852 5853 assert(bdev != NULL); 5854 assert(cb != NULL); 5855 5856 ctx = calloc(1, sizeof(*ctx)); 5857 if (ctx == NULL) { 5858 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5859 cb(bdev, cb_arg, -ENOMEM); 5860 return; 5861 } 5862 5863 ctx->mode = mode; 5864 ctx->cb = cb; 5865 ctx->cb_arg = cb_arg; 5866 5867 spdk_spin_lock(&bdev->internal.spinlock); 5868 bdev_reset_io_stat(bdev->internal.stat, mode); 5869 spdk_spin_unlock(&bdev->internal.spinlock); 5870 5871 spdk_bdev_for_each_channel(bdev, 5872 bdev_reset_each_channel_stat, 5873 ctx, 5874 bdev_reset_device_stat_done); 5875 } 5876 5877 int 5878 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5879 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5880 spdk_bdev_io_completion_cb cb, void *cb_arg) 5881 { 5882 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5883 struct spdk_bdev_io *bdev_io; 5884 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5885 5886 if (!desc->write) { 5887 return -EBADF; 5888 } 5889 5890 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5891 return -ENOTSUP; 5892 } 5893 5894 bdev_io = bdev_channel_get_io(channel); 5895 if (!bdev_io) { 5896 return -ENOMEM; 5897 } 5898 5899 bdev_io->internal.ch = channel; 5900 bdev_io->internal.desc = desc; 5901 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5902 bdev_io->u.nvme_passthru.cmd = *cmd; 5903 bdev_io->u.nvme_passthru.buf = buf; 5904 bdev_io->u.nvme_passthru.nbytes = nbytes; 5905 bdev_io->u.nvme_passthru.md_buf = NULL; 5906 bdev_io->u.nvme_passthru.md_len = 0; 5907 5908 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5909 5910 bdev_io_submit(bdev_io); 5911 return 0; 5912 } 5913 5914 int 5915 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5916 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5917 spdk_bdev_io_completion_cb cb, void *cb_arg) 5918 { 5919 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5920 struct spdk_bdev_io *bdev_io; 5921 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5922 5923 if (!desc->write) { 5924 /* 5925 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5926 * to easily determine if the command is a read or write, but for now just 5927 * do not allow io_passthru with a read-only descriptor. 5928 */ 5929 return -EBADF; 5930 } 5931 5932 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5933 return -ENOTSUP; 5934 } 5935 5936 bdev_io = bdev_channel_get_io(channel); 5937 if (!bdev_io) { 5938 return -ENOMEM; 5939 } 5940 5941 bdev_io->internal.ch = channel; 5942 bdev_io->internal.desc = desc; 5943 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5944 bdev_io->u.nvme_passthru.cmd = *cmd; 5945 bdev_io->u.nvme_passthru.buf = buf; 5946 bdev_io->u.nvme_passthru.nbytes = nbytes; 5947 bdev_io->u.nvme_passthru.md_buf = NULL; 5948 bdev_io->u.nvme_passthru.md_len = 0; 5949 5950 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5951 5952 bdev_io_submit(bdev_io); 5953 return 0; 5954 } 5955 5956 int 5957 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5958 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5959 spdk_bdev_io_completion_cb cb, void *cb_arg) 5960 { 5961 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5962 struct spdk_bdev_io *bdev_io; 5963 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5964 5965 if (!desc->write) { 5966 /* 5967 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5968 * to easily determine if the command is a read or write, but for now just 5969 * do not allow io_passthru with a read-only descriptor. 5970 */ 5971 return -EBADF; 5972 } 5973 5974 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5975 return -ENOTSUP; 5976 } 5977 5978 bdev_io = bdev_channel_get_io(channel); 5979 if (!bdev_io) { 5980 return -ENOMEM; 5981 } 5982 5983 bdev_io->internal.ch = channel; 5984 bdev_io->internal.desc = desc; 5985 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5986 bdev_io->u.nvme_passthru.cmd = *cmd; 5987 bdev_io->u.nvme_passthru.buf = buf; 5988 bdev_io->u.nvme_passthru.nbytes = nbytes; 5989 bdev_io->u.nvme_passthru.md_buf = md_buf; 5990 bdev_io->u.nvme_passthru.md_len = md_len; 5991 5992 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5993 5994 bdev_io_submit(bdev_io); 5995 return 0; 5996 } 5997 5998 static void bdev_abort_retry(void *ctx); 5999 static void bdev_abort(struct spdk_bdev_io *parent_io); 6000 6001 static void 6002 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6003 { 6004 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 6005 struct spdk_bdev_io *parent_io = cb_arg; 6006 struct spdk_bdev_io *bio_to_abort, *tmp_io; 6007 6008 bio_to_abort = bdev_io->u.abort.bio_to_abort; 6009 6010 spdk_bdev_free_io(bdev_io); 6011 6012 if (!success) { 6013 /* Check if the target I/O completed in the meantime. */ 6014 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 6015 if (tmp_io == bio_to_abort) { 6016 break; 6017 } 6018 } 6019 6020 /* If the target I/O still exists, set the parent to failed. */ 6021 if (tmp_io != NULL) { 6022 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6023 } 6024 } 6025 6026 parent_io->u.bdev.split_outstanding--; 6027 if (parent_io->u.bdev.split_outstanding == 0) { 6028 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6029 bdev_abort_retry(parent_io); 6030 } else { 6031 bdev_io_complete(parent_io); 6032 } 6033 } 6034 } 6035 6036 static int 6037 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 6038 struct spdk_bdev_io *bio_to_abort, 6039 spdk_bdev_io_completion_cb cb, void *cb_arg) 6040 { 6041 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6042 struct spdk_bdev_io *bdev_io; 6043 6044 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 6045 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 6046 /* TODO: Abort reset or abort request. */ 6047 return -ENOTSUP; 6048 } 6049 6050 bdev_io = bdev_channel_get_io(channel); 6051 if (bdev_io == NULL) { 6052 return -ENOMEM; 6053 } 6054 6055 bdev_io->internal.ch = channel; 6056 bdev_io->internal.desc = desc; 6057 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6058 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6059 6060 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 6061 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 6062 6063 /* Parent abort request is not submitted directly, but to manage its 6064 * execution add it to the submitted list here. 6065 */ 6066 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6067 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6068 6069 bdev_abort(bdev_io); 6070 6071 return 0; 6072 } 6073 6074 bdev_io->u.abort.bio_to_abort = bio_to_abort; 6075 6076 /* Submit the abort request to the underlying bdev module. */ 6077 bdev_io_submit(bdev_io); 6078 6079 return 0; 6080 } 6081 6082 static uint32_t 6083 _bdev_abort(struct spdk_bdev_io *parent_io) 6084 { 6085 struct spdk_bdev_desc *desc = parent_io->internal.desc; 6086 struct spdk_bdev_channel *channel = parent_io->internal.ch; 6087 void *bio_cb_arg; 6088 struct spdk_bdev_io *bio_to_abort; 6089 uint32_t matched_ios; 6090 int rc; 6091 6092 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 6093 6094 /* matched_ios is returned and will be kept by the caller. 6095 * 6096 * This function will be used for two cases, 1) the same cb_arg is used for 6097 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6098 * Incrementing split_outstanding directly here may confuse readers especially 6099 * for the 1st case. 6100 * 6101 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6102 * works as expected. 6103 */ 6104 matched_ios = 0; 6105 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6106 6107 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6108 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6109 continue; 6110 } 6111 6112 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6113 /* Any I/O which was submitted after this abort command should be excluded. */ 6114 continue; 6115 } 6116 6117 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6118 if (rc != 0) { 6119 if (rc == -ENOMEM) { 6120 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6121 } else { 6122 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6123 } 6124 break; 6125 } 6126 matched_ios++; 6127 } 6128 6129 return matched_ios; 6130 } 6131 6132 static void 6133 bdev_abort_retry(void *ctx) 6134 { 6135 struct spdk_bdev_io *parent_io = ctx; 6136 uint32_t matched_ios; 6137 6138 matched_ios = _bdev_abort(parent_io); 6139 6140 if (matched_ios == 0) { 6141 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6142 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6143 } else { 6144 /* For retry, the case that no target I/O was found is success 6145 * because it means target I/Os completed in the meantime. 6146 */ 6147 bdev_io_complete(parent_io); 6148 } 6149 return; 6150 } 6151 6152 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6153 parent_io->u.bdev.split_outstanding = matched_ios; 6154 } 6155 6156 static void 6157 bdev_abort(struct spdk_bdev_io *parent_io) 6158 { 6159 uint32_t matched_ios; 6160 6161 matched_ios = _bdev_abort(parent_io); 6162 6163 if (matched_ios == 0) { 6164 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6165 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6166 } else { 6167 /* The case the no target I/O was found is failure. */ 6168 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6169 bdev_io_complete(parent_io); 6170 } 6171 return; 6172 } 6173 6174 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6175 parent_io->u.bdev.split_outstanding = matched_ios; 6176 } 6177 6178 int 6179 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6180 void *bio_cb_arg, 6181 spdk_bdev_io_completion_cb cb, void *cb_arg) 6182 { 6183 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6184 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6185 struct spdk_bdev_io *bdev_io; 6186 6187 if (bio_cb_arg == NULL) { 6188 return -EINVAL; 6189 } 6190 6191 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6192 return -ENOTSUP; 6193 } 6194 6195 bdev_io = bdev_channel_get_io(channel); 6196 if (bdev_io == NULL) { 6197 return -ENOMEM; 6198 } 6199 6200 bdev_io->internal.ch = channel; 6201 bdev_io->internal.desc = desc; 6202 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6203 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6204 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6205 6206 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6207 6208 /* Parent abort request is not submitted directly, but to manage its execution, 6209 * add it to the submitted list here. 6210 */ 6211 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6212 6213 bdev_abort(bdev_io); 6214 6215 return 0; 6216 } 6217 6218 int 6219 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6220 struct spdk_bdev_io_wait_entry *entry) 6221 { 6222 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6223 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6224 6225 if (bdev != entry->bdev) { 6226 SPDK_ERRLOG("bdevs do not match\n"); 6227 return -EINVAL; 6228 } 6229 6230 if (mgmt_ch->per_thread_cache_count > 0) { 6231 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6232 return -EINVAL; 6233 } 6234 6235 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6236 return 0; 6237 } 6238 6239 static inline void 6240 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6241 { 6242 enum spdk_bdev_io_status io_status = bdev_io->internal.status; 6243 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6244 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6245 uint32_t blocklen = bdev_io->bdev->blocklen; 6246 6247 if (spdk_likely(io_status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6248 switch (bdev_io->type) { 6249 case SPDK_BDEV_IO_TYPE_READ: 6250 io_stat->bytes_read += num_blocks * blocklen; 6251 io_stat->num_read_ops++; 6252 io_stat->read_latency_ticks += tsc_diff; 6253 if (io_stat->max_read_latency_ticks < tsc_diff) { 6254 io_stat->max_read_latency_ticks = tsc_diff; 6255 } 6256 if (io_stat->min_read_latency_ticks > tsc_diff) { 6257 io_stat->min_read_latency_ticks = tsc_diff; 6258 } 6259 break; 6260 case SPDK_BDEV_IO_TYPE_WRITE: 6261 io_stat->bytes_written += num_blocks * blocklen; 6262 io_stat->num_write_ops++; 6263 io_stat->write_latency_ticks += tsc_diff; 6264 if (io_stat->max_write_latency_ticks < tsc_diff) { 6265 io_stat->max_write_latency_ticks = tsc_diff; 6266 } 6267 if (io_stat->min_write_latency_ticks > tsc_diff) { 6268 io_stat->min_write_latency_ticks = tsc_diff; 6269 } 6270 break; 6271 case SPDK_BDEV_IO_TYPE_UNMAP: 6272 io_stat->bytes_unmapped += num_blocks * blocklen; 6273 io_stat->num_unmap_ops++; 6274 io_stat->unmap_latency_ticks += tsc_diff; 6275 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6276 io_stat->max_unmap_latency_ticks = tsc_diff; 6277 } 6278 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6279 io_stat->min_unmap_latency_ticks = tsc_diff; 6280 } 6281 break; 6282 case SPDK_BDEV_IO_TYPE_ZCOPY: 6283 /* Track the data in the start phase only */ 6284 if (bdev_io->u.bdev.zcopy.start) { 6285 if (bdev_io->u.bdev.zcopy.populate) { 6286 io_stat->bytes_read += num_blocks * blocklen; 6287 io_stat->num_read_ops++; 6288 io_stat->read_latency_ticks += tsc_diff; 6289 if (io_stat->max_read_latency_ticks < tsc_diff) { 6290 io_stat->max_read_latency_ticks = tsc_diff; 6291 } 6292 if (io_stat->min_read_latency_ticks > tsc_diff) { 6293 io_stat->min_read_latency_ticks = tsc_diff; 6294 } 6295 } else { 6296 io_stat->bytes_written += num_blocks * blocklen; 6297 io_stat->num_write_ops++; 6298 io_stat->write_latency_ticks += tsc_diff; 6299 if (io_stat->max_write_latency_ticks < tsc_diff) { 6300 io_stat->max_write_latency_ticks = tsc_diff; 6301 } 6302 if (io_stat->min_write_latency_ticks > tsc_diff) { 6303 io_stat->min_write_latency_ticks = tsc_diff; 6304 } 6305 } 6306 } 6307 break; 6308 case SPDK_BDEV_IO_TYPE_COPY: 6309 io_stat->bytes_copied += num_blocks * blocklen; 6310 io_stat->num_copy_ops++; 6311 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6312 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6313 io_stat->max_copy_latency_ticks = tsc_diff; 6314 } 6315 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6316 io_stat->min_copy_latency_ticks = tsc_diff; 6317 } 6318 break; 6319 default: 6320 break; 6321 } 6322 } else if (io_status <= SPDK_BDEV_IO_STATUS_FAILED && io_status >= SPDK_MIN_BDEV_IO_STATUS) { 6323 io_stat = bdev_io->bdev->internal.stat; 6324 assert(io_stat->io_error != NULL); 6325 6326 spdk_spin_lock(&bdev_io->bdev->internal.spinlock); 6327 io_stat->io_error->error_status[-io_status - 1]++; 6328 spdk_spin_unlock(&bdev_io->bdev->internal.spinlock); 6329 } 6330 6331 #ifdef SPDK_CONFIG_VTUNE 6332 uint64_t now_tsc = spdk_get_ticks(); 6333 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6334 uint64_t data[5]; 6335 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6336 6337 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6338 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6339 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6340 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6341 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6342 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6343 6344 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6345 __itt_metadata_u64, 5, data); 6346 6347 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6348 bdev_io->internal.ch->start_tsc = now_tsc; 6349 } 6350 #endif 6351 } 6352 6353 static inline void 6354 bdev_io_complete(void *ctx) 6355 { 6356 struct spdk_bdev_io *bdev_io = ctx; 6357 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6358 uint64_t tsc, tsc_diff; 6359 6360 if (spdk_unlikely(bdev_io->internal.in_submit_request)) { 6361 /* 6362 * Defer completion to avoid potential infinite recursion if the 6363 * user's completion callback issues a new I/O. 6364 */ 6365 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6366 bdev_io_complete, bdev_io); 6367 return; 6368 } 6369 6370 tsc = spdk_get_ticks(); 6371 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6372 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6373 bdev_io->internal.caller_ctx); 6374 6375 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6376 6377 if (bdev_io->internal.ch->histogram) { 6378 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6379 } 6380 6381 bdev_io_update_io_stat(bdev_io, tsc_diff); 6382 6383 assert(bdev_io->internal.cb != NULL); 6384 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6385 6386 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6387 bdev_io->internal.caller_ctx); 6388 } 6389 6390 static void bdev_destroy_cb(void *io_device); 6391 6392 static void 6393 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6394 { 6395 struct spdk_bdev_io *bdev_io = _ctx; 6396 6397 if (bdev_io->u.reset.ch_ref != NULL) { 6398 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6399 bdev_io->u.reset.ch_ref = NULL; 6400 } 6401 6402 bdev_io_complete(bdev_io); 6403 6404 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6405 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6406 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6407 } 6408 } 6409 6410 static void 6411 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6412 struct spdk_io_channel *_ch, void *_ctx) 6413 { 6414 struct spdk_bdev_io *bdev_io = _ctx; 6415 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6416 struct spdk_bdev_io *queued_reset; 6417 6418 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6419 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6420 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6421 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6422 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6423 } 6424 6425 spdk_bdev_for_each_channel_continue(i, 0); 6426 } 6427 6428 void 6429 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6430 { 6431 struct spdk_bdev *bdev = bdev_io->bdev; 6432 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6433 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6434 6435 bdev_io->internal.status = status; 6436 6437 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6438 bool unlock_channels = false; 6439 6440 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6441 SPDK_ERRLOG("NOMEM returned for reset\n"); 6442 } 6443 spdk_spin_lock(&bdev->internal.spinlock); 6444 if (bdev_io == bdev->internal.reset_in_progress) { 6445 bdev->internal.reset_in_progress = NULL; 6446 unlock_channels = true; 6447 } 6448 spdk_spin_unlock(&bdev->internal.spinlock); 6449 6450 if (unlock_channels) { 6451 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6452 bdev_reset_complete); 6453 return; 6454 } 6455 } else { 6456 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6457 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6458 /* bdev IO will be completed in the callback */ 6459 return; 6460 } 6461 6462 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6463 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6464 return; 6465 } 6466 } 6467 6468 bdev_io_complete(bdev_io); 6469 } 6470 6471 void 6472 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6473 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6474 { 6475 if (sc == SPDK_SCSI_STATUS_GOOD) { 6476 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6477 } else { 6478 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6479 bdev_io->internal.error.scsi.sc = sc; 6480 bdev_io->internal.error.scsi.sk = sk; 6481 bdev_io->internal.error.scsi.asc = asc; 6482 bdev_io->internal.error.scsi.ascq = ascq; 6483 } 6484 6485 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6486 } 6487 6488 void 6489 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6490 int *sc, int *sk, int *asc, int *ascq) 6491 { 6492 assert(sc != NULL); 6493 assert(sk != NULL); 6494 assert(asc != NULL); 6495 assert(ascq != NULL); 6496 6497 switch (bdev_io->internal.status) { 6498 case SPDK_BDEV_IO_STATUS_SUCCESS: 6499 *sc = SPDK_SCSI_STATUS_GOOD; 6500 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6501 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6502 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6503 break; 6504 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6505 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6506 break; 6507 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6508 *sc = bdev_io->internal.error.scsi.sc; 6509 *sk = bdev_io->internal.error.scsi.sk; 6510 *asc = bdev_io->internal.error.scsi.asc; 6511 *ascq = bdev_io->internal.error.scsi.ascq; 6512 break; 6513 default: 6514 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6515 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6516 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6517 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6518 break; 6519 } 6520 } 6521 6522 void 6523 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6524 { 6525 if (aio_result == 0) { 6526 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6527 } else { 6528 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6529 } 6530 6531 bdev_io->internal.error.aio_result = aio_result; 6532 6533 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6534 } 6535 6536 void 6537 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6538 { 6539 assert(aio_result != NULL); 6540 6541 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6542 *aio_result = bdev_io->internal.error.aio_result; 6543 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6544 *aio_result = 0; 6545 } else { 6546 *aio_result = -EIO; 6547 } 6548 } 6549 6550 void 6551 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6552 { 6553 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6554 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6555 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6556 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6557 } else { 6558 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6559 } 6560 6561 bdev_io->internal.error.nvme.cdw0 = cdw0; 6562 bdev_io->internal.error.nvme.sct = sct; 6563 bdev_io->internal.error.nvme.sc = sc; 6564 6565 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6566 } 6567 6568 void 6569 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6570 { 6571 assert(sct != NULL); 6572 assert(sc != NULL); 6573 assert(cdw0 != NULL); 6574 6575 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6576 *sct = SPDK_NVME_SCT_GENERIC; 6577 *sc = SPDK_NVME_SC_SUCCESS; 6578 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6579 *cdw0 = 0; 6580 } else { 6581 *cdw0 = 1U; 6582 } 6583 return; 6584 } 6585 6586 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6587 *sct = bdev_io->internal.error.nvme.sct; 6588 *sc = bdev_io->internal.error.nvme.sc; 6589 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6590 *sct = SPDK_NVME_SCT_GENERIC; 6591 *sc = SPDK_NVME_SC_SUCCESS; 6592 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6593 *sct = SPDK_NVME_SCT_GENERIC; 6594 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6595 } else { 6596 *sct = SPDK_NVME_SCT_GENERIC; 6597 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6598 } 6599 6600 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6601 } 6602 6603 void 6604 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6605 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6606 { 6607 assert(first_sct != NULL); 6608 assert(first_sc != NULL); 6609 assert(second_sct != NULL); 6610 assert(second_sc != NULL); 6611 assert(cdw0 != NULL); 6612 6613 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6614 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6615 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6616 *first_sct = bdev_io->internal.error.nvme.sct; 6617 *first_sc = bdev_io->internal.error.nvme.sc; 6618 *second_sct = SPDK_NVME_SCT_GENERIC; 6619 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6620 } else { 6621 *first_sct = SPDK_NVME_SCT_GENERIC; 6622 *first_sc = SPDK_NVME_SC_SUCCESS; 6623 *second_sct = bdev_io->internal.error.nvme.sct; 6624 *second_sc = bdev_io->internal.error.nvme.sc; 6625 } 6626 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6627 *first_sct = SPDK_NVME_SCT_GENERIC; 6628 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6629 *second_sct = SPDK_NVME_SCT_GENERIC; 6630 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6631 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6632 *first_sct = SPDK_NVME_SCT_GENERIC; 6633 *first_sc = SPDK_NVME_SC_SUCCESS; 6634 *second_sct = SPDK_NVME_SCT_GENERIC; 6635 *second_sc = SPDK_NVME_SC_SUCCESS; 6636 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6637 *first_sct = SPDK_NVME_SCT_GENERIC; 6638 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6639 *second_sct = SPDK_NVME_SCT_GENERIC; 6640 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6641 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6642 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6643 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6644 *second_sct = SPDK_NVME_SCT_GENERIC; 6645 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6646 } else { 6647 *first_sct = SPDK_NVME_SCT_GENERIC; 6648 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6649 *second_sct = SPDK_NVME_SCT_GENERIC; 6650 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6651 } 6652 6653 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6654 } 6655 6656 struct spdk_thread * 6657 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6658 { 6659 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6660 } 6661 6662 struct spdk_io_channel * 6663 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6664 { 6665 return bdev_io->internal.ch->channel; 6666 } 6667 6668 static int 6669 bdev_register(struct spdk_bdev *bdev) 6670 { 6671 char *bdev_name; 6672 char uuid[SPDK_UUID_STRING_LEN]; 6673 int ret; 6674 6675 assert(bdev->module != NULL); 6676 6677 if (!bdev->name) { 6678 SPDK_ERRLOG("Bdev name is NULL\n"); 6679 return -EINVAL; 6680 } 6681 6682 if (!strlen(bdev->name)) { 6683 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6684 return -EINVAL; 6685 } 6686 6687 /* Users often register their own I/O devices using the bdev name. In 6688 * order to avoid conflicts, prepend bdev_. */ 6689 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6690 if (!bdev_name) { 6691 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6692 return -ENOMEM; 6693 } 6694 6695 bdev->internal.stat = bdev_alloc_io_stat(true); 6696 if (!bdev->internal.stat) { 6697 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6698 free(bdev_name); 6699 return -ENOMEM; 6700 } 6701 6702 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6703 bdev->internal.measured_queue_depth = UINT64_MAX; 6704 bdev->internal.claim_module = NULL; 6705 bdev->internal.qd_poller = NULL; 6706 bdev->internal.qos = NULL; 6707 6708 TAILQ_INIT(&bdev->internal.open_descs); 6709 TAILQ_INIT(&bdev->internal.locked_ranges); 6710 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6711 TAILQ_INIT(&bdev->aliases); 6712 6713 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6714 if (ret != 0) { 6715 bdev_free_io_stat(bdev->internal.stat); 6716 free(bdev_name); 6717 return ret; 6718 } 6719 6720 /* UUID has to be specified by the user or defined by bdev itself. 6721 * Otherwise this field must remain empty, to indicate that this 6722 * value cannot be depended upon. */ 6723 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6724 /* Add the UUID alias only if it's different than the name */ 6725 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6726 if (strcmp(bdev->name, uuid) != 0) { 6727 ret = spdk_bdev_alias_add(bdev, uuid); 6728 if (ret != 0) { 6729 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6730 bdev_name_del(&bdev->internal.bdev_name); 6731 bdev_free_io_stat(bdev->internal.stat); 6732 free(bdev_name); 6733 return ret; 6734 } 6735 } 6736 } 6737 6738 if (spdk_bdev_get_buf_align(bdev) > 1) { 6739 if (bdev->split_on_optimal_io_boundary) { 6740 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6741 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6742 } else { 6743 bdev->split_on_optimal_io_boundary = true; 6744 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6745 } 6746 } 6747 6748 /* If the user didn't specify a write unit size, set it to one. */ 6749 if (bdev->write_unit_size == 0) { 6750 bdev->write_unit_size = 1; 6751 } 6752 6753 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6754 if (bdev->acwu == 0) { 6755 bdev->acwu = bdev->write_unit_size; 6756 } 6757 6758 if (bdev->phys_blocklen == 0) { 6759 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6760 } 6761 6762 bdev->internal.reset_in_progress = NULL; 6763 bdev->internal.qd_poll_in_progress = false; 6764 bdev->internal.period = 0; 6765 bdev->internal.new_period = 0; 6766 6767 spdk_io_device_register(__bdev_to_io_dev(bdev), 6768 bdev_channel_create, bdev_channel_destroy, 6769 sizeof(struct spdk_bdev_channel), 6770 bdev_name); 6771 6772 free(bdev_name); 6773 6774 spdk_spin_init(&bdev->internal.spinlock); 6775 6776 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6777 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6778 6779 return 0; 6780 } 6781 6782 static void 6783 bdev_destroy_cb(void *io_device) 6784 { 6785 int rc; 6786 struct spdk_bdev *bdev; 6787 spdk_bdev_unregister_cb cb_fn; 6788 void *cb_arg; 6789 6790 bdev = __bdev_from_io_dev(io_device); 6791 cb_fn = bdev->internal.unregister_cb; 6792 cb_arg = bdev->internal.unregister_ctx; 6793 6794 spdk_spin_destroy(&bdev->internal.spinlock); 6795 free(bdev->internal.qos); 6796 bdev_free_io_stat(bdev->internal.stat); 6797 6798 rc = bdev->fn_table->destruct(bdev->ctxt); 6799 if (rc < 0) { 6800 SPDK_ERRLOG("destruct failed\n"); 6801 } 6802 if (rc <= 0 && cb_fn != NULL) { 6803 cb_fn(cb_arg, rc); 6804 } 6805 } 6806 6807 void 6808 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6809 { 6810 if (bdev->internal.unregister_cb != NULL) { 6811 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6812 } 6813 } 6814 6815 static void 6816 _remove_notify(void *arg) 6817 { 6818 struct spdk_bdev_desc *desc = arg; 6819 6820 spdk_spin_lock(&desc->spinlock); 6821 desc->refs--; 6822 6823 if (!desc->closed) { 6824 spdk_spin_unlock(&desc->spinlock); 6825 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6826 return; 6827 } else if (0 == desc->refs) { 6828 /* This descriptor was closed after this remove_notify message was sent. 6829 * spdk_bdev_close() could not free the descriptor since this message was 6830 * in flight, so we free it now using bdev_desc_free(). 6831 */ 6832 spdk_spin_unlock(&desc->spinlock); 6833 bdev_desc_free(desc); 6834 return; 6835 } 6836 spdk_spin_unlock(&desc->spinlock); 6837 } 6838 6839 /* returns: 0 - bdev removed and ready to be destructed. 6840 * -EBUSY - bdev can't be destructed yet. */ 6841 static int 6842 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6843 { 6844 struct spdk_bdev_desc *desc, *tmp; 6845 int rc = 0; 6846 char uuid[SPDK_UUID_STRING_LEN]; 6847 6848 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6849 assert(spdk_spin_held(&bdev->internal.spinlock)); 6850 6851 /* Notify each descriptor about hotremoval */ 6852 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6853 rc = -EBUSY; 6854 spdk_spin_lock(&desc->spinlock); 6855 /* 6856 * Defer invocation of the event_cb to a separate message that will 6857 * run later on its thread. This ensures this context unwinds and 6858 * we don't recursively unregister this bdev again if the event_cb 6859 * immediately closes its descriptor. 6860 */ 6861 desc->refs++; 6862 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6863 spdk_spin_unlock(&desc->spinlock); 6864 } 6865 6866 /* If there are no descriptors, proceed removing the bdev */ 6867 if (rc == 0) { 6868 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6869 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6870 6871 /* Delete the name and the UUID alias */ 6872 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6873 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6874 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6875 6876 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6877 6878 if (bdev->internal.reset_in_progress != NULL) { 6879 /* If reset is in progress, let the completion callback for reset 6880 * unregister the bdev. 6881 */ 6882 rc = -EBUSY; 6883 } 6884 } 6885 6886 return rc; 6887 } 6888 6889 static void 6890 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6891 struct spdk_io_channel *io_ch, void *_ctx) 6892 { 6893 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 6894 6895 bdev_channel_abort_queued_ios(bdev_ch); 6896 spdk_bdev_for_each_channel_continue(i, 0); 6897 } 6898 6899 static void 6900 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 6901 { 6902 int rc; 6903 6904 spdk_spin_lock(&g_bdev_mgr.spinlock); 6905 spdk_spin_lock(&bdev->internal.spinlock); 6906 /* 6907 * Set the status to REMOVING after completing to abort channels. Otherwise, 6908 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6909 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 6910 * may fail. 6911 */ 6912 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6913 rc = bdev_unregister_unsafe(bdev); 6914 spdk_spin_unlock(&bdev->internal.spinlock); 6915 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6916 6917 if (rc == 0) { 6918 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6919 } 6920 } 6921 6922 void 6923 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6924 { 6925 struct spdk_thread *thread; 6926 6927 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6928 6929 thread = spdk_get_thread(); 6930 if (!thread) { 6931 /* The user called this from a non-SPDK thread. */ 6932 if (cb_fn != NULL) { 6933 cb_fn(cb_arg, -ENOTSUP); 6934 } 6935 return; 6936 } 6937 6938 spdk_spin_lock(&g_bdev_mgr.spinlock); 6939 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6940 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6941 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6942 if (cb_fn) { 6943 cb_fn(cb_arg, -EBUSY); 6944 } 6945 return; 6946 } 6947 6948 spdk_spin_lock(&bdev->internal.spinlock); 6949 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6950 bdev->internal.unregister_cb = cb_fn; 6951 bdev->internal.unregister_ctx = cb_arg; 6952 spdk_spin_unlock(&bdev->internal.spinlock); 6953 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6954 6955 spdk_bdev_set_qd_sampling_period(bdev, 0); 6956 6957 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 6958 bdev_unregister); 6959 } 6960 6961 int 6962 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6963 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6964 { 6965 struct spdk_bdev_desc *desc; 6966 struct spdk_bdev *bdev; 6967 int rc; 6968 6969 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6970 if (rc != 0) { 6971 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6972 return rc; 6973 } 6974 6975 bdev = spdk_bdev_desc_get_bdev(desc); 6976 6977 if (bdev->module != module) { 6978 spdk_bdev_close(desc); 6979 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6980 bdev_name); 6981 return -ENODEV; 6982 } 6983 6984 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6985 6986 spdk_bdev_close(desc); 6987 6988 return 0; 6989 } 6990 6991 static int 6992 bdev_start_qos(struct spdk_bdev *bdev) 6993 { 6994 struct set_qos_limit_ctx *ctx; 6995 6996 /* Enable QoS */ 6997 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6998 ctx = calloc(1, sizeof(*ctx)); 6999 if (ctx == NULL) { 7000 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 7001 return -ENOMEM; 7002 } 7003 ctx->bdev = bdev; 7004 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 7005 } 7006 7007 return 0; 7008 } 7009 7010 static int 7011 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 7012 { 7013 struct spdk_thread *thread; 7014 int rc = 0; 7015 7016 thread = spdk_get_thread(); 7017 if (!thread) { 7018 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 7019 return -ENOTSUP; 7020 } 7021 7022 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7023 spdk_get_thread()); 7024 7025 desc->bdev = bdev; 7026 desc->thread = thread; 7027 desc->write = write; 7028 7029 spdk_spin_lock(&bdev->internal.spinlock); 7030 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 7031 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 7032 spdk_spin_unlock(&bdev->internal.spinlock); 7033 return -ENODEV; 7034 } 7035 7036 if (write && bdev->internal.claim_module) { 7037 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 7038 bdev->name, bdev->internal.claim_module->name); 7039 spdk_spin_unlock(&bdev->internal.spinlock); 7040 return -EPERM; 7041 } 7042 7043 rc = bdev_start_qos(bdev); 7044 if (rc != 0) { 7045 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 7046 spdk_spin_unlock(&bdev->internal.spinlock); 7047 return rc; 7048 } 7049 7050 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 7051 7052 spdk_spin_unlock(&bdev->internal.spinlock); 7053 7054 return 0; 7055 } 7056 7057 static int 7058 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 7059 struct spdk_bdev_desc **_desc) 7060 { 7061 struct spdk_bdev_desc *desc; 7062 unsigned int event_id; 7063 7064 desc = calloc(1, sizeof(*desc)); 7065 if (desc == NULL) { 7066 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 7067 return -ENOMEM; 7068 } 7069 7070 TAILQ_INIT(&desc->pending_media_events); 7071 TAILQ_INIT(&desc->free_media_events); 7072 7073 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 7074 desc->callback.event_fn = event_cb; 7075 desc->callback.ctx = event_ctx; 7076 spdk_spin_init(&desc->spinlock); 7077 7078 if (bdev->media_events) { 7079 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 7080 sizeof(*desc->media_events_buffer)); 7081 if (desc->media_events_buffer == NULL) { 7082 SPDK_ERRLOG("Failed to initialize media event pool\n"); 7083 bdev_desc_free(desc); 7084 return -ENOMEM; 7085 } 7086 7087 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 7088 TAILQ_INSERT_TAIL(&desc->free_media_events, 7089 &desc->media_events_buffer[event_id], tailq); 7090 } 7091 } 7092 7093 *_desc = desc; 7094 7095 return 0; 7096 } 7097 7098 int 7099 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7100 void *event_ctx, struct spdk_bdev_desc **_desc) 7101 { 7102 struct spdk_bdev_desc *desc; 7103 struct spdk_bdev *bdev; 7104 int rc; 7105 7106 if (event_cb == NULL) { 7107 SPDK_ERRLOG("Missing event callback function\n"); 7108 return -EINVAL; 7109 } 7110 7111 spdk_spin_lock(&g_bdev_mgr.spinlock); 7112 7113 bdev = bdev_get_by_name(bdev_name); 7114 7115 if (bdev == NULL) { 7116 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7117 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7118 return -ENODEV; 7119 } 7120 7121 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7122 if (rc != 0) { 7123 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7124 return rc; 7125 } 7126 7127 rc = bdev_open(bdev, write, desc); 7128 if (rc != 0) { 7129 bdev_desc_free(desc); 7130 desc = NULL; 7131 } 7132 7133 *_desc = desc; 7134 7135 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7136 7137 return rc; 7138 } 7139 7140 static void 7141 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7142 { 7143 int rc; 7144 7145 spdk_spin_lock(&bdev->internal.spinlock); 7146 spdk_spin_lock(&desc->spinlock); 7147 7148 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7149 7150 desc->closed = true; 7151 7152 if (0 == desc->refs) { 7153 spdk_spin_unlock(&desc->spinlock); 7154 bdev_desc_free(desc); 7155 } else { 7156 spdk_spin_unlock(&desc->spinlock); 7157 } 7158 7159 /* If no more descriptors, kill QoS channel */ 7160 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7161 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7162 bdev->name, spdk_get_thread()); 7163 7164 if (bdev_qos_destroy(bdev)) { 7165 /* There isn't anything we can do to recover here. Just let the 7166 * old QoS poller keep running. The QoS handling won't change 7167 * cores when the user allocates a new channel, but it won't break. */ 7168 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7169 } 7170 } 7171 7172 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7173 rc = bdev_unregister_unsafe(bdev); 7174 spdk_spin_unlock(&bdev->internal.spinlock); 7175 7176 if (rc == 0) { 7177 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7178 } 7179 } else { 7180 spdk_spin_unlock(&bdev->internal.spinlock); 7181 } 7182 } 7183 7184 void 7185 spdk_bdev_close(struct spdk_bdev_desc *desc) 7186 { 7187 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7188 7189 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7190 spdk_get_thread()); 7191 7192 assert(desc->thread == spdk_get_thread()); 7193 7194 spdk_poller_unregister(&desc->io_timeout_poller); 7195 7196 spdk_spin_lock(&g_bdev_mgr.spinlock); 7197 7198 bdev_close(bdev, desc); 7199 7200 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7201 } 7202 7203 static void 7204 bdev_register_finished(void *arg) 7205 { 7206 struct spdk_bdev_desc *desc = arg; 7207 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7208 7209 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7210 7211 spdk_spin_lock(&g_bdev_mgr.spinlock); 7212 7213 bdev_close(bdev, desc); 7214 7215 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7216 } 7217 7218 int 7219 spdk_bdev_register(struct spdk_bdev *bdev) 7220 { 7221 struct spdk_bdev_desc *desc; 7222 int rc; 7223 7224 if (spdk_unlikely(spdk_thread_get_app_thread() != spdk_get_thread())) { 7225 SPDK_LOG_DEPRECATED(bdev_register_examine_thread); 7226 } 7227 7228 rc = bdev_register(bdev); 7229 if (rc != 0) { 7230 return rc; 7231 } 7232 7233 /* A descriptor is opened to prevent bdev deletion during examination */ 7234 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7235 if (rc != 0) { 7236 spdk_bdev_unregister(bdev, NULL, NULL); 7237 return rc; 7238 } 7239 7240 rc = bdev_open(bdev, false, desc); 7241 if (rc != 0) { 7242 bdev_desc_free(desc); 7243 spdk_bdev_unregister(bdev, NULL, NULL); 7244 return rc; 7245 } 7246 7247 /* Examine configuration before initializing I/O */ 7248 bdev_examine(bdev); 7249 7250 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7251 if (rc != 0) { 7252 bdev_close(bdev, desc); 7253 spdk_bdev_unregister(bdev, NULL, NULL); 7254 } 7255 7256 return rc; 7257 } 7258 7259 int 7260 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7261 struct spdk_bdev_module *module) 7262 { 7263 if (bdev->internal.claim_module != NULL) { 7264 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 7265 bdev->internal.claim_module->name); 7266 return -EPERM; 7267 } 7268 7269 if (desc && !desc->write) { 7270 desc->write = true; 7271 } 7272 7273 bdev->internal.claim_module = module; 7274 return 0; 7275 } 7276 7277 void 7278 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7279 { 7280 assert(bdev->internal.claim_module != NULL); 7281 bdev->internal.claim_module = NULL; 7282 } 7283 7284 struct spdk_bdev * 7285 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7286 { 7287 assert(desc != NULL); 7288 return desc->bdev; 7289 } 7290 7291 int 7292 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7293 { 7294 struct spdk_bdev *bdev, *tmp; 7295 struct spdk_bdev_desc *desc; 7296 int rc = 0; 7297 7298 assert(fn != NULL); 7299 7300 spdk_spin_lock(&g_bdev_mgr.spinlock); 7301 bdev = spdk_bdev_first(); 7302 while (bdev != NULL) { 7303 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7304 if (rc != 0) { 7305 break; 7306 } 7307 rc = bdev_open(bdev, false, desc); 7308 if (rc != 0) { 7309 bdev_desc_free(desc); 7310 if (rc == -ENODEV) { 7311 /* Ignore the error and move to the next bdev. */ 7312 rc = 0; 7313 bdev = spdk_bdev_next(bdev); 7314 continue; 7315 } 7316 break; 7317 } 7318 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7319 7320 rc = fn(ctx, bdev); 7321 7322 spdk_spin_lock(&g_bdev_mgr.spinlock); 7323 tmp = spdk_bdev_next(bdev); 7324 bdev_close(bdev, desc); 7325 if (rc != 0) { 7326 break; 7327 } 7328 bdev = tmp; 7329 } 7330 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7331 7332 return rc; 7333 } 7334 7335 int 7336 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7337 { 7338 struct spdk_bdev *bdev, *tmp; 7339 struct spdk_bdev_desc *desc; 7340 int rc = 0; 7341 7342 assert(fn != NULL); 7343 7344 spdk_spin_lock(&g_bdev_mgr.spinlock); 7345 bdev = spdk_bdev_first_leaf(); 7346 while (bdev != NULL) { 7347 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7348 if (rc != 0) { 7349 break; 7350 } 7351 rc = bdev_open(bdev, false, desc); 7352 if (rc != 0) { 7353 bdev_desc_free(desc); 7354 if (rc == -ENODEV) { 7355 /* Ignore the error and move to the next bdev. */ 7356 rc = 0; 7357 bdev = spdk_bdev_next_leaf(bdev); 7358 continue; 7359 } 7360 break; 7361 } 7362 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7363 7364 rc = fn(ctx, bdev); 7365 7366 spdk_spin_lock(&g_bdev_mgr.spinlock); 7367 tmp = spdk_bdev_next_leaf(bdev); 7368 bdev_close(bdev, desc); 7369 if (rc != 0) { 7370 break; 7371 } 7372 bdev = tmp; 7373 } 7374 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7375 7376 return rc; 7377 } 7378 7379 void 7380 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7381 { 7382 struct iovec *iovs; 7383 int iovcnt; 7384 7385 if (bdev_io == NULL) { 7386 return; 7387 } 7388 7389 switch (bdev_io->type) { 7390 case SPDK_BDEV_IO_TYPE_READ: 7391 case SPDK_BDEV_IO_TYPE_WRITE: 7392 case SPDK_BDEV_IO_TYPE_ZCOPY: 7393 iovs = bdev_io->u.bdev.iovs; 7394 iovcnt = bdev_io->u.bdev.iovcnt; 7395 break; 7396 default: 7397 iovs = NULL; 7398 iovcnt = 0; 7399 break; 7400 } 7401 7402 if (iovp) { 7403 *iovp = iovs; 7404 } 7405 if (iovcntp) { 7406 *iovcntp = iovcnt; 7407 } 7408 } 7409 7410 void * 7411 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7412 { 7413 if (bdev_io == NULL) { 7414 return NULL; 7415 } 7416 7417 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7418 return NULL; 7419 } 7420 7421 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7422 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7423 return bdev_io->u.bdev.md_buf; 7424 } 7425 7426 return NULL; 7427 } 7428 7429 void * 7430 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7431 { 7432 if (bdev_io == NULL) { 7433 assert(false); 7434 return NULL; 7435 } 7436 7437 return bdev_io->internal.caller_ctx; 7438 } 7439 7440 void 7441 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7442 { 7443 7444 if (spdk_bdev_module_list_find(bdev_module->name)) { 7445 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7446 assert(false); 7447 } 7448 7449 /* 7450 * Modules with examine callbacks must be initialized first, so they are 7451 * ready to handle examine callbacks from later modules that will 7452 * register physical bdevs. 7453 */ 7454 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7455 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7456 } else { 7457 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7458 } 7459 } 7460 7461 struct spdk_bdev_module * 7462 spdk_bdev_module_list_find(const char *name) 7463 { 7464 struct spdk_bdev_module *bdev_module; 7465 7466 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7467 if (strcmp(name, bdev_module->name) == 0) { 7468 break; 7469 } 7470 } 7471 7472 return bdev_module; 7473 } 7474 7475 static void 7476 bdev_write_zero_buffer_next(void *_bdev_io) 7477 { 7478 struct spdk_bdev_io *bdev_io = _bdev_io; 7479 uint64_t num_bytes, num_blocks; 7480 void *md_buf = NULL; 7481 int rc; 7482 7483 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7484 bdev_io->u.bdev.split_remaining_num_blocks, 7485 ZERO_BUFFER_SIZE); 7486 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7487 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7488 7489 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7490 md_buf = (char *)g_bdev_mgr.zero_buffer + 7491 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7492 } 7493 7494 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7495 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7496 g_bdev_mgr.zero_buffer, md_buf, 7497 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7498 bdev_write_zero_buffer_done, bdev_io); 7499 if (rc == 0) { 7500 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7501 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7502 } else if (rc == -ENOMEM) { 7503 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7504 } else { 7505 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7506 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7507 } 7508 } 7509 7510 static void 7511 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7512 { 7513 struct spdk_bdev_io *parent_io = cb_arg; 7514 7515 spdk_bdev_free_io(bdev_io); 7516 7517 if (!success) { 7518 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7519 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7520 return; 7521 } 7522 7523 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7524 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7525 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7526 return; 7527 } 7528 7529 bdev_write_zero_buffer_next(parent_io); 7530 } 7531 7532 static void 7533 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7534 { 7535 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7536 ctx->bdev->internal.qos_mod_in_progress = false; 7537 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7538 7539 if (ctx->cb_fn) { 7540 ctx->cb_fn(ctx->cb_arg, status); 7541 } 7542 free(ctx); 7543 } 7544 7545 static void 7546 bdev_disable_qos_done(void *cb_arg) 7547 { 7548 struct set_qos_limit_ctx *ctx = cb_arg; 7549 struct spdk_bdev *bdev = ctx->bdev; 7550 struct spdk_bdev_io *bdev_io; 7551 struct spdk_bdev_qos *qos; 7552 7553 spdk_spin_lock(&bdev->internal.spinlock); 7554 qos = bdev->internal.qos; 7555 bdev->internal.qos = NULL; 7556 spdk_spin_unlock(&bdev->internal.spinlock); 7557 7558 while (!TAILQ_EMPTY(&qos->queued)) { 7559 /* Send queued I/O back to their original thread for resubmission. */ 7560 bdev_io = TAILQ_FIRST(&qos->queued); 7561 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7562 7563 if (bdev_io->internal.io_submit_ch) { 7564 /* 7565 * Channel was changed when sending it to the QoS thread - change it back 7566 * before sending it back to the original thread. 7567 */ 7568 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7569 bdev_io->internal.io_submit_ch = NULL; 7570 } 7571 7572 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7573 _bdev_io_submit, bdev_io); 7574 } 7575 7576 if (qos->thread != NULL) { 7577 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7578 spdk_poller_unregister(&qos->poller); 7579 } 7580 7581 free(qos); 7582 7583 bdev_set_qos_limit_done(ctx, 0); 7584 } 7585 7586 static void 7587 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 7588 { 7589 struct set_qos_limit_ctx *ctx = _ctx; 7590 struct spdk_thread *thread; 7591 7592 spdk_spin_lock(&bdev->internal.spinlock); 7593 thread = bdev->internal.qos->thread; 7594 spdk_spin_unlock(&bdev->internal.spinlock); 7595 7596 if (thread != NULL) { 7597 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7598 } else { 7599 bdev_disable_qos_done(ctx); 7600 } 7601 } 7602 7603 static void 7604 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7605 struct spdk_io_channel *ch, void *_ctx) 7606 { 7607 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7608 7609 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7610 7611 spdk_bdev_for_each_channel_continue(i, 0); 7612 } 7613 7614 static void 7615 bdev_update_qos_rate_limit_msg(void *cb_arg) 7616 { 7617 struct set_qos_limit_ctx *ctx = cb_arg; 7618 struct spdk_bdev *bdev = ctx->bdev; 7619 7620 spdk_spin_lock(&bdev->internal.spinlock); 7621 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7622 spdk_spin_unlock(&bdev->internal.spinlock); 7623 7624 bdev_set_qos_limit_done(ctx, 0); 7625 } 7626 7627 static void 7628 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7629 struct spdk_io_channel *ch, void *_ctx) 7630 { 7631 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7632 7633 spdk_spin_lock(&bdev->internal.spinlock); 7634 bdev_enable_qos(bdev, bdev_ch); 7635 spdk_spin_unlock(&bdev->internal.spinlock); 7636 spdk_bdev_for_each_channel_continue(i, 0); 7637 } 7638 7639 static void 7640 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 7641 { 7642 struct set_qos_limit_ctx *ctx = _ctx; 7643 7644 bdev_set_qos_limit_done(ctx, status); 7645 } 7646 7647 static void 7648 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7649 { 7650 int i; 7651 7652 assert(bdev->internal.qos != NULL); 7653 7654 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7655 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7656 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7657 7658 if (limits[i] == 0) { 7659 bdev->internal.qos->rate_limits[i].limit = 7660 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7661 } 7662 } 7663 } 7664 } 7665 7666 void 7667 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7668 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7669 { 7670 struct set_qos_limit_ctx *ctx; 7671 uint32_t limit_set_complement; 7672 uint64_t min_limit_per_sec; 7673 int i; 7674 bool disable_rate_limit = true; 7675 7676 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7677 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7678 continue; 7679 } 7680 7681 if (limits[i] > 0) { 7682 disable_rate_limit = false; 7683 } 7684 7685 if (bdev_qos_is_iops_rate_limit(i) == true) { 7686 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7687 } else { 7688 /* Change from megabyte to byte rate limit */ 7689 limits[i] = limits[i] * 1024 * 1024; 7690 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7691 } 7692 7693 limit_set_complement = limits[i] % min_limit_per_sec; 7694 if (limit_set_complement) { 7695 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7696 limits[i], min_limit_per_sec); 7697 limits[i] += min_limit_per_sec - limit_set_complement; 7698 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7699 } 7700 } 7701 7702 ctx = calloc(1, sizeof(*ctx)); 7703 if (ctx == NULL) { 7704 cb_fn(cb_arg, -ENOMEM); 7705 return; 7706 } 7707 7708 ctx->cb_fn = cb_fn; 7709 ctx->cb_arg = cb_arg; 7710 ctx->bdev = bdev; 7711 7712 spdk_spin_lock(&bdev->internal.spinlock); 7713 if (bdev->internal.qos_mod_in_progress) { 7714 spdk_spin_unlock(&bdev->internal.spinlock); 7715 free(ctx); 7716 cb_fn(cb_arg, -EAGAIN); 7717 return; 7718 } 7719 bdev->internal.qos_mod_in_progress = true; 7720 7721 if (disable_rate_limit == true && bdev->internal.qos) { 7722 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7723 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7724 (bdev->internal.qos->rate_limits[i].limit > 0 && 7725 bdev->internal.qos->rate_limits[i].limit != 7726 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7727 disable_rate_limit = false; 7728 break; 7729 } 7730 } 7731 } 7732 7733 if (disable_rate_limit == false) { 7734 if (bdev->internal.qos == NULL) { 7735 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7736 if (!bdev->internal.qos) { 7737 spdk_spin_unlock(&bdev->internal.spinlock); 7738 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7739 bdev_set_qos_limit_done(ctx, -ENOMEM); 7740 return; 7741 } 7742 } 7743 7744 if (bdev->internal.qos->thread == NULL) { 7745 /* Enabling */ 7746 bdev_set_qos_rate_limits(bdev, limits); 7747 7748 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 7749 bdev_enable_qos_done); 7750 } else { 7751 /* Updating */ 7752 bdev_set_qos_rate_limits(bdev, limits); 7753 7754 spdk_thread_send_msg(bdev->internal.qos->thread, 7755 bdev_update_qos_rate_limit_msg, ctx); 7756 } 7757 } else { 7758 if (bdev->internal.qos != NULL) { 7759 bdev_set_qos_rate_limits(bdev, limits); 7760 7761 /* Disabling */ 7762 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 7763 bdev_disable_qos_msg_done); 7764 } else { 7765 spdk_spin_unlock(&bdev->internal.spinlock); 7766 bdev_set_qos_limit_done(ctx, 0); 7767 return; 7768 } 7769 } 7770 7771 spdk_spin_unlock(&bdev->internal.spinlock); 7772 } 7773 7774 struct spdk_bdev_histogram_ctx { 7775 spdk_bdev_histogram_status_cb cb_fn; 7776 void *cb_arg; 7777 struct spdk_bdev *bdev; 7778 int status; 7779 }; 7780 7781 static void 7782 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7783 { 7784 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7785 7786 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7787 ctx->bdev->internal.histogram_in_progress = false; 7788 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7789 ctx->cb_fn(ctx->cb_arg, ctx->status); 7790 free(ctx); 7791 } 7792 7793 static void 7794 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7795 struct spdk_io_channel *_ch, void *_ctx) 7796 { 7797 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7798 7799 if (ch->histogram != NULL) { 7800 spdk_histogram_data_free(ch->histogram); 7801 ch->histogram = NULL; 7802 } 7803 spdk_bdev_for_each_channel_continue(i, 0); 7804 } 7805 7806 static void 7807 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7808 { 7809 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7810 7811 if (status != 0) { 7812 ctx->status = status; 7813 ctx->bdev->internal.histogram_enabled = false; 7814 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 7815 bdev_histogram_disable_channel_cb); 7816 } else { 7817 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7818 ctx->bdev->internal.histogram_in_progress = false; 7819 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7820 ctx->cb_fn(ctx->cb_arg, ctx->status); 7821 free(ctx); 7822 } 7823 } 7824 7825 static void 7826 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7827 struct spdk_io_channel *_ch, void *_ctx) 7828 { 7829 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7830 int status = 0; 7831 7832 if (ch->histogram == NULL) { 7833 ch->histogram = spdk_histogram_data_alloc(); 7834 if (ch->histogram == NULL) { 7835 status = -ENOMEM; 7836 } 7837 } 7838 7839 spdk_bdev_for_each_channel_continue(i, status); 7840 } 7841 7842 void 7843 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7844 void *cb_arg, bool enable) 7845 { 7846 struct spdk_bdev_histogram_ctx *ctx; 7847 7848 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7849 if (ctx == NULL) { 7850 cb_fn(cb_arg, -ENOMEM); 7851 return; 7852 } 7853 7854 ctx->bdev = bdev; 7855 ctx->status = 0; 7856 ctx->cb_fn = cb_fn; 7857 ctx->cb_arg = cb_arg; 7858 7859 spdk_spin_lock(&bdev->internal.spinlock); 7860 if (bdev->internal.histogram_in_progress) { 7861 spdk_spin_unlock(&bdev->internal.spinlock); 7862 free(ctx); 7863 cb_fn(cb_arg, -EAGAIN); 7864 return; 7865 } 7866 7867 bdev->internal.histogram_in_progress = true; 7868 spdk_spin_unlock(&bdev->internal.spinlock); 7869 7870 bdev->internal.histogram_enabled = enable; 7871 7872 if (enable) { 7873 /* Allocate histogram for each channel */ 7874 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 7875 bdev_histogram_enable_channel_cb); 7876 } else { 7877 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 7878 bdev_histogram_disable_channel_cb); 7879 } 7880 } 7881 7882 struct spdk_bdev_histogram_data_ctx { 7883 spdk_bdev_histogram_data_cb cb_fn; 7884 void *cb_arg; 7885 struct spdk_bdev *bdev; 7886 /** merged histogram data from all channels */ 7887 struct spdk_histogram_data *histogram; 7888 }; 7889 7890 static void 7891 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7892 { 7893 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7894 7895 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7896 free(ctx); 7897 } 7898 7899 static void 7900 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7901 struct spdk_io_channel *_ch, void *_ctx) 7902 { 7903 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7904 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7905 int status = 0; 7906 7907 if (ch->histogram == NULL) { 7908 status = -EFAULT; 7909 } else { 7910 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7911 } 7912 7913 spdk_bdev_for_each_channel_continue(i, status); 7914 } 7915 7916 void 7917 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7918 spdk_bdev_histogram_data_cb cb_fn, 7919 void *cb_arg) 7920 { 7921 struct spdk_bdev_histogram_data_ctx *ctx; 7922 7923 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7924 if (ctx == NULL) { 7925 cb_fn(cb_arg, -ENOMEM, NULL); 7926 return; 7927 } 7928 7929 ctx->bdev = bdev; 7930 ctx->cb_fn = cb_fn; 7931 ctx->cb_arg = cb_arg; 7932 7933 ctx->histogram = histogram; 7934 7935 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 7936 bdev_histogram_get_channel_cb); 7937 } 7938 7939 void 7940 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 7941 void *cb_arg) 7942 { 7943 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7944 int status = 0; 7945 7946 assert(cb_fn != NULL); 7947 7948 if (bdev_ch->histogram == NULL) { 7949 status = -EFAULT; 7950 } 7951 cb_fn(cb_arg, status, bdev_ch->histogram); 7952 } 7953 7954 size_t 7955 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7956 size_t max_events) 7957 { 7958 struct media_event_entry *entry; 7959 size_t num_events = 0; 7960 7961 for (; num_events < max_events; ++num_events) { 7962 entry = TAILQ_FIRST(&desc->pending_media_events); 7963 if (entry == NULL) { 7964 break; 7965 } 7966 7967 events[num_events] = entry->event; 7968 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7969 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7970 } 7971 7972 return num_events; 7973 } 7974 7975 int 7976 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7977 size_t num_events) 7978 { 7979 struct spdk_bdev_desc *desc; 7980 struct media_event_entry *entry; 7981 size_t event_id; 7982 int rc = 0; 7983 7984 assert(bdev->media_events); 7985 7986 spdk_spin_lock(&bdev->internal.spinlock); 7987 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7988 if (desc->write) { 7989 break; 7990 } 7991 } 7992 7993 if (desc == NULL || desc->media_events_buffer == NULL) { 7994 rc = -ENODEV; 7995 goto out; 7996 } 7997 7998 for (event_id = 0; event_id < num_events; ++event_id) { 7999 entry = TAILQ_FIRST(&desc->free_media_events); 8000 if (entry == NULL) { 8001 break; 8002 } 8003 8004 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 8005 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 8006 entry->event = events[event_id]; 8007 } 8008 8009 rc = event_id; 8010 out: 8011 spdk_spin_unlock(&bdev->internal.spinlock); 8012 return rc; 8013 } 8014 8015 void 8016 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 8017 { 8018 struct spdk_bdev_desc *desc; 8019 8020 spdk_spin_lock(&bdev->internal.spinlock); 8021 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 8022 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 8023 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 8024 desc->callback.ctx); 8025 } 8026 } 8027 spdk_spin_unlock(&bdev->internal.spinlock); 8028 } 8029 8030 struct locked_lba_range_ctx { 8031 struct lba_range range; 8032 struct spdk_bdev *bdev; 8033 struct lba_range *current_range; 8034 struct lba_range *owner_range; 8035 struct spdk_poller *poller; 8036 lock_range_cb cb_fn; 8037 void *cb_arg; 8038 }; 8039 8040 static void 8041 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8042 { 8043 struct locked_lba_range_ctx *ctx = _ctx; 8044 8045 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 8046 free(ctx); 8047 } 8048 8049 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 8050 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 8051 8052 static void 8053 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8054 { 8055 struct locked_lba_range_ctx *ctx = _ctx; 8056 8057 if (status == -ENOMEM) { 8058 /* One of the channels could not allocate a range object. 8059 * So we have to go back and clean up any ranges that were 8060 * allocated successfully before we return error status to 8061 * the caller. We can reuse the unlock function to do that 8062 * clean up. 8063 */ 8064 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8065 bdev_lock_error_cleanup_cb); 8066 return; 8067 } 8068 8069 /* All channels have locked this range and no I/O overlapping the range 8070 * are outstanding! Set the owner_ch for the range object for the 8071 * locking channel, so that this channel will know that it is allowed 8072 * to write to this range. 8073 */ 8074 ctx->owner_range->owner_ch = ctx->range.owner_ch; 8075 ctx->cb_fn(ctx->cb_arg, status); 8076 8077 /* Don't free the ctx here. Its range is in the bdev's global list of 8078 * locked ranges still, and will be removed and freed when this range 8079 * is later unlocked. 8080 */ 8081 } 8082 8083 static int 8084 bdev_lock_lba_range_check_io(void *_i) 8085 { 8086 struct spdk_bdev_channel_iter *i = _i; 8087 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 8088 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8089 struct locked_lba_range_ctx *ctx = i->ctx; 8090 struct lba_range *range = ctx->current_range; 8091 struct spdk_bdev_io *bdev_io; 8092 8093 spdk_poller_unregister(&ctx->poller); 8094 8095 /* The range is now in the locked_ranges, so no new IO can be submitted to this 8096 * range. But we need to wait until any outstanding IO overlapping with this range 8097 * are completed. 8098 */ 8099 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8100 if (bdev_io_range_is_locked(bdev_io, range)) { 8101 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8102 return SPDK_POLLER_BUSY; 8103 } 8104 } 8105 8106 spdk_bdev_for_each_channel_continue(i, 0); 8107 return SPDK_POLLER_BUSY; 8108 } 8109 8110 static void 8111 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8112 struct spdk_io_channel *_ch, void *_ctx) 8113 { 8114 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8115 struct locked_lba_range_ctx *ctx = _ctx; 8116 struct lba_range *range; 8117 8118 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8119 if (range->length == ctx->range.length && 8120 range->offset == ctx->range.offset && 8121 range->locked_ctx == ctx->range.locked_ctx) { 8122 /* This range already exists on this channel, so don't add 8123 * it again. This can happen when a new channel is created 8124 * while the for_each_channel operation is in progress. 8125 * Do not check for outstanding I/O in that case, since the 8126 * range was locked before any I/O could be submitted to the 8127 * new channel. 8128 */ 8129 spdk_bdev_for_each_channel_continue(i, 0); 8130 return; 8131 } 8132 } 8133 8134 range = calloc(1, sizeof(*range)); 8135 if (range == NULL) { 8136 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8137 return; 8138 } 8139 8140 range->length = ctx->range.length; 8141 range->offset = ctx->range.offset; 8142 range->locked_ctx = ctx->range.locked_ctx; 8143 ctx->current_range = range; 8144 if (ctx->range.owner_ch == ch) { 8145 /* This is the range object for the channel that will hold 8146 * the lock. Store it in the ctx object so that we can easily 8147 * set its owner_ch after the lock is finally acquired. 8148 */ 8149 ctx->owner_range = range; 8150 } 8151 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8152 bdev_lock_lba_range_check_io(i); 8153 } 8154 8155 static void 8156 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8157 { 8158 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8159 8160 /* We will add a copy of this range to each channel now. */ 8161 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8162 bdev_lock_lba_range_cb); 8163 } 8164 8165 static bool 8166 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8167 { 8168 struct lba_range *r; 8169 8170 TAILQ_FOREACH(r, tailq, tailq) { 8171 if (bdev_lba_range_overlapped(range, r)) { 8172 return true; 8173 } 8174 } 8175 return false; 8176 } 8177 8178 static int 8179 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8180 uint64_t offset, uint64_t length, 8181 lock_range_cb cb_fn, void *cb_arg) 8182 { 8183 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8184 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8185 struct locked_lba_range_ctx *ctx; 8186 8187 if (cb_arg == NULL) { 8188 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8189 return -EINVAL; 8190 } 8191 8192 ctx = calloc(1, sizeof(*ctx)); 8193 if (ctx == NULL) { 8194 return -ENOMEM; 8195 } 8196 8197 ctx->range.offset = offset; 8198 ctx->range.length = length; 8199 ctx->range.owner_ch = ch; 8200 ctx->range.locked_ctx = cb_arg; 8201 ctx->bdev = bdev; 8202 ctx->cb_fn = cb_fn; 8203 ctx->cb_arg = cb_arg; 8204 8205 spdk_spin_lock(&bdev->internal.spinlock); 8206 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8207 /* There is an active lock overlapping with this range. 8208 * Put it on the pending list until this range no 8209 * longer overlaps with another. 8210 */ 8211 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8212 } else { 8213 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8214 bdev_lock_lba_range_ctx(bdev, ctx); 8215 } 8216 spdk_spin_unlock(&bdev->internal.spinlock); 8217 return 0; 8218 } 8219 8220 static void 8221 bdev_lock_lba_range_ctx_msg(void *_ctx) 8222 { 8223 struct locked_lba_range_ctx *ctx = _ctx; 8224 8225 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8226 } 8227 8228 static void 8229 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8230 { 8231 struct locked_lba_range_ctx *ctx = _ctx; 8232 struct locked_lba_range_ctx *pending_ctx; 8233 struct lba_range *range, *tmp; 8234 8235 spdk_spin_lock(&bdev->internal.spinlock); 8236 /* Check if there are any pending locked ranges that overlap with this range 8237 * that was just unlocked. If there are, check that it doesn't overlap with any 8238 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8239 * the lock process. 8240 */ 8241 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8242 if (bdev_lba_range_overlapped(range, &ctx->range) && 8243 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8244 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8245 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8246 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8247 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8248 bdev_lock_lba_range_ctx_msg, pending_ctx); 8249 } 8250 } 8251 spdk_spin_unlock(&bdev->internal.spinlock); 8252 8253 ctx->cb_fn(ctx->cb_arg, status); 8254 free(ctx); 8255 } 8256 8257 static void 8258 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8259 struct spdk_io_channel *_ch, void *_ctx) 8260 { 8261 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8262 struct locked_lba_range_ctx *ctx = _ctx; 8263 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8264 struct spdk_bdev_io *bdev_io; 8265 struct lba_range *range; 8266 8267 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8268 if (ctx->range.offset == range->offset && 8269 ctx->range.length == range->length && 8270 ctx->range.locked_ctx == range->locked_ctx) { 8271 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8272 free(range); 8273 break; 8274 } 8275 } 8276 8277 /* Note: we should almost always be able to assert that the range specified 8278 * was found. But there are some very rare corner cases where a new channel 8279 * gets created simultaneously with a range unlock, where this function 8280 * would execute on that new channel and wouldn't have the range. 8281 * We also use this to clean up range allocations when a later allocation 8282 * fails in the locking path. 8283 * So we can't actually assert() here. 8284 */ 8285 8286 /* Swap the locked IO into a temporary list, and then try to submit them again. 8287 * We could hyper-optimize this to only resubmit locked I/O that overlap 8288 * with the range that was just unlocked, but this isn't a performance path so 8289 * we go for simplicity here. 8290 */ 8291 TAILQ_INIT(&io_locked); 8292 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8293 while (!TAILQ_EMPTY(&io_locked)) { 8294 bdev_io = TAILQ_FIRST(&io_locked); 8295 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8296 bdev_io_submit(bdev_io); 8297 } 8298 8299 spdk_bdev_for_each_channel_continue(i, 0); 8300 } 8301 8302 static int 8303 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8304 uint64_t offset, uint64_t length, 8305 lock_range_cb cb_fn, void *cb_arg) 8306 { 8307 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8308 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8309 struct locked_lba_range_ctx *ctx; 8310 struct lba_range *range; 8311 bool range_found = false; 8312 8313 /* Let's make sure the specified channel actually has a lock on 8314 * the specified range. Note that the range must match exactly. 8315 */ 8316 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8317 if (range->offset == offset && range->length == length && 8318 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8319 range_found = true; 8320 break; 8321 } 8322 } 8323 8324 if (!range_found) { 8325 return -EINVAL; 8326 } 8327 8328 spdk_spin_lock(&bdev->internal.spinlock); 8329 /* We confirmed that this channel has locked the specified range. To 8330 * start the unlock the process, we find the range in the bdev's locked_ranges 8331 * and remove it. This ensures new channels don't inherit the locked range. 8332 * Then we will send a message to each channel (including the one specified 8333 * here) to remove the range from its per-channel list. 8334 */ 8335 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8336 if (range->offset == offset && range->length == length && 8337 range->locked_ctx == cb_arg) { 8338 break; 8339 } 8340 } 8341 if (range == NULL) { 8342 assert(false); 8343 spdk_spin_unlock(&bdev->internal.spinlock); 8344 return -EINVAL; 8345 } 8346 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8347 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8348 spdk_spin_unlock(&bdev->internal.spinlock); 8349 8350 ctx->cb_fn = cb_fn; 8351 ctx->cb_arg = cb_arg; 8352 8353 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8354 bdev_unlock_lba_range_cb); 8355 return 0; 8356 } 8357 8358 int 8359 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8360 int array_size) 8361 { 8362 if (!bdev) { 8363 return -EINVAL; 8364 } 8365 8366 if (bdev->fn_table->get_memory_domains) { 8367 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8368 } 8369 8370 return 0; 8371 } 8372 8373 struct spdk_bdev_for_each_io_ctx { 8374 void *ctx; 8375 spdk_bdev_io_fn fn; 8376 spdk_bdev_for_each_io_cb cb; 8377 }; 8378 8379 static void 8380 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8381 struct spdk_io_channel *io_ch, void *_ctx) 8382 { 8383 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8384 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8385 struct spdk_bdev_io *bdev_io; 8386 int rc = 0; 8387 8388 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8389 rc = ctx->fn(ctx->ctx, bdev_io); 8390 if (rc != 0) { 8391 break; 8392 } 8393 } 8394 8395 spdk_bdev_for_each_channel_continue(i, rc); 8396 } 8397 8398 static void 8399 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8400 { 8401 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8402 8403 ctx->cb(ctx->ctx, status); 8404 8405 free(ctx); 8406 } 8407 8408 void 8409 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8410 spdk_bdev_for_each_io_cb cb) 8411 { 8412 struct spdk_bdev_for_each_io_ctx *ctx; 8413 8414 assert(fn != NULL && cb != NULL); 8415 8416 ctx = calloc(1, sizeof(*ctx)); 8417 if (ctx == NULL) { 8418 SPDK_ERRLOG("Failed to allocate context.\n"); 8419 cb(_ctx, -ENOMEM); 8420 return; 8421 } 8422 8423 ctx->ctx = _ctx; 8424 ctx->fn = fn; 8425 ctx->cb = cb; 8426 8427 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8428 bdev_for_each_io_done); 8429 } 8430 8431 void 8432 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8433 { 8434 spdk_for_each_channel_continue(iter->i, status); 8435 } 8436 8437 static struct spdk_bdev * 8438 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8439 { 8440 void *io_device = spdk_io_channel_iter_get_io_device(i); 8441 8442 return __bdev_from_io_dev(io_device); 8443 } 8444 8445 static void 8446 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8447 { 8448 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8449 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8450 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8451 8452 iter->i = i; 8453 iter->fn(iter, bdev, ch, iter->ctx); 8454 } 8455 8456 static void 8457 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 8458 { 8459 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8460 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8461 8462 iter->i = i; 8463 iter->cpl(bdev, iter->ctx, status); 8464 8465 free(iter); 8466 } 8467 8468 void 8469 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 8470 void *ctx, spdk_bdev_for_each_channel_done cpl) 8471 { 8472 struct spdk_bdev_channel_iter *iter; 8473 8474 assert(bdev != NULL && fn != NULL && ctx != NULL); 8475 8476 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 8477 if (iter == NULL) { 8478 SPDK_ERRLOG("Unable to allocate iterator\n"); 8479 assert(false); 8480 return; 8481 } 8482 8483 iter->fn = fn; 8484 iter->cpl = cpl; 8485 iter->ctx = ctx; 8486 8487 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 8488 iter, bdev_each_channel_cpl); 8489 } 8490 8491 int 8492 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 8493 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 8494 spdk_bdev_io_completion_cb cb, void *cb_arg) 8495 { 8496 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8497 struct spdk_bdev_io *bdev_io; 8498 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 8499 8500 if (!desc->write) { 8501 return -EBADF; 8502 } 8503 8504 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 8505 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 8506 return -ENOTSUP; 8507 } 8508 8509 if (num_blocks == 0) { 8510 SPDK_ERRLOG("Can't copy 0 blocks\n"); 8511 return -EINVAL; 8512 } 8513 8514 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 8515 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 8516 SPDK_DEBUGLOG(bdev, 8517 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 8518 dst_offset_blocks, src_offset_blocks, num_blocks); 8519 return -EINVAL; 8520 } 8521 8522 bdev_io = bdev_channel_get_io(channel); 8523 if (!bdev_io) { 8524 return -ENOMEM; 8525 } 8526 8527 bdev_io->internal.ch = channel; 8528 bdev_io->internal.desc = desc; 8529 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 8530 8531 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 8532 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 8533 bdev_io->u.bdev.num_blocks = num_blocks; 8534 bdev_io->u.bdev.ext_opts = NULL; 8535 bdev_io_init(bdev_io, bdev, cb_arg, cb); 8536 8537 bdev_io_submit(bdev_io); 8538 return 0; 8539 } 8540 8541 SPDK_LOG_REGISTER_COMPONENT(bdev) 8542 8543 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8544 { 8545 struct spdk_trace_tpoint_opts opts[] = { 8546 { 8547 "BDEV_IO_START", TRACE_BDEV_IO_START, 8548 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8549 { 8550 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8551 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8552 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8553 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8554 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8555 } 8556 }, 8557 { 8558 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8559 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8560 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8561 }, 8562 { 8563 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8564 OWNER_BDEV, OBJECT_NONE, 1, 8565 { 8566 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8567 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8568 } 8569 }, 8570 { 8571 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8572 OWNER_BDEV, OBJECT_NONE, 0, 8573 { 8574 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8575 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8576 } 8577 }, 8578 }; 8579 8580 8581 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8582 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8583 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8584 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8585 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8586 } 8587