1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (C) 2016 Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define BUF_SMALL_CACHE_SIZE 128 42 #define BUF_LARGE_CACHE_SIZE 16 43 #define NOMEM_THRESHOLD_COUNT 8 44 45 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 46 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 47 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 48 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 49 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 50 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 51 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 /* The maximum number of children requests for a COPY command 60 * when splitting into children requests at a time. 61 */ 62 #define SPDK_BDEV_MAX_CHILDREN_COPY_REQS (8) 63 64 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 65 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 66 }; 67 68 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 69 70 RB_HEAD(bdev_name_tree, spdk_bdev_name); 71 72 static int 73 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 74 { 75 return strcmp(name1->name, name2->name); 76 } 77 78 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 79 80 struct spdk_bdev_mgr { 81 struct spdk_mempool *bdev_io_pool; 82 83 void *zero_buffer; 84 85 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 86 87 struct spdk_bdev_list bdevs; 88 struct bdev_name_tree bdev_names; 89 90 bool init_complete; 91 bool module_init_complete; 92 93 struct spdk_spinlock spinlock; 94 95 #ifdef SPDK_CONFIG_VTUNE 96 __itt_domain *domain; 97 #endif 98 }; 99 100 static struct spdk_bdev_mgr g_bdev_mgr = { 101 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 102 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 103 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 104 .init_complete = false, 105 .module_init_complete = false, 106 }; 107 108 static void 109 __attribute__((constructor)) 110 _bdev_init(void) 111 { 112 spdk_spin_init(&g_bdev_mgr.spinlock); 113 } 114 115 typedef void (*lock_range_cb)(void *ctx, int status); 116 117 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 118 119 struct lba_range { 120 uint64_t offset; 121 uint64_t length; 122 void *locked_ctx; 123 struct spdk_bdev_channel *owner_ch; 124 TAILQ_ENTRY(lba_range) tailq; 125 }; 126 127 static struct spdk_bdev_opts g_bdev_opts = { 128 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 129 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 130 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 131 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 132 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 133 }; 134 135 static spdk_bdev_init_cb g_init_cb_fn = NULL; 136 static void *g_init_cb_arg = NULL; 137 138 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 139 static void *g_fini_cb_arg = NULL; 140 static struct spdk_thread *g_fini_thread = NULL; 141 142 struct spdk_bdev_qos_limit { 143 /** IOs or bytes allowed per second (i.e., 1s). */ 144 uint64_t limit; 145 146 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 147 * For remaining bytes, allowed to run negative if an I/O is submitted when 148 * some bytes are remaining, but the I/O is bigger than that amount. The 149 * excess will be deducted from the next timeslice. 150 */ 151 int64_t remaining_this_timeslice; 152 153 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 154 uint32_t min_per_timeslice; 155 156 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 157 uint32_t max_per_timeslice; 158 159 /** Function to check whether to queue the IO. */ 160 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 161 162 /** Function to update for the submitted IO. */ 163 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 164 }; 165 166 struct spdk_bdev_qos { 167 /** Types of structure of rate limits. */ 168 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 169 170 /** The channel that all I/O are funneled through. */ 171 struct spdk_bdev_channel *ch; 172 173 /** The thread on which the poller is running. */ 174 struct spdk_thread *thread; 175 176 /** Queue of I/O waiting to be issued. */ 177 bdev_io_tailq_t queued; 178 179 /** Size of a timeslice in tsc ticks. */ 180 uint64_t timeslice_size; 181 182 /** Timestamp of start of last timeslice. */ 183 uint64_t last_timeslice; 184 185 /** Poller that processes queued I/O commands each time slice. */ 186 struct spdk_poller *poller; 187 }; 188 189 struct spdk_bdev_mgmt_channel { 190 /* 191 * Each thread keeps a cache of bdev_io - this allows 192 * bdev threads which are *not* DPDK threads to still 193 * benefit from a per-thread bdev_io cache. Without 194 * this, non-DPDK threads fetching from the mempool 195 * incur a cmpxchg on get and put. 196 */ 197 bdev_io_stailq_t per_thread_cache; 198 uint32_t per_thread_cache_count; 199 uint32_t bdev_io_cache_size; 200 201 struct spdk_iobuf_channel iobuf; 202 203 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 204 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 205 }; 206 207 /* 208 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 209 * will queue here their IO that awaits retry. It makes it possible to retry sending 210 * IO to one bdev after IO from other bdev completes. 211 */ 212 struct spdk_bdev_shared_resource { 213 /* The bdev management channel */ 214 struct spdk_bdev_mgmt_channel *mgmt_ch; 215 216 /* 217 * Count of I/O submitted to bdev module and waiting for completion. 218 * Incremented before submit_request() is called on an spdk_bdev_io. 219 */ 220 uint64_t io_outstanding; 221 222 /* 223 * Queue of IO awaiting retry because of a previous NOMEM status returned 224 * on this channel. 225 */ 226 bdev_io_tailq_t nomem_io; 227 228 /* 229 * Threshold which io_outstanding must drop to before retrying nomem_io. 230 */ 231 uint64_t nomem_threshold; 232 233 /* I/O channel allocated by a bdev module */ 234 struct spdk_io_channel *shared_ch; 235 236 /* Refcount of bdev channels using this resource */ 237 uint32_t ref; 238 239 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 240 }; 241 242 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 243 #define BDEV_CH_QOS_ENABLED (1 << 1) 244 245 struct spdk_bdev_channel { 246 struct spdk_bdev *bdev; 247 248 /* The channel for the underlying device */ 249 struct spdk_io_channel *channel; 250 251 /* Per io_device per thread data */ 252 struct spdk_bdev_shared_resource *shared_resource; 253 254 struct spdk_bdev_io_stat *stat; 255 256 /* 257 * Count of I/O submitted to the underlying dev module through this channel 258 * and waiting for completion. 259 */ 260 uint64_t io_outstanding; 261 262 /* 263 * List of all submitted I/Os including I/O that are generated via splitting. 264 */ 265 bdev_io_tailq_t io_submitted; 266 267 /* 268 * List of spdk_bdev_io that are currently queued because they write to a locked 269 * LBA range. 270 */ 271 bdev_io_tailq_t io_locked; 272 273 uint32_t flags; 274 275 struct spdk_histogram_data *histogram; 276 277 #ifdef SPDK_CONFIG_VTUNE 278 uint64_t start_tsc; 279 uint64_t interval_tsc; 280 __itt_string_handle *handle; 281 struct spdk_bdev_io_stat *prev_stat; 282 #endif 283 284 bdev_io_tailq_t queued_resets; 285 286 lba_range_tailq_t locked_ranges; 287 }; 288 289 struct media_event_entry { 290 struct spdk_bdev_media_event event; 291 TAILQ_ENTRY(media_event_entry) tailq; 292 }; 293 294 #define MEDIA_EVENT_POOL_SIZE 64 295 296 struct spdk_bdev_desc { 297 struct spdk_bdev *bdev; 298 struct spdk_thread *thread; 299 struct { 300 spdk_bdev_event_cb_t event_fn; 301 void *ctx; 302 } callback; 303 bool closed; 304 bool write; 305 bool memory_domains_supported; 306 struct spdk_spinlock spinlock; 307 uint32_t refs; 308 TAILQ_HEAD(, media_event_entry) pending_media_events; 309 TAILQ_HEAD(, media_event_entry) free_media_events; 310 struct media_event_entry *media_events_buffer; 311 TAILQ_ENTRY(spdk_bdev_desc) link; 312 313 uint64_t timeout_in_sec; 314 spdk_bdev_io_timeout_cb cb_fn; 315 void *cb_arg; 316 struct spdk_poller *io_timeout_poller; 317 }; 318 319 struct spdk_bdev_iostat_ctx { 320 struct spdk_bdev_io_stat *stat; 321 spdk_bdev_get_device_stat_cb cb; 322 void *cb_arg; 323 }; 324 325 struct set_qos_limit_ctx { 326 void (*cb_fn)(void *cb_arg, int status); 327 void *cb_arg; 328 struct spdk_bdev *bdev; 329 }; 330 331 struct spdk_bdev_channel_iter { 332 spdk_bdev_for_each_channel_msg fn; 333 spdk_bdev_for_each_channel_done cpl; 334 struct spdk_io_channel_iter *i; 335 void *ctx; 336 }; 337 338 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 339 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 340 #define __io_ch_to_bdev_ch(io_ch) ((struct spdk_bdev_channel *)spdk_io_channel_get_ctx(io_ch)) 341 #define __io_ch_to_bdev_mgmt_ch(io_ch) ((struct spdk_bdev_mgmt_channel *)spdk_io_channel_get_ctx(io_ch)) 342 343 static inline void bdev_io_complete(void *ctx); 344 345 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 346 static void bdev_write_zero_buffer_next(void *_bdev_io); 347 348 static void bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 349 struct spdk_io_channel *ch, void *_ctx); 350 static void bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status); 351 352 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 353 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 354 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 355 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 356 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 357 struct iovec *iov, int iovcnt, void *md_buf, 358 uint64_t offset_blocks, uint64_t num_blocks, 359 spdk_bdev_io_completion_cb cb, void *cb_arg, 360 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 361 362 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 363 uint64_t offset, uint64_t length, 364 lock_range_cb cb_fn, void *cb_arg); 365 366 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 367 uint64_t offset, uint64_t length, 368 lock_range_cb cb_fn, void *cb_arg); 369 370 static inline void bdev_io_complete(void *ctx); 371 372 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 373 static bool bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *ch, struct spdk_bdev_io *bio_to_abort); 374 375 void 376 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 377 { 378 if (!opts) { 379 SPDK_ERRLOG("opts should not be NULL\n"); 380 return; 381 } 382 383 if (!opts_size) { 384 SPDK_ERRLOG("opts_size should not be zero value\n"); 385 return; 386 } 387 388 opts->opts_size = opts_size; 389 390 #define SET_FIELD(field) \ 391 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 392 opts->field = g_bdev_opts.field; \ 393 } \ 394 395 SET_FIELD(bdev_io_pool_size); 396 SET_FIELD(bdev_io_cache_size); 397 SET_FIELD(bdev_auto_examine); 398 SET_FIELD(small_buf_pool_size); 399 SET_FIELD(large_buf_pool_size); 400 401 /* Do not remove this statement, you should always update this statement when you adding a new field, 402 * and do not forget to add the SET_FIELD statement for your added field. */ 403 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 404 405 #undef SET_FIELD 406 } 407 408 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_small_buf_pool_size, "spdk_bdev_opts.small_buf_pool_size", 409 "v23.05", 0); 410 SPDK_LOG_DEPRECATION_REGISTER(bdev_opts_large_buf_pool_size, "spdk_bdev_opts.large_buf_pool_size", 411 "v23.05", 0); 412 int 413 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 414 { 415 struct spdk_iobuf_opts iobuf_opts; 416 uint32_t min_pool_size; 417 int rc; 418 419 if (!opts) { 420 SPDK_ERRLOG("opts cannot be NULL\n"); 421 return -1; 422 } 423 424 if (!opts->opts_size) { 425 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 426 return -1; 427 } 428 429 /* 430 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 431 * initialization. A second mgmt_ch will be created on the same thread when the application starts 432 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 433 */ 434 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 435 if (opts->bdev_io_pool_size < min_pool_size) { 436 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 437 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 438 spdk_thread_get_count()); 439 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 440 return -1; 441 } 442 443 if (opts->small_buf_pool_size != BUF_SMALL_POOL_SIZE) { 444 SPDK_LOG_DEPRECATED(bdev_opts_small_buf_pool_size); 445 } 446 if (opts->large_buf_pool_size != BUF_LARGE_POOL_SIZE) { 447 SPDK_LOG_DEPRECATED(bdev_opts_large_buf_pool_size); 448 } 449 450 #define SET_FIELD(field) \ 451 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 452 g_bdev_opts.field = opts->field; \ 453 } \ 454 455 SET_FIELD(bdev_io_pool_size); 456 SET_FIELD(bdev_io_cache_size); 457 SET_FIELD(bdev_auto_examine); 458 SET_FIELD(small_buf_pool_size); 459 SET_FIELD(large_buf_pool_size); 460 461 spdk_iobuf_get_opts(&iobuf_opts); 462 iobuf_opts.small_pool_count = opts->small_buf_pool_size; 463 iobuf_opts.large_pool_count = opts->large_buf_pool_size; 464 465 rc = spdk_iobuf_set_opts(&iobuf_opts); 466 if (rc != 0) { 467 SPDK_ERRLOG("Failed to set iobuf opts\n"); 468 return -1; 469 } 470 471 g_bdev_opts.opts_size = opts->opts_size; 472 473 #undef SET_FIELD 474 475 return 0; 476 } 477 478 static struct spdk_bdev * 479 bdev_get_by_name(const char *bdev_name) 480 { 481 struct spdk_bdev_name find; 482 struct spdk_bdev_name *res; 483 484 find.name = (char *)bdev_name; 485 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 486 if (res != NULL) { 487 return res->bdev; 488 } 489 490 return NULL; 491 } 492 493 struct spdk_bdev * 494 spdk_bdev_get_by_name(const char *bdev_name) 495 { 496 struct spdk_bdev *bdev; 497 498 spdk_spin_lock(&g_bdev_mgr.spinlock); 499 bdev = bdev_get_by_name(bdev_name); 500 spdk_spin_unlock(&g_bdev_mgr.spinlock); 501 502 return bdev; 503 } 504 505 struct spdk_bdev_wait_for_examine_ctx { 506 struct spdk_poller *poller; 507 spdk_bdev_wait_for_examine_cb cb_fn; 508 void *cb_arg; 509 }; 510 511 static bool bdev_module_all_actions_completed(void); 512 513 static int 514 bdev_wait_for_examine_cb(void *arg) 515 { 516 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 517 518 if (!bdev_module_all_actions_completed()) { 519 return SPDK_POLLER_IDLE; 520 } 521 522 spdk_poller_unregister(&ctx->poller); 523 ctx->cb_fn(ctx->cb_arg); 524 free(ctx); 525 526 return SPDK_POLLER_BUSY; 527 } 528 529 int 530 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 531 { 532 struct spdk_bdev_wait_for_examine_ctx *ctx; 533 534 ctx = calloc(1, sizeof(*ctx)); 535 if (ctx == NULL) { 536 return -ENOMEM; 537 } 538 ctx->cb_fn = cb_fn; 539 ctx->cb_arg = cb_arg; 540 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 541 542 return 0; 543 } 544 545 struct spdk_bdev_examine_item { 546 char *name; 547 TAILQ_ENTRY(spdk_bdev_examine_item) link; 548 }; 549 550 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 551 552 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 553 g_bdev_examine_allowlist); 554 555 static inline bool 556 bdev_examine_allowlist_check(const char *name) 557 { 558 struct spdk_bdev_examine_item *item; 559 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 560 if (strcmp(name, item->name) == 0) { 561 return true; 562 } 563 } 564 return false; 565 } 566 567 static inline void 568 bdev_examine_allowlist_free(void) 569 { 570 struct spdk_bdev_examine_item *item; 571 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 572 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 573 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 574 free(item->name); 575 free(item); 576 } 577 } 578 579 static inline bool 580 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 581 { 582 struct spdk_bdev_alias *tmp; 583 if (bdev_examine_allowlist_check(bdev->name)) { 584 return true; 585 } 586 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 587 if (bdev_examine_allowlist_check(tmp->alias.name)) { 588 return true; 589 } 590 } 591 return false; 592 } 593 594 static inline bool 595 bdev_ok_to_examine(struct spdk_bdev *bdev) 596 { 597 if (g_bdev_opts.bdev_auto_examine) { 598 return true; 599 } else { 600 return bdev_in_examine_allowlist(bdev); 601 } 602 } 603 604 static void 605 bdev_examine(struct spdk_bdev *bdev) 606 { 607 struct spdk_bdev_module *module; 608 uint32_t action; 609 610 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 611 if (module->examine_config && bdev_ok_to_examine(bdev)) { 612 action = module->internal.action_in_progress; 613 module->internal.action_in_progress++; 614 module->examine_config(bdev); 615 if (action != module->internal.action_in_progress) { 616 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 617 module->name); 618 } 619 } 620 } 621 622 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 623 if (bdev->internal.claim_module->examine_disk) { 624 bdev->internal.claim_module->internal.action_in_progress++; 625 bdev->internal.claim_module->examine_disk(bdev); 626 } 627 return; 628 } 629 630 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 631 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 632 module->internal.action_in_progress++; 633 module->examine_disk(bdev); 634 } 635 } 636 } 637 638 int 639 spdk_bdev_examine(const char *name) 640 { 641 struct spdk_bdev *bdev; 642 struct spdk_bdev_examine_item *item; 643 644 if (g_bdev_opts.bdev_auto_examine) { 645 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 646 return -EINVAL; 647 } 648 649 if (bdev_examine_allowlist_check(name)) { 650 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 651 return -EEXIST; 652 } 653 654 item = calloc(1, sizeof(*item)); 655 if (!item) { 656 return -ENOMEM; 657 } 658 item->name = strdup(name); 659 if (!item->name) { 660 free(item); 661 return -ENOMEM; 662 } 663 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 664 665 bdev = spdk_bdev_get_by_name(name); 666 if (bdev) { 667 bdev_examine(bdev); 668 } 669 return 0; 670 } 671 672 static inline void 673 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 674 { 675 struct spdk_bdev_examine_item *item; 676 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 677 spdk_json_write_object_begin(w); 678 spdk_json_write_named_string(w, "method", "bdev_examine"); 679 spdk_json_write_named_object_begin(w, "params"); 680 spdk_json_write_named_string(w, "name", item->name); 681 spdk_json_write_object_end(w); 682 spdk_json_write_object_end(w); 683 } 684 } 685 686 struct spdk_bdev * 687 spdk_bdev_first(void) 688 { 689 struct spdk_bdev *bdev; 690 691 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 692 if (bdev) { 693 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 694 } 695 696 return bdev; 697 } 698 699 struct spdk_bdev * 700 spdk_bdev_next(struct spdk_bdev *prev) 701 { 702 struct spdk_bdev *bdev; 703 704 bdev = TAILQ_NEXT(prev, internal.link); 705 if (bdev) { 706 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 707 } 708 709 return bdev; 710 } 711 712 static struct spdk_bdev * 713 _bdev_next_leaf(struct spdk_bdev *bdev) 714 { 715 while (bdev != NULL) { 716 if (bdev->internal.claim_module == NULL) { 717 return bdev; 718 } else { 719 bdev = TAILQ_NEXT(bdev, internal.link); 720 } 721 } 722 723 return bdev; 724 } 725 726 struct spdk_bdev * 727 spdk_bdev_first_leaf(void) 728 { 729 struct spdk_bdev *bdev; 730 731 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 732 733 if (bdev) { 734 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 735 } 736 737 return bdev; 738 } 739 740 struct spdk_bdev * 741 spdk_bdev_next_leaf(struct spdk_bdev *prev) 742 { 743 struct spdk_bdev *bdev; 744 745 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 746 747 if (bdev) { 748 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 749 } 750 751 return bdev; 752 } 753 754 static inline bool 755 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 756 { 757 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 758 } 759 760 void 761 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 762 { 763 struct iovec *iovs; 764 765 if (bdev_io->u.bdev.iovs == NULL) { 766 bdev_io->u.bdev.iovs = &bdev_io->iov; 767 bdev_io->u.bdev.iovcnt = 1; 768 } 769 770 iovs = bdev_io->u.bdev.iovs; 771 772 assert(iovs != NULL); 773 assert(bdev_io->u.bdev.iovcnt >= 1); 774 775 iovs[0].iov_base = buf; 776 iovs[0].iov_len = len; 777 } 778 779 void 780 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 781 { 782 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 783 bdev_io->u.bdev.md_buf = md_buf; 784 } 785 786 static bool 787 _is_buf_allocated(const struct iovec *iovs) 788 { 789 if (iovs == NULL) { 790 return false; 791 } 792 793 return iovs[0].iov_base != NULL; 794 } 795 796 static bool 797 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 798 { 799 int i; 800 uintptr_t iov_base; 801 802 if (spdk_likely(alignment == 1)) { 803 return true; 804 } 805 806 for (i = 0; i < iovcnt; i++) { 807 iov_base = (uintptr_t)iovs[i].iov_base; 808 if ((iov_base & (alignment - 1)) != 0) { 809 return false; 810 } 811 } 812 813 return true; 814 } 815 816 static void 817 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 818 { 819 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 820 void *buf; 821 822 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 823 buf = bdev_io->internal.buf; 824 bdev_io->internal.buf = NULL; 825 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 826 bdev_io->internal.get_aux_buf_cb = NULL; 827 } else { 828 assert(bdev_io->internal.get_buf_cb != NULL); 829 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 830 bdev_io->internal.get_buf_cb = NULL; 831 } 832 } 833 834 static void 835 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 836 { 837 struct spdk_bdev_io *bdev_io = ctx; 838 839 if (rc) { 840 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 841 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 842 } 843 bdev_io_get_buf_complete(bdev_io, !rc); 844 } 845 846 static void 847 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 848 { 849 int rc = 0; 850 851 /* save original md_buf */ 852 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 853 bdev_io->internal.orig_md_iov.iov_len = len; 854 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 855 bdev_io->internal.bounce_md_iov.iov_len = len; 856 /* set bounce md_buf */ 857 bdev_io->u.bdev.md_buf = md_buf; 858 859 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 860 if (bdev_io_use_memory_domain(bdev_io)) { 861 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 862 bdev_io->internal.ext_opts->memory_domain_ctx, 863 &bdev_io->internal.orig_md_iov, 1, 864 &bdev_io->internal.bounce_md_iov, 1, 865 bdev_io->internal.data_transfer_cpl, 866 bdev_io); 867 if (rc == 0) { 868 /* Continue to submit IO in completion callback */ 869 return; 870 } 871 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 872 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 873 } else { 874 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 875 } 876 } 877 878 assert(bdev_io->internal.data_transfer_cpl); 879 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 880 } 881 882 static void 883 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 884 { 885 struct spdk_bdev *bdev = bdev_io->bdev; 886 uint64_t md_len; 887 void *buf; 888 889 if (spdk_bdev_is_md_separate(bdev)) { 890 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 891 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 892 893 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 894 895 if (bdev_io->u.bdev.md_buf != NULL) { 896 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 897 return; 898 } else { 899 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 900 } 901 } 902 903 bdev_io_get_buf_complete(bdev_io, true); 904 } 905 906 static void 907 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 908 { 909 struct spdk_bdev_io *bdev_io = ctx; 910 911 if (rc) { 912 SPDK_ERRLOG("Failed to get data buffer\n"); 913 assert(bdev_io->internal.data_transfer_cpl); 914 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 915 return; 916 } 917 918 _bdev_io_set_md_buf(bdev_io); 919 } 920 921 static void 922 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 923 bdev_copy_bounce_buffer_cpl cpl_cb) 924 { 925 int rc = 0; 926 927 bdev_io->internal.data_transfer_cpl = cpl_cb; 928 /* save original iovec */ 929 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 930 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 931 /* set bounce iov */ 932 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 933 bdev_io->u.bdev.iovcnt = 1; 934 /* set bounce buffer for this operation */ 935 bdev_io->u.bdev.iovs[0].iov_base = buf; 936 bdev_io->u.bdev.iovs[0].iov_len = len; 937 /* if this is write path, copy data from original buffer to bounce buffer */ 938 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 939 if (bdev_io_use_memory_domain(bdev_io)) { 940 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 941 bdev_io->internal.ext_opts->memory_domain_ctx, 942 bdev_io->internal.orig_iovs, 943 (uint32_t) bdev_io->internal.orig_iovcnt, 944 bdev_io->u.bdev.iovs, 1, 945 _bdev_io_pull_bounce_data_buf_done, 946 bdev_io); 947 if (rc == 0) { 948 /* Continue to submit IO in completion callback */ 949 return; 950 } 951 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 952 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 953 } else { 954 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 955 } 956 } 957 958 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 959 } 960 961 static void 962 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 963 { 964 struct spdk_bdev *bdev = bdev_io->bdev; 965 bool buf_allocated; 966 uint64_t alignment; 967 void *aligned_buf; 968 969 bdev_io->internal.buf = buf; 970 971 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 972 bdev_io_get_buf_complete(bdev_io, true); 973 return; 974 } 975 976 alignment = spdk_bdev_get_buf_align(bdev); 977 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 978 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 979 980 if (buf_allocated) { 981 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 982 /* Continue in completion callback */ 983 return; 984 } else { 985 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 986 } 987 988 _bdev_io_set_md_buf(bdev_io); 989 } 990 991 static inline uint64_t 992 bdev_io_get_max_buf_len(struct spdk_bdev_io *bdev_io, uint64_t len) 993 { 994 struct spdk_bdev *bdev = bdev_io->bdev; 995 uint64_t md_len, alignment; 996 997 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 998 alignment = spdk_bdev_get_buf_align(bdev); 999 1000 return len + alignment + md_len; 1001 } 1002 1003 static void 1004 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 1005 { 1006 struct spdk_bdev_mgmt_channel *ch; 1007 1008 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1009 spdk_iobuf_put(&ch->iobuf, buf, bdev_io_get_max_buf_len(bdev_io, buf_len)); 1010 } 1011 1012 static void 1013 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1014 { 1015 assert(bdev_io->internal.buf != NULL); 1016 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1017 bdev_io->internal.buf = NULL; 1018 } 1019 1020 void 1021 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1022 { 1023 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1024 1025 assert(buf != NULL); 1026 _bdev_io_put_buf(bdev_io, buf, len); 1027 } 1028 1029 static void 1030 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1031 { 1032 struct spdk_bdev *bdev = bdev_ch->bdev; 1033 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1034 struct spdk_bdev_io *bdev_io; 1035 1036 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1037 /* 1038 * Allow some more I/O to complete before retrying the nomem_io queue. 1039 * Some drivers (such as nvme) cannot immediately take a new I/O in 1040 * the context of a completion, because the resources for the I/O are 1041 * not released until control returns to the bdev poller. Also, we 1042 * may require several small I/O to complete before a larger I/O 1043 * (that requires splitting) can be submitted. 1044 */ 1045 return; 1046 } 1047 1048 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1049 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1050 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1051 bdev_io->internal.ch->io_outstanding++; 1052 shared_resource->io_outstanding++; 1053 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1054 bdev_io->internal.error.nvme.cdw0 = 0; 1055 bdev_io->num_retries++; 1056 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1057 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1058 break; 1059 } 1060 } 1061 } 1062 1063 static inline void 1064 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1065 struct spdk_bdev_shared_resource *shared_resource) 1066 { 1067 assert(bdev_ch->io_outstanding > 0); 1068 assert(shared_resource->io_outstanding > 0); 1069 bdev_ch->io_outstanding--; 1070 shared_resource->io_outstanding--; 1071 } 1072 1073 static inline bool 1074 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1075 { 1076 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1077 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1078 1079 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1080 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1081 /* 1082 * Wait for some of the outstanding I/O to complete before we 1083 * retry any of the nomem_io. Normally we will wait for 1084 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1085 * depth channels we will instead wait for half to complete. 1086 */ 1087 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1088 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1089 return true; 1090 } 1091 1092 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1093 bdev_ch_retry_io(bdev_ch); 1094 } 1095 1096 return false; 1097 } 1098 1099 static void 1100 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1101 { 1102 struct spdk_bdev_io *bdev_io = ctx; 1103 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1104 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1105 1106 if (rc) { 1107 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1108 } 1109 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1110 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1111 */ 1112 bdev_io_put_buf(bdev_io); 1113 1114 /* Continue with IO completion flow */ 1115 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1116 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1117 return; 1118 } 1119 1120 bdev_io_complete(bdev_io); 1121 } 1122 1123 static inline void 1124 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1125 { 1126 int rc = 0; 1127 1128 /* do the same for metadata buffer */ 1129 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1130 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1131 1132 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1133 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1134 if (bdev_io_use_memory_domain(bdev_io)) { 1135 /* If memory domain is used then we need to call async push function */ 1136 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1137 bdev_io->internal.ext_opts->memory_domain_ctx, 1138 &bdev_io->internal.orig_md_iov, 1139 (uint32_t)bdev_io->internal.orig_iovcnt, 1140 &bdev_io->internal.bounce_md_iov, 1, 1141 bdev_io->internal.data_transfer_cpl, 1142 bdev_io); 1143 if (rc == 0) { 1144 /* Continue IO completion in async callback */ 1145 return; 1146 } 1147 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1148 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1149 } else { 1150 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1151 bdev_io->internal.orig_md_iov.iov_len); 1152 } 1153 } 1154 } 1155 1156 assert(bdev_io->internal.data_transfer_cpl); 1157 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1158 } 1159 1160 static void 1161 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1162 { 1163 struct spdk_bdev_io *bdev_io = ctx; 1164 1165 assert(bdev_io->internal.data_transfer_cpl); 1166 1167 if (rc) { 1168 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1169 return; 1170 } 1171 1172 /* set original buffer for this io */ 1173 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1174 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1175 /* disable bouncing buffer for this io */ 1176 bdev_io->internal.orig_iovcnt = 0; 1177 bdev_io->internal.orig_iovs = NULL; 1178 1179 _bdev_io_push_bounce_md_buffer(bdev_io); 1180 } 1181 1182 static inline void 1183 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1184 { 1185 int rc = 0; 1186 1187 bdev_io->internal.data_transfer_cpl = cpl_cb; 1188 1189 /* if this is read path, copy data from bounce buffer to original buffer */ 1190 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1191 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1192 if (bdev_io_use_memory_domain(bdev_io)) { 1193 /* If memory domain is used then we need to call async push function */ 1194 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1195 bdev_io->internal.ext_opts->memory_domain_ctx, 1196 bdev_io->internal.orig_iovs, 1197 (uint32_t)bdev_io->internal.orig_iovcnt, 1198 &bdev_io->internal.bounce_iov, 1, 1199 _bdev_io_push_bounce_data_buffer_done, 1200 bdev_io); 1201 if (rc == 0) { 1202 /* Continue IO completion in async callback */ 1203 return; 1204 } 1205 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1206 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1207 } else { 1208 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1209 bdev_io->internal.orig_iovcnt, 1210 bdev_io->internal.bounce_iov.iov_base, 1211 bdev_io->internal.bounce_iov.iov_len); 1212 } 1213 } 1214 1215 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1216 } 1217 1218 static void 1219 bdev_io_get_iobuf_cb(struct spdk_iobuf_entry *iobuf, void *buf) 1220 { 1221 struct spdk_bdev_io *bdev_io; 1222 1223 bdev_io = SPDK_CONTAINEROF(iobuf, struct spdk_bdev_io, internal.iobuf); 1224 _bdev_io_set_buf(bdev_io, buf, bdev_io->internal.buf_len); 1225 } 1226 1227 static void 1228 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1229 { 1230 struct spdk_bdev_mgmt_channel *mgmt_ch; 1231 uint64_t max_len; 1232 void *buf; 1233 1234 assert(spdk_bdev_io_get_thread(bdev_io) == spdk_get_thread()); 1235 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1236 max_len = bdev_io_get_max_buf_len(bdev_io, len); 1237 1238 if (spdk_unlikely(max_len > mgmt_ch->iobuf.large.bufsize)) { 1239 SPDK_ERRLOG("Length %" PRIu64 " is larger than allowed\n", max_len); 1240 bdev_io_get_buf_complete(bdev_io, false); 1241 return; 1242 } 1243 1244 bdev_io->internal.buf_len = len; 1245 buf = spdk_iobuf_get(&mgmt_ch->iobuf, max_len, &bdev_io->internal.iobuf, 1246 bdev_io_get_iobuf_cb); 1247 if (buf != NULL) { 1248 _bdev_io_set_buf(bdev_io, buf, len); 1249 } 1250 } 1251 1252 void 1253 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1254 { 1255 struct spdk_bdev *bdev = bdev_io->bdev; 1256 uint64_t alignment; 1257 1258 assert(cb != NULL); 1259 bdev_io->internal.get_buf_cb = cb; 1260 1261 alignment = spdk_bdev_get_buf_align(bdev); 1262 1263 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1264 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1265 /* Buffer already present and aligned */ 1266 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1267 return; 1268 } 1269 1270 bdev_io_get_buf(bdev_io, len); 1271 } 1272 1273 static void 1274 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1275 bool success) 1276 { 1277 if (!success) { 1278 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1279 bdev_io_complete(bdev_io); 1280 } else { 1281 bdev_io_submit(bdev_io); 1282 } 1283 } 1284 1285 static void 1286 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1287 uint64_t len) 1288 { 1289 assert(cb != NULL); 1290 bdev_io->internal.get_buf_cb = cb; 1291 1292 bdev_io_get_buf(bdev_io, len); 1293 } 1294 1295 void 1296 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1297 { 1298 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1299 1300 assert(cb != NULL); 1301 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1302 bdev_io->internal.get_aux_buf_cb = cb; 1303 bdev_io_get_buf(bdev_io, len); 1304 } 1305 1306 static int 1307 bdev_module_get_max_ctx_size(void) 1308 { 1309 struct spdk_bdev_module *bdev_module; 1310 int max_bdev_module_size = 0; 1311 1312 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1313 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1314 max_bdev_module_size = bdev_module->get_ctx_size(); 1315 } 1316 } 1317 1318 return max_bdev_module_size; 1319 } 1320 1321 static void 1322 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1323 { 1324 int i; 1325 struct spdk_bdev_qos *qos = bdev->internal.qos; 1326 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1327 1328 if (!qos) { 1329 return; 1330 } 1331 1332 spdk_bdev_get_qos_rate_limits(bdev, limits); 1333 1334 spdk_json_write_object_begin(w); 1335 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1336 1337 spdk_json_write_named_object_begin(w, "params"); 1338 spdk_json_write_named_string(w, "name", bdev->name); 1339 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1340 if (limits[i] > 0) { 1341 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1342 } 1343 } 1344 spdk_json_write_object_end(w); 1345 1346 spdk_json_write_object_end(w); 1347 } 1348 1349 void 1350 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1351 { 1352 struct spdk_bdev_module *bdev_module; 1353 struct spdk_bdev *bdev; 1354 1355 assert(w != NULL); 1356 1357 spdk_json_write_array_begin(w); 1358 1359 spdk_json_write_object_begin(w); 1360 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1361 spdk_json_write_named_object_begin(w, "params"); 1362 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1363 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1364 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1365 spdk_json_write_object_end(w); 1366 spdk_json_write_object_end(w); 1367 1368 bdev_examine_allowlist_config_json(w); 1369 1370 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1371 if (bdev_module->config_json) { 1372 bdev_module->config_json(w); 1373 } 1374 } 1375 1376 spdk_spin_lock(&g_bdev_mgr.spinlock); 1377 1378 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1379 if (bdev->fn_table->write_config_json) { 1380 bdev->fn_table->write_config_json(bdev, w); 1381 } 1382 1383 bdev_qos_config_json(bdev, w); 1384 } 1385 1386 spdk_spin_unlock(&g_bdev_mgr.spinlock); 1387 1388 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1389 spdk_json_write_object_begin(w); 1390 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1391 spdk_json_write_object_end(w); 1392 1393 spdk_json_write_array_end(w); 1394 } 1395 1396 static void 1397 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1398 { 1399 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1400 struct spdk_bdev_io *bdev_io; 1401 1402 spdk_iobuf_channel_fini(&ch->iobuf); 1403 1404 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1405 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1406 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1407 ch->per_thread_cache_count--; 1408 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1409 } 1410 1411 assert(ch->per_thread_cache_count == 0); 1412 } 1413 1414 static int 1415 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1416 { 1417 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1418 struct spdk_bdev_io *bdev_io; 1419 uint32_t i; 1420 int rc; 1421 1422 rc = spdk_iobuf_channel_init(&ch->iobuf, "bdev", BUF_SMALL_CACHE_SIZE, BUF_LARGE_CACHE_SIZE); 1423 if (rc != 0) { 1424 SPDK_ERRLOG("Failed to create iobuf channel: %s\n", spdk_strerror(-rc)); 1425 return -1; 1426 } 1427 1428 STAILQ_INIT(&ch->per_thread_cache); 1429 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1430 1431 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1432 ch->per_thread_cache_count = 0; 1433 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1434 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1435 if (bdev_io == NULL) { 1436 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1437 assert(false); 1438 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1439 return -1; 1440 } 1441 ch->per_thread_cache_count++; 1442 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1443 } 1444 1445 TAILQ_INIT(&ch->shared_resources); 1446 TAILQ_INIT(&ch->io_wait_queue); 1447 1448 return 0; 1449 } 1450 1451 static void 1452 bdev_init_complete(int rc) 1453 { 1454 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1455 void *cb_arg = g_init_cb_arg; 1456 struct spdk_bdev_module *m; 1457 1458 g_bdev_mgr.init_complete = true; 1459 g_init_cb_fn = NULL; 1460 g_init_cb_arg = NULL; 1461 1462 /* 1463 * For modules that need to know when subsystem init is complete, 1464 * inform them now. 1465 */ 1466 if (rc == 0) { 1467 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1468 if (m->init_complete) { 1469 m->init_complete(); 1470 } 1471 } 1472 } 1473 1474 cb_fn(cb_arg, rc); 1475 } 1476 1477 static bool 1478 bdev_module_all_actions_completed(void) 1479 { 1480 struct spdk_bdev_module *m; 1481 1482 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1483 if (m->internal.action_in_progress > 0) { 1484 return false; 1485 } 1486 } 1487 return true; 1488 } 1489 1490 static void 1491 bdev_module_action_complete(void) 1492 { 1493 /* 1494 * Don't finish bdev subsystem initialization if 1495 * module pre-initialization is still in progress, or 1496 * the subsystem been already initialized. 1497 */ 1498 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1499 return; 1500 } 1501 1502 /* 1503 * Check all bdev modules for inits/examinations in progress. If any 1504 * exist, return immediately since we cannot finish bdev subsystem 1505 * initialization until all are completed. 1506 */ 1507 if (!bdev_module_all_actions_completed()) { 1508 return; 1509 } 1510 1511 /* 1512 * Modules already finished initialization - now that all 1513 * the bdev modules have finished their asynchronous I/O 1514 * processing, the entire bdev layer can be marked as complete. 1515 */ 1516 bdev_init_complete(0); 1517 } 1518 1519 static void 1520 bdev_module_action_done(struct spdk_bdev_module *module) 1521 { 1522 assert(module->internal.action_in_progress > 0); 1523 module->internal.action_in_progress--; 1524 bdev_module_action_complete(); 1525 } 1526 1527 void 1528 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1529 { 1530 bdev_module_action_done(module); 1531 } 1532 1533 void 1534 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1535 { 1536 bdev_module_action_done(module); 1537 } 1538 1539 /** The last initialized bdev module */ 1540 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1541 1542 static void 1543 bdev_init_failed(void *cb_arg) 1544 { 1545 struct spdk_bdev_module *module = cb_arg; 1546 1547 module->internal.action_in_progress--; 1548 bdev_init_complete(-1); 1549 } 1550 1551 static int 1552 bdev_modules_init(void) 1553 { 1554 struct spdk_bdev_module *module; 1555 int rc = 0; 1556 1557 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1558 g_resume_bdev_module = module; 1559 if (module->async_init) { 1560 module->internal.action_in_progress = 1; 1561 } 1562 rc = module->module_init(); 1563 if (rc != 0) { 1564 /* Bump action_in_progress to prevent other modules from completion of modules_init 1565 * Send message to defer application shutdown until resources are cleaned up */ 1566 module->internal.action_in_progress = 1; 1567 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1568 return rc; 1569 } 1570 } 1571 1572 g_resume_bdev_module = NULL; 1573 return 0; 1574 } 1575 1576 void 1577 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1578 { 1579 int rc = 0; 1580 char mempool_name[32]; 1581 1582 assert(cb_fn != NULL); 1583 1584 g_init_cb_fn = cb_fn; 1585 g_init_cb_arg = cb_arg; 1586 1587 spdk_notify_type_register("bdev_register"); 1588 spdk_notify_type_register("bdev_unregister"); 1589 1590 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1591 1592 rc = spdk_iobuf_register_module("bdev"); 1593 if (rc != 0) { 1594 SPDK_ERRLOG("could not register bdev iobuf module: %s\n", spdk_strerror(-rc)); 1595 bdev_init_complete(-1); 1596 return; 1597 } 1598 1599 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1600 g_bdev_opts.bdev_io_pool_size, 1601 sizeof(struct spdk_bdev_io) + 1602 bdev_module_get_max_ctx_size(), 1603 0, 1604 SPDK_ENV_SOCKET_ID_ANY); 1605 1606 if (g_bdev_mgr.bdev_io_pool == NULL) { 1607 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1608 bdev_init_complete(-1); 1609 return; 1610 } 1611 1612 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1613 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1614 if (!g_bdev_mgr.zero_buffer) { 1615 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1616 bdev_init_complete(-1); 1617 return; 1618 } 1619 1620 #ifdef SPDK_CONFIG_VTUNE 1621 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1622 #endif 1623 1624 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1625 bdev_mgmt_channel_destroy, 1626 sizeof(struct spdk_bdev_mgmt_channel), 1627 "bdev_mgr"); 1628 1629 rc = bdev_modules_init(); 1630 g_bdev_mgr.module_init_complete = true; 1631 if (rc != 0) { 1632 SPDK_ERRLOG("bdev modules init failed\n"); 1633 return; 1634 } 1635 1636 bdev_module_action_complete(); 1637 } 1638 1639 static void 1640 bdev_mgr_unregister_cb(void *io_device) 1641 { 1642 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1643 1644 if (g_bdev_mgr.bdev_io_pool) { 1645 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1646 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1647 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1648 g_bdev_opts.bdev_io_pool_size); 1649 } 1650 1651 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1652 } 1653 1654 spdk_free(g_bdev_mgr.zero_buffer); 1655 1656 bdev_examine_allowlist_free(); 1657 1658 cb_fn(g_fini_cb_arg); 1659 g_fini_cb_fn = NULL; 1660 g_fini_cb_arg = NULL; 1661 g_bdev_mgr.init_complete = false; 1662 g_bdev_mgr.module_init_complete = false; 1663 } 1664 1665 static void 1666 bdev_module_fini_iter(void *arg) 1667 { 1668 struct spdk_bdev_module *bdev_module; 1669 1670 /* FIXME: Handling initialization failures is broken now, 1671 * so we won't even try cleaning up after successfully 1672 * initialized modules. if module_init_complete is false, 1673 * just call spdk_bdev_mgr_unregister_cb 1674 */ 1675 if (!g_bdev_mgr.module_init_complete) { 1676 bdev_mgr_unregister_cb(NULL); 1677 return; 1678 } 1679 1680 /* Start iterating from the last touched module */ 1681 if (!g_resume_bdev_module) { 1682 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1683 } else { 1684 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1685 internal.tailq); 1686 } 1687 1688 while (bdev_module) { 1689 if (bdev_module->async_fini) { 1690 /* Save our place so we can resume later. We must 1691 * save the variable here, before calling module_fini() 1692 * below, because in some cases the module may immediately 1693 * call spdk_bdev_module_fini_done() and re-enter 1694 * this function to continue iterating. */ 1695 g_resume_bdev_module = bdev_module; 1696 } 1697 1698 if (bdev_module->module_fini) { 1699 bdev_module->module_fini(); 1700 } 1701 1702 if (bdev_module->async_fini) { 1703 return; 1704 } 1705 1706 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1707 internal.tailq); 1708 } 1709 1710 g_resume_bdev_module = NULL; 1711 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1712 } 1713 1714 void 1715 spdk_bdev_module_fini_done(void) 1716 { 1717 if (spdk_get_thread() != g_fini_thread) { 1718 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1719 } else { 1720 bdev_module_fini_iter(NULL); 1721 } 1722 } 1723 1724 static void 1725 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1726 { 1727 struct spdk_bdev *bdev = cb_arg; 1728 1729 if (bdeverrno && bdev) { 1730 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1731 bdev->name); 1732 1733 /* 1734 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1735 * bdev; try to continue by manually removing this bdev from the list and continue 1736 * with the next bdev in the list. 1737 */ 1738 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1739 } 1740 1741 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1742 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1743 /* 1744 * Bdev module finish need to be deferred as we might be in the middle of some context 1745 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1746 * after returning. 1747 */ 1748 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1749 return; 1750 } 1751 1752 /* 1753 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1754 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1755 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1756 * base bdevs. 1757 * 1758 * Also, walk the list in the reverse order. 1759 */ 1760 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1761 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1762 if (bdev->internal.claim_module != NULL) { 1763 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1764 bdev->name, bdev->internal.claim_module->name); 1765 continue; 1766 } 1767 1768 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1769 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1770 return; 1771 } 1772 1773 /* 1774 * If any bdev fails to unclaim underlying bdev properly, we may face the 1775 * case of bdev list consisting of claimed bdevs only (if claims are managed 1776 * correctly, this would mean there's a loop in the claims graph which is 1777 * clearly impossible). Warn and unregister last bdev on the list then. 1778 */ 1779 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1780 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1781 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1782 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1783 return; 1784 } 1785 } 1786 1787 static void 1788 bdev_module_fini_start_iter(void *arg) 1789 { 1790 struct spdk_bdev_module *bdev_module; 1791 1792 if (!g_resume_bdev_module) { 1793 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1794 } else { 1795 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1796 } 1797 1798 while (bdev_module) { 1799 if (bdev_module->async_fini_start) { 1800 /* Save our place so we can resume later. We must 1801 * save the variable here, before calling fini_start() 1802 * below, because in some cases the module may immediately 1803 * call spdk_bdev_module_fini_start_done() and re-enter 1804 * this function to continue iterating. */ 1805 g_resume_bdev_module = bdev_module; 1806 } 1807 1808 if (bdev_module->fini_start) { 1809 bdev_module->fini_start(); 1810 } 1811 1812 if (bdev_module->async_fini_start) { 1813 return; 1814 } 1815 1816 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1817 } 1818 1819 g_resume_bdev_module = NULL; 1820 1821 bdev_finish_unregister_bdevs_iter(NULL, 0); 1822 } 1823 1824 void 1825 spdk_bdev_module_fini_start_done(void) 1826 { 1827 if (spdk_get_thread() != g_fini_thread) { 1828 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1829 } else { 1830 bdev_module_fini_start_iter(NULL); 1831 } 1832 } 1833 1834 static void 1835 bdev_finish_wait_for_examine_done(void *cb_arg) 1836 { 1837 bdev_module_fini_start_iter(NULL); 1838 } 1839 1840 void 1841 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1842 { 1843 int rc; 1844 1845 assert(cb_fn != NULL); 1846 1847 g_fini_thread = spdk_get_thread(); 1848 1849 g_fini_cb_fn = cb_fn; 1850 g_fini_cb_arg = cb_arg; 1851 1852 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1853 if (rc != 0) { 1854 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1855 bdev_finish_wait_for_examine_done(NULL); 1856 } 1857 } 1858 1859 struct spdk_bdev_io * 1860 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1861 { 1862 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1863 struct spdk_bdev_io *bdev_io; 1864 1865 if (ch->per_thread_cache_count > 0) { 1866 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1867 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1868 ch->per_thread_cache_count--; 1869 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1870 /* 1871 * Don't try to look for bdev_ios in the global pool if there are 1872 * waiters on bdev_ios - we don't want this caller to jump the line. 1873 */ 1874 bdev_io = NULL; 1875 } else { 1876 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1877 } 1878 1879 return bdev_io; 1880 } 1881 1882 void 1883 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1884 { 1885 struct spdk_bdev_mgmt_channel *ch; 1886 1887 assert(bdev_io != NULL); 1888 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1889 1890 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1891 1892 if (bdev_io->internal.buf != NULL) { 1893 bdev_io_put_buf(bdev_io); 1894 } 1895 1896 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1897 ch->per_thread_cache_count++; 1898 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1899 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1900 struct spdk_bdev_io_wait_entry *entry; 1901 1902 entry = TAILQ_FIRST(&ch->io_wait_queue); 1903 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1904 entry->cb_fn(entry->cb_arg); 1905 } 1906 } else { 1907 /* We should never have a full cache with entries on the io wait queue. */ 1908 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1909 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1910 } 1911 } 1912 1913 static bool 1914 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1915 { 1916 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1917 1918 switch (limit) { 1919 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1920 return true; 1921 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1922 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1923 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1924 return false; 1925 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1926 default: 1927 return false; 1928 } 1929 } 1930 1931 static bool 1932 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1933 { 1934 switch (bdev_io->type) { 1935 case SPDK_BDEV_IO_TYPE_NVME_IO: 1936 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1937 case SPDK_BDEV_IO_TYPE_READ: 1938 case SPDK_BDEV_IO_TYPE_WRITE: 1939 return true; 1940 case SPDK_BDEV_IO_TYPE_ZCOPY: 1941 if (bdev_io->u.bdev.zcopy.start) { 1942 return true; 1943 } else { 1944 return false; 1945 } 1946 default: 1947 return false; 1948 } 1949 } 1950 1951 static bool 1952 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1953 { 1954 switch (bdev_io->type) { 1955 case SPDK_BDEV_IO_TYPE_NVME_IO: 1956 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1957 /* Bit 1 (0x2) set for read operation */ 1958 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 1959 return true; 1960 } else { 1961 return false; 1962 } 1963 case SPDK_BDEV_IO_TYPE_READ: 1964 return true; 1965 case SPDK_BDEV_IO_TYPE_ZCOPY: 1966 /* Populate to read from disk */ 1967 if (bdev_io->u.bdev.zcopy.populate) { 1968 return true; 1969 } else { 1970 return false; 1971 } 1972 default: 1973 return false; 1974 } 1975 } 1976 1977 static uint64_t 1978 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 1979 { 1980 struct spdk_bdev *bdev = bdev_io->bdev; 1981 1982 switch (bdev_io->type) { 1983 case SPDK_BDEV_IO_TYPE_NVME_IO: 1984 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1985 return bdev_io->u.nvme_passthru.nbytes; 1986 case SPDK_BDEV_IO_TYPE_READ: 1987 case SPDK_BDEV_IO_TYPE_WRITE: 1988 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1989 case SPDK_BDEV_IO_TYPE_ZCOPY: 1990 /* Track the data in the start phase only */ 1991 if (bdev_io->u.bdev.zcopy.start) { 1992 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 1993 } else { 1994 return 0; 1995 } 1996 default: 1997 return 0; 1998 } 1999 } 2000 2001 static bool 2002 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2003 { 2004 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2005 return true; 2006 } else { 2007 return false; 2008 } 2009 } 2010 2011 static bool 2012 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2013 { 2014 if (bdev_is_read_io(io) == false) { 2015 return false; 2016 } 2017 2018 return bdev_qos_rw_queue_io(limit, io); 2019 } 2020 2021 static bool 2022 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2023 { 2024 if (bdev_is_read_io(io) == true) { 2025 return false; 2026 } 2027 2028 return bdev_qos_rw_queue_io(limit, io); 2029 } 2030 2031 static void 2032 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2033 { 2034 limit->remaining_this_timeslice--; 2035 } 2036 2037 static void 2038 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2039 { 2040 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2041 } 2042 2043 static void 2044 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2045 { 2046 if (bdev_is_read_io(io) == false) { 2047 return; 2048 } 2049 2050 return bdev_qos_rw_bps_update_quota(limit, io); 2051 } 2052 2053 static void 2054 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2055 { 2056 if (bdev_is_read_io(io) == true) { 2057 return; 2058 } 2059 2060 return bdev_qos_rw_bps_update_quota(limit, io); 2061 } 2062 2063 static void 2064 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2065 { 2066 int i; 2067 2068 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2069 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2070 qos->rate_limits[i].queue_io = NULL; 2071 qos->rate_limits[i].update_quota = NULL; 2072 continue; 2073 } 2074 2075 switch (i) { 2076 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2077 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2078 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2079 break; 2080 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2081 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2082 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2083 break; 2084 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2085 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2086 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2087 break; 2088 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2089 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2090 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2091 break; 2092 default: 2093 break; 2094 } 2095 } 2096 } 2097 2098 static void 2099 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2100 struct spdk_bdev_io *bdev_io, 2101 enum spdk_bdev_io_status status) 2102 { 2103 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2104 2105 bdev_io->internal.in_submit_request = true; 2106 bdev_ch->io_outstanding++; 2107 shared_resource->io_outstanding++; 2108 spdk_bdev_io_complete(bdev_io, status); 2109 bdev_io->internal.in_submit_request = false; 2110 } 2111 2112 static inline void 2113 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2114 { 2115 struct spdk_bdev *bdev = bdev_io->bdev; 2116 struct spdk_io_channel *ch = bdev_ch->channel; 2117 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2118 2119 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2120 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2121 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2122 2123 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2124 bdev_abort_buf_io(mgmt_channel, bio_to_abort)) { 2125 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2126 SPDK_BDEV_IO_STATUS_SUCCESS); 2127 return; 2128 } 2129 } 2130 2131 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2132 bdev_io->bdev->split_on_write_unit && 2133 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2134 SPDK_ERRLOG("IO num_blocks %lu does not match the write_unit_size %u\n", 2135 bdev_io->u.bdev.num_blocks, bdev_io->bdev->write_unit_size); 2136 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2137 return; 2138 } 2139 2140 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2141 bdev_ch->io_outstanding++; 2142 shared_resource->io_outstanding++; 2143 bdev_io->internal.in_submit_request = true; 2144 bdev->fn_table->submit_request(ch, bdev_io); 2145 bdev_io->internal.in_submit_request = false; 2146 } else { 2147 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2148 } 2149 } 2150 2151 static bool 2152 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2153 { 2154 int i; 2155 2156 if (bdev_qos_io_to_limit(bdev_io) == true) { 2157 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2158 if (!qos->rate_limits[i].queue_io) { 2159 continue; 2160 } 2161 2162 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2163 bdev_io) == true) { 2164 return true; 2165 } 2166 } 2167 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2168 if (!qos->rate_limits[i].update_quota) { 2169 continue; 2170 } 2171 2172 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2173 } 2174 } 2175 2176 return false; 2177 } 2178 2179 static int 2180 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2181 { 2182 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2183 int submitted_ios = 0; 2184 2185 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2186 if (!bdev_qos_queue_io(qos, bdev_io)) { 2187 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2188 bdev_io_do_submit(ch, bdev_io); 2189 submitted_ios++; 2190 } 2191 } 2192 2193 return submitted_ios; 2194 } 2195 2196 static void 2197 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2198 { 2199 int rc; 2200 2201 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2202 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2203 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2204 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2205 &bdev_io->internal.waitq_entry); 2206 if (rc != 0) { 2207 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2208 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2209 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2210 } 2211 } 2212 2213 static bool 2214 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2215 { 2216 uint32_t io_boundary; 2217 struct spdk_bdev *bdev = bdev_io->bdev; 2218 uint32_t max_size = bdev->max_segment_size; 2219 int max_segs = bdev->max_num_segments; 2220 2221 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2222 io_boundary = bdev->write_unit_size; 2223 } else if (bdev->split_on_optimal_io_boundary) { 2224 io_boundary = bdev->optimal_io_boundary; 2225 } else { 2226 io_boundary = 0; 2227 } 2228 2229 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2230 return false; 2231 } 2232 2233 if (io_boundary) { 2234 uint64_t start_stripe, end_stripe; 2235 2236 start_stripe = bdev_io->u.bdev.offset_blocks; 2237 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2238 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2239 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2240 start_stripe >>= spdk_u32log2(io_boundary); 2241 end_stripe >>= spdk_u32log2(io_boundary); 2242 } else { 2243 start_stripe /= io_boundary; 2244 end_stripe /= io_boundary; 2245 } 2246 2247 if (start_stripe != end_stripe) { 2248 return true; 2249 } 2250 } 2251 2252 if (max_segs) { 2253 if (bdev_io->u.bdev.iovcnt > max_segs) { 2254 return true; 2255 } 2256 } 2257 2258 if (max_size) { 2259 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2260 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2261 return true; 2262 } 2263 } 2264 } 2265 2266 return false; 2267 } 2268 2269 static bool 2270 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2271 { 2272 uint32_t num_unmap_segments; 2273 2274 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2275 return false; 2276 } 2277 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2278 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2279 return true; 2280 } 2281 2282 return false; 2283 } 2284 2285 static bool 2286 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2287 { 2288 if (!bdev_io->bdev->max_write_zeroes) { 2289 return false; 2290 } 2291 2292 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2293 return true; 2294 } 2295 2296 return false; 2297 } 2298 2299 static bool 2300 bdev_copy_should_split(struct spdk_bdev_io *bdev_io) 2301 { 2302 if (bdev_io->bdev->max_copy != 0 && 2303 bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_copy) { 2304 return true; 2305 } 2306 2307 return false; 2308 } 2309 2310 static bool 2311 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2312 { 2313 switch (bdev_io->type) { 2314 case SPDK_BDEV_IO_TYPE_READ: 2315 case SPDK_BDEV_IO_TYPE_WRITE: 2316 return bdev_rw_should_split(bdev_io); 2317 case SPDK_BDEV_IO_TYPE_UNMAP: 2318 return bdev_unmap_should_split(bdev_io); 2319 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2320 return bdev_write_zeroes_should_split(bdev_io); 2321 case SPDK_BDEV_IO_TYPE_COPY: 2322 return bdev_copy_should_split(bdev_io); 2323 default: 2324 return false; 2325 } 2326 } 2327 2328 static uint32_t 2329 _to_next_boundary(uint64_t offset, uint32_t boundary) 2330 { 2331 return (boundary - (offset % boundary)); 2332 } 2333 2334 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2335 2336 static void _bdev_rw_split(void *_bdev_io); 2337 2338 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2339 2340 static void 2341 _bdev_unmap_split(void *_bdev_io) 2342 { 2343 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2344 } 2345 2346 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2347 2348 static void 2349 _bdev_write_zeroes_split(void *_bdev_io) 2350 { 2351 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2352 } 2353 2354 static void bdev_copy_split(struct spdk_bdev_io *bdev_io); 2355 2356 static void 2357 _bdev_copy_split(void *_bdev_io) 2358 { 2359 return bdev_copy_split((struct spdk_bdev_io *)_bdev_io); 2360 } 2361 2362 static int 2363 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2364 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2365 { 2366 int rc; 2367 uint64_t current_offset, current_remaining, current_src_offset; 2368 spdk_bdev_io_wait_cb io_wait_fn; 2369 2370 current_offset = *offset; 2371 current_remaining = *remaining; 2372 2373 bdev_io->u.bdev.split_outstanding++; 2374 2375 io_wait_fn = _bdev_rw_split; 2376 switch (bdev_io->type) { 2377 case SPDK_BDEV_IO_TYPE_READ: 2378 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2379 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2380 iov, iovcnt, md_buf, current_offset, 2381 num_blocks, 2382 bdev_io_split_done, bdev_io, 2383 bdev_io->internal.ext_opts, true); 2384 break; 2385 case SPDK_BDEV_IO_TYPE_WRITE: 2386 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2387 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2388 iov, iovcnt, md_buf, current_offset, 2389 num_blocks, 2390 bdev_io_split_done, bdev_io, 2391 bdev_io->internal.ext_opts, true); 2392 break; 2393 case SPDK_BDEV_IO_TYPE_UNMAP: 2394 io_wait_fn = _bdev_unmap_split; 2395 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2396 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2397 current_offset, num_blocks, 2398 bdev_io_split_done, bdev_io); 2399 break; 2400 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2401 io_wait_fn = _bdev_write_zeroes_split; 2402 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2403 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2404 current_offset, num_blocks, 2405 bdev_io_split_done, bdev_io); 2406 break; 2407 case SPDK_BDEV_IO_TYPE_COPY: 2408 io_wait_fn = _bdev_copy_split; 2409 current_src_offset = bdev_io->u.bdev.copy.src_offset_blocks + 2410 (current_offset - bdev_io->u.bdev.offset_blocks); 2411 rc = spdk_bdev_copy_blocks(bdev_io->internal.desc, 2412 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2413 current_offset, current_src_offset, num_blocks, 2414 bdev_io_split_done, bdev_io); 2415 break; 2416 default: 2417 assert(false); 2418 rc = -EINVAL; 2419 break; 2420 } 2421 2422 if (rc == 0) { 2423 current_offset += num_blocks; 2424 current_remaining -= num_blocks; 2425 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2426 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2427 *offset = current_offset; 2428 *remaining = current_remaining; 2429 } else { 2430 bdev_io->u.bdev.split_outstanding--; 2431 if (rc == -ENOMEM) { 2432 if (bdev_io->u.bdev.split_outstanding == 0) { 2433 /* No I/O is outstanding. Hence we should wait here. */ 2434 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2435 } 2436 } else { 2437 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2438 if (bdev_io->u.bdev.split_outstanding == 0) { 2439 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2440 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2441 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2442 } 2443 } 2444 } 2445 2446 return rc; 2447 } 2448 2449 static void 2450 _bdev_rw_split(void *_bdev_io) 2451 { 2452 struct iovec *parent_iov, *iov; 2453 struct spdk_bdev_io *bdev_io = _bdev_io; 2454 struct spdk_bdev *bdev = bdev_io->bdev; 2455 uint64_t parent_offset, current_offset, remaining; 2456 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2457 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2458 uint32_t iovcnt, iov_len, child_iovsize; 2459 uint32_t blocklen = bdev->blocklen; 2460 uint32_t io_boundary; 2461 uint32_t max_segment_size = bdev->max_segment_size; 2462 uint32_t max_child_iovcnt = bdev->max_num_segments; 2463 void *md_buf = NULL; 2464 int rc; 2465 2466 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2467 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, SPDK_BDEV_IO_NUM_CHILD_IOV) : 2468 SPDK_BDEV_IO_NUM_CHILD_IOV; 2469 2470 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2471 io_boundary = bdev->write_unit_size; 2472 } else if (bdev->split_on_optimal_io_boundary) { 2473 io_boundary = bdev->optimal_io_boundary; 2474 } else { 2475 io_boundary = UINT32_MAX; 2476 } 2477 2478 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2479 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2480 parent_offset = bdev_io->u.bdev.offset_blocks; 2481 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2482 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2483 2484 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2485 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2486 if (parent_iov_offset < parent_iov->iov_len) { 2487 break; 2488 } 2489 parent_iov_offset -= parent_iov->iov_len; 2490 } 2491 2492 child_iovcnt = 0; 2493 while (remaining > 0 && parent_iovpos < parent_iovcnt && 2494 child_iovcnt < SPDK_BDEV_IO_NUM_CHILD_IOV) { 2495 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2496 to_next_boundary = spdk_min(remaining, to_next_boundary); 2497 to_next_boundary_bytes = to_next_boundary * blocklen; 2498 2499 iov = &bdev_io->child_iov[child_iovcnt]; 2500 iovcnt = 0; 2501 2502 if (bdev_io->u.bdev.md_buf) { 2503 md_buf = (char *)bdev_io->u.bdev.md_buf + 2504 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2505 } 2506 2507 child_iovsize = spdk_min(SPDK_BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2508 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2509 iovcnt < child_iovsize) { 2510 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2511 iov_len = parent_iov->iov_len - parent_iov_offset; 2512 2513 iov_len = spdk_min(iov_len, max_segment_size); 2514 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2515 to_next_boundary_bytes -= iov_len; 2516 2517 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2518 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2519 2520 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2521 parent_iov_offset += iov_len; 2522 } else { 2523 parent_iovpos++; 2524 parent_iov_offset = 0; 2525 } 2526 child_iovcnt++; 2527 iovcnt++; 2528 } 2529 2530 if (to_next_boundary_bytes > 0) { 2531 /* We had to stop this child I/O early because we ran out of 2532 * child_iov space or were limited by max_num_segments. 2533 * Ensure the iovs to be aligned with block size and 2534 * then adjust to_next_boundary before starting the 2535 * child I/O. 2536 */ 2537 assert(child_iovcnt == SPDK_BDEV_IO_NUM_CHILD_IOV || 2538 iovcnt == child_iovsize); 2539 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2540 if (to_last_block_bytes != 0) { 2541 uint32_t child_iovpos = child_iovcnt - 1; 2542 /* don't decrease child_iovcnt when it equals to SPDK_BDEV_IO_NUM_CHILD_IOV 2543 * so the loop will naturally end 2544 */ 2545 2546 to_last_block_bytes = blocklen - to_last_block_bytes; 2547 to_next_boundary_bytes += to_last_block_bytes; 2548 while (to_last_block_bytes > 0 && iovcnt > 0) { 2549 iov_len = spdk_min(to_last_block_bytes, 2550 bdev_io->child_iov[child_iovpos].iov_len); 2551 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2552 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2553 child_iovpos--; 2554 if (--iovcnt == 0) { 2555 /* If the child IO is less than a block size just return. 2556 * If the first child IO of any split round is less than 2557 * a block size, an error exit. 2558 */ 2559 if (bdev_io->u.bdev.split_outstanding == 0) { 2560 SPDK_ERRLOG("The first child io was less than a block size\n"); 2561 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2562 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2563 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2564 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2565 } 2566 2567 return; 2568 } 2569 } 2570 2571 to_last_block_bytes -= iov_len; 2572 2573 if (parent_iov_offset == 0) { 2574 parent_iovpos--; 2575 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2576 } 2577 parent_iov_offset -= iov_len; 2578 } 2579 2580 assert(to_last_block_bytes == 0); 2581 } 2582 to_next_boundary -= to_next_boundary_bytes / blocklen; 2583 } 2584 2585 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2586 ¤t_offset, &remaining); 2587 if (spdk_unlikely(rc)) { 2588 return; 2589 } 2590 } 2591 } 2592 2593 static void 2594 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2595 { 2596 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2597 uint32_t num_children_reqs = 0; 2598 int rc; 2599 2600 offset = bdev_io->u.bdev.split_current_offset_blocks; 2601 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2602 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2603 2604 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2605 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2606 2607 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2608 &offset, &remaining); 2609 if (spdk_likely(rc == 0)) { 2610 num_children_reqs++; 2611 } else { 2612 return; 2613 } 2614 } 2615 } 2616 2617 static void 2618 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2619 { 2620 uint64_t offset, write_zeroes_blocks, remaining; 2621 uint32_t num_children_reqs = 0; 2622 int rc; 2623 2624 offset = bdev_io->u.bdev.split_current_offset_blocks; 2625 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2626 2627 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2628 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2629 2630 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2631 &offset, &remaining); 2632 if (spdk_likely(rc == 0)) { 2633 num_children_reqs++; 2634 } else { 2635 return; 2636 } 2637 } 2638 } 2639 2640 static void 2641 bdev_copy_split(struct spdk_bdev_io *bdev_io) 2642 { 2643 uint64_t offset, copy_blocks, remaining; 2644 uint32_t num_children_reqs = 0; 2645 int rc; 2646 2647 offset = bdev_io->u.bdev.split_current_offset_blocks; 2648 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2649 2650 assert(bdev_io->bdev->max_copy != 0); 2651 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_COPY_REQS)) { 2652 copy_blocks = spdk_min(remaining, bdev_io->bdev->max_copy); 2653 2654 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, copy_blocks, 2655 &offset, &remaining); 2656 if (spdk_likely(rc == 0)) { 2657 num_children_reqs++; 2658 } else { 2659 return; 2660 } 2661 } 2662 } 2663 2664 static void 2665 parent_bdev_io_complete(void *ctx, int rc) 2666 { 2667 struct spdk_bdev_io *parent_io = ctx; 2668 2669 if (rc) { 2670 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2671 } 2672 2673 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2674 parent_io->internal.caller_ctx); 2675 } 2676 2677 static void 2678 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2679 { 2680 struct spdk_bdev_io *parent_io = cb_arg; 2681 2682 spdk_bdev_free_io(bdev_io); 2683 2684 if (!success) { 2685 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2686 /* If any child I/O failed, stop further splitting process. */ 2687 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2688 parent_io->u.bdev.split_remaining_num_blocks = 0; 2689 } 2690 parent_io->u.bdev.split_outstanding--; 2691 if (parent_io->u.bdev.split_outstanding != 0) { 2692 return; 2693 } 2694 2695 /* 2696 * Parent I/O finishes when all blocks are consumed. 2697 */ 2698 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2699 assert(parent_io->internal.cb != bdev_io_split_done); 2700 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2701 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2702 2703 if (parent_io->internal.orig_iovcnt != 0) { 2704 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2705 /* bdev IO will be completed in the callback */ 2706 } else { 2707 parent_bdev_io_complete(parent_io, 0); 2708 } 2709 return; 2710 } 2711 2712 /* 2713 * Continue with the splitting process. This function will complete the parent I/O if the 2714 * splitting is done. 2715 */ 2716 switch (parent_io->type) { 2717 case SPDK_BDEV_IO_TYPE_READ: 2718 case SPDK_BDEV_IO_TYPE_WRITE: 2719 _bdev_rw_split(parent_io); 2720 break; 2721 case SPDK_BDEV_IO_TYPE_UNMAP: 2722 bdev_unmap_split(parent_io); 2723 break; 2724 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2725 bdev_write_zeroes_split(parent_io); 2726 break; 2727 case SPDK_BDEV_IO_TYPE_COPY: 2728 bdev_copy_split(parent_io); 2729 break; 2730 default: 2731 assert(false); 2732 break; 2733 } 2734 } 2735 2736 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2737 bool success); 2738 2739 static void 2740 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2741 { 2742 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2743 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2744 bdev_io->u.bdev.split_outstanding = 0; 2745 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2746 2747 switch (bdev_io->type) { 2748 case SPDK_BDEV_IO_TYPE_READ: 2749 case SPDK_BDEV_IO_TYPE_WRITE: 2750 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2751 _bdev_rw_split(bdev_io); 2752 } else { 2753 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2754 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2755 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2756 } 2757 break; 2758 case SPDK_BDEV_IO_TYPE_UNMAP: 2759 bdev_unmap_split(bdev_io); 2760 break; 2761 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2762 bdev_write_zeroes_split(bdev_io); 2763 break; 2764 case SPDK_BDEV_IO_TYPE_COPY: 2765 bdev_copy_split(bdev_io); 2766 break; 2767 default: 2768 assert(false); 2769 break; 2770 } 2771 } 2772 2773 static void 2774 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2775 { 2776 if (!success) { 2777 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2778 return; 2779 } 2780 2781 _bdev_rw_split(bdev_io); 2782 } 2783 2784 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2785 * be inlined, at least on some compilers. 2786 */ 2787 static inline void 2788 _bdev_io_submit(void *ctx) 2789 { 2790 struct spdk_bdev_io *bdev_io = ctx; 2791 struct spdk_bdev *bdev = bdev_io->bdev; 2792 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2793 2794 if (spdk_likely(bdev_ch->flags == 0)) { 2795 bdev_io_do_submit(bdev_ch, bdev_io); 2796 return; 2797 } 2798 2799 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2800 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2801 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2802 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2803 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2804 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2805 } else { 2806 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2807 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2808 } 2809 } else { 2810 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2811 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2812 } 2813 } 2814 2815 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2816 2817 bool 2818 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2819 { 2820 if (range1->length == 0 || range2->length == 0) { 2821 return false; 2822 } 2823 2824 if (range1->offset + range1->length <= range2->offset) { 2825 return false; 2826 } 2827 2828 if (range2->offset + range2->length <= range1->offset) { 2829 return false; 2830 } 2831 2832 return true; 2833 } 2834 2835 static bool 2836 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2837 { 2838 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2839 struct lba_range r; 2840 2841 switch (bdev_io->type) { 2842 case SPDK_BDEV_IO_TYPE_NVME_IO: 2843 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2844 /* Don't try to decode the NVMe command - just assume worst-case and that 2845 * it overlaps a locked range. 2846 */ 2847 return true; 2848 case SPDK_BDEV_IO_TYPE_WRITE: 2849 case SPDK_BDEV_IO_TYPE_UNMAP: 2850 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2851 case SPDK_BDEV_IO_TYPE_ZCOPY: 2852 case SPDK_BDEV_IO_TYPE_COPY: 2853 r.offset = bdev_io->u.bdev.offset_blocks; 2854 r.length = bdev_io->u.bdev.num_blocks; 2855 if (!bdev_lba_range_overlapped(range, &r)) { 2856 /* This I/O doesn't overlap the specified LBA range. */ 2857 return false; 2858 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2859 /* This I/O overlaps, but the I/O is on the same channel that locked this 2860 * range, and the caller_ctx is the same as the locked_ctx. This means 2861 * that this I/O is associated with the lock, and is allowed to execute. 2862 */ 2863 return false; 2864 } else { 2865 return true; 2866 } 2867 default: 2868 return false; 2869 } 2870 } 2871 2872 void 2873 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2874 { 2875 struct spdk_bdev *bdev = bdev_io->bdev; 2876 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2877 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2878 2879 assert(thread != NULL); 2880 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2881 2882 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2883 struct lba_range *range; 2884 2885 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2886 if (bdev_io_range_is_locked(bdev_io, range)) { 2887 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2888 return; 2889 } 2890 } 2891 } 2892 2893 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2894 2895 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2896 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2897 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2898 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2899 spdk_bdev_get_name(bdev)); 2900 2901 if (bdev_io_should_split(bdev_io)) { 2902 bdev_io_split(NULL, bdev_io); 2903 return; 2904 } 2905 2906 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2907 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2908 _bdev_io_submit(bdev_io); 2909 } else { 2910 bdev_io->internal.io_submit_ch = ch; 2911 bdev_io->internal.ch = bdev->internal.qos->ch; 2912 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2913 } 2914 } else { 2915 _bdev_io_submit(bdev_io); 2916 } 2917 } 2918 2919 static inline void 2920 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2921 { 2922 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2923 2924 /* Zero part we don't copy */ 2925 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2926 memcpy(opts_copy, opts, opts->size); 2927 opts_copy->size = sizeof(*opts_copy); 2928 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2929 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2930 bdev_io->u.bdev.ext_opts = opts_copy; 2931 } 2932 2933 static inline void 2934 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2935 { 2936 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2937 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2938 * For write operation we need to pull buffers from memory domain before submitting IO. 2939 * Once read operation completes, we need to use memory_domain push functionality to 2940 * update data in original memory domain IO buffer 2941 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2942 * the copied ext_opts */ 2943 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2944 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2945 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2946 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2947 } 2948 2949 static inline void 2950 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2951 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2952 { 2953 if (opts) { 2954 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2955 assert(opts->size <= sizeof(*opts)); 2956 /* 2957 * copy if size is smaller than opts struct to avoid having to check size 2958 * on every access to bdev_io->u.bdev.ext_opts 2959 */ 2960 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2961 _bdev_io_copy_ext_opts(bdev_io, opts); 2962 if (use_pull_push) { 2963 _bdev_io_ext_use_bounce_buffer(bdev_io); 2964 return; 2965 } 2966 } 2967 } 2968 bdev_io_submit(bdev_io); 2969 } 2970 2971 static void 2972 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2973 { 2974 struct spdk_bdev *bdev = bdev_io->bdev; 2975 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2976 struct spdk_io_channel *ch = bdev_ch->channel; 2977 2978 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2979 2980 bdev_io->internal.in_submit_request = true; 2981 bdev->fn_table->submit_request(ch, bdev_io); 2982 bdev_io->internal.in_submit_request = false; 2983 } 2984 2985 void 2986 bdev_io_init(struct spdk_bdev_io *bdev_io, 2987 struct spdk_bdev *bdev, void *cb_arg, 2988 spdk_bdev_io_completion_cb cb) 2989 { 2990 bdev_io->bdev = bdev; 2991 bdev_io->internal.caller_ctx = cb_arg; 2992 bdev_io->internal.cb = cb; 2993 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2994 bdev_io->internal.in_submit_request = false; 2995 bdev_io->internal.buf = NULL; 2996 bdev_io->internal.io_submit_ch = NULL; 2997 bdev_io->internal.orig_iovs = NULL; 2998 bdev_io->internal.orig_iovcnt = 0; 2999 bdev_io->internal.orig_md_iov.iov_base = NULL; 3000 bdev_io->internal.error.nvme.cdw0 = 0; 3001 bdev_io->num_retries = 0; 3002 bdev_io->internal.get_buf_cb = NULL; 3003 bdev_io->internal.get_aux_buf_cb = NULL; 3004 bdev_io->internal.ext_opts = NULL; 3005 bdev_io->internal.data_transfer_cpl = NULL; 3006 } 3007 3008 static bool 3009 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3010 { 3011 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 3012 } 3013 3014 bool 3015 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3016 { 3017 bool supported; 3018 3019 supported = bdev_io_type_supported(bdev, io_type); 3020 3021 if (!supported) { 3022 switch (io_type) { 3023 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3024 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3025 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3026 break; 3027 default: 3028 break; 3029 } 3030 } 3031 3032 return supported; 3033 } 3034 3035 uint64_t 3036 spdk_bdev_io_get_submit_tsc(struct spdk_bdev_io *bdev_io) 3037 { 3038 return bdev_io->internal.submit_tsc; 3039 } 3040 3041 int 3042 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3043 { 3044 if (bdev->fn_table->dump_info_json) { 3045 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3046 } 3047 3048 return 0; 3049 } 3050 3051 static void 3052 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3053 { 3054 uint32_t max_per_timeslice = 0; 3055 int i; 3056 3057 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3058 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3059 qos->rate_limits[i].max_per_timeslice = 0; 3060 continue; 3061 } 3062 3063 max_per_timeslice = qos->rate_limits[i].limit * 3064 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3065 3066 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3067 qos->rate_limits[i].min_per_timeslice); 3068 3069 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3070 } 3071 3072 bdev_qos_set_ops(qos); 3073 } 3074 3075 static int 3076 bdev_channel_poll_qos(void *arg) 3077 { 3078 struct spdk_bdev_qos *qos = arg; 3079 uint64_t now = spdk_get_ticks(); 3080 int i; 3081 3082 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3083 /* We received our callback earlier than expected - return 3084 * immediately and wait to do accounting until at least one 3085 * timeslice has actually expired. This should never happen 3086 * with a well-behaved timer implementation. 3087 */ 3088 return SPDK_POLLER_IDLE; 3089 } 3090 3091 /* Reset for next round of rate limiting */ 3092 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3093 /* We may have allowed the IOs or bytes to slightly overrun in the last 3094 * timeslice. remaining_this_timeslice is signed, so if it's negative 3095 * here, we'll account for the overrun so that the next timeslice will 3096 * be appropriately reduced. 3097 */ 3098 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3099 qos->rate_limits[i].remaining_this_timeslice = 0; 3100 } 3101 } 3102 3103 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3104 qos->last_timeslice += qos->timeslice_size; 3105 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3106 qos->rate_limits[i].remaining_this_timeslice += 3107 qos->rate_limits[i].max_per_timeslice; 3108 } 3109 } 3110 3111 return bdev_qos_io_submit(qos->ch, qos); 3112 } 3113 3114 static void 3115 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3116 { 3117 struct spdk_bdev_shared_resource *shared_resource; 3118 struct lba_range *range; 3119 3120 bdev_io_stat_free(ch->stat); 3121 #ifdef SPDK_CONFIG_VTUNE 3122 bdev_io_stat_free(ch->prev_stat); 3123 #endif 3124 3125 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3126 range = TAILQ_FIRST(&ch->locked_ranges); 3127 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3128 free(range); 3129 } 3130 3131 spdk_put_io_channel(ch->channel); 3132 3133 shared_resource = ch->shared_resource; 3134 3135 assert(TAILQ_EMPTY(&ch->io_locked)); 3136 assert(TAILQ_EMPTY(&ch->io_submitted)); 3137 assert(ch->io_outstanding == 0); 3138 assert(shared_resource->ref > 0); 3139 shared_resource->ref--; 3140 if (shared_resource->ref == 0) { 3141 assert(shared_resource->io_outstanding == 0); 3142 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3143 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3144 free(shared_resource); 3145 } 3146 } 3147 3148 static void 3149 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3150 { 3151 struct spdk_bdev_qos *qos = bdev->internal.qos; 3152 int i; 3153 3154 assert(spdk_spin_held(&bdev->internal.spinlock)); 3155 3156 /* Rate limiting on this bdev enabled */ 3157 if (qos) { 3158 if (qos->ch == NULL) { 3159 struct spdk_io_channel *io_ch; 3160 3161 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3162 bdev->name, spdk_get_thread()); 3163 3164 /* No qos channel has been selected, so set one up */ 3165 3166 /* Take another reference to ch */ 3167 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3168 assert(io_ch != NULL); 3169 qos->ch = ch; 3170 3171 qos->thread = spdk_io_channel_get_thread(io_ch); 3172 3173 TAILQ_INIT(&qos->queued); 3174 3175 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3176 if (bdev_qos_is_iops_rate_limit(i) == true) { 3177 qos->rate_limits[i].min_per_timeslice = 3178 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3179 } else { 3180 qos->rate_limits[i].min_per_timeslice = 3181 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3182 } 3183 3184 if (qos->rate_limits[i].limit == 0) { 3185 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3186 } 3187 } 3188 bdev_qos_update_max_quota_per_timeslice(qos); 3189 qos->timeslice_size = 3190 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3191 qos->last_timeslice = spdk_get_ticks(); 3192 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3193 qos, 3194 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3195 } 3196 3197 ch->flags |= BDEV_CH_QOS_ENABLED; 3198 } 3199 } 3200 3201 struct poll_timeout_ctx { 3202 struct spdk_bdev_desc *desc; 3203 uint64_t timeout_in_sec; 3204 spdk_bdev_io_timeout_cb cb_fn; 3205 void *cb_arg; 3206 }; 3207 3208 static void 3209 bdev_desc_free(struct spdk_bdev_desc *desc) 3210 { 3211 spdk_spin_destroy(&desc->spinlock); 3212 free(desc->media_events_buffer); 3213 free(desc); 3214 } 3215 3216 static void 3217 bdev_channel_poll_timeout_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 3218 { 3219 struct poll_timeout_ctx *ctx = _ctx; 3220 struct spdk_bdev_desc *desc = ctx->desc; 3221 3222 free(ctx); 3223 3224 spdk_spin_lock(&desc->spinlock); 3225 desc->refs--; 3226 if (desc->closed == true && desc->refs == 0) { 3227 spdk_spin_unlock(&desc->spinlock); 3228 bdev_desc_free(desc); 3229 return; 3230 } 3231 spdk_spin_unlock(&desc->spinlock); 3232 } 3233 3234 static void 3235 bdev_channel_poll_timeout_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 3236 struct spdk_io_channel *io_ch, void *_ctx) 3237 { 3238 struct poll_timeout_ctx *ctx = _ctx; 3239 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 3240 struct spdk_bdev_desc *desc = ctx->desc; 3241 struct spdk_bdev_io *bdev_io; 3242 uint64_t now; 3243 3244 spdk_spin_lock(&desc->spinlock); 3245 if (desc->closed == true) { 3246 spdk_spin_unlock(&desc->spinlock); 3247 spdk_bdev_for_each_channel_continue(i, -1); 3248 return; 3249 } 3250 spdk_spin_unlock(&desc->spinlock); 3251 3252 now = spdk_get_ticks(); 3253 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3254 /* Exclude any I/O that are generated via splitting. */ 3255 if (bdev_io->internal.cb == bdev_io_split_done) { 3256 continue; 3257 } 3258 3259 /* Once we find an I/O that has not timed out, we can immediately 3260 * exit the loop. 3261 */ 3262 if (now < (bdev_io->internal.submit_tsc + 3263 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3264 goto end; 3265 } 3266 3267 if (bdev_io->internal.desc == desc) { 3268 ctx->cb_fn(ctx->cb_arg, bdev_io); 3269 } 3270 } 3271 3272 end: 3273 spdk_bdev_for_each_channel_continue(i, 0); 3274 } 3275 3276 static int 3277 bdev_poll_timeout_io(void *arg) 3278 { 3279 struct spdk_bdev_desc *desc = arg; 3280 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3281 struct poll_timeout_ctx *ctx; 3282 3283 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3284 if (!ctx) { 3285 SPDK_ERRLOG("failed to allocate memory\n"); 3286 return SPDK_POLLER_BUSY; 3287 } 3288 ctx->desc = desc; 3289 ctx->cb_arg = desc->cb_arg; 3290 ctx->cb_fn = desc->cb_fn; 3291 ctx->timeout_in_sec = desc->timeout_in_sec; 3292 3293 /* Take a ref on the descriptor in case it gets closed while we are checking 3294 * all of the channels. 3295 */ 3296 spdk_spin_lock(&desc->spinlock); 3297 desc->refs++; 3298 spdk_spin_unlock(&desc->spinlock); 3299 3300 spdk_bdev_for_each_channel(bdev, bdev_channel_poll_timeout_io, ctx, 3301 bdev_channel_poll_timeout_io_done); 3302 3303 return SPDK_POLLER_BUSY; 3304 } 3305 3306 int 3307 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3308 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3309 { 3310 assert(desc->thread == spdk_get_thread()); 3311 3312 spdk_poller_unregister(&desc->io_timeout_poller); 3313 3314 if (timeout_in_sec) { 3315 assert(cb_fn != NULL); 3316 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3317 desc, 3318 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3319 1000); 3320 if (desc->io_timeout_poller == NULL) { 3321 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3322 return -1; 3323 } 3324 } 3325 3326 desc->cb_fn = cb_fn; 3327 desc->cb_arg = cb_arg; 3328 desc->timeout_in_sec = timeout_in_sec; 3329 3330 return 0; 3331 } 3332 3333 static int 3334 bdev_channel_create(void *io_device, void *ctx_buf) 3335 { 3336 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3337 struct spdk_bdev_channel *ch = ctx_buf; 3338 struct spdk_io_channel *mgmt_io_ch; 3339 struct spdk_bdev_mgmt_channel *mgmt_ch; 3340 struct spdk_bdev_shared_resource *shared_resource; 3341 struct lba_range *range; 3342 3343 ch->bdev = bdev; 3344 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3345 if (!ch->channel) { 3346 return -1; 3347 } 3348 3349 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3350 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3351 3352 assert(ch->histogram == NULL); 3353 if (bdev->internal.histogram_enabled) { 3354 ch->histogram = spdk_histogram_data_alloc(); 3355 if (ch->histogram == NULL) { 3356 SPDK_ERRLOG("Could not allocate histogram\n"); 3357 } 3358 } 3359 3360 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3361 if (!mgmt_io_ch) { 3362 spdk_put_io_channel(ch->channel); 3363 return -1; 3364 } 3365 3366 mgmt_ch = __io_ch_to_bdev_mgmt_ch(mgmt_io_ch); 3367 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3368 if (shared_resource->shared_ch == ch->channel) { 3369 spdk_put_io_channel(mgmt_io_ch); 3370 shared_resource->ref++; 3371 break; 3372 } 3373 } 3374 3375 if (shared_resource == NULL) { 3376 shared_resource = calloc(1, sizeof(*shared_resource)); 3377 if (shared_resource == NULL) { 3378 spdk_put_io_channel(ch->channel); 3379 spdk_put_io_channel(mgmt_io_ch); 3380 return -1; 3381 } 3382 3383 shared_resource->mgmt_ch = mgmt_ch; 3384 shared_resource->io_outstanding = 0; 3385 TAILQ_INIT(&shared_resource->nomem_io); 3386 shared_resource->nomem_threshold = 0; 3387 shared_resource->shared_ch = ch->channel; 3388 shared_resource->ref = 1; 3389 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3390 } 3391 3392 ch->io_outstanding = 0; 3393 TAILQ_INIT(&ch->queued_resets); 3394 TAILQ_INIT(&ch->locked_ranges); 3395 ch->flags = 0; 3396 ch->shared_resource = shared_resource; 3397 3398 TAILQ_INIT(&ch->io_submitted); 3399 TAILQ_INIT(&ch->io_locked); 3400 3401 ch->stat = bdev_io_stat_alloc(); 3402 if (ch->stat == NULL) { 3403 bdev_channel_destroy_resource(ch); 3404 return -1; 3405 } 3406 3407 ch->stat->ticks_rate = spdk_get_ticks_hz(); 3408 3409 #ifdef SPDK_CONFIG_VTUNE 3410 { 3411 char *name; 3412 __itt_init_ittlib(NULL, 0); 3413 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3414 if (!name) { 3415 bdev_channel_destroy_resource(ch); 3416 return -1; 3417 } 3418 ch->handle = __itt_string_handle_create(name); 3419 free(name); 3420 ch->start_tsc = spdk_get_ticks(); 3421 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3422 ch->prev_stat = bdev_io_stat_alloc(); 3423 if (ch->prev_stat == NULL) { 3424 bdev_channel_destroy_resource(ch); 3425 return -1; 3426 } 3427 } 3428 #endif 3429 3430 spdk_spin_lock(&bdev->internal.spinlock); 3431 bdev_enable_qos(bdev, ch); 3432 3433 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3434 struct lba_range *new_range; 3435 3436 new_range = calloc(1, sizeof(*new_range)); 3437 if (new_range == NULL) { 3438 spdk_spin_unlock(&bdev->internal.spinlock); 3439 bdev_channel_destroy_resource(ch); 3440 return -1; 3441 } 3442 new_range->length = range->length; 3443 new_range->offset = range->offset; 3444 new_range->locked_ctx = range->locked_ctx; 3445 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3446 } 3447 3448 spdk_spin_unlock(&bdev->internal.spinlock); 3449 3450 return 0; 3451 } 3452 3453 static int 3454 bdev_abort_all_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, 3455 void *cb_ctx) 3456 { 3457 struct spdk_bdev_channel *bdev_ch = cb_ctx; 3458 struct spdk_bdev_io *bdev_io; 3459 uint64_t buf_len; 3460 3461 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3462 if (bdev_io->internal.ch == bdev_ch) { 3463 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3464 spdk_iobuf_entry_abort(ch, entry, buf_len); 3465 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3466 } 3467 3468 return 0; 3469 } 3470 3471 /* 3472 * Abort I/O that are waiting on a data buffer. 3473 */ 3474 static void 3475 bdev_abort_all_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_channel *ch) 3476 { 3477 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3478 bdev_abort_all_buf_io_cb, ch); 3479 spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3480 bdev_abort_all_buf_io_cb, ch); 3481 } 3482 3483 /* 3484 * Abort I/O that are queued waiting for submission. These types of I/O are 3485 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3486 */ 3487 static void 3488 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3489 { 3490 struct spdk_bdev_io *bdev_io, *tmp; 3491 3492 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3493 if (bdev_io->internal.ch == ch) { 3494 TAILQ_REMOVE(queue, bdev_io, internal.link); 3495 /* 3496 * spdk_bdev_io_complete() assumes that the completed I/O had 3497 * been submitted to the bdev module. Since in this case it 3498 * hadn't, bump io_outstanding to account for the decrement 3499 * that spdk_bdev_io_complete() will do. 3500 */ 3501 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3502 ch->io_outstanding++; 3503 ch->shared_resource->io_outstanding++; 3504 } 3505 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3506 } 3507 } 3508 } 3509 3510 static bool 3511 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3512 { 3513 struct spdk_bdev_io *bdev_io; 3514 3515 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3516 if (bdev_io == bio_to_abort) { 3517 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3518 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3519 return true; 3520 } 3521 } 3522 3523 return false; 3524 } 3525 3526 static int 3527 bdev_abort_buf_io_cb(struct spdk_iobuf_channel *ch, struct spdk_iobuf_entry *entry, void *cb_ctx) 3528 { 3529 struct spdk_bdev_io *bdev_io, *bio_to_abort = cb_ctx; 3530 uint64_t buf_len; 3531 3532 bdev_io = SPDK_CONTAINEROF(entry, struct spdk_bdev_io, internal.iobuf); 3533 if (bdev_io == bio_to_abort) { 3534 buf_len = bdev_io_get_max_buf_len(bdev_io, bdev_io->internal.buf_len); 3535 spdk_iobuf_entry_abort(ch, entry, buf_len); 3536 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3537 return 1; 3538 } 3539 3540 return 0; 3541 } 3542 3543 static bool 3544 bdev_abort_buf_io(struct spdk_bdev_mgmt_channel *mgmt_ch, struct spdk_bdev_io *bio_to_abort) 3545 { 3546 int rc; 3547 3548 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.small, 3549 bdev_abort_buf_io_cb, bio_to_abort); 3550 if (rc == 1) { 3551 return true; 3552 } 3553 3554 rc = spdk_iobuf_for_each_entry(&mgmt_ch->iobuf, &mgmt_ch->iobuf.large, 3555 bdev_abort_buf_io_cb, bio_to_abort); 3556 return rc == 1; 3557 } 3558 3559 static void 3560 bdev_qos_channel_destroy(void *cb_arg) 3561 { 3562 struct spdk_bdev_qos *qos = cb_arg; 3563 3564 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3565 spdk_poller_unregister(&qos->poller); 3566 3567 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3568 3569 free(qos); 3570 } 3571 3572 static int 3573 bdev_qos_destroy(struct spdk_bdev *bdev) 3574 { 3575 int i; 3576 3577 /* 3578 * Cleanly shutting down the QoS poller is tricky, because 3579 * during the asynchronous operation the user could open 3580 * a new descriptor and create a new channel, spawning 3581 * a new QoS poller. 3582 * 3583 * The strategy is to create a new QoS structure here and swap it 3584 * in. The shutdown path then continues to refer to the old one 3585 * until it completes and then releases it. 3586 */ 3587 struct spdk_bdev_qos *new_qos, *old_qos; 3588 3589 old_qos = bdev->internal.qos; 3590 3591 new_qos = calloc(1, sizeof(*new_qos)); 3592 if (!new_qos) { 3593 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3594 return -ENOMEM; 3595 } 3596 3597 /* Copy the old QoS data into the newly allocated structure */ 3598 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3599 3600 /* Zero out the key parts of the QoS structure */ 3601 new_qos->ch = NULL; 3602 new_qos->thread = NULL; 3603 new_qos->poller = NULL; 3604 TAILQ_INIT(&new_qos->queued); 3605 /* 3606 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3607 * It will be used later for the new QoS structure. 3608 */ 3609 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3610 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3611 new_qos->rate_limits[i].min_per_timeslice = 0; 3612 new_qos->rate_limits[i].max_per_timeslice = 0; 3613 } 3614 3615 bdev->internal.qos = new_qos; 3616 3617 if (old_qos->thread == NULL) { 3618 free(old_qos); 3619 } else { 3620 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3621 } 3622 3623 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3624 * been destroyed yet. The destruction path will end up waiting for the final 3625 * channel to be put before it releases resources. */ 3626 3627 return 0; 3628 } 3629 3630 static void 3631 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3632 { 3633 total->bytes_read += add->bytes_read; 3634 total->num_read_ops += add->num_read_ops; 3635 total->bytes_written += add->bytes_written; 3636 total->num_write_ops += add->num_write_ops; 3637 total->bytes_unmapped += add->bytes_unmapped; 3638 total->num_unmap_ops += add->num_unmap_ops; 3639 total->bytes_copied += add->bytes_copied; 3640 total->num_copy_ops += add->num_copy_ops; 3641 total->read_latency_ticks += add->read_latency_ticks; 3642 total->write_latency_ticks += add->write_latency_ticks; 3643 total->unmap_latency_ticks += add->unmap_latency_ticks; 3644 total->copy_latency_ticks += add->copy_latency_ticks; 3645 if (total->max_read_latency_ticks < add->max_read_latency_ticks) { 3646 total->max_read_latency_ticks = add->max_read_latency_ticks; 3647 } 3648 if (total->min_read_latency_ticks > add->min_read_latency_ticks) { 3649 total->min_read_latency_ticks = add->min_read_latency_ticks; 3650 } 3651 if (total->max_write_latency_ticks < add->max_write_latency_ticks) { 3652 total->max_write_latency_ticks = add->max_write_latency_ticks; 3653 } 3654 if (total->min_write_latency_ticks > add->min_write_latency_ticks) { 3655 total->min_write_latency_ticks = add->min_write_latency_ticks; 3656 } 3657 if (total->max_unmap_latency_ticks < add->max_unmap_latency_ticks) { 3658 total->max_unmap_latency_ticks = add->max_unmap_latency_ticks; 3659 } 3660 if (total->min_unmap_latency_ticks > add->min_unmap_latency_ticks) { 3661 total->min_unmap_latency_ticks = add->min_unmap_latency_ticks; 3662 } 3663 if (total->max_copy_latency_ticks < add->max_copy_latency_ticks) { 3664 total->max_copy_latency_ticks = add->max_copy_latency_ticks; 3665 } 3666 if (total->min_copy_latency_ticks > add->min_copy_latency_ticks) { 3667 total->min_copy_latency_ticks = add->min_copy_latency_ticks; 3668 } 3669 } 3670 3671 static void 3672 bdev_io_stat_get(struct spdk_bdev_io_stat *to_stat, struct spdk_bdev_io_stat *from_stat) 3673 { 3674 memcpy(to_stat, from_stat, sizeof(struct spdk_bdev_io_stat)); 3675 } 3676 3677 static void 3678 bdev_io_stat_reset(struct spdk_bdev_io_stat *stat, enum bdev_reset_stat_mode mode) 3679 { 3680 stat->max_read_latency_ticks = 0; 3681 stat->min_read_latency_ticks = UINT64_MAX; 3682 stat->max_write_latency_ticks = 0; 3683 stat->min_write_latency_ticks = UINT64_MAX; 3684 stat->max_unmap_latency_ticks = 0; 3685 stat->min_unmap_latency_ticks = UINT64_MAX; 3686 stat->max_copy_latency_ticks = 0; 3687 stat->min_copy_latency_ticks = UINT64_MAX; 3688 3689 if (mode != BDEV_RESET_STAT_ALL) { 3690 return; 3691 } 3692 3693 stat->bytes_read = 0; 3694 stat->num_read_ops = 0; 3695 stat->bytes_written = 0; 3696 stat->num_write_ops = 0; 3697 stat->bytes_unmapped = 0; 3698 stat->num_unmap_ops = 0; 3699 stat->read_latency_ticks = 0; 3700 stat->write_latency_ticks = 0; 3701 stat->unmap_latency_ticks = 0; 3702 } 3703 3704 struct spdk_bdev_io_stat * 3705 bdev_io_stat_alloc(void) 3706 { 3707 struct spdk_bdev_io_stat *stat; 3708 3709 stat = malloc(sizeof(struct spdk_bdev_io_stat)); 3710 if (stat != NULL) { 3711 bdev_io_stat_reset(stat, BDEV_RESET_STAT_ALL); 3712 } 3713 3714 return stat; 3715 } 3716 3717 void 3718 bdev_io_stat_free(struct spdk_bdev_io_stat *stat) 3719 { 3720 free(stat); 3721 } 3722 3723 void 3724 bdev_io_stat_dump_json(struct spdk_bdev_io_stat *stat, struct spdk_json_write_ctx *w) 3725 { 3726 spdk_json_write_named_uint64(w, "bytes_read", stat->bytes_read); 3727 spdk_json_write_named_uint64(w, "num_read_ops", stat->num_read_ops); 3728 spdk_json_write_named_uint64(w, "bytes_written", stat->bytes_written); 3729 spdk_json_write_named_uint64(w, "num_write_ops", stat->num_write_ops); 3730 spdk_json_write_named_uint64(w, "bytes_unmapped", stat->bytes_unmapped); 3731 spdk_json_write_named_uint64(w, "num_unmap_ops", stat->num_unmap_ops); 3732 spdk_json_write_named_uint64(w, "bytes_copied", stat->bytes_copied); 3733 spdk_json_write_named_uint64(w, "num_copy_ops", stat->num_copy_ops); 3734 spdk_json_write_named_uint64(w, "read_latency_ticks", stat->read_latency_ticks); 3735 spdk_json_write_named_uint64(w, "max_read_latency_ticks", stat->max_read_latency_ticks); 3736 spdk_json_write_named_uint64(w, "min_read_latency_ticks", 3737 stat->min_read_latency_ticks != UINT64_MAX ? 3738 stat->min_read_latency_ticks : 0); 3739 spdk_json_write_named_uint64(w, "write_latency_ticks", stat->write_latency_ticks); 3740 spdk_json_write_named_uint64(w, "max_write_latency_ticks", stat->max_write_latency_ticks); 3741 spdk_json_write_named_uint64(w, "min_write_latency_ticks", 3742 stat->min_write_latency_ticks != UINT64_MAX ? 3743 stat->min_write_latency_ticks : 0); 3744 spdk_json_write_named_uint64(w, "unmap_latency_ticks", stat->unmap_latency_ticks); 3745 spdk_json_write_named_uint64(w, "max_unmap_latency_ticks", stat->max_unmap_latency_ticks); 3746 spdk_json_write_named_uint64(w, "min_unmap_latency_ticks", 3747 stat->min_unmap_latency_ticks != UINT64_MAX ? 3748 stat->min_unmap_latency_ticks : 0); 3749 spdk_json_write_named_uint64(w, "copy_latency_ticks", stat->copy_latency_ticks); 3750 spdk_json_write_named_uint64(w, "max_copy_latency_ticks", stat->max_copy_latency_ticks); 3751 spdk_json_write_named_uint64(w, "min_copy_latency_ticks", 3752 stat->min_copy_latency_ticks != UINT64_MAX ? 3753 stat->min_copy_latency_ticks : 0); 3754 } 3755 3756 static void 3757 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3758 { 3759 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3760 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3761 3762 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3763 bdev_abort_all_buf_io(mgmt_ch, ch); 3764 bdev_abort_all_buf_io(mgmt_ch, ch); 3765 } 3766 3767 static void 3768 bdev_channel_destroy(void *io_device, void *ctx_buf) 3769 { 3770 struct spdk_bdev_channel *ch = ctx_buf; 3771 3772 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3773 spdk_get_thread()); 3774 3775 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3776 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3777 3778 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3779 spdk_spin_lock(&ch->bdev->internal.spinlock); 3780 bdev_io_stat_add(ch->bdev->internal.stat, ch->stat); 3781 spdk_spin_unlock(&ch->bdev->internal.spinlock); 3782 3783 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3784 3785 bdev_channel_abort_queued_ios(ch); 3786 3787 if (ch->histogram) { 3788 spdk_histogram_data_free(ch->histogram); 3789 } 3790 3791 bdev_channel_destroy_resource(ch); 3792 } 3793 3794 /* 3795 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3796 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3797 */ 3798 static int 3799 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3800 { 3801 struct spdk_bdev_name *tmp; 3802 3803 bdev_name->name = strdup(name); 3804 if (bdev_name->name == NULL) { 3805 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3806 return -ENOMEM; 3807 } 3808 3809 bdev_name->bdev = bdev; 3810 3811 spdk_spin_lock(&g_bdev_mgr.spinlock); 3812 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3813 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3814 3815 if (tmp != NULL) { 3816 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3817 free(bdev_name->name); 3818 return -EEXIST; 3819 } 3820 3821 return 0; 3822 } 3823 3824 static void 3825 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3826 { 3827 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3828 free(bdev_name->name); 3829 } 3830 3831 static void 3832 bdev_name_del(struct spdk_bdev_name *bdev_name) 3833 { 3834 spdk_spin_lock(&g_bdev_mgr.spinlock); 3835 bdev_name_del_unsafe(bdev_name); 3836 spdk_spin_unlock(&g_bdev_mgr.spinlock); 3837 } 3838 3839 int 3840 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3841 { 3842 struct spdk_bdev_alias *tmp; 3843 int ret; 3844 3845 if (alias == NULL) { 3846 SPDK_ERRLOG("Empty alias passed\n"); 3847 return -EINVAL; 3848 } 3849 3850 tmp = calloc(1, sizeof(*tmp)); 3851 if (tmp == NULL) { 3852 SPDK_ERRLOG("Unable to allocate alias\n"); 3853 return -ENOMEM; 3854 } 3855 3856 ret = bdev_name_add(&tmp->alias, bdev, alias); 3857 if (ret != 0) { 3858 free(tmp); 3859 return ret; 3860 } 3861 3862 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3863 3864 return 0; 3865 } 3866 3867 static int 3868 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3869 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3870 { 3871 struct spdk_bdev_alias *tmp; 3872 3873 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3874 if (strcmp(alias, tmp->alias.name) == 0) { 3875 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3876 alias_del_fn(&tmp->alias); 3877 free(tmp); 3878 return 0; 3879 } 3880 } 3881 3882 return -ENOENT; 3883 } 3884 3885 int 3886 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3887 { 3888 int rc; 3889 3890 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3891 if (rc == -ENOENT) { 3892 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3893 } 3894 3895 return rc; 3896 } 3897 3898 void 3899 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3900 { 3901 struct spdk_bdev_alias *p, *tmp; 3902 3903 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3904 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3905 bdev_name_del(&p->alias); 3906 free(p); 3907 } 3908 } 3909 3910 struct spdk_io_channel * 3911 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3912 { 3913 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3914 } 3915 3916 void * 3917 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3918 { 3919 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3920 void *ctx = NULL; 3921 3922 if (bdev->fn_table->get_module_ctx) { 3923 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3924 } 3925 3926 return ctx; 3927 } 3928 3929 const char * 3930 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3931 { 3932 return bdev->module->name; 3933 } 3934 3935 const char * 3936 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3937 { 3938 return bdev->name; 3939 } 3940 3941 const char * 3942 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3943 { 3944 return bdev->product_name; 3945 } 3946 3947 const struct spdk_bdev_aliases_list * 3948 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3949 { 3950 return &bdev->aliases; 3951 } 3952 3953 uint32_t 3954 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3955 { 3956 return bdev->blocklen; 3957 } 3958 3959 uint32_t 3960 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3961 { 3962 return bdev->write_unit_size; 3963 } 3964 3965 uint64_t 3966 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3967 { 3968 return bdev->blockcnt; 3969 } 3970 3971 const char * 3972 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3973 { 3974 return qos_rpc_type[type]; 3975 } 3976 3977 void 3978 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3979 { 3980 int i; 3981 3982 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3983 3984 spdk_spin_lock(&bdev->internal.spinlock); 3985 if (bdev->internal.qos) { 3986 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3987 if (bdev->internal.qos->rate_limits[i].limit != 3988 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3989 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3990 if (bdev_qos_is_iops_rate_limit(i) == false) { 3991 /* Change from Byte to Megabyte which is user visible. */ 3992 limits[i] = limits[i] / 1024 / 1024; 3993 } 3994 } 3995 } 3996 } 3997 spdk_spin_unlock(&bdev->internal.spinlock); 3998 } 3999 4000 size_t 4001 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 4002 { 4003 return 1 << bdev->required_alignment; 4004 } 4005 4006 uint32_t 4007 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 4008 { 4009 return bdev->optimal_io_boundary; 4010 } 4011 4012 bool 4013 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 4014 { 4015 return bdev->write_cache; 4016 } 4017 4018 const struct spdk_uuid * 4019 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 4020 { 4021 return &bdev->uuid; 4022 } 4023 4024 uint16_t 4025 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 4026 { 4027 return bdev->acwu; 4028 } 4029 4030 uint32_t 4031 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 4032 { 4033 return bdev->md_len; 4034 } 4035 4036 bool 4037 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 4038 { 4039 return (bdev->md_len != 0) && bdev->md_interleave; 4040 } 4041 4042 bool 4043 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 4044 { 4045 return (bdev->md_len != 0) && !bdev->md_interleave; 4046 } 4047 4048 bool 4049 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 4050 { 4051 return bdev->zoned; 4052 } 4053 4054 uint32_t 4055 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 4056 { 4057 if (spdk_bdev_is_md_interleaved(bdev)) { 4058 return bdev->blocklen - bdev->md_len; 4059 } else { 4060 return bdev->blocklen; 4061 } 4062 } 4063 4064 uint32_t 4065 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 4066 { 4067 return bdev->phys_blocklen; 4068 } 4069 4070 static uint32_t 4071 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 4072 { 4073 if (!spdk_bdev_is_md_interleaved(bdev)) { 4074 return bdev->blocklen + bdev->md_len; 4075 } else { 4076 return bdev->blocklen; 4077 } 4078 } 4079 4080 /* We have to use the typedef in the function declaration to appease astyle. */ 4081 typedef enum spdk_dif_type spdk_dif_type_t; 4082 4083 spdk_dif_type_t 4084 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 4085 { 4086 if (bdev->md_len != 0) { 4087 return bdev->dif_type; 4088 } else { 4089 return SPDK_DIF_DISABLE; 4090 } 4091 } 4092 4093 bool 4094 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 4095 { 4096 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 4097 return bdev->dif_is_head_of_md; 4098 } else { 4099 return false; 4100 } 4101 } 4102 4103 bool 4104 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 4105 enum spdk_dif_check_type check_type) 4106 { 4107 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 4108 return false; 4109 } 4110 4111 switch (check_type) { 4112 case SPDK_DIF_CHECK_TYPE_REFTAG: 4113 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 4114 case SPDK_DIF_CHECK_TYPE_APPTAG: 4115 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 4116 case SPDK_DIF_CHECK_TYPE_GUARD: 4117 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 4118 default: 4119 return false; 4120 } 4121 } 4122 4123 uint32_t 4124 spdk_bdev_get_max_copy(const struct spdk_bdev *bdev) 4125 { 4126 return bdev->max_copy; 4127 } 4128 4129 uint64_t 4130 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 4131 { 4132 return bdev->internal.measured_queue_depth; 4133 } 4134 4135 uint64_t 4136 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 4137 { 4138 return bdev->internal.period; 4139 } 4140 4141 uint64_t 4142 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 4143 { 4144 return bdev->internal.weighted_io_time; 4145 } 4146 4147 uint64_t 4148 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 4149 { 4150 return bdev->internal.io_time; 4151 } 4152 4153 static void bdev_update_qd_sampling_period(void *ctx); 4154 4155 static void 4156 _calculate_measured_qd_cpl(struct spdk_bdev *bdev, void *_ctx, int status) 4157 { 4158 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 4159 4160 if (bdev->internal.measured_queue_depth) { 4161 bdev->internal.io_time += bdev->internal.period; 4162 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 4163 } 4164 4165 bdev->internal.qd_poll_in_progress = false; 4166 4167 bdev_update_qd_sampling_period(bdev); 4168 } 4169 4170 static void 4171 _calculate_measured_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4172 struct spdk_io_channel *io_ch, void *_ctx) 4173 { 4174 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(io_ch); 4175 4176 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4177 spdk_bdev_for_each_channel_continue(i, 0); 4178 } 4179 4180 static int 4181 bdev_calculate_measured_queue_depth(void *ctx) 4182 { 4183 struct spdk_bdev *bdev = ctx; 4184 4185 bdev->internal.qd_poll_in_progress = true; 4186 bdev->internal.temporary_queue_depth = 0; 4187 spdk_bdev_for_each_channel(bdev, _calculate_measured_qd, bdev, _calculate_measured_qd_cpl); 4188 return SPDK_POLLER_BUSY; 4189 } 4190 4191 static void 4192 bdev_update_qd_sampling_period(void *ctx) 4193 { 4194 struct spdk_bdev *bdev = ctx; 4195 4196 if (bdev->internal.period == bdev->internal.new_period) { 4197 return; 4198 } 4199 4200 if (bdev->internal.qd_poll_in_progress) { 4201 return; 4202 } 4203 4204 bdev->internal.period = bdev->internal.new_period; 4205 4206 spdk_poller_unregister(&bdev->internal.qd_poller); 4207 if (bdev->internal.period != 0) { 4208 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4209 bdev, bdev->internal.period); 4210 } else { 4211 spdk_bdev_close(bdev->internal.qd_desc); 4212 bdev->internal.qd_desc = NULL; 4213 } 4214 } 4215 4216 static void 4217 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4218 { 4219 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4220 } 4221 4222 void 4223 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4224 { 4225 int rc; 4226 4227 if (bdev->internal.new_period == period) { 4228 return; 4229 } 4230 4231 bdev->internal.new_period = period; 4232 4233 if (bdev->internal.qd_desc != NULL) { 4234 assert(bdev->internal.period != 0); 4235 4236 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4237 bdev_update_qd_sampling_period, bdev); 4238 return; 4239 } 4240 4241 assert(bdev->internal.period == 0); 4242 4243 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4244 NULL, &bdev->internal.qd_desc); 4245 if (rc != 0) { 4246 return; 4247 } 4248 4249 bdev->internal.period = period; 4250 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4251 bdev, period); 4252 } 4253 4254 struct bdev_get_current_qd_ctx { 4255 uint64_t current_qd; 4256 spdk_bdev_get_current_qd_cb cb_fn; 4257 void *cb_arg; 4258 }; 4259 4260 static void 4261 bdev_get_current_qd_done(struct spdk_bdev *bdev, void *_ctx, int status) 4262 { 4263 struct bdev_get_current_qd_ctx *ctx = _ctx; 4264 4265 ctx->cb_fn(bdev, ctx->current_qd, ctx->cb_arg, 0); 4266 4267 free(ctx); 4268 } 4269 4270 static void 4271 bdev_get_current_qd(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 4272 struct spdk_io_channel *io_ch, void *_ctx) 4273 { 4274 struct bdev_get_current_qd_ctx *ctx = _ctx; 4275 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 4276 4277 ctx->current_qd += bdev_ch->io_outstanding; 4278 4279 spdk_bdev_for_each_channel_continue(i, 0); 4280 } 4281 4282 void 4283 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4284 void *cb_arg) 4285 { 4286 struct bdev_get_current_qd_ctx *ctx; 4287 4288 assert(cb_fn != NULL); 4289 4290 ctx = calloc(1, sizeof(*ctx)); 4291 if (ctx == NULL) { 4292 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4293 return; 4294 } 4295 4296 ctx->cb_fn = cb_fn; 4297 ctx->cb_arg = cb_arg; 4298 4299 spdk_bdev_for_each_channel(bdev, bdev_get_current_qd, ctx, bdev_get_current_qd_done); 4300 } 4301 4302 static void 4303 _resize_notify(void *arg) 4304 { 4305 struct spdk_bdev_desc *desc = arg; 4306 4307 spdk_spin_lock(&desc->spinlock); 4308 desc->refs--; 4309 if (!desc->closed) { 4310 spdk_spin_unlock(&desc->spinlock); 4311 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4312 desc->bdev, 4313 desc->callback.ctx); 4314 return; 4315 } else if (0 == desc->refs) { 4316 /* This descriptor was closed after this resize_notify message was sent. 4317 * spdk_bdev_close() could not free the descriptor since this message was 4318 * in flight, so we free it now using bdev_desc_free(). 4319 */ 4320 spdk_spin_unlock(&desc->spinlock); 4321 bdev_desc_free(desc); 4322 return; 4323 } 4324 spdk_spin_unlock(&desc->spinlock); 4325 } 4326 4327 int 4328 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4329 { 4330 struct spdk_bdev_desc *desc; 4331 int ret; 4332 4333 if (size == bdev->blockcnt) { 4334 return 0; 4335 } 4336 4337 spdk_spin_lock(&bdev->internal.spinlock); 4338 4339 /* bdev has open descriptors */ 4340 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4341 bdev->blockcnt > size) { 4342 ret = -EBUSY; 4343 } else { 4344 bdev->blockcnt = size; 4345 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4346 spdk_spin_lock(&desc->spinlock); 4347 if (!desc->closed) { 4348 desc->refs++; 4349 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4350 } 4351 spdk_spin_unlock(&desc->spinlock); 4352 } 4353 ret = 0; 4354 } 4355 4356 spdk_spin_unlock(&bdev->internal.spinlock); 4357 4358 return ret; 4359 } 4360 4361 /* 4362 * Convert I/O offset and length from bytes to blocks. 4363 * 4364 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4365 */ 4366 static uint64_t 4367 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4368 uint64_t num_bytes, uint64_t *num_blocks) 4369 { 4370 uint32_t block_size = bdev->blocklen; 4371 uint8_t shift_cnt; 4372 4373 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4374 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4375 shift_cnt = spdk_u32log2(block_size); 4376 *offset_blocks = offset_bytes >> shift_cnt; 4377 *num_blocks = num_bytes >> shift_cnt; 4378 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4379 (num_bytes - (*num_blocks << shift_cnt)); 4380 } else { 4381 *offset_blocks = offset_bytes / block_size; 4382 *num_blocks = num_bytes / block_size; 4383 return (offset_bytes % block_size) | (num_bytes % block_size); 4384 } 4385 } 4386 4387 static bool 4388 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4389 { 4390 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4391 * has been an overflow and hence the offset has been wrapped around */ 4392 if (offset_blocks + num_blocks < offset_blocks) { 4393 return false; 4394 } 4395 4396 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4397 if (offset_blocks + num_blocks > bdev->blockcnt) { 4398 return false; 4399 } 4400 4401 return true; 4402 } 4403 4404 static void 4405 bdev_seek_complete_cb(void *ctx) 4406 { 4407 struct spdk_bdev_io *bdev_io = ctx; 4408 4409 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4410 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4411 } 4412 4413 static int 4414 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4415 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4416 spdk_bdev_io_completion_cb cb, void *cb_arg) 4417 { 4418 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4419 struct spdk_bdev_io *bdev_io; 4420 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4421 4422 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4423 4424 /* Check if offset_blocks is valid looking at the validity of one block */ 4425 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4426 return -EINVAL; 4427 } 4428 4429 bdev_io = bdev_channel_get_io(channel); 4430 if (!bdev_io) { 4431 return -ENOMEM; 4432 } 4433 4434 bdev_io->internal.ch = channel; 4435 bdev_io->internal.desc = desc; 4436 bdev_io->type = io_type; 4437 bdev_io->u.bdev.offset_blocks = offset_blocks; 4438 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4439 4440 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4441 /* In case bdev doesn't support seek to next data/hole offset, 4442 * it is assumed that only data and no holes are present */ 4443 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4444 bdev_io->u.bdev.seek.offset = offset_blocks; 4445 } else { 4446 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4447 } 4448 4449 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4450 return 0; 4451 } 4452 4453 bdev_io_submit(bdev_io); 4454 return 0; 4455 } 4456 4457 int 4458 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4459 uint64_t offset_blocks, 4460 spdk_bdev_io_completion_cb cb, void *cb_arg) 4461 { 4462 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4463 } 4464 4465 int 4466 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4467 uint64_t offset_blocks, 4468 spdk_bdev_io_completion_cb cb, void *cb_arg) 4469 { 4470 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4471 } 4472 4473 uint64_t 4474 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4475 { 4476 return bdev_io->u.bdev.seek.offset; 4477 } 4478 4479 static int 4480 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4481 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4482 spdk_bdev_io_completion_cb cb, void *cb_arg) 4483 { 4484 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4485 struct spdk_bdev_io *bdev_io; 4486 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4487 4488 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4489 return -EINVAL; 4490 } 4491 4492 bdev_io = bdev_channel_get_io(channel); 4493 if (!bdev_io) { 4494 return -ENOMEM; 4495 } 4496 4497 bdev_io->internal.ch = channel; 4498 bdev_io->internal.desc = desc; 4499 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4500 bdev_io->u.bdev.iovs = &bdev_io->iov; 4501 bdev_io->u.bdev.iovs[0].iov_base = buf; 4502 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4503 bdev_io->u.bdev.iovcnt = 1; 4504 bdev_io->u.bdev.md_buf = md_buf; 4505 bdev_io->u.bdev.num_blocks = num_blocks; 4506 bdev_io->u.bdev.offset_blocks = offset_blocks; 4507 bdev_io->u.bdev.ext_opts = NULL; 4508 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4509 4510 bdev_io_submit(bdev_io); 4511 return 0; 4512 } 4513 4514 int 4515 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4516 void *buf, uint64_t offset, uint64_t nbytes, 4517 spdk_bdev_io_completion_cb cb, void *cb_arg) 4518 { 4519 uint64_t offset_blocks, num_blocks; 4520 4521 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4522 nbytes, &num_blocks) != 0) { 4523 return -EINVAL; 4524 } 4525 4526 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4527 } 4528 4529 int 4530 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4531 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4532 spdk_bdev_io_completion_cb cb, void *cb_arg) 4533 { 4534 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4535 } 4536 4537 int 4538 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4539 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4540 spdk_bdev_io_completion_cb cb, void *cb_arg) 4541 { 4542 struct iovec iov = { 4543 .iov_base = buf, 4544 }; 4545 4546 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4547 return -EINVAL; 4548 } 4549 4550 if (md_buf && !_is_buf_allocated(&iov)) { 4551 return -EINVAL; 4552 } 4553 4554 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4555 cb, cb_arg); 4556 } 4557 4558 int 4559 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4560 struct iovec *iov, int iovcnt, 4561 uint64_t offset, uint64_t nbytes, 4562 spdk_bdev_io_completion_cb cb, void *cb_arg) 4563 { 4564 uint64_t offset_blocks, num_blocks; 4565 4566 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4567 nbytes, &num_blocks) != 0) { 4568 return -EINVAL; 4569 } 4570 4571 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4572 } 4573 4574 static int 4575 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4576 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4577 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4578 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4579 { 4580 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4581 struct spdk_bdev_io *bdev_io; 4582 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4583 4584 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4585 return -EINVAL; 4586 } 4587 4588 bdev_io = bdev_channel_get_io(channel); 4589 if (!bdev_io) { 4590 return -ENOMEM; 4591 } 4592 4593 bdev_io->internal.ch = channel; 4594 bdev_io->internal.desc = desc; 4595 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4596 bdev_io->u.bdev.iovs = iov; 4597 bdev_io->u.bdev.iovcnt = iovcnt; 4598 bdev_io->u.bdev.md_buf = md_buf; 4599 bdev_io->u.bdev.num_blocks = num_blocks; 4600 bdev_io->u.bdev.offset_blocks = offset_blocks; 4601 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4602 bdev_io->internal.ext_opts = opts; 4603 bdev_io->u.bdev.ext_opts = opts; 4604 4605 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4606 4607 return 0; 4608 } 4609 4610 int 4611 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4612 struct iovec *iov, int iovcnt, 4613 uint64_t offset_blocks, uint64_t num_blocks, 4614 spdk_bdev_io_completion_cb cb, void *cb_arg) 4615 { 4616 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4617 num_blocks, cb, cb_arg, NULL, false); 4618 } 4619 4620 int 4621 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4622 struct iovec *iov, int iovcnt, void *md_buf, 4623 uint64_t offset_blocks, uint64_t num_blocks, 4624 spdk_bdev_io_completion_cb cb, void *cb_arg) 4625 { 4626 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4627 return -EINVAL; 4628 } 4629 4630 if (md_buf && !_is_buf_allocated(iov)) { 4631 return -EINVAL; 4632 } 4633 4634 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4635 num_blocks, cb, cb_arg, NULL, false); 4636 } 4637 4638 static inline bool 4639 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4640 { 4641 /* 4642 * We check if opts size is at least of size when we first introduced 4643 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4644 * are not checked internal. 4645 */ 4646 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4647 sizeof(opts->metadata) && 4648 opts->size <= sizeof(*opts) && 4649 /* When memory domain is used, the user must provide data buffers */ 4650 (!opts->memory_domain || (iov && iov[0].iov_base)); 4651 } 4652 4653 int 4654 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4655 struct iovec *iov, int iovcnt, 4656 uint64_t offset_blocks, uint64_t num_blocks, 4657 spdk_bdev_io_completion_cb cb, void *cb_arg, 4658 struct spdk_bdev_ext_io_opts *opts) 4659 { 4660 void *md = NULL; 4661 4662 if (opts) { 4663 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4664 return -EINVAL; 4665 } 4666 md = opts->metadata; 4667 } 4668 4669 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4670 return -EINVAL; 4671 } 4672 4673 if (md && !_is_buf_allocated(iov)) { 4674 return -EINVAL; 4675 } 4676 4677 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4678 num_blocks, cb, cb_arg, opts, false); 4679 } 4680 4681 static int 4682 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4683 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4684 spdk_bdev_io_completion_cb cb, void *cb_arg) 4685 { 4686 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4687 struct spdk_bdev_io *bdev_io; 4688 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4689 4690 if (!desc->write) { 4691 return -EBADF; 4692 } 4693 4694 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4695 return -EINVAL; 4696 } 4697 4698 bdev_io = bdev_channel_get_io(channel); 4699 if (!bdev_io) { 4700 return -ENOMEM; 4701 } 4702 4703 bdev_io->internal.ch = channel; 4704 bdev_io->internal.desc = desc; 4705 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4706 bdev_io->u.bdev.iovs = &bdev_io->iov; 4707 bdev_io->u.bdev.iovs[0].iov_base = buf; 4708 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4709 bdev_io->u.bdev.iovcnt = 1; 4710 bdev_io->u.bdev.md_buf = md_buf; 4711 bdev_io->u.bdev.num_blocks = num_blocks; 4712 bdev_io->u.bdev.offset_blocks = offset_blocks; 4713 bdev_io->u.bdev.ext_opts = NULL; 4714 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4715 4716 bdev_io_submit(bdev_io); 4717 return 0; 4718 } 4719 4720 int 4721 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4722 void *buf, uint64_t offset, uint64_t nbytes, 4723 spdk_bdev_io_completion_cb cb, void *cb_arg) 4724 { 4725 uint64_t offset_blocks, num_blocks; 4726 4727 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4728 nbytes, &num_blocks) != 0) { 4729 return -EINVAL; 4730 } 4731 4732 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4733 } 4734 4735 int 4736 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4737 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4738 spdk_bdev_io_completion_cb cb, void *cb_arg) 4739 { 4740 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4741 cb, cb_arg); 4742 } 4743 4744 int 4745 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4746 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4747 spdk_bdev_io_completion_cb cb, void *cb_arg) 4748 { 4749 struct iovec iov = { 4750 .iov_base = buf, 4751 }; 4752 4753 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4754 return -EINVAL; 4755 } 4756 4757 if (md_buf && !_is_buf_allocated(&iov)) { 4758 return -EINVAL; 4759 } 4760 4761 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4762 cb, cb_arg); 4763 } 4764 4765 static int 4766 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4767 struct iovec *iov, int iovcnt, void *md_buf, 4768 uint64_t offset_blocks, uint64_t num_blocks, 4769 spdk_bdev_io_completion_cb cb, void *cb_arg, 4770 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4771 { 4772 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4773 struct spdk_bdev_io *bdev_io; 4774 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4775 4776 if (!desc->write) { 4777 return -EBADF; 4778 } 4779 4780 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4781 return -EINVAL; 4782 } 4783 4784 bdev_io = bdev_channel_get_io(channel); 4785 if (!bdev_io) { 4786 return -ENOMEM; 4787 } 4788 4789 bdev_io->internal.ch = channel; 4790 bdev_io->internal.desc = desc; 4791 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4792 bdev_io->u.bdev.iovs = iov; 4793 bdev_io->u.bdev.iovcnt = iovcnt; 4794 bdev_io->u.bdev.md_buf = md_buf; 4795 bdev_io->u.bdev.num_blocks = num_blocks; 4796 bdev_io->u.bdev.offset_blocks = offset_blocks; 4797 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4798 bdev_io->internal.ext_opts = opts; 4799 bdev_io->u.bdev.ext_opts = opts; 4800 4801 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4802 4803 return 0; 4804 } 4805 4806 int 4807 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4808 struct iovec *iov, int iovcnt, 4809 uint64_t offset, uint64_t len, 4810 spdk_bdev_io_completion_cb cb, void *cb_arg) 4811 { 4812 uint64_t offset_blocks, num_blocks; 4813 4814 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4815 len, &num_blocks) != 0) { 4816 return -EINVAL; 4817 } 4818 4819 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4820 } 4821 4822 int 4823 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4824 struct iovec *iov, int iovcnt, 4825 uint64_t offset_blocks, uint64_t num_blocks, 4826 spdk_bdev_io_completion_cb cb, void *cb_arg) 4827 { 4828 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4829 num_blocks, cb, cb_arg, NULL, false); 4830 } 4831 4832 int 4833 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4834 struct iovec *iov, int iovcnt, void *md_buf, 4835 uint64_t offset_blocks, uint64_t num_blocks, 4836 spdk_bdev_io_completion_cb cb, void *cb_arg) 4837 { 4838 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4839 return -EINVAL; 4840 } 4841 4842 if (md_buf && !_is_buf_allocated(iov)) { 4843 return -EINVAL; 4844 } 4845 4846 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4847 num_blocks, cb, cb_arg, NULL, false); 4848 } 4849 4850 int 4851 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4852 struct iovec *iov, int iovcnt, 4853 uint64_t offset_blocks, uint64_t num_blocks, 4854 spdk_bdev_io_completion_cb cb, void *cb_arg, 4855 struct spdk_bdev_ext_io_opts *opts) 4856 { 4857 void *md = NULL; 4858 4859 if (opts) { 4860 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4861 return -EINVAL; 4862 } 4863 md = opts->metadata; 4864 } 4865 4866 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4867 return -EINVAL; 4868 } 4869 4870 if (md && !_is_buf_allocated(iov)) { 4871 return -EINVAL; 4872 } 4873 4874 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4875 num_blocks, cb, cb_arg, opts, false); 4876 } 4877 4878 static void 4879 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4880 { 4881 struct spdk_bdev_io *parent_io = cb_arg; 4882 struct spdk_bdev *bdev = parent_io->bdev; 4883 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4884 int i, rc = 0; 4885 4886 if (!success) { 4887 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4888 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4889 spdk_bdev_free_io(bdev_io); 4890 return; 4891 } 4892 4893 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4894 rc = memcmp(read_buf, 4895 parent_io->u.bdev.iovs[i].iov_base, 4896 parent_io->u.bdev.iovs[i].iov_len); 4897 if (rc) { 4898 break; 4899 } 4900 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4901 } 4902 4903 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 4904 rc = memcmp(bdev_io->u.bdev.md_buf, 4905 parent_io->u.bdev.md_buf, 4906 spdk_bdev_get_md_size(bdev)); 4907 } 4908 4909 spdk_bdev_free_io(bdev_io); 4910 4911 if (rc == 0) { 4912 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4913 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4914 } else { 4915 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4916 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4917 } 4918 } 4919 4920 static void 4921 bdev_compare_do_read(void *_bdev_io) 4922 { 4923 struct spdk_bdev_io *bdev_io = _bdev_io; 4924 int rc; 4925 4926 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4927 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4928 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4929 bdev_compare_do_read_done, bdev_io); 4930 4931 if (rc == -ENOMEM) { 4932 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4933 } else if (rc != 0) { 4934 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4935 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4936 } 4937 } 4938 4939 static int 4940 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4941 struct iovec *iov, int iovcnt, void *md_buf, 4942 uint64_t offset_blocks, uint64_t num_blocks, 4943 spdk_bdev_io_completion_cb cb, void *cb_arg) 4944 { 4945 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4946 struct spdk_bdev_io *bdev_io; 4947 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 4948 4949 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4950 return -EINVAL; 4951 } 4952 4953 bdev_io = bdev_channel_get_io(channel); 4954 if (!bdev_io) { 4955 return -ENOMEM; 4956 } 4957 4958 bdev_io->internal.ch = channel; 4959 bdev_io->internal.desc = desc; 4960 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4961 bdev_io->u.bdev.iovs = iov; 4962 bdev_io->u.bdev.iovcnt = iovcnt; 4963 bdev_io->u.bdev.md_buf = md_buf; 4964 bdev_io->u.bdev.num_blocks = num_blocks; 4965 bdev_io->u.bdev.offset_blocks = offset_blocks; 4966 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4967 bdev_io->u.bdev.ext_opts = NULL; 4968 4969 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4970 bdev_io_submit(bdev_io); 4971 return 0; 4972 } 4973 4974 bdev_compare_do_read(bdev_io); 4975 4976 return 0; 4977 } 4978 4979 int 4980 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4981 struct iovec *iov, int iovcnt, 4982 uint64_t offset_blocks, uint64_t num_blocks, 4983 spdk_bdev_io_completion_cb cb, void *cb_arg) 4984 { 4985 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4986 num_blocks, cb, cb_arg); 4987 } 4988 4989 int 4990 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4991 struct iovec *iov, int iovcnt, void *md_buf, 4992 uint64_t offset_blocks, uint64_t num_blocks, 4993 spdk_bdev_io_completion_cb cb, void *cb_arg) 4994 { 4995 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4996 return -EINVAL; 4997 } 4998 4999 if (md_buf && !_is_buf_allocated(iov)) { 5000 return -EINVAL; 5001 } 5002 5003 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 5004 num_blocks, cb, cb_arg); 5005 } 5006 5007 static int 5008 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5009 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5010 spdk_bdev_io_completion_cb cb, void *cb_arg) 5011 { 5012 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5013 struct spdk_bdev_io *bdev_io; 5014 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5015 5016 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5017 return -EINVAL; 5018 } 5019 5020 bdev_io = bdev_channel_get_io(channel); 5021 if (!bdev_io) { 5022 return -ENOMEM; 5023 } 5024 5025 bdev_io->internal.ch = channel; 5026 bdev_io->internal.desc = desc; 5027 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 5028 bdev_io->u.bdev.iovs = &bdev_io->iov; 5029 bdev_io->u.bdev.iovs[0].iov_base = buf; 5030 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 5031 bdev_io->u.bdev.iovcnt = 1; 5032 bdev_io->u.bdev.md_buf = md_buf; 5033 bdev_io->u.bdev.num_blocks = num_blocks; 5034 bdev_io->u.bdev.offset_blocks = offset_blocks; 5035 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5036 bdev_io->u.bdev.ext_opts = NULL; 5037 5038 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 5039 bdev_io_submit(bdev_io); 5040 return 0; 5041 } 5042 5043 bdev_compare_do_read(bdev_io); 5044 5045 return 0; 5046 } 5047 5048 int 5049 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5050 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 5051 spdk_bdev_io_completion_cb cb, void *cb_arg) 5052 { 5053 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 5054 cb, cb_arg); 5055 } 5056 5057 int 5058 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5059 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 5060 spdk_bdev_io_completion_cb cb, void *cb_arg) 5061 { 5062 struct iovec iov = { 5063 .iov_base = buf, 5064 }; 5065 5066 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 5067 return -EINVAL; 5068 } 5069 5070 if (md_buf && !_is_buf_allocated(&iov)) { 5071 return -EINVAL; 5072 } 5073 5074 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 5075 cb, cb_arg); 5076 } 5077 5078 static void 5079 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 5080 { 5081 struct spdk_bdev_io *bdev_io = ctx; 5082 5083 if (unlock_status) { 5084 SPDK_ERRLOG("LBA range unlock failed\n"); 5085 } 5086 5087 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 5088 false, bdev_io->internal.caller_ctx); 5089 } 5090 5091 static void 5092 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 5093 { 5094 bdev_io->internal.status = status; 5095 5096 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 5097 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5098 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 5099 } 5100 5101 static void 5102 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5103 { 5104 struct spdk_bdev_io *parent_io = cb_arg; 5105 5106 if (!success) { 5107 SPDK_ERRLOG("Compare and write operation failed\n"); 5108 } 5109 5110 spdk_bdev_free_io(bdev_io); 5111 5112 bdev_comparev_and_writev_blocks_unlock(parent_io, 5113 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 5114 } 5115 5116 static void 5117 bdev_compare_and_write_do_write(void *_bdev_io) 5118 { 5119 struct spdk_bdev_io *bdev_io = _bdev_io; 5120 int rc; 5121 5122 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 5123 spdk_io_channel_from_ctx(bdev_io->internal.ch), 5124 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 5125 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5126 bdev_compare_and_write_do_write_done, bdev_io); 5127 5128 5129 if (rc == -ENOMEM) { 5130 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 5131 } else if (rc != 0) { 5132 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 5133 } 5134 } 5135 5136 static void 5137 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5138 { 5139 struct spdk_bdev_io *parent_io = cb_arg; 5140 5141 spdk_bdev_free_io(bdev_io); 5142 5143 if (!success) { 5144 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 5145 return; 5146 } 5147 5148 bdev_compare_and_write_do_write(parent_io); 5149 } 5150 5151 static void 5152 bdev_compare_and_write_do_compare(void *_bdev_io) 5153 { 5154 struct spdk_bdev_io *bdev_io = _bdev_io; 5155 int rc; 5156 5157 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 5158 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 5159 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 5160 bdev_compare_and_write_do_compare_done, bdev_io); 5161 5162 if (rc == -ENOMEM) { 5163 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 5164 } else if (rc != 0) { 5165 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5166 } 5167 } 5168 5169 static void 5170 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5171 { 5172 struct spdk_bdev_io *bdev_io = ctx; 5173 5174 if (status) { 5175 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5176 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5177 return; 5178 } 5179 5180 bdev_compare_and_write_do_compare(bdev_io); 5181 } 5182 5183 int 5184 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5185 struct iovec *compare_iov, int compare_iovcnt, 5186 struct iovec *write_iov, int write_iovcnt, 5187 uint64_t offset_blocks, uint64_t num_blocks, 5188 spdk_bdev_io_completion_cb cb, void *cb_arg) 5189 { 5190 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5191 struct spdk_bdev_io *bdev_io; 5192 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5193 5194 if (!desc->write) { 5195 return -EBADF; 5196 } 5197 5198 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5199 return -EINVAL; 5200 } 5201 5202 if (num_blocks > bdev->acwu) { 5203 return -EINVAL; 5204 } 5205 5206 bdev_io = bdev_channel_get_io(channel); 5207 if (!bdev_io) { 5208 return -ENOMEM; 5209 } 5210 5211 bdev_io->internal.ch = channel; 5212 bdev_io->internal.desc = desc; 5213 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5214 bdev_io->u.bdev.iovs = compare_iov; 5215 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5216 bdev_io->u.bdev.fused_iovs = write_iov; 5217 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5218 bdev_io->u.bdev.md_buf = NULL; 5219 bdev_io->u.bdev.num_blocks = num_blocks; 5220 bdev_io->u.bdev.offset_blocks = offset_blocks; 5221 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5222 bdev_io->u.bdev.ext_opts = NULL; 5223 5224 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5225 bdev_io_submit(bdev_io); 5226 return 0; 5227 } 5228 5229 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5230 bdev_comparev_and_writev_blocks_locked, bdev_io); 5231 } 5232 5233 int 5234 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5235 struct iovec *iov, int iovcnt, 5236 uint64_t offset_blocks, uint64_t num_blocks, 5237 bool populate, 5238 spdk_bdev_io_completion_cb cb, void *cb_arg) 5239 { 5240 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5241 struct spdk_bdev_io *bdev_io; 5242 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5243 5244 if (!desc->write) { 5245 return -EBADF; 5246 } 5247 5248 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5249 return -EINVAL; 5250 } 5251 5252 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5253 return -ENOTSUP; 5254 } 5255 5256 bdev_io = bdev_channel_get_io(channel); 5257 if (!bdev_io) { 5258 return -ENOMEM; 5259 } 5260 5261 bdev_io->internal.ch = channel; 5262 bdev_io->internal.desc = desc; 5263 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5264 bdev_io->u.bdev.num_blocks = num_blocks; 5265 bdev_io->u.bdev.offset_blocks = offset_blocks; 5266 bdev_io->u.bdev.iovs = iov; 5267 bdev_io->u.bdev.iovcnt = iovcnt; 5268 bdev_io->u.bdev.md_buf = NULL; 5269 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5270 bdev_io->u.bdev.zcopy.commit = 0; 5271 bdev_io->u.bdev.zcopy.start = 1; 5272 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5273 bdev_io->u.bdev.ext_opts = NULL; 5274 5275 bdev_io_submit(bdev_io); 5276 5277 return 0; 5278 } 5279 5280 int 5281 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5282 spdk_bdev_io_completion_cb cb, void *cb_arg) 5283 { 5284 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5285 return -EINVAL; 5286 } 5287 5288 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5289 bdev_io->u.bdev.zcopy.start = 0; 5290 bdev_io->internal.caller_ctx = cb_arg; 5291 bdev_io->internal.cb = cb; 5292 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5293 5294 bdev_io_submit(bdev_io); 5295 5296 return 0; 5297 } 5298 5299 int 5300 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5301 uint64_t offset, uint64_t len, 5302 spdk_bdev_io_completion_cb cb, void *cb_arg) 5303 { 5304 uint64_t offset_blocks, num_blocks; 5305 5306 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5307 len, &num_blocks) != 0) { 5308 return -EINVAL; 5309 } 5310 5311 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5312 } 5313 5314 int 5315 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5316 uint64_t offset_blocks, uint64_t num_blocks, 5317 spdk_bdev_io_completion_cb cb, void *cb_arg) 5318 { 5319 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5320 struct spdk_bdev_io *bdev_io; 5321 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5322 5323 if (!desc->write) { 5324 return -EBADF; 5325 } 5326 5327 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5328 return -EINVAL; 5329 } 5330 5331 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5332 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5333 return -ENOTSUP; 5334 } 5335 5336 bdev_io = bdev_channel_get_io(channel); 5337 5338 if (!bdev_io) { 5339 return -ENOMEM; 5340 } 5341 5342 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5343 bdev_io->internal.ch = channel; 5344 bdev_io->internal.desc = desc; 5345 bdev_io->u.bdev.offset_blocks = offset_blocks; 5346 bdev_io->u.bdev.num_blocks = num_blocks; 5347 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5348 bdev_io->u.bdev.ext_opts = NULL; 5349 5350 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5351 bdev_io_submit(bdev_io); 5352 return 0; 5353 } 5354 5355 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5356 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5357 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5358 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5359 bdev_write_zero_buffer_next(bdev_io); 5360 5361 return 0; 5362 } 5363 5364 int 5365 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5366 uint64_t offset, uint64_t nbytes, 5367 spdk_bdev_io_completion_cb cb, void *cb_arg) 5368 { 5369 uint64_t offset_blocks, num_blocks; 5370 5371 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5372 nbytes, &num_blocks) != 0) { 5373 return -EINVAL; 5374 } 5375 5376 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5377 } 5378 5379 int 5380 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5381 uint64_t offset_blocks, uint64_t num_blocks, 5382 spdk_bdev_io_completion_cb cb, void *cb_arg) 5383 { 5384 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5385 struct spdk_bdev_io *bdev_io; 5386 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5387 5388 if (!desc->write) { 5389 return -EBADF; 5390 } 5391 5392 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5393 return -EINVAL; 5394 } 5395 5396 if (num_blocks == 0) { 5397 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5398 return -EINVAL; 5399 } 5400 5401 bdev_io = bdev_channel_get_io(channel); 5402 if (!bdev_io) { 5403 return -ENOMEM; 5404 } 5405 5406 bdev_io->internal.ch = channel; 5407 bdev_io->internal.desc = desc; 5408 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5409 5410 bdev_io->u.bdev.iovs = &bdev_io->iov; 5411 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5412 bdev_io->u.bdev.iovs[0].iov_len = 0; 5413 bdev_io->u.bdev.iovcnt = 1; 5414 5415 bdev_io->u.bdev.offset_blocks = offset_blocks; 5416 bdev_io->u.bdev.num_blocks = num_blocks; 5417 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5418 bdev_io->u.bdev.ext_opts = NULL; 5419 5420 bdev_io_submit(bdev_io); 5421 return 0; 5422 } 5423 5424 int 5425 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5426 uint64_t offset, uint64_t length, 5427 spdk_bdev_io_completion_cb cb, void *cb_arg) 5428 { 5429 uint64_t offset_blocks, num_blocks; 5430 5431 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5432 length, &num_blocks) != 0) { 5433 return -EINVAL; 5434 } 5435 5436 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5437 } 5438 5439 int 5440 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5441 uint64_t offset_blocks, uint64_t num_blocks, 5442 spdk_bdev_io_completion_cb cb, void *cb_arg) 5443 { 5444 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5445 struct spdk_bdev_io *bdev_io; 5446 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5447 5448 if (!desc->write) { 5449 return -EBADF; 5450 } 5451 5452 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5453 return -EINVAL; 5454 } 5455 5456 bdev_io = bdev_channel_get_io(channel); 5457 if (!bdev_io) { 5458 return -ENOMEM; 5459 } 5460 5461 bdev_io->internal.ch = channel; 5462 bdev_io->internal.desc = desc; 5463 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5464 bdev_io->u.bdev.iovs = NULL; 5465 bdev_io->u.bdev.iovcnt = 0; 5466 bdev_io->u.bdev.offset_blocks = offset_blocks; 5467 bdev_io->u.bdev.num_blocks = num_blocks; 5468 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5469 5470 bdev_io_submit(bdev_io); 5471 return 0; 5472 } 5473 5474 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5475 5476 static void 5477 bdev_reset_check_outstanding_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 5478 { 5479 struct spdk_bdev_channel *ch = _ctx; 5480 struct spdk_bdev_io *bdev_io; 5481 5482 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5483 5484 if (status == -EBUSY) { 5485 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5486 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5487 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5488 } else { 5489 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5490 * start the reset. */ 5491 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5492 bdev_io_submit_reset(bdev_io); 5493 } 5494 } else { 5495 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5496 SPDK_DEBUGLOG(bdev, 5497 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5498 ch->bdev->name); 5499 /* Mark the completion status as a SUCCESS and complete the reset. */ 5500 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5501 } 5502 } 5503 5504 static void 5505 bdev_reset_check_outstanding_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5506 struct spdk_io_channel *io_ch, void *_ctx) 5507 { 5508 struct spdk_bdev_channel *cur_ch = __io_ch_to_bdev_ch(io_ch); 5509 int status = 0; 5510 5511 if (cur_ch->io_outstanding > 0) { 5512 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5513 * further iteration over the rest of the channels and pass non-zero status 5514 * to the callback function. */ 5515 status = -EBUSY; 5516 } 5517 spdk_bdev_for_each_channel_continue(i, status); 5518 } 5519 5520 static int 5521 bdev_reset_poll_for_outstanding_io(void *ctx) 5522 { 5523 struct spdk_bdev_channel *ch = ctx; 5524 struct spdk_bdev_io *bdev_io; 5525 5526 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5527 5528 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5529 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5530 bdev_reset_check_outstanding_io_done); 5531 5532 return SPDK_POLLER_BUSY; 5533 } 5534 5535 static void 5536 bdev_reset_freeze_channel_done(struct spdk_bdev *bdev, void *_ctx, int status) 5537 { 5538 struct spdk_bdev_channel *ch = _ctx; 5539 struct spdk_bdev_io *bdev_io; 5540 5541 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5542 5543 if (bdev->reset_io_drain_timeout == 0) { 5544 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5545 5546 bdev_io_submit_reset(bdev_io); 5547 return; 5548 } 5549 5550 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5551 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5552 5553 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5554 * submit the reset to the underlying module only if outstanding I/O 5555 * remain after reset_io_drain_timeout seconds have passed. */ 5556 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_check_outstanding_io, ch, 5557 bdev_reset_check_outstanding_io_done); 5558 } 5559 5560 static void 5561 bdev_reset_freeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5562 struct spdk_io_channel *ch, void *_ctx) 5563 { 5564 struct spdk_bdev_channel *channel; 5565 struct spdk_bdev_mgmt_channel *mgmt_channel; 5566 struct spdk_bdev_shared_resource *shared_resource; 5567 bdev_io_tailq_t tmp_queued; 5568 5569 TAILQ_INIT(&tmp_queued); 5570 5571 channel = __io_ch_to_bdev_ch(ch); 5572 shared_resource = channel->shared_resource; 5573 mgmt_channel = shared_resource->mgmt_ch; 5574 5575 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5576 5577 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5578 /* The QoS object is always valid and readable while 5579 * the channel flag is set, so the lock here should not 5580 * be necessary. We're not in the fast path though, so 5581 * just take it anyway. */ 5582 spdk_spin_lock(&channel->bdev->internal.spinlock); 5583 if (channel->bdev->internal.qos->ch == channel) { 5584 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5585 } 5586 spdk_spin_unlock(&channel->bdev->internal.spinlock); 5587 } 5588 5589 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5590 bdev_abort_all_buf_io(mgmt_channel, channel); 5591 bdev_abort_all_buf_io(mgmt_channel, channel); 5592 bdev_abort_all_queued_io(&tmp_queued, channel); 5593 5594 spdk_bdev_for_each_channel_continue(i, 0); 5595 } 5596 5597 static void 5598 bdev_start_reset(void *ctx) 5599 { 5600 struct spdk_bdev_channel *ch = ctx; 5601 5602 spdk_bdev_for_each_channel(ch->bdev, bdev_reset_freeze_channel, ch, 5603 bdev_reset_freeze_channel_done); 5604 } 5605 5606 static void 5607 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5608 { 5609 struct spdk_bdev *bdev = ch->bdev; 5610 5611 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5612 5613 spdk_spin_lock(&bdev->internal.spinlock); 5614 if (bdev->internal.reset_in_progress == NULL) { 5615 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5616 /* 5617 * Take a channel reference for the target bdev for the life of this 5618 * reset. This guards against the channel getting destroyed while 5619 * spdk_bdev_for_each_channel() calls related to this reset IO are in 5620 * progress. We will release the reference when this reset is 5621 * completed. 5622 */ 5623 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5624 bdev_start_reset(ch); 5625 } 5626 spdk_spin_unlock(&bdev->internal.spinlock); 5627 } 5628 5629 int 5630 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5631 spdk_bdev_io_completion_cb cb, void *cb_arg) 5632 { 5633 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5634 struct spdk_bdev_io *bdev_io; 5635 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5636 5637 bdev_io = bdev_channel_get_io(channel); 5638 if (!bdev_io) { 5639 return -ENOMEM; 5640 } 5641 5642 bdev_io->internal.ch = channel; 5643 bdev_io->internal.desc = desc; 5644 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5645 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5646 bdev_io->u.reset.ch_ref = NULL; 5647 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5648 5649 spdk_spin_lock(&bdev->internal.spinlock); 5650 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5651 spdk_spin_unlock(&bdev->internal.spinlock); 5652 5653 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5654 internal.ch_link); 5655 5656 bdev_channel_start_reset(channel); 5657 5658 return 0; 5659 } 5660 5661 void 5662 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5663 struct spdk_bdev_io_stat *stat) 5664 { 5665 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5666 5667 bdev_io_stat_get(stat, channel->stat); 5668 } 5669 5670 static void 5671 bdev_get_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5672 { 5673 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5674 5675 bdev_iostat_ctx->cb(bdev, bdev_iostat_ctx->stat, 5676 bdev_iostat_ctx->cb_arg, 0); 5677 free(bdev_iostat_ctx); 5678 } 5679 5680 static void 5681 bdev_get_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5682 struct spdk_io_channel *ch, void *_ctx) 5683 { 5684 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = _ctx; 5685 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5686 5687 bdev_io_stat_add(bdev_iostat_ctx->stat, channel->stat); 5688 spdk_bdev_for_each_channel_continue(i, 0); 5689 } 5690 5691 void 5692 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5693 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5694 { 5695 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5696 5697 assert(bdev != NULL); 5698 assert(stat != NULL); 5699 assert(cb != NULL); 5700 5701 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5702 if (bdev_iostat_ctx == NULL) { 5703 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5704 cb(bdev, stat, cb_arg, -ENOMEM); 5705 return; 5706 } 5707 5708 bdev_iostat_ctx->stat = stat; 5709 bdev_iostat_ctx->cb = cb; 5710 bdev_iostat_ctx->cb_arg = cb_arg; 5711 5712 /* Start with the statistics from previously deleted channels. */ 5713 spdk_spin_lock(&bdev->internal.spinlock); 5714 bdev_io_stat_get(bdev_iostat_ctx->stat, bdev->internal.stat); 5715 spdk_spin_unlock(&bdev->internal.spinlock); 5716 5717 /* Then iterate and add the statistics from each existing channel. */ 5718 spdk_bdev_for_each_channel(bdev, bdev_get_each_channel_stat, bdev_iostat_ctx, 5719 bdev_get_device_stat_done); 5720 } 5721 5722 struct bdev_iostat_reset_ctx { 5723 enum bdev_reset_stat_mode mode; 5724 bdev_reset_device_stat_cb cb; 5725 void *cb_arg; 5726 }; 5727 5728 static void 5729 bdev_reset_device_stat_done(struct spdk_bdev *bdev, void *_ctx, int status) 5730 { 5731 struct bdev_iostat_reset_ctx *ctx = _ctx; 5732 5733 ctx->cb(bdev, ctx->cb_arg, 0); 5734 5735 free(ctx); 5736 } 5737 5738 static void 5739 bdev_reset_each_channel_stat(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 5740 struct spdk_io_channel *ch, void *_ctx) 5741 { 5742 struct bdev_iostat_reset_ctx *ctx = _ctx; 5743 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5744 5745 bdev_io_stat_reset(channel->stat, ctx->mode); 5746 5747 spdk_bdev_for_each_channel_continue(i, 0); 5748 } 5749 5750 void 5751 bdev_reset_device_stat(struct spdk_bdev *bdev, enum bdev_reset_stat_mode mode, 5752 bdev_reset_device_stat_cb cb, void *cb_arg) 5753 { 5754 struct bdev_iostat_reset_ctx *ctx; 5755 5756 assert(bdev != NULL); 5757 assert(cb != NULL); 5758 5759 ctx = calloc(1, sizeof(*ctx)); 5760 if (ctx == NULL) { 5761 SPDK_ERRLOG("Unable to allocate bdev_iostat_reset_ctx.\n"); 5762 cb(bdev, cb_arg, -ENOMEM); 5763 return; 5764 } 5765 5766 ctx->mode = mode; 5767 ctx->cb = cb; 5768 ctx->cb_arg = cb_arg; 5769 5770 spdk_spin_lock(&bdev->internal.spinlock); 5771 bdev_io_stat_reset(bdev->internal.stat, mode); 5772 spdk_spin_unlock(&bdev->internal.spinlock); 5773 5774 spdk_bdev_for_each_channel(bdev, 5775 bdev_reset_each_channel_stat, 5776 ctx, 5777 bdev_reset_device_stat_done); 5778 } 5779 5780 int 5781 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5782 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5783 spdk_bdev_io_completion_cb cb, void *cb_arg) 5784 { 5785 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5786 struct spdk_bdev_io *bdev_io; 5787 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5788 5789 if (!desc->write) { 5790 return -EBADF; 5791 } 5792 5793 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5794 return -ENOTSUP; 5795 } 5796 5797 bdev_io = bdev_channel_get_io(channel); 5798 if (!bdev_io) { 5799 return -ENOMEM; 5800 } 5801 5802 bdev_io->internal.ch = channel; 5803 bdev_io->internal.desc = desc; 5804 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5805 bdev_io->u.nvme_passthru.cmd = *cmd; 5806 bdev_io->u.nvme_passthru.buf = buf; 5807 bdev_io->u.nvme_passthru.nbytes = nbytes; 5808 bdev_io->u.nvme_passthru.md_buf = NULL; 5809 bdev_io->u.nvme_passthru.md_len = 0; 5810 5811 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5812 5813 bdev_io_submit(bdev_io); 5814 return 0; 5815 } 5816 5817 int 5818 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5819 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5820 spdk_bdev_io_completion_cb cb, void *cb_arg) 5821 { 5822 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5823 struct spdk_bdev_io *bdev_io; 5824 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5825 5826 if (!desc->write) { 5827 /* 5828 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5829 * to easily determine if the command is a read or write, but for now just 5830 * do not allow io_passthru with a read-only descriptor. 5831 */ 5832 return -EBADF; 5833 } 5834 5835 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5836 return -ENOTSUP; 5837 } 5838 5839 bdev_io = bdev_channel_get_io(channel); 5840 if (!bdev_io) { 5841 return -ENOMEM; 5842 } 5843 5844 bdev_io->internal.ch = channel; 5845 bdev_io->internal.desc = desc; 5846 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5847 bdev_io->u.nvme_passthru.cmd = *cmd; 5848 bdev_io->u.nvme_passthru.buf = buf; 5849 bdev_io->u.nvme_passthru.nbytes = nbytes; 5850 bdev_io->u.nvme_passthru.md_buf = NULL; 5851 bdev_io->u.nvme_passthru.md_len = 0; 5852 5853 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5854 5855 bdev_io_submit(bdev_io); 5856 return 0; 5857 } 5858 5859 int 5860 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5861 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5862 spdk_bdev_io_completion_cb cb, void *cb_arg) 5863 { 5864 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5865 struct spdk_bdev_io *bdev_io; 5866 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 5867 5868 if (!desc->write) { 5869 /* 5870 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5871 * to easily determine if the command is a read or write, but for now just 5872 * do not allow io_passthru with a read-only descriptor. 5873 */ 5874 return -EBADF; 5875 } 5876 5877 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5878 return -ENOTSUP; 5879 } 5880 5881 bdev_io = bdev_channel_get_io(channel); 5882 if (!bdev_io) { 5883 return -ENOMEM; 5884 } 5885 5886 bdev_io->internal.ch = channel; 5887 bdev_io->internal.desc = desc; 5888 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5889 bdev_io->u.nvme_passthru.cmd = *cmd; 5890 bdev_io->u.nvme_passthru.buf = buf; 5891 bdev_io->u.nvme_passthru.nbytes = nbytes; 5892 bdev_io->u.nvme_passthru.md_buf = md_buf; 5893 bdev_io->u.nvme_passthru.md_len = md_len; 5894 5895 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5896 5897 bdev_io_submit(bdev_io); 5898 return 0; 5899 } 5900 5901 static void bdev_abort_retry(void *ctx); 5902 static void bdev_abort(struct spdk_bdev_io *parent_io); 5903 5904 static void 5905 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5906 { 5907 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5908 struct spdk_bdev_io *parent_io = cb_arg; 5909 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5910 5911 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5912 5913 spdk_bdev_free_io(bdev_io); 5914 5915 if (!success) { 5916 /* Check if the target I/O completed in the meantime. */ 5917 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5918 if (tmp_io == bio_to_abort) { 5919 break; 5920 } 5921 } 5922 5923 /* If the target I/O still exists, set the parent to failed. */ 5924 if (tmp_io != NULL) { 5925 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5926 } 5927 } 5928 5929 parent_io->u.bdev.split_outstanding--; 5930 if (parent_io->u.bdev.split_outstanding == 0) { 5931 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5932 bdev_abort_retry(parent_io); 5933 } else { 5934 bdev_io_complete(parent_io); 5935 } 5936 } 5937 } 5938 5939 static int 5940 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5941 struct spdk_bdev_io *bio_to_abort, 5942 spdk_bdev_io_completion_cb cb, void *cb_arg) 5943 { 5944 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5945 struct spdk_bdev_io *bdev_io; 5946 5947 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5948 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5949 /* TODO: Abort reset or abort request. */ 5950 return -ENOTSUP; 5951 } 5952 5953 bdev_io = bdev_channel_get_io(channel); 5954 if (bdev_io == NULL) { 5955 return -ENOMEM; 5956 } 5957 5958 bdev_io->internal.ch = channel; 5959 bdev_io->internal.desc = desc; 5960 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5961 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5962 5963 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5964 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5965 5966 /* Parent abort request is not submitted directly, but to manage its 5967 * execution add it to the submitted list here. 5968 */ 5969 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5970 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5971 5972 bdev_abort(bdev_io); 5973 5974 return 0; 5975 } 5976 5977 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5978 5979 /* Submit the abort request to the underlying bdev module. */ 5980 bdev_io_submit(bdev_io); 5981 5982 return 0; 5983 } 5984 5985 static uint32_t 5986 _bdev_abort(struct spdk_bdev_io *parent_io) 5987 { 5988 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5989 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5990 void *bio_cb_arg; 5991 struct spdk_bdev_io *bio_to_abort; 5992 uint32_t matched_ios; 5993 int rc; 5994 5995 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5996 5997 /* matched_ios is returned and will be kept by the caller. 5998 * 5999 * This function will be used for two cases, 1) the same cb_arg is used for 6000 * multiple I/Os, 2) a single large I/O is split into smaller ones. 6001 * Incrementing split_outstanding directly here may confuse readers especially 6002 * for the 1st case. 6003 * 6004 * Completion of I/O abort is processed after stack unwinding. Hence this trick 6005 * works as expected. 6006 */ 6007 matched_ios = 0; 6008 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6009 6010 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 6011 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 6012 continue; 6013 } 6014 6015 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 6016 /* Any I/O which was submitted after this abort command should be excluded. */ 6017 continue; 6018 } 6019 6020 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 6021 if (rc != 0) { 6022 if (rc == -ENOMEM) { 6023 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 6024 } else { 6025 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6026 } 6027 break; 6028 } 6029 matched_ios++; 6030 } 6031 6032 return matched_ios; 6033 } 6034 6035 static void 6036 bdev_abort_retry(void *ctx) 6037 { 6038 struct spdk_bdev_io *parent_io = ctx; 6039 uint32_t matched_ios; 6040 6041 matched_ios = _bdev_abort(parent_io); 6042 6043 if (matched_ios == 0) { 6044 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6045 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6046 } else { 6047 /* For retry, the case that no target I/O was found is success 6048 * because it means target I/Os completed in the meantime. 6049 */ 6050 bdev_io_complete(parent_io); 6051 } 6052 return; 6053 } 6054 6055 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6056 parent_io->u.bdev.split_outstanding = matched_ios; 6057 } 6058 6059 static void 6060 bdev_abort(struct spdk_bdev_io *parent_io) 6061 { 6062 uint32_t matched_ios; 6063 6064 matched_ios = _bdev_abort(parent_io); 6065 6066 if (matched_ios == 0) { 6067 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 6068 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 6069 } else { 6070 /* The case the no target I/O was found is failure. */ 6071 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6072 bdev_io_complete(parent_io); 6073 } 6074 return; 6075 } 6076 6077 /* Use split_outstanding to manage the progress of aborting I/Os. */ 6078 parent_io->u.bdev.split_outstanding = matched_ios; 6079 } 6080 6081 int 6082 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 6083 void *bio_cb_arg, 6084 spdk_bdev_io_completion_cb cb, void *cb_arg) 6085 { 6086 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6087 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6088 struct spdk_bdev_io *bdev_io; 6089 6090 if (bio_cb_arg == NULL) { 6091 return -EINVAL; 6092 } 6093 6094 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 6095 return -ENOTSUP; 6096 } 6097 6098 bdev_io = bdev_channel_get_io(channel); 6099 if (bdev_io == NULL) { 6100 return -ENOMEM; 6101 } 6102 6103 bdev_io->internal.ch = channel; 6104 bdev_io->internal.desc = desc; 6105 bdev_io->internal.submit_tsc = spdk_get_ticks(); 6106 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 6107 bdev_io_init(bdev_io, bdev, cb_arg, cb); 6108 6109 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 6110 6111 /* Parent abort request is not submitted directly, but to manage its execution, 6112 * add it to the submitted list here. 6113 */ 6114 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 6115 6116 bdev_abort(bdev_io); 6117 6118 return 0; 6119 } 6120 6121 int 6122 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 6123 struct spdk_bdev_io_wait_entry *entry) 6124 { 6125 struct spdk_bdev_channel *channel = __io_ch_to_bdev_ch(ch); 6126 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 6127 6128 if (bdev != entry->bdev) { 6129 SPDK_ERRLOG("bdevs do not match\n"); 6130 return -EINVAL; 6131 } 6132 6133 if (mgmt_ch->per_thread_cache_count > 0) { 6134 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 6135 return -EINVAL; 6136 } 6137 6138 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 6139 return 0; 6140 } 6141 6142 static inline void 6143 bdev_io_update_io_stat(struct spdk_bdev_io *bdev_io, uint64_t tsc_diff) 6144 { 6145 struct spdk_bdev_io_stat *io_stat = bdev_io->internal.ch->stat; 6146 uint64_t num_blocks = bdev_io->u.bdev.num_blocks; 6147 uint32_t blocklen = bdev_io->bdev->blocklen; 6148 6149 if (spdk_likely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS)) { 6150 switch (bdev_io->type) { 6151 case SPDK_BDEV_IO_TYPE_READ: 6152 io_stat->bytes_read += num_blocks * blocklen; 6153 io_stat->num_read_ops++; 6154 io_stat->read_latency_ticks += tsc_diff; 6155 if (io_stat->max_read_latency_ticks < tsc_diff) { 6156 io_stat->max_read_latency_ticks = tsc_diff; 6157 } 6158 if (io_stat->min_read_latency_ticks > tsc_diff) { 6159 io_stat->min_read_latency_ticks = tsc_diff; 6160 } 6161 break; 6162 case SPDK_BDEV_IO_TYPE_WRITE: 6163 io_stat->bytes_written += num_blocks * blocklen; 6164 io_stat->num_write_ops++; 6165 io_stat->write_latency_ticks += tsc_diff; 6166 if (io_stat->max_write_latency_ticks < tsc_diff) { 6167 io_stat->max_write_latency_ticks = tsc_diff; 6168 } 6169 if (io_stat->min_write_latency_ticks > tsc_diff) { 6170 io_stat->min_write_latency_ticks = tsc_diff; 6171 } 6172 break; 6173 case SPDK_BDEV_IO_TYPE_UNMAP: 6174 io_stat->bytes_unmapped += num_blocks * blocklen; 6175 io_stat->num_unmap_ops++; 6176 io_stat->unmap_latency_ticks += tsc_diff; 6177 if (io_stat->max_unmap_latency_ticks < tsc_diff) { 6178 io_stat->max_unmap_latency_ticks = tsc_diff; 6179 } 6180 if (io_stat->min_unmap_latency_ticks > tsc_diff) { 6181 io_stat->min_unmap_latency_ticks = tsc_diff; 6182 } 6183 break; 6184 case SPDK_BDEV_IO_TYPE_ZCOPY: 6185 /* Track the data in the start phase only */ 6186 if (bdev_io->u.bdev.zcopy.start) { 6187 if (bdev_io->u.bdev.zcopy.populate) { 6188 io_stat->bytes_read += num_blocks * blocklen; 6189 io_stat->num_read_ops++; 6190 io_stat->read_latency_ticks += tsc_diff; 6191 if (io_stat->max_read_latency_ticks < tsc_diff) { 6192 io_stat->max_read_latency_ticks = tsc_diff; 6193 } 6194 if (io_stat->min_read_latency_ticks > tsc_diff) { 6195 io_stat->min_read_latency_ticks = tsc_diff; 6196 } 6197 } else { 6198 io_stat->bytes_written += num_blocks * blocklen; 6199 io_stat->num_write_ops++; 6200 io_stat->write_latency_ticks += tsc_diff; 6201 if (io_stat->max_write_latency_ticks < tsc_diff) { 6202 io_stat->max_write_latency_ticks = tsc_diff; 6203 } 6204 if (io_stat->min_write_latency_ticks > tsc_diff) { 6205 io_stat->min_write_latency_ticks = tsc_diff; 6206 } 6207 } 6208 } 6209 break; 6210 case SPDK_BDEV_IO_TYPE_COPY: 6211 io_stat->bytes_copied += num_blocks * blocklen; 6212 io_stat->num_copy_ops++; 6213 bdev_io->internal.ch->stat->copy_latency_ticks += tsc_diff; 6214 if (io_stat->max_copy_latency_ticks < tsc_diff) { 6215 io_stat->max_copy_latency_ticks = tsc_diff; 6216 } 6217 if (io_stat->min_copy_latency_ticks > tsc_diff) { 6218 io_stat->min_copy_latency_ticks = tsc_diff; 6219 } 6220 break; 6221 default: 6222 break; 6223 } 6224 } 6225 6226 #ifdef SPDK_CONFIG_VTUNE 6227 uint64_t now_tsc = spdk_get_ticks(); 6228 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6229 uint64_t data[5]; 6230 struct spdk_bdev_io_stat *prev_stat = bdev_io->internal.ch->prev_stat; 6231 6232 data[0] = io_stat->num_read_ops - prev_stat->num_read_ops; 6233 data[1] = io_stat->bytes_read - prev_stat->bytes_read; 6234 data[2] = io_stat->num_write_ops - prev_stat->num_write_ops; 6235 data[3] = io_stat->bytes_written - prev_stat->bytes_written; 6236 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6237 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6238 6239 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6240 __itt_metadata_u64, 5, data); 6241 6242 memcpy(prev_stat, io_stat, sizeof(struct spdk_bdev_io_stat)); 6243 bdev_io->internal.ch->start_tsc = now_tsc; 6244 } 6245 #endif 6246 } 6247 6248 static inline void 6249 bdev_io_complete(void *ctx) 6250 { 6251 struct spdk_bdev_io *bdev_io = ctx; 6252 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6253 uint64_t tsc, tsc_diff; 6254 6255 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 6256 /* 6257 * Send the completion to the thread that originally submitted the I/O, 6258 * which may not be the current thread in the case of QoS. 6259 */ 6260 if (bdev_io->internal.io_submit_ch) { 6261 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6262 bdev_io->internal.io_submit_ch = NULL; 6263 } 6264 6265 /* 6266 * Defer completion to avoid potential infinite recursion if the 6267 * user's completion callback issues a new I/O. 6268 */ 6269 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6270 bdev_io_complete, bdev_io); 6271 return; 6272 } 6273 6274 tsc = spdk_get_ticks(); 6275 tsc_diff = tsc - bdev_io->internal.submit_tsc; 6276 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 6277 bdev_io->internal.caller_ctx); 6278 6279 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 6280 6281 if (bdev_io->internal.ch->histogram) { 6282 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 6283 } 6284 6285 bdev_io_update_io_stat(bdev_io, tsc_diff); 6286 6287 assert(bdev_io->internal.cb != NULL); 6288 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6289 6290 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6291 bdev_io->internal.caller_ctx); 6292 } 6293 6294 static void bdev_destroy_cb(void *io_device); 6295 6296 static void 6297 bdev_reset_complete(struct spdk_bdev *bdev, void *_ctx, int status) 6298 { 6299 struct spdk_bdev_io *bdev_io = _ctx; 6300 6301 if (bdev_io->u.reset.ch_ref != NULL) { 6302 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6303 bdev_io->u.reset.ch_ref = NULL; 6304 } 6305 6306 bdev_io_complete(bdev_io); 6307 6308 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6309 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6310 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6311 } 6312 } 6313 6314 static void 6315 bdev_unfreeze_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6316 struct spdk_io_channel *_ch, void *_ctx) 6317 { 6318 struct spdk_bdev_io *bdev_io = _ctx; 6319 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 6320 struct spdk_bdev_io *queued_reset; 6321 6322 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6323 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6324 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6325 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6326 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6327 } 6328 6329 spdk_bdev_for_each_channel_continue(i, 0); 6330 } 6331 6332 void 6333 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6334 { 6335 struct spdk_bdev *bdev = bdev_io->bdev; 6336 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6337 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6338 6339 bdev_io->internal.status = status; 6340 6341 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6342 bool unlock_channels = false; 6343 6344 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6345 SPDK_ERRLOG("NOMEM returned for reset\n"); 6346 } 6347 spdk_spin_lock(&bdev->internal.spinlock); 6348 if (bdev_io == bdev->internal.reset_in_progress) { 6349 bdev->internal.reset_in_progress = NULL; 6350 unlock_channels = true; 6351 } 6352 spdk_spin_unlock(&bdev->internal.spinlock); 6353 6354 if (unlock_channels) { 6355 spdk_bdev_for_each_channel(bdev, bdev_unfreeze_channel, bdev_io, 6356 bdev_reset_complete); 6357 return; 6358 } 6359 } else { 6360 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6361 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6362 /* bdev IO will be completed in the callback */ 6363 return; 6364 } 6365 6366 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6367 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6368 return; 6369 } 6370 } 6371 6372 bdev_io_complete(bdev_io); 6373 } 6374 6375 void 6376 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6377 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6378 { 6379 if (sc == SPDK_SCSI_STATUS_GOOD) { 6380 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6381 } else { 6382 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6383 bdev_io->internal.error.scsi.sc = sc; 6384 bdev_io->internal.error.scsi.sk = sk; 6385 bdev_io->internal.error.scsi.asc = asc; 6386 bdev_io->internal.error.scsi.ascq = ascq; 6387 } 6388 6389 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6390 } 6391 6392 void 6393 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6394 int *sc, int *sk, int *asc, int *ascq) 6395 { 6396 assert(sc != NULL); 6397 assert(sk != NULL); 6398 assert(asc != NULL); 6399 assert(ascq != NULL); 6400 6401 switch (bdev_io->internal.status) { 6402 case SPDK_BDEV_IO_STATUS_SUCCESS: 6403 *sc = SPDK_SCSI_STATUS_GOOD; 6404 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6405 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6406 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6407 break; 6408 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6409 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6410 break; 6411 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6412 *sc = bdev_io->internal.error.scsi.sc; 6413 *sk = bdev_io->internal.error.scsi.sk; 6414 *asc = bdev_io->internal.error.scsi.asc; 6415 *ascq = bdev_io->internal.error.scsi.ascq; 6416 break; 6417 default: 6418 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6419 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6420 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6421 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6422 break; 6423 } 6424 } 6425 6426 void 6427 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6428 { 6429 if (aio_result == 0) { 6430 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6431 } else { 6432 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6433 } 6434 6435 bdev_io->internal.error.aio_result = aio_result; 6436 6437 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6438 } 6439 6440 void 6441 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6442 { 6443 assert(aio_result != NULL); 6444 6445 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6446 *aio_result = bdev_io->internal.error.aio_result; 6447 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6448 *aio_result = 0; 6449 } else { 6450 *aio_result = -EIO; 6451 } 6452 } 6453 6454 void 6455 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6456 { 6457 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6458 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6459 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6460 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6461 } else { 6462 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6463 } 6464 6465 bdev_io->internal.error.nvme.cdw0 = cdw0; 6466 bdev_io->internal.error.nvme.sct = sct; 6467 bdev_io->internal.error.nvme.sc = sc; 6468 6469 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6470 } 6471 6472 void 6473 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6474 { 6475 assert(sct != NULL); 6476 assert(sc != NULL); 6477 assert(cdw0 != NULL); 6478 6479 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6480 *sct = SPDK_NVME_SCT_GENERIC; 6481 *sc = SPDK_NVME_SC_SUCCESS; 6482 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6483 *cdw0 = 0; 6484 } else { 6485 *cdw0 = 1U; 6486 } 6487 return; 6488 } 6489 6490 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6491 *sct = bdev_io->internal.error.nvme.sct; 6492 *sc = bdev_io->internal.error.nvme.sc; 6493 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6494 *sct = SPDK_NVME_SCT_GENERIC; 6495 *sc = SPDK_NVME_SC_SUCCESS; 6496 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6497 *sct = SPDK_NVME_SCT_GENERIC; 6498 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6499 } else { 6500 *sct = SPDK_NVME_SCT_GENERIC; 6501 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6502 } 6503 6504 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6505 } 6506 6507 void 6508 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6509 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6510 { 6511 assert(first_sct != NULL); 6512 assert(first_sc != NULL); 6513 assert(second_sct != NULL); 6514 assert(second_sc != NULL); 6515 assert(cdw0 != NULL); 6516 6517 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6518 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6519 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6520 *first_sct = bdev_io->internal.error.nvme.sct; 6521 *first_sc = bdev_io->internal.error.nvme.sc; 6522 *second_sct = SPDK_NVME_SCT_GENERIC; 6523 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6524 } else { 6525 *first_sct = SPDK_NVME_SCT_GENERIC; 6526 *first_sc = SPDK_NVME_SC_SUCCESS; 6527 *second_sct = bdev_io->internal.error.nvme.sct; 6528 *second_sc = bdev_io->internal.error.nvme.sc; 6529 } 6530 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6531 *first_sct = SPDK_NVME_SCT_GENERIC; 6532 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6533 *second_sct = SPDK_NVME_SCT_GENERIC; 6534 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6535 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6536 *first_sct = SPDK_NVME_SCT_GENERIC; 6537 *first_sc = SPDK_NVME_SC_SUCCESS; 6538 *second_sct = SPDK_NVME_SCT_GENERIC; 6539 *second_sc = SPDK_NVME_SC_SUCCESS; 6540 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6541 *first_sct = SPDK_NVME_SCT_GENERIC; 6542 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6543 *second_sct = SPDK_NVME_SCT_GENERIC; 6544 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6545 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6546 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6547 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6548 *second_sct = SPDK_NVME_SCT_GENERIC; 6549 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6550 } else { 6551 *first_sct = SPDK_NVME_SCT_GENERIC; 6552 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6553 *second_sct = SPDK_NVME_SCT_GENERIC; 6554 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6555 } 6556 6557 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6558 } 6559 6560 struct spdk_thread * 6561 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6562 { 6563 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6564 } 6565 6566 struct spdk_io_channel * 6567 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6568 { 6569 return bdev_io->internal.ch->channel; 6570 } 6571 6572 static int 6573 bdev_register(struct spdk_bdev *bdev) 6574 { 6575 char *bdev_name; 6576 char uuid[SPDK_UUID_STRING_LEN]; 6577 int ret; 6578 6579 assert(bdev->module != NULL); 6580 6581 if (!bdev->name) { 6582 SPDK_ERRLOG("Bdev name is NULL\n"); 6583 return -EINVAL; 6584 } 6585 6586 if (!strlen(bdev->name)) { 6587 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6588 return -EINVAL; 6589 } 6590 6591 /* Users often register their own I/O devices using the bdev name. In 6592 * order to avoid conflicts, prepend bdev_. */ 6593 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6594 if (!bdev_name) { 6595 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6596 return -ENOMEM; 6597 } 6598 6599 bdev->internal.stat = bdev_io_stat_alloc(); 6600 if (!bdev->internal.stat) { 6601 SPDK_ERRLOG("Unable to allocate I/O statistics structure.\n"); 6602 free(bdev_name); 6603 return -ENOMEM; 6604 } 6605 6606 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6607 bdev->internal.measured_queue_depth = UINT64_MAX; 6608 bdev->internal.claim_module = NULL; 6609 bdev->internal.qd_poller = NULL; 6610 bdev->internal.qos = NULL; 6611 6612 TAILQ_INIT(&bdev->internal.open_descs); 6613 TAILQ_INIT(&bdev->internal.locked_ranges); 6614 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6615 TAILQ_INIT(&bdev->aliases); 6616 6617 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6618 if (ret != 0) { 6619 bdev_io_stat_free(bdev->internal.stat); 6620 free(bdev_name); 6621 return ret; 6622 } 6623 6624 /* UUID has to be specified by the user or defined by bdev itself. 6625 * Otherwise this field must remain empty, to indicate that this 6626 * value cannot be depended upon. */ 6627 if (!spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6628 /* Add the UUID alias only if it's different than the name */ 6629 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6630 if (strcmp(bdev->name, uuid) != 0) { 6631 ret = spdk_bdev_alias_add(bdev, uuid); 6632 if (ret != 0) { 6633 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6634 bdev_name_del(&bdev->internal.bdev_name); 6635 bdev_io_stat_free(bdev->internal.stat); 6636 free(bdev_name); 6637 return ret; 6638 } 6639 } 6640 } 6641 6642 if (spdk_bdev_get_buf_align(bdev) > 1) { 6643 if (bdev->split_on_optimal_io_boundary) { 6644 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6645 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6646 } else { 6647 bdev->split_on_optimal_io_boundary = true; 6648 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6649 } 6650 } 6651 6652 /* If the user didn't specify a write unit size, set it to one. */ 6653 if (bdev->write_unit_size == 0) { 6654 bdev->write_unit_size = 1; 6655 } 6656 6657 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6658 if (bdev->acwu == 0) { 6659 bdev->acwu = bdev->write_unit_size; 6660 } 6661 6662 if (bdev->phys_blocklen == 0) { 6663 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6664 } 6665 6666 bdev->internal.reset_in_progress = NULL; 6667 bdev->internal.qd_poll_in_progress = false; 6668 bdev->internal.period = 0; 6669 bdev->internal.new_period = 0; 6670 6671 spdk_io_device_register(__bdev_to_io_dev(bdev), 6672 bdev_channel_create, bdev_channel_destroy, 6673 sizeof(struct spdk_bdev_channel), 6674 bdev_name); 6675 6676 free(bdev_name); 6677 6678 spdk_spin_init(&bdev->internal.spinlock); 6679 6680 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6681 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6682 6683 return 0; 6684 } 6685 6686 static void 6687 bdev_destroy_cb(void *io_device) 6688 { 6689 int rc; 6690 struct spdk_bdev *bdev; 6691 spdk_bdev_unregister_cb cb_fn; 6692 void *cb_arg; 6693 6694 bdev = __bdev_from_io_dev(io_device); 6695 cb_fn = bdev->internal.unregister_cb; 6696 cb_arg = bdev->internal.unregister_ctx; 6697 6698 spdk_spin_destroy(&bdev->internal.spinlock); 6699 free(bdev->internal.qos); 6700 bdev_io_stat_free(bdev->internal.stat); 6701 6702 rc = bdev->fn_table->destruct(bdev->ctxt); 6703 if (rc < 0) { 6704 SPDK_ERRLOG("destruct failed\n"); 6705 } 6706 if (rc <= 0 && cb_fn != NULL) { 6707 cb_fn(cb_arg, rc); 6708 } 6709 } 6710 6711 void 6712 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6713 { 6714 if (bdev->internal.unregister_cb != NULL) { 6715 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6716 } 6717 } 6718 6719 static void 6720 _remove_notify(void *arg) 6721 { 6722 struct spdk_bdev_desc *desc = arg; 6723 6724 spdk_spin_lock(&desc->spinlock); 6725 desc->refs--; 6726 6727 if (!desc->closed) { 6728 spdk_spin_unlock(&desc->spinlock); 6729 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6730 return; 6731 } else if (0 == desc->refs) { 6732 /* This descriptor was closed after this remove_notify message was sent. 6733 * spdk_bdev_close() could not free the descriptor since this message was 6734 * in flight, so we free it now using bdev_desc_free(). 6735 */ 6736 spdk_spin_unlock(&desc->spinlock); 6737 bdev_desc_free(desc); 6738 return; 6739 } 6740 spdk_spin_unlock(&desc->spinlock); 6741 } 6742 6743 /* returns: 0 - bdev removed and ready to be destructed. 6744 * -EBUSY - bdev can't be destructed yet. */ 6745 static int 6746 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6747 { 6748 struct spdk_bdev_desc *desc, *tmp; 6749 int rc = 0; 6750 char uuid[SPDK_UUID_STRING_LEN]; 6751 6752 assert(spdk_spin_held(&g_bdev_mgr.spinlock)); 6753 assert(spdk_spin_held(&bdev->internal.spinlock)); 6754 6755 /* Notify each descriptor about hotremoval */ 6756 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6757 rc = -EBUSY; 6758 spdk_spin_lock(&desc->spinlock); 6759 /* 6760 * Defer invocation of the event_cb to a separate message that will 6761 * run later on its thread. This ensures this context unwinds and 6762 * we don't recursively unregister this bdev again if the event_cb 6763 * immediately closes its descriptor. 6764 */ 6765 desc->refs++; 6766 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6767 spdk_spin_unlock(&desc->spinlock); 6768 } 6769 6770 /* If there are no descriptors, proceed removing the bdev */ 6771 if (rc == 0) { 6772 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6773 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6774 6775 /* Delete the name and the UUID alias */ 6776 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6777 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6778 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6779 6780 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6781 6782 if (bdev->internal.reset_in_progress != NULL) { 6783 /* If reset is in progress, let the completion callback for reset 6784 * unregister the bdev. 6785 */ 6786 rc = -EBUSY; 6787 } 6788 } 6789 6790 return rc; 6791 } 6792 6793 static void 6794 bdev_unregister_abort_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 6795 struct spdk_io_channel *io_ch, void *_ctx) 6796 { 6797 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 6798 6799 bdev_channel_abort_queued_ios(bdev_ch); 6800 spdk_bdev_for_each_channel_continue(i, 0); 6801 } 6802 6803 static void 6804 bdev_unregister(struct spdk_bdev *bdev, void *_ctx, int status) 6805 { 6806 int rc; 6807 6808 spdk_spin_lock(&g_bdev_mgr.spinlock); 6809 spdk_spin_lock(&bdev->internal.spinlock); 6810 /* 6811 * Set the status to REMOVING after completing to abort channels. Otherwise, 6812 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6813 * spdk_bdev_for_each_channel() is executed and spdk_io_device_unregister() 6814 * may fail. 6815 */ 6816 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6817 rc = bdev_unregister_unsafe(bdev); 6818 spdk_spin_unlock(&bdev->internal.spinlock); 6819 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6820 6821 if (rc == 0) { 6822 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6823 } 6824 } 6825 6826 void 6827 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6828 { 6829 struct spdk_thread *thread; 6830 6831 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6832 6833 thread = spdk_get_thread(); 6834 if (!thread) { 6835 /* The user called this from a non-SPDK thread. */ 6836 if (cb_fn != NULL) { 6837 cb_fn(cb_arg, -ENOTSUP); 6838 } 6839 return; 6840 } 6841 6842 spdk_spin_lock(&g_bdev_mgr.spinlock); 6843 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6844 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6845 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6846 if (cb_fn) { 6847 cb_fn(cb_arg, -EBUSY); 6848 } 6849 return; 6850 } 6851 6852 spdk_spin_lock(&bdev->internal.spinlock); 6853 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6854 bdev->internal.unregister_cb = cb_fn; 6855 bdev->internal.unregister_ctx = cb_arg; 6856 spdk_spin_unlock(&bdev->internal.spinlock); 6857 spdk_spin_unlock(&g_bdev_mgr.spinlock); 6858 6859 spdk_bdev_set_qd_sampling_period(bdev, 0); 6860 6861 spdk_bdev_for_each_channel(bdev, bdev_unregister_abort_channel, bdev, 6862 bdev_unregister); 6863 } 6864 6865 int 6866 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6867 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6868 { 6869 struct spdk_bdev_desc *desc; 6870 struct spdk_bdev *bdev; 6871 int rc; 6872 6873 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6874 if (rc != 0) { 6875 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6876 return rc; 6877 } 6878 6879 bdev = spdk_bdev_desc_get_bdev(desc); 6880 6881 if (bdev->module != module) { 6882 spdk_bdev_close(desc); 6883 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6884 bdev_name); 6885 return -ENODEV; 6886 } 6887 6888 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6889 6890 spdk_bdev_close(desc); 6891 6892 return 0; 6893 } 6894 6895 static int 6896 bdev_start_qos(struct spdk_bdev *bdev) 6897 { 6898 struct set_qos_limit_ctx *ctx; 6899 6900 /* Enable QoS */ 6901 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6902 ctx = calloc(1, sizeof(*ctx)); 6903 if (ctx == NULL) { 6904 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6905 return -ENOMEM; 6906 } 6907 ctx->bdev = bdev; 6908 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, bdev_enable_qos_done); 6909 } 6910 6911 return 0; 6912 } 6913 6914 static int 6915 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6916 { 6917 struct spdk_thread *thread; 6918 int rc = 0; 6919 6920 thread = spdk_get_thread(); 6921 if (!thread) { 6922 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6923 return -ENOTSUP; 6924 } 6925 6926 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6927 spdk_get_thread()); 6928 6929 desc->bdev = bdev; 6930 desc->thread = thread; 6931 desc->write = write; 6932 6933 spdk_spin_lock(&bdev->internal.spinlock); 6934 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6935 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6936 spdk_spin_unlock(&bdev->internal.spinlock); 6937 return -ENODEV; 6938 } 6939 6940 if (write && bdev->internal.claim_module) { 6941 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6942 bdev->name, bdev->internal.claim_module->name); 6943 spdk_spin_unlock(&bdev->internal.spinlock); 6944 return -EPERM; 6945 } 6946 6947 rc = bdev_start_qos(bdev); 6948 if (rc != 0) { 6949 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6950 spdk_spin_unlock(&bdev->internal.spinlock); 6951 return rc; 6952 } 6953 6954 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6955 6956 spdk_spin_unlock(&bdev->internal.spinlock); 6957 6958 return 0; 6959 } 6960 6961 static int 6962 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6963 struct spdk_bdev_desc **_desc) 6964 { 6965 struct spdk_bdev_desc *desc; 6966 unsigned int event_id; 6967 6968 desc = calloc(1, sizeof(*desc)); 6969 if (desc == NULL) { 6970 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6971 return -ENOMEM; 6972 } 6973 6974 TAILQ_INIT(&desc->pending_media_events); 6975 TAILQ_INIT(&desc->free_media_events); 6976 6977 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6978 desc->callback.event_fn = event_cb; 6979 desc->callback.ctx = event_ctx; 6980 spdk_spin_init(&desc->spinlock); 6981 6982 if (bdev->media_events) { 6983 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6984 sizeof(*desc->media_events_buffer)); 6985 if (desc->media_events_buffer == NULL) { 6986 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6987 bdev_desc_free(desc); 6988 return -ENOMEM; 6989 } 6990 6991 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6992 TAILQ_INSERT_TAIL(&desc->free_media_events, 6993 &desc->media_events_buffer[event_id], tailq); 6994 } 6995 } 6996 6997 *_desc = desc; 6998 6999 return 0; 7000 } 7001 7002 int 7003 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 7004 void *event_ctx, struct spdk_bdev_desc **_desc) 7005 { 7006 struct spdk_bdev_desc *desc; 7007 struct spdk_bdev *bdev; 7008 int rc; 7009 7010 if (event_cb == NULL) { 7011 SPDK_ERRLOG("Missing event callback function\n"); 7012 return -EINVAL; 7013 } 7014 7015 spdk_spin_lock(&g_bdev_mgr.spinlock); 7016 7017 bdev = bdev_get_by_name(bdev_name); 7018 7019 if (bdev == NULL) { 7020 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 7021 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7022 return -ENODEV; 7023 } 7024 7025 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 7026 if (rc != 0) { 7027 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7028 return rc; 7029 } 7030 7031 rc = bdev_open(bdev, write, desc); 7032 if (rc != 0) { 7033 bdev_desc_free(desc); 7034 desc = NULL; 7035 } 7036 7037 *_desc = desc; 7038 7039 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7040 7041 return rc; 7042 } 7043 7044 static void 7045 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 7046 { 7047 int rc; 7048 7049 spdk_spin_lock(&bdev->internal.spinlock); 7050 spdk_spin_lock(&desc->spinlock); 7051 7052 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 7053 7054 desc->closed = true; 7055 7056 if (0 == desc->refs) { 7057 spdk_spin_unlock(&desc->spinlock); 7058 bdev_desc_free(desc); 7059 } else { 7060 spdk_spin_unlock(&desc->spinlock); 7061 } 7062 7063 /* If no more descriptors, kill QoS channel */ 7064 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7065 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 7066 bdev->name, spdk_get_thread()); 7067 7068 if (bdev_qos_destroy(bdev)) { 7069 /* There isn't anything we can do to recover here. Just let the 7070 * old QoS poller keep running. The QoS handling won't change 7071 * cores when the user allocates a new channel, but it won't break. */ 7072 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 7073 } 7074 } 7075 7076 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 7077 rc = bdev_unregister_unsafe(bdev); 7078 spdk_spin_unlock(&bdev->internal.spinlock); 7079 7080 if (rc == 0) { 7081 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 7082 } 7083 } else { 7084 spdk_spin_unlock(&bdev->internal.spinlock); 7085 } 7086 } 7087 7088 void 7089 spdk_bdev_close(struct spdk_bdev_desc *desc) 7090 { 7091 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7092 7093 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 7094 spdk_get_thread()); 7095 7096 assert(desc->thread == spdk_get_thread()); 7097 7098 spdk_poller_unregister(&desc->io_timeout_poller); 7099 7100 spdk_spin_lock(&g_bdev_mgr.spinlock); 7101 7102 bdev_close(bdev, desc); 7103 7104 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7105 } 7106 7107 static void 7108 bdev_register_finished(void *arg) 7109 { 7110 struct spdk_bdev_desc *desc = arg; 7111 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7112 7113 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 7114 7115 spdk_spin_lock(&g_bdev_mgr.spinlock); 7116 7117 bdev_close(bdev, desc); 7118 7119 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7120 } 7121 7122 int 7123 spdk_bdev_register(struct spdk_bdev *bdev) 7124 { 7125 struct spdk_bdev_desc *desc; 7126 int rc; 7127 7128 rc = bdev_register(bdev); 7129 if (rc != 0) { 7130 return rc; 7131 } 7132 7133 /* A descriptor is opened to prevent bdev deletion during examination */ 7134 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7135 if (rc != 0) { 7136 spdk_bdev_unregister(bdev, NULL, NULL); 7137 return rc; 7138 } 7139 7140 rc = bdev_open(bdev, false, desc); 7141 if (rc != 0) { 7142 bdev_desc_free(desc); 7143 spdk_bdev_unregister(bdev, NULL, NULL); 7144 return rc; 7145 } 7146 7147 /* Examine configuration before initializing I/O */ 7148 bdev_examine(bdev); 7149 7150 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 7151 if (rc != 0) { 7152 bdev_close(bdev, desc); 7153 spdk_bdev_unregister(bdev, NULL, NULL); 7154 } 7155 7156 return rc; 7157 } 7158 7159 int 7160 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 7161 struct spdk_bdev_module *module) 7162 { 7163 if (bdev->internal.claim_module != NULL) { 7164 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 7165 bdev->internal.claim_module->name); 7166 return -EPERM; 7167 } 7168 7169 if (desc && !desc->write) { 7170 desc->write = true; 7171 } 7172 7173 bdev->internal.claim_module = module; 7174 return 0; 7175 } 7176 7177 void 7178 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 7179 { 7180 assert(bdev->internal.claim_module != NULL); 7181 bdev->internal.claim_module = NULL; 7182 } 7183 7184 struct spdk_bdev * 7185 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 7186 { 7187 assert(desc != NULL); 7188 return desc->bdev; 7189 } 7190 7191 int 7192 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 7193 { 7194 struct spdk_bdev *bdev, *tmp; 7195 struct spdk_bdev_desc *desc; 7196 int rc = 0; 7197 7198 assert(fn != NULL); 7199 7200 spdk_spin_lock(&g_bdev_mgr.spinlock); 7201 bdev = spdk_bdev_first(); 7202 while (bdev != NULL) { 7203 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7204 if (rc != 0) { 7205 break; 7206 } 7207 rc = bdev_open(bdev, false, desc); 7208 if (rc != 0) { 7209 bdev_desc_free(desc); 7210 if (rc == -ENODEV) { 7211 /* Ignore the error and move to the next bdev. */ 7212 rc = 0; 7213 bdev = spdk_bdev_next(bdev); 7214 continue; 7215 } 7216 break; 7217 } 7218 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7219 7220 rc = fn(ctx, bdev); 7221 7222 spdk_spin_lock(&g_bdev_mgr.spinlock); 7223 tmp = spdk_bdev_next(bdev); 7224 bdev_close(bdev, desc); 7225 if (rc != 0) { 7226 break; 7227 } 7228 bdev = tmp; 7229 } 7230 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7231 7232 return rc; 7233 } 7234 7235 int 7236 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 7237 { 7238 struct spdk_bdev *bdev, *tmp; 7239 struct spdk_bdev_desc *desc; 7240 int rc = 0; 7241 7242 assert(fn != NULL); 7243 7244 spdk_spin_lock(&g_bdev_mgr.spinlock); 7245 bdev = spdk_bdev_first_leaf(); 7246 while (bdev != NULL) { 7247 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 7248 if (rc != 0) { 7249 break; 7250 } 7251 rc = bdev_open(bdev, false, desc); 7252 if (rc != 0) { 7253 bdev_desc_free(desc); 7254 if (rc == -ENODEV) { 7255 /* Ignore the error and move to the next bdev. */ 7256 rc = 0; 7257 bdev = spdk_bdev_next_leaf(bdev); 7258 continue; 7259 } 7260 break; 7261 } 7262 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7263 7264 rc = fn(ctx, bdev); 7265 7266 spdk_spin_lock(&g_bdev_mgr.spinlock); 7267 tmp = spdk_bdev_next_leaf(bdev); 7268 bdev_close(bdev, desc); 7269 if (rc != 0) { 7270 break; 7271 } 7272 bdev = tmp; 7273 } 7274 spdk_spin_unlock(&g_bdev_mgr.spinlock); 7275 7276 return rc; 7277 } 7278 7279 void 7280 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 7281 { 7282 struct iovec *iovs; 7283 int iovcnt; 7284 7285 if (bdev_io == NULL) { 7286 return; 7287 } 7288 7289 switch (bdev_io->type) { 7290 case SPDK_BDEV_IO_TYPE_READ: 7291 case SPDK_BDEV_IO_TYPE_WRITE: 7292 case SPDK_BDEV_IO_TYPE_ZCOPY: 7293 iovs = bdev_io->u.bdev.iovs; 7294 iovcnt = bdev_io->u.bdev.iovcnt; 7295 break; 7296 default: 7297 iovs = NULL; 7298 iovcnt = 0; 7299 break; 7300 } 7301 7302 if (iovp) { 7303 *iovp = iovs; 7304 } 7305 if (iovcntp) { 7306 *iovcntp = iovcnt; 7307 } 7308 } 7309 7310 void * 7311 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7312 { 7313 if (bdev_io == NULL) { 7314 return NULL; 7315 } 7316 7317 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7318 return NULL; 7319 } 7320 7321 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7322 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7323 return bdev_io->u.bdev.md_buf; 7324 } 7325 7326 return NULL; 7327 } 7328 7329 void * 7330 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7331 { 7332 if (bdev_io == NULL) { 7333 assert(false); 7334 return NULL; 7335 } 7336 7337 return bdev_io->internal.caller_ctx; 7338 } 7339 7340 void 7341 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7342 { 7343 7344 if (spdk_bdev_module_list_find(bdev_module->name)) { 7345 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7346 assert(false); 7347 } 7348 7349 /* 7350 * Modules with examine callbacks must be initialized first, so they are 7351 * ready to handle examine callbacks from later modules that will 7352 * register physical bdevs. 7353 */ 7354 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7355 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7356 } else { 7357 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7358 } 7359 } 7360 7361 struct spdk_bdev_module * 7362 spdk_bdev_module_list_find(const char *name) 7363 { 7364 struct spdk_bdev_module *bdev_module; 7365 7366 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7367 if (strcmp(name, bdev_module->name) == 0) { 7368 break; 7369 } 7370 } 7371 7372 return bdev_module; 7373 } 7374 7375 static void 7376 bdev_write_zero_buffer_next(void *_bdev_io) 7377 { 7378 struct spdk_bdev_io *bdev_io = _bdev_io; 7379 uint64_t num_bytes, num_blocks; 7380 void *md_buf = NULL; 7381 int rc; 7382 7383 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7384 bdev_io->u.bdev.split_remaining_num_blocks, 7385 ZERO_BUFFER_SIZE); 7386 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7387 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7388 7389 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7390 md_buf = (char *)g_bdev_mgr.zero_buffer + 7391 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7392 } 7393 7394 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7395 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7396 g_bdev_mgr.zero_buffer, md_buf, 7397 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7398 bdev_write_zero_buffer_done, bdev_io); 7399 if (rc == 0) { 7400 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7401 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7402 } else if (rc == -ENOMEM) { 7403 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7404 } else { 7405 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7406 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7407 } 7408 } 7409 7410 static void 7411 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7412 { 7413 struct spdk_bdev_io *parent_io = cb_arg; 7414 7415 spdk_bdev_free_io(bdev_io); 7416 7417 if (!success) { 7418 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7419 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7420 return; 7421 } 7422 7423 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7424 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7425 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7426 return; 7427 } 7428 7429 bdev_write_zero_buffer_next(parent_io); 7430 } 7431 7432 static void 7433 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7434 { 7435 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7436 ctx->bdev->internal.qos_mod_in_progress = false; 7437 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7438 7439 if (ctx->cb_fn) { 7440 ctx->cb_fn(ctx->cb_arg, status); 7441 } 7442 free(ctx); 7443 } 7444 7445 static void 7446 bdev_disable_qos_done(void *cb_arg) 7447 { 7448 struct set_qos_limit_ctx *ctx = cb_arg; 7449 struct spdk_bdev *bdev = ctx->bdev; 7450 struct spdk_bdev_io *bdev_io; 7451 struct spdk_bdev_qos *qos; 7452 7453 spdk_spin_lock(&bdev->internal.spinlock); 7454 qos = bdev->internal.qos; 7455 bdev->internal.qos = NULL; 7456 spdk_spin_unlock(&bdev->internal.spinlock); 7457 7458 while (!TAILQ_EMPTY(&qos->queued)) { 7459 /* Send queued I/O back to their original thread for resubmission. */ 7460 bdev_io = TAILQ_FIRST(&qos->queued); 7461 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7462 7463 if (bdev_io->internal.io_submit_ch) { 7464 /* 7465 * Channel was changed when sending it to the QoS thread - change it back 7466 * before sending it back to the original thread. 7467 */ 7468 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7469 bdev_io->internal.io_submit_ch = NULL; 7470 } 7471 7472 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7473 _bdev_io_submit, bdev_io); 7474 } 7475 7476 if (qos->thread != NULL) { 7477 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7478 spdk_poller_unregister(&qos->poller); 7479 } 7480 7481 free(qos); 7482 7483 bdev_set_qos_limit_done(ctx, 0); 7484 } 7485 7486 static void 7487 bdev_disable_qos_msg_done(struct spdk_bdev *bdev, void *_ctx, int status) 7488 { 7489 struct set_qos_limit_ctx *ctx = _ctx; 7490 struct spdk_thread *thread; 7491 7492 spdk_spin_lock(&bdev->internal.spinlock); 7493 thread = bdev->internal.qos->thread; 7494 spdk_spin_unlock(&bdev->internal.spinlock); 7495 7496 if (thread != NULL) { 7497 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7498 } else { 7499 bdev_disable_qos_done(ctx); 7500 } 7501 } 7502 7503 static void 7504 bdev_disable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7505 struct spdk_io_channel *ch, void *_ctx) 7506 { 7507 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7508 7509 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7510 7511 spdk_bdev_for_each_channel_continue(i, 0); 7512 } 7513 7514 static void 7515 bdev_update_qos_rate_limit_msg(void *cb_arg) 7516 { 7517 struct set_qos_limit_ctx *ctx = cb_arg; 7518 struct spdk_bdev *bdev = ctx->bdev; 7519 7520 spdk_spin_lock(&bdev->internal.spinlock); 7521 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7522 spdk_spin_unlock(&bdev->internal.spinlock); 7523 7524 bdev_set_qos_limit_done(ctx, 0); 7525 } 7526 7527 static void 7528 bdev_enable_qos_msg(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7529 struct spdk_io_channel *ch, void *_ctx) 7530 { 7531 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7532 7533 spdk_spin_lock(&bdev->internal.spinlock); 7534 bdev_enable_qos(bdev, bdev_ch); 7535 spdk_spin_unlock(&bdev->internal.spinlock); 7536 spdk_bdev_for_each_channel_continue(i, 0); 7537 } 7538 7539 static void 7540 bdev_enable_qos_done(struct spdk_bdev *bdev, void *_ctx, int status) 7541 { 7542 struct set_qos_limit_ctx *ctx = _ctx; 7543 7544 bdev_set_qos_limit_done(ctx, status); 7545 } 7546 7547 static void 7548 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7549 { 7550 int i; 7551 7552 assert(bdev->internal.qos != NULL); 7553 7554 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7555 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7556 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7557 7558 if (limits[i] == 0) { 7559 bdev->internal.qos->rate_limits[i].limit = 7560 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7561 } 7562 } 7563 } 7564 } 7565 7566 void 7567 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7568 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7569 { 7570 struct set_qos_limit_ctx *ctx; 7571 uint32_t limit_set_complement; 7572 uint64_t min_limit_per_sec; 7573 int i; 7574 bool disable_rate_limit = true; 7575 7576 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7577 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7578 continue; 7579 } 7580 7581 if (limits[i] > 0) { 7582 disable_rate_limit = false; 7583 } 7584 7585 if (bdev_qos_is_iops_rate_limit(i) == true) { 7586 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7587 } else { 7588 /* Change from megabyte to byte rate limit */ 7589 limits[i] = limits[i] * 1024 * 1024; 7590 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7591 } 7592 7593 limit_set_complement = limits[i] % min_limit_per_sec; 7594 if (limit_set_complement) { 7595 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7596 limits[i], min_limit_per_sec); 7597 limits[i] += min_limit_per_sec - limit_set_complement; 7598 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7599 } 7600 } 7601 7602 ctx = calloc(1, sizeof(*ctx)); 7603 if (ctx == NULL) { 7604 cb_fn(cb_arg, -ENOMEM); 7605 return; 7606 } 7607 7608 ctx->cb_fn = cb_fn; 7609 ctx->cb_arg = cb_arg; 7610 ctx->bdev = bdev; 7611 7612 spdk_spin_lock(&bdev->internal.spinlock); 7613 if (bdev->internal.qos_mod_in_progress) { 7614 spdk_spin_unlock(&bdev->internal.spinlock); 7615 free(ctx); 7616 cb_fn(cb_arg, -EAGAIN); 7617 return; 7618 } 7619 bdev->internal.qos_mod_in_progress = true; 7620 7621 if (disable_rate_limit == true && bdev->internal.qos) { 7622 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7623 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7624 (bdev->internal.qos->rate_limits[i].limit > 0 && 7625 bdev->internal.qos->rate_limits[i].limit != 7626 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7627 disable_rate_limit = false; 7628 break; 7629 } 7630 } 7631 } 7632 7633 if (disable_rate_limit == false) { 7634 if (bdev->internal.qos == NULL) { 7635 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7636 if (!bdev->internal.qos) { 7637 spdk_spin_unlock(&bdev->internal.spinlock); 7638 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7639 bdev_set_qos_limit_done(ctx, -ENOMEM); 7640 return; 7641 } 7642 } 7643 7644 if (bdev->internal.qos->thread == NULL) { 7645 /* Enabling */ 7646 bdev_set_qos_rate_limits(bdev, limits); 7647 7648 spdk_bdev_for_each_channel(bdev, bdev_enable_qos_msg, ctx, 7649 bdev_enable_qos_done); 7650 } else { 7651 /* Updating */ 7652 bdev_set_qos_rate_limits(bdev, limits); 7653 7654 spdk_thread_send_msg(bdev->internal.qos->thread, 7655 bdev_update_qos_rate_limit_msg, ctx); 7656 } 7657 } else { 7658 if (bdev->internal.qos != NULL) { 7659 bdev_set_qos_rate_limits(bdev, limits); 7660 7661 /* Disabling */ 7662 spdk_bdev_for_each_channel(bdev, bdev_disable_qos_msg, ctx, 7663 bdev_disable_qos_msg_done); 7664 } else { 7665 spdk_spin_unlock(&bdev->internal.spinlock); 7666 bdev_set_qos_limit_done(ctx, 0); 7667 return; 7668 } 7669 } 7670 7671 spdk_spin_unlock(&bdev->internal.spinlock); 7672 } 7673 7674 struct spdk_bdev_histogram_ctx { 7675 spdk_bdev_histogram_status_cb cb_fn; 7676 void *cb_arg; 7677 struct spdk_bdev *bdev; 7678 int status; 7679 }; 7680 7681 static void 7682 bdev_histogram_disable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7683 { 7684 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7685 7686 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7687 ctx->bdev->internal.histogram_in_progress = false; 7688 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7689 ctx->cb_fn(ctx->cb_arg, ctx->status); 7690 free(ctx); 7691 } 7692 7693 static void 7694 bdev_histogram_disable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7695 struct spdk_io_channel *_ch, void *_ctx) 7696 { 7697 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7698 7699 if (ch->histogram != NULL) { 7700 spdk_histogram_data_free(ch->histogram); 7701 ch->histogram = NULL; 7702 } 7703 spdk_bdev_for_each_channel_continue(i, 0); 7704 } 7705 7706 static void 7707 bdev_histogram_enable_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7708 { 7709 struct spdk_bdev_histogram_ctx *ctx = _ctx; 7710 7711 if (status != 0) { 7712 ctx->status = status; 7713 ctx->bdev->internal.histogram_enabled = false; 7714 spdk_bdev_for_each_channel(ctx->bdev, bdev_histogram_disable_channel, ctx, 7715 bdev_histogram_disable_channel_cb); 7716 } else { 7717 spdk_spin_lock(&ctx->bdev->internal.spinlock); 7718 ctx->bdev->internal.histogram_in_progress = false; 7719 spdk_spin_unlock(&ctx->bdev->internal.spinlock); 7720 ctx->cb_fn(ctx->cb_arg, ctx->status); 7721 free(ctx); 7722 } 7723 } 7724 7725 static void 7726 bdev_histogram_enable_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7727 struct spdk_io_channel *_ch, void *_ctx) 7728 { 7729 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7730 int status = 0; 7731 7732 if (ch->histogram == NULL) { 7733 ch->histogram = spdk_histogram_data_alloc(); 7734 if (ch->histogram == NULL) { 7735 status = -ENOMEM; 7736 } 7737 } 7738 7739 spdk_bdev_for_each_channel_continue(i, status); 7740 } 7741 7742 void 7743 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7744 void *cb_arg, bool enable) 7745 { 7746 struct spdk_bdev_histogram_ctx *ctx; 7747 7748 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7749 if (ctx == NULL) { 7750 cb_fn(cb_arg, -ENOMEM); 7751 return; 7752 } 7753 7754 ctx->bdev = bdev; 7755 ctx->status = 0; 7756 ctx->cb_fn = cb_fn; 7757 ctx->cb_arg = cb_arg; 7758 7759 spdk_spin_lock(&bdev->internal.spinlock); 7760 if (bdev->internal.histogram_in_progress) { 7761 spdk_spin_unlock(&bdev->internal.spinlock); 7762 free(ctx); 7763 cb_fn(cb_arg, -EAGAIN); 7764 return; 7765 } 7766 7767 bdev->internal.histogram_in_progress = true; 7768 spdk_spin_unlock(&bdev->internal.spinlock); 7769 7770 bdev->internal.histogram_enabled = enable; 7771 7772 if (enable) { 7773 /* Allocate histogram for each channel */ 7774 spdk_bdev_for_each_channel(bdev, bdev_histogram_enable_channel, ctx, 7775 bdev_histogram_enable_channel_cb); 7776 } else { 7777 spdk_bdev_for_each_channel(bdev, bdev_histogram_disable_channel, ctx, 7778 bdev_histogram_disable_channel_cb); 7779 } 7780 } 7781 7782 struct spdk_bdev_histogram_data_ctx { 7783 spdk_bdev_histogram_data_cb cb_fn; 7784 void *cb_arg; 7785 struct spdk_bdev *bdev; 7786 /** merged histogram data from all channels */ 7787 struct spdk_histogram_data *histogram; 7788 }; 7789 7790 static void 7791 bdev_histogram_get_channel_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7792 { 7793 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7794 7795 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7796 free(ctx); 7797 } 7798 7799 static void 7800 bdev_histogram_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 7801 struct spdk_io_channel *_ch, void *_ctx) 7802 { 7803 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7804 struct spdk_bdev_histogram_data_ctx *ctx = _ctx; 7805 int status = 0; 7806 7807 if (ch->histogram == NULL) { 7808 status = -EFAULT; 7809 } else { 7810 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7811 } 7812 7813 spdk_bdev_for_each_channel_continue(i, status); 7814 } 7815 7816 void 7817 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7818 spdk_bdev_histogram_data_cb cb_fn, 7819 void *cb_arg) 7820 { 7821 struct spdk_bdev_histogram_data_ctx *ctx; 7822 7823 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7824 if (ctx == NULL) { 7825 cb_fn(cb_arg, -ENOMEM, NULL); 7826 return; 7827 } 7828 7829 ctx->bdev = bdev; 7830 ctx->cb_fn = cb_fn; 7831 ctx->cb_arg = cb_arg; 7832 7833 ctx->histogram = histogram; 7834 7835 spdk_bdev_for_each_channel(bdev, bdev_histogram_get_channel, ctx, 7836 bdev_histogram_get_channel_cb); 7837 } 7838 7839 void 7840 spdk_bdev_channel_get_histogram(struct spdk_io_channel *ch, spdk_bdev_histogram_data_cb cb_fn, 7841 void *cb_arg) 7842 { 7843 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(ch); 7844 int status = 0; 7845 7846 assert(cb_fn != NULL); 7847 7848 if (bdev_ch->histogram == NULL) { 7849 status = -EFAULT; 7850 } 7851 cb_fn(cb_arg, status, bdev_ch->histogram); 7852 } 7853 7854 size_t 7855 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7856 size_t max_events) 7857 { 7858 struct media_event_entry *entry; 7859 size_t num_events = 0; 7860 7861 for (; num_events < max_events; ++num_events) { 7862 entry = TAILQ_FIRST(&desc->pending_media_events); 7863 if (entry == NULL) { 7864 break; 7865 } 7866 7867 events[num_events] = entry->event; 7868 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7869 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7870 } 7871 7872 return num_events; 7873 } 7874 7875 int 7876 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7877 size_t num_events) 7878 { 7879 struct spdk_bdev_desc *desc; 7880 struct media_event_entry *entry; 7881 size_t event_id; 7882 int rc = 0; 7883 7884 assert(bdev->media_events); 7885 7886 spdk_spin_lock(&bdev->internal.spinlock); 7887 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7888 if (desc->write) { 7889 break; 7890 } 7891 } 7892 7893 if (desc == NULL || desc->media_events_buffer == NULL) { 7894 rc = -ENODEV; 7895 goto out; 7896 } 7897 7898 for (event_id = 0; event_id < num_events; ++event_id) { 7899 entry = TAILQ_FIRST(&desc->free_media_events); 7900 if (entry == NULL) { 7901 break; 7902 } 7903 7904 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7905 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7906 entry->event = events[event_id]; 7907 } 7908 7909 rc = event_id; 7910 out: 7911 spdk_spin_unlock(&bdev->internal.spinlock); 7912 return rc; 7913 } 7914 7915 void 7916 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7917 { 7918 struct spdk_bdev_desc *desc; 7919 7920 spdk_spin_lock(&bdev->internal.spinlock); 7921 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7922 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7923 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7924 desc->callback.ctx); 7925 } 7926 } 7927 spdk_spin_unlock(&bdev->internal.spinlock); 7928 } 7929 7930 struct locked_lba_range_ctx { 7931 struct lba_range range; 7932 struct spdk_bdev *bdev; 7933 struct lba_range *current_range; 7934 struct lba_range *owner_range; 7935 struct spdk_poller *poller; 7936 lock_range_cb cb_fn; 7937 void *cb_arg; 7938 }; 7939 7940 static void 7941 bdev_lock_error_cleanup_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7942 { 7943 struct locked_lba_range_ctx *ctx = _ctx; 7944 7945 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7946 free(ctx); 7947 } 7948 7949 static void bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, 7950 struct spdk_bdev *bdev, struct spdk_io_channel *ch, void *_ctx); 7951 7952 static void 7953 bdev_lock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 7954 { 7955 struct locked_lba_range_ctx *ctx = _ctx; 7956 7957 if (status == -ENOMEM) { 7958 /* One of the channels could not allocate a range object. 7959 * So we have to go back and clean up any ranges that were 7960 * allocated successfully before we return error status to 7961 * the caller. We can reuse the unlock function to do that 7962 * clean up. 7963 */ 7964 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 7965 bdev_lock_error_cleanup_cb); 7966 return; 7967 } 7968 7969 /* All channels have locked this range and no I/O overlapping the range 7970 * are outstanding! Set the owner_ch for the range object for the 7971 * locking channel, so that this channel will know that it is allowed 7972 * to write to this range. 7973 */ 7974 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7975 ctx->cb_fn(ctx->cb_arg, status); 7976 7977 /* Don't free the ctx here. Its range is in the bdev's global list of 7978 * locked ranges still, and will be removed and freed when this range 7979 * is later unlocked. 7980 */ 7981 } 7982 7983 static int 7984 bdev_lock_lba_range_check_io(void *_i) 7985 { 7986 struct spdk_bdev_channel_iter *i = _i; 7987 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i->i); 7988 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 7989 struct locked_lba_range_ctx *ctx = i->ctx; 7990 struct lba_range *range = ctx->current_range; 7991 struct spdk_bdev_io *bdev_io; 7992 7993 spdk_poller_unregister(&ctx->poller); 7994 7995 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7996 * range. But we need to wait until any outstanding IO overlapping with this range 7997 * are completed. 7998 */ 7999 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 8000 if (bdev_io_range_is_locked(bdev_io, range)) { 8001 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 8002 return SPDK_POLLER_BUSY; 8003 } 8004 } 8005 8006 spdk_bdev_for_each_channel_continue(i, 0); 8007 return SPDK_POLLER_BUSY; 8008 } 8009 8010 static void 8011 bdev_lock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8012 struct spdk_io_channel *_ch, void *_ctx) 8013 { 8014 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8015 struct locked_lba_range_ctx *ctx = _ctx; 8016 struct lba_range *range; 8017 8018 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8019 if (range->length == ctx->range.length && 8020 range->offset == ctx->range.offset && 8021 range->locked_ctx == ctx->range.locked_ctx) { 8022 /* This range already exists on this channel, so don't add 8023 * it again. This can happen when a new channel is created 8024 * while the for_each_channel operation is in progress. 8025 * Do not check for outstanding I/O in that case, since the 8026 * range was locked before any I/O could be submitted to the 8027 * new channel. 8028 */ 8029 spdk_bdev_for_each_channel_continue(i, 0); 8030 return; 8031 } 8032 } 8033 8034 range = calloc(1, sizeof(*range)); 8035 if (range == NULL) { 8036 spdk_bdev_for_each_channel_continue(i, -ENOMEM); 8037 return; 8038 } 8039 8040 range->length = ctx->range.length; 8041 range->offset = ctx->range.offset; 8042 range->locked_ctx = ctx->range.locked_ctx; 8043 ctx->current_range = range; 8044 if (ctx->range.owner_ch == ch) { 8045 /* This is the range object for the channel that will hold 8046 * the lock. Store it in the ctx object so that we can easily 8047 * set its owner_ch after the lock is finally acquired. 8048 */ 8049 ctx->owner_range = range; 8050 } 8051 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 8052 bdev_lock_lba_range_check_io(i); 8053 } 8054 8055 static void 8056 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 8057 { 8058 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 8059 8060 /* We will add a copy of this range to each channel now. */ 8061 spdk_bdev_for_each_channel(bdev, bdev_lock_lba_range_get_channel, ctx, 8062 bdev_lock_lba_range_cb); 8063 } 8064 8065 static bool 8066 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 8067 { 8068 struct lba_range *r; 8069 8070 TAILQ_FOREACH(r, tailq, tailq) { 8071 if (bdev_lba_range_overlapped(range, r)) { 8072 return true; 8073 } 8074 } 8075 return false; 8076 } 8077 8078 static int 8079 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8080 uint64_t offset, uint64_t length, 8081 lock_range_cb cb_fn, void *cb_arg) 8082 { 8083 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8084 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8085 struct locked_lba_range_ctx *ctx; 8086 8087 if (cb_arg == NULL) { 8088 SPDK_ERRLOG("cb_arg must not be NULL\n"); 8089 return -EINVAL; 8090 } 8091 8092 ctx = calloc(1, sizeof(*ctx)); 8093 if (ctx == NULL) { 8094 return -ENOMEM; 8095 } 8096 8097 ctx->range.offset = offset; 8098 ctx->range.length = length; 8099 ctx->range.owner_ch = ch; 8100 ctx->range.locked_ctx = cb_arg; 8101 ctx->bdev = bdev; 8102 ctx->cb_fn = cb_fn; 8103 ctx->cb_arg = cb_arg; 8104 8105 spdk_spin_lock(&bdev->internal.spinlock); 8106 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 8107 /* There is an active lock overlapping with this range. 8108 * Put it on the pending list until this range no 8109 * longer overlaps with another. 8110 */ 8111 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 8112 } else { 8113 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 8114 bdev_lock_lba_range_ctx(bdev, ctx); 8115 } 8116 spdk_spin_unlock(&bdev->internal.spinlock); 8117 return 0; 8118 } 8119 8120 static void 8121 bdev_lock_lba_range_ctx_msg(void *_ctx) 8122 { 8123 struct locked_lba_range_ctx *ctx = _ctx; 8124 8125 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 8126 } 8127 8128 static void 8129 bdev_unlock_lba_range_cb(struct spdk_bdev *bdev, void *_ctx, int status) 8130 { 8131 struct locked_lba_range_ctx *ctx = _ctx; 8132 struct locked_lba_range_ctx *pending_ctx; 8133 struct lba_range *range, *tmp; 8134 8135 spdk_spin_lock(&bdev->internal.spinlock); 8136 /* Check if there are any pending locked ranges that overlap with this range 8137 * that was just unlocked. If there are, check that it doesn't overlap with any 8138 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 8139 * the lock process. 8140 */ 8141 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 8142 if (bdev_lba_range_overlapped(range, &ctx->range) && 8143 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 8144 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 8145 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8146 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 8147 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 8148 bdev_lock_lba_range_ctx_msg, pending_ctx); 8149 } 8150 } 8151 spdk_spin_unlock(&bdev->internal.spinlock); 8152 8153 ctx->cb_fn(ctx->cb_arg, status); 8154 free(ctx); 8155 } 8156 8157 static void 8158 bdev_unlock_lba_range_get_channel(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8159 struct spdk_io_channel *_ch, void *_ctx) 8160 { 8161 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8162 struct locked_lba_range_ctx *ctx = _ctx; 8163 TAILQ_HEAD(, spdk_bdev_io) io_locked; 8164 struct spdk_bdev_io *bdev_io; 8165 struct lba_range *range; 8166 8167 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8168 if (ctx->range.offset == range->offset && 8169 ctx->range.length == range->length && 8170 ctx->range.locked_ctx == range->locked_ctx) { 8171 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 8172 free(range); 8173 break; 8174 } 8175 } 8176 8177 /* Note: we should almost always be able to assert that the range specified 8178 * was found. But there are some very rare corner cases where a new channel 8179 * gets created simultaneously with a range unlock, where this function 8180 * would execute on that new channel and wouldn't have the range. 8181 * We also use this to clean up range allocations when a later allocation 8182 * fails in the locking path. 8183 * So we can't actually assert() here. 8184 */ 8185 8186 /* Swap the locked IO into a temporary list, and then try to submit them again. 8187 * We could hyper-optimize this to only resubmit locked I/O that overlap 8188 * with the range that was just unlocked, but this isn't a performance path so 8189 * we go for simplicity here. 8190 */ 8191 TAILQ_INIT(&io_locked); 8192 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 8193 while (!TAILQ_EMPTY(&io_locked)) { 8194 bdev_io = TAILQ_FIRST(&io_locked); 8195 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 8196 bdev_io_submit(bdev_io); 8197 } 8198 8199 spdk_bdev_for_each_channel_continue(i, 0); 8200 } 8201 8202 static int 8203 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 8204 uint64_t offset, uint64_t length, 8205 lock_range_cb cb_fn, void *cb_arg) 8206 { 8207 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8208 struct spdk_bdev_channel *ch = __io_ch_to_bdev_ch(_ch); 8209 struct locked_lba_range_ctx *ctx; 8210 struct lba_range *range; 8211 bool range_found = false; 8212 8213 /* Let's make sure the specified channel actually has a lock on 8214 * the specified range. Note that the range must match exactly. 8215 */ 8216 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 8217 if (range->offset == offset && range->length == length && 8218 range->owner_ch == ch && range->locked_ctx == cb_arg) { 8219 range_found = true; 8220 break; 8221 } 8222 } 8223 8224 if (!range_found) { 8225 return -EINVAL; 8226 } 8227 8228 spdk_spin_lock(&bdev->internal.spinlock); 8229 /* We confirmed that this channel has locked the specified range. To 8230 * start the unlock the process, we find the range in the bdev's locked_ranges 8231 * and remove it. This ensures new channels don't inherit the locked range. 8232 * Then we will send a message to each channel (including the one specified 8233 * here) to remove the range from its per-channel list. 8234 */ 8235 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 8236 if (range->offset == offset && range->length == length && 8237 range->locked_ctx == cb_arg) { 8238 break; 8239 } 8240 } 8241 if (range == NULL) { 8242 assert(false); 8243 spdk_spin_unlock(&bdev->internal.spinlock); 8244 return -EINVAL; 8245 } 8246 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 8247 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 8248 spdk_spin_unlock(&bdev->internal.spinlock); 8249 8250 ctx->cb_fn = cb_fn; 8251 ctx->cb_arg = cb_arg; 8252 8253 spdk_bdev_for_each_channel(bdev, bdev_unlock_lba_range_get_channel, ctx, 8254 bdev_unlock_lba_range_cb); 8255 return 0; 8256 } 8257 8258 int 8259 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 8260 int array_size) 8261 { 8262 if (!bdev) { 8263 return -EINVAL; 8264 } 8265 8266 if (bdev->fn_table->get_memory_domains) { 8267 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 8268 } 8269 8270 return 0; 8271 } 8272 8273 struct spdk_bdev_for_each_io_ctx { 8274 void *ctx; 8275 spdk_bdev_io_fn fn; 8276 spdk_bdev_for_each_io_cb cb; 8277 }; 8278 8279 static void 8280 bdev_channel_for_each_io(struct spdk_bdev_channel_iter *i, struct spdk_bdev *bdev, 8281 struct spdk_io_channel *io_ch, void *_ctx) 8282 { 8283 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8284 struct spdk_bdev_channel *bdev_ch = __io_ch_to_bdev_ch(io_ch); 8285 struct spdk_bdev_io *bdev_io; 8286 int rc = 0; 8287 8288 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 8289 rc = ctx->fn(ctx->ctx, bdev_io); 8290 if (rc != 0) { 8291 break; 8292 } 8293 } 8294 8295 spdk_bdev_for_each_channel_continue(i, rc); 8296 } 8297 8298 static void 8299 bdev_for_each_io_done(struct spdk_bdev *bdev, void *_ctx, int status) 8300 { 8301 struct spdk_bdev_for_each_io_ctx *ctx = _ctx; 8302 8303 ctx->cb(ctx->ctx, status); 8304 8305 free(ctx); 8306 } 8307 8308 void 8309 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8310 spdk_bdev_for_each_io_cb cb) 8311 { 8312 struct spdk_bdev_for_each_io_ctx *ctx; 8313 8314 assert(fn != NULL && cb != NULL); 8315 8316 ctx = calloc(1, sizeof(*ctx)); 8317 if (ctx == NULL) { 8318 SPDK_ERRLOG("Failed to allocate context.\n"); 8319 cb(_ctx, -ENOMEM); 8320 return; 8321 } 8322 8323 ctx->ctx = _ctx; 8324 ctx->fn = fn; 8325 ctx->cb = cb; 8326 8327 spdk_bdev_for_each_channel(bdev, bdev_channel_for_each_io, ctx, 8328 bdev_for_each_io_done); 8329 } 8330 8331 void 8332 spdk_bdev_for_each_channel_continue(struct spdk_bdev_channel_iter *iter, int status) 8333 { 8334 spdk_for_each_channel_continue(iter->i, status); 8335 } 8336 8337 static struct spdk_bdev * 8338 io_channel_iter_get_bdev(struct spdk_io_channel_iter *i) 8339 { 8340 void *io_device = spdk_io_channel_iter_get_io_device(i); 8341 8342 return __bdev_from_io_dev(io_device); 8343 } 8344 8345 static void 8346 bdev_each_channel_msg(struct spdk_io_channel_iter *i) 8347 { 8348 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8349 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8350 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 8351 8352 iter->i = i; 8353 iter->fn(iter, bdev, ch, iter->ctx); 8354 } 8355 8356 static void 8357 bdev_each_channel_cpl(struct spdk_io_channel_iter *i, int status) 8358 { 8359 struct spdk_bdev_channel_iter *iter = spdk_io_channel_iter_get_ctx(i); 8360 struct spdk_bdev *bdev = io_channel_iter_get_bdev(i); 8361 8362 iter->i = i; 8363 iter->cpl(bdev, iter->ctx, status); 8364 8365 free(iter); 8366 } 8367 8368 void 8369 spdk_bdev_for_each_channel(struct spdk_bdev *bdev, spdk_bdev_for_each_channel_msg fn, 8370 void *ctx, spdk_bdev_for_each_channel_done cpl) 8371 { 8372 struct spdk_bdev_channel_iter *iter; 8373 8374 assert(bdev != NULL && fn != NULL && ctx != NULL); 8375 8376 iter = calloc(1, sizeof(struct spdk_bdev_channel_iter)); 8377 if (iter == NULL) { 8378 SPDK_ERRLOG("Unable to allocate iterator\n"); 8379 assert(false); 8380 return; 8381 } 8382 8383 iter->fn = fn; 8384 iter->cpl = cpl; 8385 iter->ctx = ctx; 8386 8387 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_each_channel_msg, 8388 iter, bdev_each_channel_cpl); 8389 } 8390 8391 int 8392 spdk_bdev_copy_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 8393 uint64_t dst_offset_blocks, uint64_t src_offset_blocks, uint64_t num_blocks, 8394 spdk_bdev_io_completion_cb cb, void *cb_arg) 8395 { 8396 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 8397 struct spdk_bdev_io *bdev_io; 8398 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 8399 8400 if (!desc->write) { 8401 return -EBADF; 8402 } 8403 8404 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COPY))) { 8405 SPDK_DEBUGLOG(bdev, "Copy IO type is not supported\n"); 8406 return -ENOTSUP; 8407 } 8408 8409 if (num_blocks == 0) { 8410 SPDK_ERRLOG("Can't copy 0 blocks\n"); 8411 return -EINVAL; 8412 } 8413 8414 if (!bdev_io_valid_blocks(bdev, dst_offset_blocks, num_blocks) || 8415 !bdev_io_valid_blocks(bdev, src_offset_blocks, num_blocks)) { 8416 SPDK_DEBUGLOG(bdev, 8417 "Invalid offset or number of blocks: dst %lu, src %lu, count %lu\n", 8418 dst_offset_blocks, src_offset_blocks, num_blocks); 8419 return -EINVAL; 8420 } 8421 8422 bdev_io = bdev_channel_get_io(channel); 8423 if (!bdev_io) { 8424 return -ENOMEM; 8425 } 8426 8427 bdev_io->internal.ch = channel; 8428 bdev_io->internal.desc = desc; 8429 bdev_io->type = SPDK_BDEV_IO_TYPE_COPY; 8430 8431 bdev_io->u.bdev.offset_blocks = dst_offset_blocks; 8432 bdev_io->u.bdev.copy.src_offset_blocks = src_offset_blocks; 8433 bdev_io->u.bdev.num_blocks = num_blocks; 8434 bdev_io->u.bdev.ext_opts = NULL; 8435 bdev_io_init(bdev_io, bdev, cb_arg, cb); 8436 8437 bdev_io_submit(bdev_io); 8438 return 0; 8439 } 8440 8441 SPDK_LOG_REGISTER_COMPONENT(bdev) 8442 8443 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8444 { 8445 struct spdk_trace_tpoint_opts opts[] = { 8446 { 8447 "BDEV_IO_START", TRACE_BDEV_IO_START, 8448 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8449 { 8450 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8451 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8452 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8453 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8454 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8455 } 8456 }, 8457 { 8458 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8459 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8460 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8461 }, 8462 { 8463 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8464 OWNER_BDEV, OBJECT_NONE, 1, 8465 { 8466 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8467 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8468 } 8469 }, 8470 { 8471 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8472 OWNER_BDEV, OBJECT_NONE, 0, 8473 { 8474 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8475 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8476 } 8477 }, 8478 }; 8479 8480 8481 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8482 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8483 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8484 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8485 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8486 } 8487