1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define NOMEM_THRESHOLD_COUNT 8 42 43 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 44 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 45 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 46 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 47 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 48 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 49 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 50 51 #define SPDK_BDEV_POOL_ALIGNMENT 512 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 60 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 61 }; 62 63 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 64 65 RB_HEAD(bdev_name_tree, spdk_bdev_name); 66 67 static int 68 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 69 { 70 return strcmp(name1->name, name2->name); 71 } 72 73 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 74 75 struct spdk_bdev_mgr { 76 struct spdk_mempool *bdev_io_pool; 77 78 struct spdk_mempool *buf_small_pool; 79 struct spdk_mempool *buf_large_pool; 80 81 void *zero_buffer; 82 83 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 84 85 struct spdk_bdev_list bdevs; 86 struct bdev_name_tree bdev_names; 87 88 bool init_complete; 89 bool module_init_complete; 90 91 pthread_mutex_t mutex; 92 93 #ifdef SPDK_CONFIG_VTUNE 94 __itt_domain *domain; 95 #endif 96 }; 97 98 static struct spdk_bdev_mgr g_bdev_mgr = { 99 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 100 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 101 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 102 .init_complete = false, 103 .module_init_complete = false, 104 .mutex = PTHREAD_MUTEX_INITIALIZER, 105 }; 106 107 typedef void (*lock_range_cb)(void *ctx, int status); 108 109 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 110 111 struct lba_range { 112 uint64_t offset; 113 uint64_t length; 114 void *locked_ctx; 115 struct spdk_bdev_channel *owner_ch; 116 TAILQ_ENTRY(lba_range) tailq; 117 }; 118 119 static struct spdk_bdev_opts g_bdev_opts = { 120 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 121 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 122 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 123 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 124 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 125 }; 126 127 static spdk_bdev_init_cb g_init_cb_fn = NULL; 128 static void *g_init_cb_arg = NULL; 129 130 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 131 static void *g_fini_cb_arg = NULL; 132 static struct spdk_thread *g_fini_thread = NULL; 133 134 struct spdk_bdev_qos_limit { 135 /** IOs or bytes allowed per second (i.e., 1s). */ 136 uint64_t limit; 137 138 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 139 * For remaining bytes, allowed to run negative if an I/O is submitted when 140 * some bytes are remaining, but the I/O is bigger than that amount. The 141 * excess will be deducted from the next timeslice. 142 */ 143 int64_t remaining_this_timeslice; 144 145 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 146 uint32_t min_per_timeslice; 147 148 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 149 uint32_t max_per_timeslice; 150 151 /** Function to check whether to queue the IO. */ 152 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 153 154 /** Function to update for the submitted IO. */ 155 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 156 }; 157 158 struct spdk_bdev_qos { 159 /** Types of structure of rate limits. */ 160 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 161 162 /** The channel that all I/O are funneled through. */ 163 struct spdk_bdev_channel *ch; 164 165 /** The thread on which the poller is running. */ 166 struct spdk_thread *thread; 167 168 /** Queue of I/O waiting to be issued. */ 169 bdev_io_tailq_t queued; 170 171 /** Size of a timeslice in tsc ticks. */ 172 uint64_t timeslice_size; 173 174 /** Timestamp of start of last timeslice. */ 175 uint64_t last_timeslice; 176 177 /** Poller that processes queued I/O commands each time slice. */ 178 struct spdk_poller *poller; 179 }; 180 181 struct spdk_bdev_mgmt_channel { 182 bdev_io_stailq_t need_buf_small; 183 bdev_io_stailq_t need_buf_large; 184 185 /* 186 * Each thread keeps a cache of bdev_io - this allows 187 * bdev threads which are *not* DPDK threads to still 188 * benefit from a per-thread bdev_io cache. Without 189 * this, non-DPDK threads fetching from the mempool 190 * incur a cmpxchg on get and put. 191 */ 192 bdev_io_stailq_t per_thread_cache; 193 uint32_t per_thread_cache_count; 194 uint32_t bdev_io_cache_size; 195 196 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 197 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 198 }; 199 200 /* 201 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 202 * will queue here their IO that awaits retry. It makes it possible to retry sending 203 * IO to one bdev after IO from other bdev completes. 204 */ 205 struct spdk_bdev_shared_resource { 206 /* The bdev management channel */ 207 struct spdk_bdev_mgmt_channel *mgmt_ch; 208 209 /* 210 * Count of I/O submitted to bdev module and waiting for completion. 211 * Incremented before submit_request() is called on an spdk_bdev_io. 212 */ 213 uint64_t io_outstanding; 214 215 /* 216 * Queue of IO awaiting retry because of a previous NOMEM status returned 217 * on this channel. 218 */ 219 bdev_io_tailq_t nomem_io; 220 221 /* 222 * Threshold which io_outstanding must drop to before retrying nomem_io. 223 */ 224 uint64_t nomem_threshold; 225 226 /* I/O channel allocated by a bdev module */ 227 struct spdk_io_channel *shared_ch; 228 229 /* Refcount of bdev channels using this resource */ 230 uint32_t ref; 231 232 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 233 }; 234 235 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 236 #define BDEV_CH_QOS_ENABLED (1 << 1) 237 238 struct spdk_bdev_channel { 239 struct spdk_bdev *bdev; 240 241 /* The channel for the underlying device */ 242 struct spdk_io_channel *channel; 243 244 /* Per io_device per thread data */ 245 struct spdk_bdev_shared_resource *shared_resource; 246 247 struct spdk_bdev_io_stat stat; 248 249 /* 250 * Count of I/O submitted to the underlying dev module through this channel 251 * and waiting for completion. 252 */ 253 uint64_t io_outstanding; 254 255 /* 256 * List of all submitted I/Os including I/O that are generated via splitting. 257 */ 258 bdev_io_tailq_t io_submitted; 259 260 /* 261 * List of spdk_bdev_io that are currently queued because they write to a locked 262 * LBA range. 263 */ 264 bdev_io_tailq_t io_locked; 265 266 uint32_t flags; 267 268 struct spdk_histogram_data *histogram; 269 270 #ifdef SPDK_CONFIG_VTUNE 271 uint64_t start_tsc; 272 uint64_t interval_tsc; 273 __itt_string_handle *handle; 274 struct spdk_bdev_io_stat prev_stat; 275 #endif 276 277 bdev_io_tailq_t queued_resets; 278 279 lba_range_tailq_t locked_ranges; 280 }; 281 282 struct media_event_entry { 283 struct spdk_bdev_media_event event; 284 TAILQ_ENTRY(media_event_entry) tailq; 285 }; 286 287 #define MEDIA_EVENT_POOL_SIZE 64 288 289 struct spdk_bdev_desc { 290 struct spdk_bdev *bdev; 291 struct spdk_thread *thread; 292 struct { 293 spdk_bdev_event_cb_t event_fn; 294 void *ctx; 295 } callback; 296 bool closed; 297 bool write; 298 bool memory_domains_supported; 299 pthread_mutex_t mutex; 300 uint32_t refs; 301 TAILQ_HEAD(, media_event_entry) pending_media_events; 302 TAILQ_HEAD(, media_event_entry) free_media_events; 303 struct media_event_entry *media_events_buffer; 304 TAILQ_ENTRY(spdk_bdev_desc) link; 305 306 uint64_t timeout_in_sec; 307 spdk_bdev_io_timeout_cb cb_fn; 308 void *cb_arg; 309 struct spdk_poller *io_timeout_poller; 310 }; 311 312 struct spdk_bdev_iostat_ctx { 313 struct spdk_bdev_io_stat *stat; 314 spdk_bdev_get_device_stat_cb cb; 315 void *cb_arg; 316 }; 317 318 struct set_qos_limit_ctx { 319 void (*cb_fn)(void *cb_arg, int status); 320 void *cb_arg; 321 struct spdk_bdev *bdev; 322 }; 323 324 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 325 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 326 327 static inline void bdev_io_complete(void *ctx); 328 329 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 330 static void bdev_write_zero_buffer_next(void *_bdev_io); 331 332 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 333 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 334 335 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 336 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 337 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 338 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 339 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 340 struct iovec *iov, int iovcnt, void *md_buf, 341 uint64_t offset_blocks, uint64_t num_blocks, 342 spdk_bdev_io_completion_cb cb, void *cb_arg, 343 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 344 345 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 346 uint64_t offset, uint64_t length, 347 lock_range_cb cb_fn, void *cb_arg); 348 349 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 350 uint64_t offset, uint64_t length, 351 lock_range_cb cb_fn, void *cb_arg); 352 353 static inline void bdev_io_complete(void *ctx); 354 355 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 356 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 357 358 void 359 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 360 { 361 if (!opts) { 362 SPDK_ERRLOG("opts should not be NULL\n"); 363 return; 364 } 365 366 if (!opts_size) { 367 SPDK_ERRLOG("opts_size should not be zero value\n"); 368 return; 369 } 370 371 opts->opts_size = opts_size; 372 373 #define SET_FIELD(field) \ 374 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 375 opts->field = g_bdev_opts.field; \ 376 } \ 377 378 SET_FIELD(bdev_io_pool_size); 379 SET_FIELD(bdev_io_cache_size); 380 SET_FIELD(bdev_auto_examine); 381 SET_FIELD(small_buf_pool_size); 382 SET_FIELD(large_buf_pool_size); 383 384 /* Do not remove this statement, you should always update this statement when you adding a new field, 385 * and do not forget to add the SET_FIELD statement for your added field. */ 386 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 387 388 #undef SET_FIELD 389 } 390 391 int 392 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 393 { 394 uint32_t min_pool_size; 395 396 if (!opts) { 397 SPDK_ERRLOG("opts cannot be NULL\n"); 398 return -1; 399 } 400 401 if (!opts->opts_size) { 402 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 403 return -1; 404 } 405 406 /* 407 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 408 * initialization. A second mgmt_ch will be created on the same thread when the application starts 409 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 410 */ 411 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 412 if (opts->bdev_io_pool_size < min_pool_size) { 413 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 414 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 415 spdk_thread_get_count()); 416 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 417 return -1; 418 } 419 420 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 421 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 422 return -1; 423 } 424 425 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 426 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 427 return -1; 428 } 429 430 #define SET_FIELD(field) \ 431 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 432 g_bdev_opts.field = opts->field; \ 433 } \ 434 435 SET_FIELD(bdev_io_pool_size); 436 SET_FIELD(bdev_io_cache_size); 437 SET_FIELD(bdev_auto_examine); 438 SET_FIELD(small_buf_pool_size); 439 SET_FIELD(large_buf_pool_size); 440 441 g_bdev_opts.opts_size = opts->opts_size; 442 443 #undef SET_FIELD 444 445 return 0; 446 } 447 448 static struct spdk_bdev * 449 bdev_get_by_name(const char *bdev_name) 450 { 451 struct spdk_bdev_name find; 452 struct spdk_bdev_name *res; 453 454 find.name = (char *)bdev_name; 455 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 456 if (res != NULL) { 457 return res->bdev; 458 } 459 460 return NULL; 461 } 462 463 struct spdk_bdev * 464 spdk_bdev_get_by_name(const char *bdev_name) 465 { 466 struct spdk_bdev *bdev; 467 468 pthread_mutex_lock(&g_bdev_mgr.mutex); 469 bdev = bdev_get_by_name(bdev_name); 470 pthread_mutex_unlock(&g_bdev_mgr.mutex); 471 472 return bdev; 473 } 474 475 struct spdk_bdev_wait_for_examine_ctx { 476 struct spdk_poller *poller; 477 spdk_bdev_wait_for_examine_cb cb_fn; 478 void *cb_arg; 479 }; 480 481 static bool bdev_module_all_actions_completed(void); 482 483 static int 484 bdev_wait_for_examine_cb(void *arg) 485 { 486 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 487 488 if (!bdev_module_all_actions_completed()) { 489 return SPDK_POLLER_IDLE; 490 } 491 492 spdk_poller_unregister(&ctx->poller); 493 ctx->cb_fn(ctx->cb_arg); 494 free(ctx); 495 496 return SPDK_POLLER_BUSY; 497 } 498 499 int 500 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 501 { 502 struct spdk_bdev_wait_for_examine_ctx *ctx; 503 504 ctx = calloc(1, sizeof(*ctx)); 505 if (ctx == NULL) { 506 return -ENOMEM; 507 } 508 ctx->cb_fn = cb_fn; 509 ctx->cb_arg = cb_arg; 510 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 511 512 return 0; 513 } 514 515 struct spdk_bdev_examine_item { 516 char *name; 517 TAILQ_ENTRY(spdk_bdev_examine_item) link; 518 }; 519 520 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 521 522 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 523 g_bdev_examine_allowlist); 524 525 static inline bool 526 bdev_examine_allowlist_check(const char *name) 527 { 528 struct spdk_bdev_examine_item *item; 529 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 530 if (strcmp(name, item->name) == 0) { 531 return true; 532 } 533 } 534 return false; 535 } 536 537 static inline void 538 bdev_examine_allowlist_free(void) 539 { 540 struct spdk_bdev_examine_item *item; 541 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 542 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 543 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 544 free(item->name); 545 free(item); 546 } 547 } 548 549 static inline bool 550 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 551 { 552 struct spdk_bdev_alias *tmp; 553 if (bdev_examine_allowlist_check(bdev->name)) { 554 return true; 555 } 556 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 557 if (bdev_examine_allowlist_check(tmp->alias.name)) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline bool 565 bdev_ok_to_examine(struct spdk_bdev *bdev) 566 { 567 if (g_bdev_opts.bdev_auto_examine) { 568 return true; 569 } else { 570 return bdev_in_examine_allowlist(bdev); 571 } 572 } 573 574 static void 575 bdev_examine(struct spdk_bdev *bdev) 576 { 577 struct spdk_bdev_module *module; 578 uint32_t action; 579 580 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 581 if (module->examine_config && bdev_ok_to_examine(bdev)) { 582 action = module->internal.action_in_progress; 583 module->internal.action_in_progress++; 584 module->examine_config(bdev); 585 if (action != module->internal.action_in_progress) { 586 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 587 module->name); 588 } 589 } 590 } 591 592 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 593 if (bdev->internal.claim_module->examine_disk) { 594 bdev->internal.claim_module->internal.action_in_progress++; 595 bdev->internal.claim_module->examine_disk(bdev); 596 } 597 return; 598 } 599 600 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 601 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 602 module->internal.action_in_progress++; 603 module->examine_disk(bdev); 604 } 605 } 606 } 607 608 int 609 spdk_bdev_examine(const char *name) 610 { 611 struct spdk_bdev *bdev; 612 struct spdk_bdev_examine_item *item; 613 614 if (g_bdev_opts.bdev_auto_examine) { 615 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 616 return -EINVAL; 617 } 618 619 if (bdev_examine_allowlist_check(name)) { 620 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 621 return -EEXIST; 622 } 623 624 item = calloc(1, sizeof(*item)); 625 if (!item) { 626 return -ENOMEM; 627 } 628 item->name = strdup(name); 629 if (!item->name) { 630 free(item); 631 return -ENOMEM; 632 } 633 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 634 635 bdev = spdk_bdev_get_by_name(name); 636 if (bdev) { 637 bdev_examine(bdev); 638 } 639 return 0; 640 } 641 642 static inline void 643 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 644 { 645 struct spdk_bdev_examine_item *item; 646 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 647 spdk_json_write_object_begin(w); 648 spdk_json_write_named_string(w, "method", "bdev_examine"); 649 spdk_json_write_named_object_begin(w, "params"); 650 spdk_json_write_named_string(w, "name", item->name); 651 spdk_json_write_object_end(w); 652 spdk_json_write_object_end(w); 653 } 654 } 655 656 struct spdk_bdev * 657 spdk_bdev_first(void) 658 { 659 struct spdk_bdev *bdev; 660 661 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 662 if (bdev) { 663 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 664 } 665 666 return bdev; 667 } 668 669 struct spdk_bdev * 670 spdk_bdev_next(struct spdk_bdev *prev) 671 { 672 struct spdk_bdev *bdev; 673 674 bdev = TAILQ_NEXT(prev, internal.link); 675 if (bdev) { 676 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 677 } 678 679 return bdev; 680 } 681 682 static struct spdk_bdev * 683 _bdev_next_leaf(struct spdk_bdev *bdev) 684 { 685 while (bdev != NULL) { 686 if (bdev->internal.claim_module == NULL) { 687 return bdev; 688 } else { 689 bdev = TAILQ_NEXT(bdev, internal.link); 690 } 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_first_leaf(void) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 702 703 if (bdev) { 704 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 705 } 706 707 return bdev; 708 } 709 710 struct spdk_bdev * 711 spdk_bdev_next_leaf(struct spdk_bdev *prev) 712 { 713 struct spdk_bdev *bdev; 714 715 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 716 717 if (bdev) { 718 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 719 } 720 721 return bdev; 722 } 723 724 static inline bool 725 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 726 { 727 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 728 } 729 730 void 731 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 732 { 733 struct iovec *iovs; 734 735 if (bdev_io->u.bdev.iovs == NULL) { 736 bdev_io->u.bdev.iovs = &bdev_io->iov; 737 bdev_io->u.bdev.iovcnt = 1; 738 } 739 740 iovs = bdev_io->u.bdev.iovs; 741 742 assert(iovs != NULL); 743 assert(bdev_io->u.bdev.iovcnt >= 1); 744 745 iovs[0].iov_base = buf; 746 iovs[0].iov_len = len; 747 } 748 749 void 750 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 751 { 752 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 753 bdev_io->u.bdev.md_buf = md_buf; 754 } 755 756 static bool 757 _is_buf_allocated(const struct iovec *iovs) 758 { 759 if (iovs == NULL) { 760 return false; 761 } 762 763 return iovs[0].iov_base != NULL; 764 } 765 766 static bool 767 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 768 { 769 int i; 770 uintptr_t iov_base; 771 772 if (spdk_likely(alignment == 1)) { 773 return true; 774 } 775 776 for (i = 0; i < iovcnt; i++) { 777 iov_base = (uintptr_t)iovs[i].iov_base; 778 if ((iov_base & (alignment - 1)) != 0) { 779 return false; 780 } 781 } 782 783 return true; 784 } 785 786 static void 787 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 788 { 789 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 790 void *buf; 791 792 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 793 buf = bdev_io->internal.buf; 794 bdev_io->internal.buf = NULL; 795 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 796 bdev_io->internal.get_aux_buf_cb = NULL; 797 } else { 798 assert(bdev_io->internal.get_buf_cb != NULL); 799 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 800 bdev_io->internal.get_buf_cb = NULL; 801 } 802 } 803 804 static void 805 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 806 { 807 struct spdk_bdev_io *bdev_io = ctx; 808 809 if (rc) { 810 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 811 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 812 } 813 bdev_io_get_buf_complete(bdev_io, !rc); 814 } 815 816 static void 817 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 818 { 819 int rc = 0; 820 821 /* save original md_buf */ 822 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 823 bdev_io->internal.orig_md_iov.iov_len = len; 824 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 825 bdev_io->internal.bounce_md_iov.iov_len = len; 826 /* set bounce md_buf */ 827 bdev_io->u.bdev.md_buf = md_buf; 828 829 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 830 if (bdev_io_use_memory_domain(bdev_io)) { 831 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 832 bdev_io->internal.ext_opts->memory_domain_ctx, 833 &bdev_io->internal.orig_md_iov, 1, 834 &bdev_io->internal.bounce_md_iov, 1, 835 bdev_io->internal.data_transfer_cpl, 836 bdev_io); 837 if (rc == 0) { 838 /* Continue to submit IO in completion callback */ 839 return; 840 } 841 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 842 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 843 } else { 844 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 845 } 846 } 847 848 assert(bdev_io->internal.data_transfer_cpl); 849 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 850 } 851 852 static void 853 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 854 { 855 struct spdk_bdev *bdev = bdev_io->bdev; 856 uint64_t md_len; 857 void *buf; 858 859 if (spdk_bdev_is_md_separate(bdev)) { 860 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 861 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 862 863 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 864 865 if (bdev_io->u.bdev.md_buf != NULL) { 866 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 867 return; 868 } else { 869 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 870 } 871 } 872 873 bdev_io_get_buf_complete(bdev_io, true); 874 } 875 876 static void 877 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 878 { 879 struct spdk_bdev_io *bdev_io = ctx; 880 881 if (rc) { 882 SPDK_ERRLOG("Failed to get data buffer\n"); 883 assert(bdev_io->internal.data_transfer_cpl); 884 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 885 return; 886 } 887 888 _bdev_io_set_md_buf(bdev_io); 889 } 890 891 static void 892 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 893 bdev_copy_bounce_buffer_cpl cpl_cb) 894 { 895 int rc = 0; 896 897 bdev_io->internal.data_transfer_cpl = cpl_cb; 898 /* save original iovec */ 899 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 900 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 901 /* set bounce iov */ 902 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 903 bdev_io->u.bdev.iovcnt = 1; 904 /* set bounce buffer for this operation */ 905 bdev_io->u.bdev.iovs[0].iov_base = buf; 906 bdev_io->u.bdev.iovs[0].iov_len = len; 907 /* if this is write path, copy data from original buffer to bounce buffer */ 908 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 909 if (bdev_io_use_memory_domain(bdev_io)) { 910 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 911 bdev_io->internal.ext_opts->memory_domain_ctx, 912 bdev_io->internal.orig_iovs, 913 (uint32_t) bdev_io->internal.orig_iovcnt, 914 bdev_io->u.bdev.iovs, 1, 915 _bdev_io_pull_bounce_data_buf_done, 916 bdev_io); 917 if (rc == 0) { 918 /* Continue to submit IO in completion callback */ 919 return; 920 } 921 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 922 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 923 } else { 924 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 925 } 926 } 927 928 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 929 } 930 931 static void 932 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 933 { 934 struct spdk_bdev *bdev = bdev_io->bdev; 935 bool buf_allocated; 936 uint64_t alignment; 937 void *aligned_buf; 938 939 bdev_io->internal.buf = buf; 940 941 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 942 bdev_io_get_buf_complete(bdev_io, true); 943 return; 944 } 945 946 alignment = spdk_bdev_get_buf_align(bdev); 947 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 948 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 949 950 if (buf_allocated) { 951 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 952 /* Continue in completion callback */ 953 return; 954 } else { 955 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 956 } 957 958 _bdev_io_set_md_buf(bdev_io); 959 } 960 961 static void 962 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 963 { 964 struct spdk_bdev *bdev = bdev_io->bdev; 965 struct spdk_mempool *pool; 966 struct spdk_bdev_io *tmp; 967 bdev_io_stailq_t *stailq; 968 struct spdk_bdev_mgmt_channel *ch; 969 uint64_t md_len, alignment; 970 971 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 972 alignment = spdk_bdev_get_buf_align(bdev); 973 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 974 975 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 976 SPDK_BDEV_POOL_ALIGNMENT) { 977 pool = g_bdev_mgr.buf_small_pool; 978 stailq = &ch->need_buf_small; 979 } else { 980 pool = g_bdev_mgr.buf_large_pool; 981 stailq = &ch->need_buf_large; 982 } 983 984 if (STAILQ_EMPTY(stailq)) { 985 spdk_mempool_put(pool, buf); 986 } else { 987 tmp = STAILQ_FIRST(stailq); 988 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 989 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 990 } 991 } 992 993 static void 994 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 995 { 996 assert(bdev_io->internal.buf != NULL); 997 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 998 bdev_io->internal.buf = NULL; 999 } 1000 1001 void 1002 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1003 { 1004 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1005 1006 assert(buf != NULL); 1007 _bdev_io_put_buf(bdev_io, buf, len); 1008 } 1009 1010 static void 1011 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1012 { 1013 struct spdk_bdev *bdev = bdev_ch->bdev; 1014 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1015 struct spdk_bdev_io *bdev_io; 1016 1017 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1018 /* 1019 * Allow some more I/O to complete before retrying the nomem_io queue. 1020 * Some drivers (such as nvme) cannot immediately take a new I/O in 1021 * the context of a completion, because the resources for the I/O are 1022 * not released until control returns to the bdev poller. Also, we 1023 * may require several small I/O to complete before a larger I/O 1024 * (that requires splitting) can be submitted. 1025 */ 1026 return; 1027 } 1028 1029 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1030 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1031 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1032 bdev_io->internal.ch->io_outstanding++; 1033 shared_resource->io_outstanding++; 1034 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1035 bdev_io->internal.error.nvme.cdw0 = 0; 1036 bdev_io->num_retries++; 1037 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1038 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1039 break; 1040 } 1041 } 1042 } 1043 1044 static inline void 1045 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1046 struct spdk_bdev_shared_resource *shared_resource) 1047 { 1048 assert(bdev_ch->io_outstanding > 0); 1049 assert(shared_resource->io_outstanding > 0); 1050 bdev_ch->io_outstanding--; 1051 shared_resource->io_outstanding--; 1052 } 1053 1054 static inline bool 1055 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1056 { 1057 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1058 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1059 1060 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1061 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1062 /* 1063 * Wait for some of the outstanding I/O to complete before we 1064 * retry any of the nomem_io. Normally we will wait for 1065 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1066 * depth channels we will instead wait for half to complete. 1067 */ 1068 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1069 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1070 return true; 1071 } 1072 1073 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1074 bdev_ch_retry_io(bdev_ch); 1075 } 1076 1077 return false; 1078 } 1079 1080 static void 1081 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1082 { 1083 struct spdk_bdev_io *bdev_io = ctx; 1084 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1085 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1086 1087 if (rc) { 1088 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1089 } 1090 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1091 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1092 */ 1093 bdev_io_put_buf(bdev_io); 1094 1095 /* Continue with IO completion flow */ 1096 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1097 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1098 return; 1099 } 1100 1101 bdev_io_complete(bdev_io); 1102 } 1103 1104 static inline void 1105 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1106 { 1107 int rc = 0; 1108 1109 /* do the same for metadata buffer */ 1110 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1111 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1112 1113 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1114 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1115 if (bdev_io_use_memory_domain(bdev_io)) { 1116 /* If memory domain is used then we need to call async push function */ 1117 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1118 bdev_io->internal.ext_opts->memory_domain_ctx, 1119 &bdev_io->internal.orig_md_iov, 1120 (uint32_t)bdev_io->internal.orig_iovcnt, 1121 &bdev_io->internal.bounce_md_iov, 1, 1122 bdev_io->internal.data_transfer_cpl, 1123 bdev_io); 1124 if (rc == 0) { 1125 /* Continue IO completion in async callback */ 1126 return; 1127 } 1128 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1129 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1130 } else { 1131 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1132 bdev_io->internal.orig_md_iov.iov_len); 1133 } 1134 } 1135 } 1136 1137 assert(bdev_io->internal.data_transfer_cpl); 1138 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1139 } 1140 1141 static void 1142 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1143 { 1144 struct spdk_bdev_io *bdev_io = ctx; 1145 1146 assert(bdev_io->internal.data_transfer_cpl); 1147 1148 if (rc) { 1149 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1150 return; 1151 } 1152 1153 /* set original buffer for this io */ 1154 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1155 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1156 /* disable bouncing buffer for this io */ 1157 bdev_io->internal.orig_iovcnt = 0; 1158 bdev_io->internal.orig_iovs = NULL; 1159 1160 _bdev_io_push_bounce_md_buffer(bdev_io); 1161 } 1162 1163 static inline void 1164 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1165 { 1166 int rc = 0; 1167 1168 bdev_io->internal.data_transfer_cpl = cpl_cb; 1169 1170 /* if this is read path, copy data from bounce buffer to original buffer */ 1171 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1172 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1173 if (bdev_io_use_memory_domain(bdev_io)) { 1174 /* If memory domain is used then we need to call async push function */ 1175 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1176 bdev_io->internal.ext_opts->memory_domain_ctx, 1177 bdev_io->internal.orig_iovs, 1178 (uint32_t)bdev_io->internal.orig_iovcnt, 1179 &bdev_io->internal.bounce_iov, 1, 1180 _bdev_io_push_bounce_data_buffer_done, 1181 bdev_io); 1182 if (rc == 0) { 1183 /* Continue IO completion in async callback */ 1184 return; 1185 } 1186 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1187 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1188 } else { 1189 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1190 bdev_io->internal.orig_iovcnt, 1191 bdev_io->internal.bounce_iov.iov_base, 1192 bdev_io->internal.bounce_iov.iov_len); 1193 } 1194 } 1195 1196 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1197 } 1198 1199 static void 1200 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1201 { 1202 struct spdk_bdev *bdev = bdev_io->bdev; 1203 struct spdk_mempool *pool; 1204 bdev_io_stailq_t *stailq; 1205 struct spdk_bdev_mgmt_channel *mgmt_ch; 1206 uint64_t alignment, md_len; 1207 void *buf; 1208 1209 alignment = spdk_bdev_get_buf_align(bdev); 1210 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1211 1212 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1213 SPDK_BDEV_POOL_ALIGNMENT) { 1214 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1215 len + alignment); 1216 bdev_io_get_buf_complete(bdev_io, false); 1217 return; 1218 } 1219 1220 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1221 1222 bdev_io->internal.buf_len = len; 1223 1224 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1225 SPDK_BDEV_POOL_ALIGNMENT) { 1226 pool = g_bdev_mgr.buf_small_pool; 1227 stailq = &mgmt_ch->need_buf_small; 1228 } else { 1229 pool = g_bdev_mgr.buf_large_pool; 1230 stailq = &mgmt_ch->need_buf_large; 1231 } 1232 1233 buf = spdk_mempool_get(pool); 1234 if (!buf) { 1235 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1236 } else { 1237 _bdev_io_set_buf(bdev_io, buf, len); 1238 } 1239 } 1240 1241 void 1242 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1243 { 1244 struct spdk_bdev *bdev = bdev_io->bdev; 1245 uint64_t alignment; 1246 1247 assert(cb != NULL); 1248 bdev_io->internal.get_buf_cb = cb; 1249 1250 alignment = spdk_bdev_get_buf_align(bdev); 1251 1252 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1253 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1254 /* Buffer already present and aligned */ 1255 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1256 return; 1257 } 1258 1259 bdev_io_get_buf(bdev_io, len); 1260 } 1261 1262 static void 1263 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1264 bool success) 1265 { 1266 if (!success) { 1267 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1268 bdev_io_complete(bdev_io); 1269 } else { 1270 bdev_io_submit(bdev_io); 1271 } 1272 } 1273 1274 static void 1275 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1276 uint64_t len) 1277 { 1278 assert(cb != NULL); 1279 bdev_io->internal.get_buf_cb = cb; 1280 1281 bdev_io_get_buf(bdev_io, len); 1282 } 1283 1284 void 1285 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1286 { 1287 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1288 1289 assert(cb != NULL); 1290 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1291 bdev_io->internal.get_aux_buf_cb = cb; 1292 bdev_io_get_buf(bdev_io, len); 1293 } 1294 1295 static int 1296 bdev_module_get_max_ctx_size(void) 1297 { 1298 struct spdk_bdev_module *bdev_module; 1299 int max_bdev_module_size = 0; 1300 1301 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1302 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1303 max_bdev_module_size = bdev_module->get_ctx_size(); 1304 } 1305 } 1306 1307 return max_bdev_module_size; 1308 } 1309 1310 static void 1311 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1312 { 1313 int i; 1314 struct spdk_bdev_qos *qos = bdev->internal.qos; 1315 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1316 1317 if (!qos) { 1318 return; 1319 } 1320 1321 spdk_bdev_get_qos_rate_limits(bdev, limits); 1322 1323 spdk_json_write_object_begin(w); 1324 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1325 1326 spdk_json_write_named_object_begin(w, "params"); 1327 spdk_json_write_named_string(w, "name", bdev->name); 1328 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1329 if (limits[i] > 0) { 1330 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1331 } 1332 } 1333 spdk_json_write_object_end(w); 1334 1335 spdk_json_write_object_end(w); 1336 } 1337 1338 void 1339 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1340 { 1341 struct spdk_bdev_module *bdev_module; 1342 struct spdk_bdev *bdev; 1343 1344 assert(w != NULL); 1345 1346 spdk_json_write_array_begin(w); 1347 1348 spdk_json_write_object_begin(w); 1349 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1350 spdk_json_write_named_object_begin(w, "params"); 1351 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1352 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1353 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1354 spdk_json_write_object_end(w); 1355 spdk_json_write_object_end(w); 1356 1357 bdev_examine_allowlist_config_json(w); 1358 1359 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1360 if (bdev_module->config_json) { 1361 bdev_module->config_json(w); 1362 } 1363 } 1364 1365 pthread_mutex_lock(&g_bdev_mgr.mutex); 1366 1367 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1368 if (bdev->fn_table->write_config_json) { 1369 bdev->fn_table->write_config_json(bdev, w); 1370 } 1371 1372 bdev_qos_config_json(bdev, w); 1373 } 1374 1375 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1376 1377 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1378 spdk_json_write_object_begin(w); 1379 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1380 spdk_json_write_object_end(w); 1381 1382 spdk_json_write_array_end(w); 1383 } 1384 1385 static void 1386 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1387 { 1388 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1389 struct spdk_bdev_io *bdev_io; 1390 1391 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1392 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1393 } 1394 1395 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1396 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1397 } 1398 1399 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1400 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1401 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1402 ch->per_thread_cache_count--; 1403 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1404 } 1405 1406 assert(ch->per_thread_cache_count == 0); 1407 } 1408 1409 static int 1410 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1411 { 1412 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1413 struct spdk_bdev_io *bdev_io; 1414 uint32_t i; 1415 1416 STAILQ_INIT(&ch->need_buf_small); 1417 STAILQ_INIT(&ch->need_buf_large); 1418 1419 STAILQ_INIT(&ch->per_thread_cache); 1420 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1421 1422 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1423 ch->per_thread_cache_count = 0; 1424 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1425 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1426 if (bdev_io == NULL) { 1427 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1428 assert(false); 1429 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1430 return -1; 1431 } 1432 ch->per_thread_cache_count++; 1433 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1434 } 1435 1436 TAILQ_INIT(&ch->shared_resources); 1437 TAILQ_INIT(&ch->io_wait_queue); 1438 1439 return 0; 1440 } 1441 1442 static void 1443 bdev_init_complete(int rc) 1444 { 1445 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1446 void *cb_arg = g_init_cb_arg; 1447 struct spdk_bdev_module *m; 1448 1449 g_bdev_mgr.init_complete = true; 1450 g_init_cb_fn = NULL; 1451 g_init_cb_arg = NULL; 1452 1453 /* 1454 * For modules that need to know when subsystem init is complete, 1455 * inform them now. 1456 */ 1457 if (rc == 0) { 1458 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1459 if (m->init_complete) { 1460 m->init_complete(); 1461 } 1462 } 1463 } 1464 1465 cb_fn(cb_arg, rc); 1466 } 1467 1468 static bool 1469 bdev_module_all_actions_completed(void) 1470 { 1471 struct spdk_bdev_module *m; 1472 1473 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1474 if (m->internal.action_in_progress > 0) { 1475 return false; 1476 } 1477 } 1478 return true; 1479 } 1480 1481 static void 1482 bdev_module_action_complete(void) 1483 { 1484 /* 1485 * Don't finish bdev subsystem initialization if 1486 * module pre-initialization is still in progress, or 1487 * the subsystem been already initialized. 1488 */ 1489 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1490 return; 1491 } 1492 1493 /* 1494 * Check all bdev modules for inits/examinations in progress. If any 1495 * exist, return immediately since we cannot finish bdev subsystem 1496 * initialization until all are completed. 1497 */ 1498 if (!bdev_module_all_actions_completed()) { 1499 return; 1500 } 1501 1502 /* 1503 * Modules already finished initialization - now that all 1504 * the bdev modules have finished their asynchronous I/O 1505 * processing, the entire bdev layer can be marked as complete. 1506 */ 1507 bdev_init_complete(0); 1508 } 1509 1510 static void 1511 bdev_module_action_done(struct spdk_bdev_module *module) 1512 { 1513 assert(module->internal.action_in_progress > 0); 1514 module->internal.action_in_progress--; 1515 bdev_module_action_complete(); 1516 } 1517 1518 void 1519 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1520 { 1521 bdev_module_action_done(module); 1522 } 1523 1524 void 1525 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1526 { 1527 bdev_module_action_done(module); 1528 } 1529 1530 /** The last initialized bdev module */ 1531 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1532 1533 static void 1534 bdev_init_failed(void *cb_arg) 1535 { 1536 struct spdk_bdev_module *module = cb_arg; 1537 1538 module->internal.action_in_progress--; 1539 bdev_init_complete(-1); 1540 } 1541 1542 static int 1543 bdev_modules_init(void) 1544 { 1545 struct spdk_bdev_module *module; 1546 int rc = 0; 1547 1548 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1549 g_resume_bdev_module = module; 1550 if (module->async_init) { 1551 module->internal.action_in_progress = 1; 1552 } 1553 rc = module->module_init(); 1554 if (rc != 0) { 1555 /* Bump action_in_progress to prevent other modules from completion of modules_init 1556 * Send message to defer application shutdown until resources are cleaned up */ 1557 module->internal.action_in_progress = 1; 1558 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1559 return rc; 1560 } 1561 } 1562 1563 g_resume_bdev_module = NULL; 1564 return 0; 1565 } 1566 1567 void 1568 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1569 { 1570 int cache_size; 1571 int rc = 0; 1572 char mempool_name[32]; 1573 1574 assert(cb_fn != NULL); 1575 1576 g_init_cb_fn = cb_fn; 1577 g_init_cb_arg = cb_arg; 1578 1579 spdk_notify_type_register("bdev_register"); 1580 spdk_notify_type_register("bdev_unregister"); 1581 1582 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1583 1584 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1585 g_bdev_opts.bdev_io_pool_size, 1586 sizeof(struct spdk_bdev_io) + 1587 bdev_module_get_max_ctx_size(), 1588 0, 1589 SPDK_ENV_SOCKET_ID_ANY); 1590 1591 if (g_bdev_mgr.bdev_io_pool == NULL) { 1592 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1593 bdev_init_complete(-1); 1594 return; 1595 } 1596 1597 /** 1598 * Ensure no more than half of the total buffers end up local caches, by 1599 * using spdk_env_get_core_count() to determine how many local caches we need 1600 * to account for. 1601 */ 1602 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1603 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1604 1605 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1606 g_bdev_opts.small_buf_pool_size, 1607 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1608 SPDK_BDEV_POOL_ALIGNMENT, 1609 cache_size, 1610 SPDK_ENV_SOCKET_ID_ANY); 1611 if (!g_bdev_mgr.buf_small_pool) { 1612 SPDK_ERRLOG("create rbuf small pool failed\n"); 1613 bdev_init_complete(-1); 1614 return; 1615 } 1616 1617 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1618 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1619 1620 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1621 g_bdev_opts.large_buf_pool_size, 1622 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1623 SPDK_BDEV_POOL_ALIGNMENT, 1624 cache_size, 1625 SPDK_ENV_SOCKET_ID_ANY); 1626 if (!g_bdev_mgr.buf_large_pool) { 1627 SPDK_ERRLOG("create rbuf large pool failed\n"); 1628 bdev_init_complete(-1); 1629 return; 1630 } 1631 1632 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1633 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1634 if (!g_bdev_mgr.zero_buffer) { 1635 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1636 bdev_init_complete(-1); 1637 return; 1638 } 1639 1640 #ifdef SPDK_CONFIG_VTUNE 1641 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1642 #endif 1643 1644 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1645 bdev_mgmt_channel_destroy, 1646 sizeof(struct spdk_bdev_mgmt_channel), 1647 "bdev_mgr"); 1648 1649 rc = bdev_modules_init(); 1650 g_bdev_mgr.module_init_complete = true; 1651 if (rc != 0) { 1652 SPDK_ERRLOG("bdev modules init failed\n"); 1653 return; 1654 } 1655 1656 bdev_module_action_complete(); 1657 } 1658 1659 static void 1660 bdev_mgr_unregister_cb(void *io_device) 1661 { 1662 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1663 1664 if (g_bdev_mgr.bdev_io_pool) { 1665 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1666 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1667 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1668 g_bdev_opts.bdev_io_pool_size); 1669 } 1670 1671 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1672 } 1673 1674 if (g_bdev_mgr.buf_small_pool) { 1675 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1676 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1677 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1678 g_bdev_opts.small_buf_pool_size); 1679 assert(false); 1680 } 1681 1682 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1683 } 1684 1685 if (g_bdev_mgr.buf_large_pool) { 1686 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1687 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1688 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1689 g_bdev_opts.large_buf_pool_size); 1690 assert(false); 1691 } 1692 1693 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1694 } 1695 1696 spdk_free(g_bdev_mgr.zero_buffer); 1697 1698 bdev_examine_allowlist_free(); 1699 1700 cb_fn(g_fini_cb_arg); 1701 g_fini_cb_fn = NULL; 1702 g_fini_cb_arg = NULL; 1703 g_bdev_mgr.init_complete = false; 1704 g_bdev_mgr.module_init_complete = false; 1705 } 1706 1707 static void 1708 bdev_module_fini_iter(void *arg) 1709 { 1710 struct spdk_bdev_module *bdev_module; 1711 1712 /* FIXME: Handling initialization failures is broken now, 1713 * so we won't even try cleaning up after successfully 1714 * initialized modules. if module_init_complete is false, 1715 * just call spdk_bdev_mgr_unregister_cb 1716 */ 1717 if (!g_bdev_mgr.module_init_complete) { 1718 bdev_mgr_unregister_cb(NULL); 1719 return; 1720 } 1721 1722 /* Start iterating from the last touched module */ 1723 if (!g_resume_bdev_module) { 1724 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1725 } else { 1726 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1727 internal.tailq); 1728 } 1729 1730 while (bdev_module) { 1731 if (bdev_module->async_fini) { 1732 /* Save our place so we can resume later. We must 1733 * save the variable here, before calling module_fini() 1734 * below, because in some cases the module may immediately 1735 * call spdk_bdev_module_fini_done() and re-enter 1736 * this function to continue iterating. */ 1737 g_resume_bdev_module = bdev_module; 1738 } 1739 1740 if (bdev_module->module_fini) { 1741 bdev_module->module_fini(); 1742 } 1743 1744 if (bdev_module->async_fini) { 1745 return; 1746 } 1747 1748 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1749 internal.tailq); 1750 } 1751 1752 g_resume_bdev_module = NULL; 1753 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1754 } 1755 1756 void 1757 spdk_bdev_module_fini_done(void) 1758 { 1759 if (spdk_get_thread() != g_fini_thread) { 1760 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1761 } else { 1762 bdev_module_fini_iter(NULL); 1763 } 1764 } 1765 1766 static void 1767 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1768 { 1769 struct spdk_bdev *bdev = cb_arg; 1770 1771 if (bdeverrno && bdev) { 1772 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1773 bdev->name); 1774 1775 /* 1776 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1777 * bdev; try to continue by manually removing this bdev from the list and continue 1778 * with the next bdev in the list. 1779 */ 1780 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1781 } 1782 1783 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1784 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1785 /* 1786 * Bdev module finish need to be deferred as we might be in the middle of some context 1787 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1788 * after returning. 1789 */ 1790 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1791 return; 1792 } 1793 1794 /* 1795 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1796 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1797 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1798 * base bdevs. 1799 * 1800 * Also, walk the list in the reverse order. 1801 */ 1802 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1803 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1804 if (bdev->internal.claim_module != NULL) { 1805 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1806 bdev->name, bdev->internal.claim_module->name); 1807 continue; 1808 } 1809 1810 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1811 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1812 return; 1813 } 1814 1815 /* 1816 * If any bdev fails to unclaim underlying bdev properly, we may face the 1817 * case of bdev list consisting of claimed bdevs only (if claims are managed 1818 * correctly, this would mean there's a loop in the claims graph which is 1819 * clearly impossible). Warn and unregister last bdev on the list then. 1820 */ 1821 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1822 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1823 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1824 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1825 return; 1826 } 1827 } 1828 1829 static void 1830 bdev_module_fini_start_iter(void *arg) 1831 { 1832 struct spdk_bdev_module *bdev_module; 1833 1834 if (!g_resume_bdev_module) { 1835 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1836 } else { 1837 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1838 } 1839 1840 while (bdev_module) { 1841 if (bdev_module->async_fini_start) { 1842 /* Save our place so we can resume later. We must 1843 * save the variable here, before calling fini_start() 1844 * below, because in some cases the module may immediately 1845 * call spdk_bdev_module_fini_start_done() and re-enter 1846 * this function to continue iterating. */ 1847 g_resume_bdev_module = bdev_module; 1848 } 1849 1850 if (bdev_module->fini_start) { 1851 bdev_module->fini_start(); 1852 } 1853 1854 if (bdev_module->async_fini_start) { 1855 return; 1856 } 1857 1858 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1859 } 1860 1861 g_resume_bdev_module = NULL; 1862 1863 bdev_finish_unregister_bdevs_iter(NULL, 0); 1864 } 1865 1866 void 1867 spdk_bdev_module_fini_start_done(void) 1868 { 1869 if (spdk_get_thread() != g_fini_thread) { 1870 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1871 } else { 1872 bdev_module_fini_start_iter(NULL); 1873 } 1874 } 1875 1876 static void 1877 bdev_finish_wait_for_examine_done(void *cb_arg) 1878 { 1879 bdev_module_fini_start_iter(NULL); 1880 } 1881 1882 void 1883 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1884 { 1885 int rc; 1886 1887 assert(cb_fn != NULL); 1888 1889 g_fini_thread = spdk_get_thread(); 1890 1891 g_fini_cb_fn = cb_fn; 1892 g_fini_cb_arg = cb_arg; 1893 1894 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1895 if (rc != 0) { 1896 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1897 bdev_finish_wait_for_examine_done(NULL); 1898 } 1899 } 1900 1901 struct spdk_bdev_io * 1902 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1903 { 1904 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1905 struct spdk_bdev_io *bdev_io; 1906 1907 if (ch->per_thread_cache_count > 0) { 1908 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1909 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1910 ch->per_thread_cache_count--; 1911 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1912 /* 1913 * Don't try to look for bdev_ios in the global pool if there are 1914 * waiters on bdev_ios - we don't want this caller to jump the line. 1915 */ 1916 bdev_io = NULL; 1917 } else { 1918 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1919 } 1920 1921 return bdev_io; 1922 } 1923 1924 void 1925 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1926 { 1927 struct spdk_bdev_mgmt_channel *ch; 1928 1929 assert(bdev_io != NULL); 1930 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1931 1932 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1933 1934 if (bdev_io->internal.buf != NULL) { 1935 bdev_io_put_buf(bdev_io); 1936 } 1937 1938 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1939 ch->per_thread_cache_count++; 1940 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1941 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1942 struct spdk_bdev_io_wait_entry *entry; 1943 1944 entry = TAILQ_FIRST(&ch->io_wait_queue); 1945 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1946 entry->cb_fn(entry->cb_arg); 1947 } 1948 } else { 1949 /* We should never have a full cache with entries on the io wait queue. */ 1950 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1951 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1952 } 1953 } 1954 1955 static bool 1956 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1957 { 1958 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1959 1960 switch (limit) { 1961 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1962 return true; 1963 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1964 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1965 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1966 return false; 1967 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1968 default: 1969 return false; 1970 } 1971 } 1972 1973 static bool 1974 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1975 { 1976 switch (bdev_io->type) { 1977 case SPDK_BDEV_IO_TYPE_NVME_IO: 1978 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1979 case SPDK_BDEV_IO_TYPE_READ: 1980 case SPDK_BDEV_IO_TYPE_WRITE: 1981 return true; 1982 case SPDK_BDEV_IO_TYPE_ZCOPY: 1983 if (bdev_io->u.bdev.zcopy.start) { 1984 return true; 1985 } else { 1986 return false; 1987 } 1988 default: 1989 return false; 1990 } 1991 } 1992 1993 static bool 1994 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1995 { 1996 switch (bdev_io->type) { 1997 case SPDK_BDEV_IO_TYPE_NVME_IO: 1998 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1999 /* Bit 1 (0x2) set for read operation */ 2000 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2001 return true; 2002 } else { 2003 return false; 2004 } 2005 case SPDK_BDEV_IO_TYPE_READ: 2006 return true; 2007 case SPDK_BDEV_IO_TYPE_ZCOPY: 2008 /* Populate to read from disk */ 2009 if (bdev_io->u.bdev.zcopy.populate) { 2010 return true; 2011 } else { 2012 return false; 2013 } 2014 default: 2015 return false; 2016 } 2017 } 2018 2019 static uint64_t 2020 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2021 { 2022 struct spdk_bdev *bdev = bdev_io->bdev; 2023 2024 switch (bdev_io->type) { 2025 case SPDK_BDEV_IO_TYPE_NVME_IO: 2026 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2027 return bdev_io->u.nvme_passthru.nbytes; 2028 case SPDK_BDEV_IO_TYPE_READ: 2029 case SPDK_BDEV_IO_TYPE_WRITE: 2030 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2031 case SPDK_BDEV_IO_TYPE_ZCOPY: 2032 /* Track the data in the start phase only */ 2033 if (bdev_io->u.bdev.zcopy.start) { 2034 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2035 } else { 2036 return 0; 2037 } 2038 default: 2039 return 0; 2040 } 2041 } 2042 2043 static bool 2044 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2045 { 2046 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2047 return true; 2048 } else { 2049 return false; 2050 } 2051 } 2052 2053 static bool 2054 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2055 { 2056 if (bdev_is_read_io(io) == false) { 2057 return false; 2058 } 2059 2060 return bdev_qos_rw_queue_io(limit, io); 2061 } 2062 2063 static bool 2064 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2065 { 2066 if (bdev_is_read_io(io) == true) { 2067 return false; 2068 } 2069 2070 return bdev_qos_rw_queue_io(limit, io); 2071 } 2072 2073 static void 2074 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2075 { 2076 limit->remaining_this_timeslice--; 2077 } 2078 2079 static void 2080 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2081 { 2082 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2083 } 2084 2085 static void 2086 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2087 { 2088 if (bdev_is_read_io(io) == false) { 2089 return; 2090 } 2091 2092 return bdev_qos_rw_bps_update_quota(limit, io); 2093 } 2094 2095 static void 2096 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2097 { 2098 if (bdev_is_read_io(io) == true) { 2099 return; 2100 } 2101 2102 return bdev_qos_rw_bps_update_quota(limit, io); 2103 } 2104 2105 static void 2106 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2107 { 2108 int i; 2109 2110 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2111 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2112 qos->rate_limits[i].queue_io = NULL; 2113 qos->rate_limits[i].update_quota = NULL; 2114 continue; 2115 } 2116 2117 switch (i) { 2118 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2119 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2120 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2121 break; 2122 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2123 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2124 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2125 break; 2126 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2127 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2128 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2129 break; 2130 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2131 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2132 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2133 break; 2134 default: 2135 break; 2136 } 2137 } 2138 } 2139 2140 static void 2141 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2142 struct spdk_bdev_io *bdev_io, 2143 enum spdk_bdev_io_status status) 2144 { 2145 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2146 2147 bdev_io->internal.in_submit_request = true; 2148 bdev_ch->io_outstanding++; 2149 shared_resource->io_outstanding++; 2150 spdk_bdev_io_complete(bdev_io, status); 2151 bdev_io->internal.in_submit_request = false; 2152 } 2153 2154 static inline void 2155 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2156 { 2157 struct spdk_bdev *bdev = bdev_io->bdev; 2158 struct spdk_io_channel *ch = bdev_ch->channel; 2159 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2160 2161 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2162 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2163 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2164 2165 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2166 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2167 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2168 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2169 SPDK_BDEV_IO_STATUS_SUCCESS); 2170 return; 2171 } 2172 } 2173 2174 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2175 bdev_io->bdev->split_on_write_unit && 2176 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2177 SPDK_ERRLOG("IO does not match the write_unit_size\n"); 2178 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2179 return; 2180 } 2181 2182 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2183 bdev_ch->io_outstanding++; 2184 shared_resource->io_outstanding++; 2185 bdev_io->internal.in_submit_request = true; 2186 bdev->fn_table->submit_request(ch, bdev_io); 2187 bdev_io->internal.in_submit_request = false; 2188 } else { 2189 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2190 } 2191 } 2192 2193 static bool 2194 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2195 { 2196 int i; 2197 2198 if (bdev_qos_io_to_limit(bdev_io) == true) { 2199 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2200 if (!qos->rate_limits[i].queue_io) { 2201 continue; 2202 } 2203 2204 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2205 bdev_io) == true) { 2206 return true; 2207 } 2208 } 2209 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2210 if (!qos->rate_limits[i].update_quota) { 2211 continue; 2212 } 2213 2214 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2215 } 2216 } 2217 2218 return false; 2219 } 2220 2221 static int 2222 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2223 { 2224 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2225 int submitted_ios = 0; 2226 2227 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2228 if (!bdev_qos_queue_io(qos, bdev_io)) { 2229 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2230 bdev_io_do_submit(ch, bdev_io); 2231 submitted_ios++; 2232 } 2233 } 2234 2235 return submitted_ios; 2236 } 2237 2238 static void 2239 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2240 { 2241 int rc; 2242 2243 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2244 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2245 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2246 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2247 &bdev_io->internal.waitq_entry); 2248 if (rc != 0) { 2249 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2250 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2251 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2252 } 2253 } 2254 2255 static bool 2256 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2257 { 2258 uint32_t io_boundary; 2259 struct spdk_bdev *bdev = bdev_io->bdev; 2260 uint32_t max_size = bdev->max_segment_size; 2261 int max_segs = bdev->max_num_segments; 2262 2263 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2264 io_boundary = bdev->write_unit_size; 2265 } else if (bdev->split_on_optimal_io_boundary) { 2266 io_boundary = bdev->optimal_io_boundary; 2267 } else { 2268 io_boundary = 0; 2269 } 2270 2271 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2272 return false; 2273 } 2274 2275 if (io_boundary) { 2276 uint64_t start_stripe, end_stripe; 2277 2278 start_stripe = bdev_io->u.bdev.offset_blocks; 2279 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2280 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2281 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2282 start_stripe >>= spdk_u32log2(io_boundary); 2283 end_stripe >>= spdk_u32log2(io_boundary); 2284 } else { 2285 start_stripe /= io_boundary; 2286 end_stripe /= io_boundary; 2287 } 2288 2289 if (start_stripe != end_stripe) { 2290 return true; 2291 } 2292 } 2293 2294 if (max_segs) { 2295 if (bdev_io->u.bdev.iovcnt > max_segs) { 2296 return true; 2297 } 2298 } 2299 2300 if (max_size) { 2301 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2302 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2303 return true; 2304 } 2305 } 2306 } 2307 2308 return false; 2309 } 2310 2311 static bool 2312 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2313 { 2314 uint32_t num_unmap_segments; 2315 2316 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2317 return false; 2318 } 2319 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2320 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2321 return true; 2322 } 2323 2324 return false; 2325 } 2326 2327 static bool 2328 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2329 { 2330 if (!bdev_io->bdev->max_write_zeroes) { 2331 return false; 2332 } 2333 2334 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2335 return true; 2336 } 2337 2338 return false; 2339 } 2340 2341 static bool 2342 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2343 { 2344 switch (bdev_io->type) { 2345 case SPDK_BDEV_IO_TYPE_READ: 2346 case SPDK_BDEV_IO_TYPE_WRITE: 2347 return bdev_rw_should_split(bdev_io); 2348 case SPDK_BDEV_IO_TYPE_UNMAP: 2349 return bdev_unmap_should_split(bdev_io); 2350 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2351 return bdev_write_zeroes_should_split(bdev_io); 2352 default: 2353 return false; 2354 } 2355 } 2356 2357 static uint32_t 2358 _to_next_boundary(uint64_t offset, uint32_t boundary) 2359 { 2360 return (boundary - (offset % boundary)); 2361 } 2362 2363 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2364 2365 static void _bdev_rw_split(void *_bdev_io); 2366 2367 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2368 2369 static void 2370 _bdev_unmap_split(void *_bdev_io) 2371 { 2372 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2373 } 2374 2375 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2376 2377 static void 2378 _bdev_write_zeroes_split(void *_bdev_io) 2379 { 2380 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2381 } 2382 2383 static int 2384 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2385 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2386 { 2387 int rc; 2388 uint64_t current_offset, current_remaining; 2389 spdk_bdev_io_wait_cb io_wait_fn; 2390 2391 current_offset = *offset; 2392 current_remaining = *remaining; 2393 2394 bdev_io->u.bdev.split_outstanding++; 2395 2396 io_wait_fn = _bdev_rw_split; 2397 switch (bdev_io->type) { 2398 case SPDK_BDEV_IO_TYPE_READ: 2399 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2400 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2401 iov, iovcnt, md_buf, current_offset, 2402 num_blocks, 2403 bdev_io_split_done, bdev_io, 2404 bdev_io->internal.ext_opts, true); 2405 break; 2406 case SPDK_BDEV_IO_TYPE_WRITE: 2407 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2408 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2409 iov, iovcnt, md_buf, current_offset, 2410 num_blocks, 2411 bdev_io_split_done, bdev_io, 2412 bdev_io->internal.ext_opts, true); 2413 break; 2414 case SPDK_BDEV_IO_TYPE_UNMAP: 2415 io_wait_fn = _bdev_unmap_split; 2416 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2417 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2418 current_offset, num_blocks, 2419 bdev_io_split_done, bdev_io); 2420 break; 2421 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2422 io_wait_fn = _bdev_write_zeroes_split; 2423 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2424 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2425 current_offset, num_blocks, 2426 bdev_io_split_done, bdev_io); 2427 break; 2428 default: 2429 assert(false); 2430 rc = -EINVAL; 2431 break; 2432 } 2433 2434 if (rc == 0) { 2435 current_offset += num_blocks; 2436 current_remaining -= num_blocks; 2437 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2438 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2439 *offset = current_offset; 2440 *remaining = current_remaining; 2441 } else { 2442 bdev_io->u.bdev.split_outstanding--; 2443 if (rc == -ENOMEM) { 2444 if (bdev_io->u.bdev.split_outstanding == 0) { 2445 /* No I/O is outstanding. Hence we should wait here. */ 2446 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2447 } 2448 } else { 2449 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2450 if (bdev_io->u.bdev.split_outstanding == 0) { 2451 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2452 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2453 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2454 } 2455 } 2456 } 2457 2458 return rc; 2459 } 2460 2461 static void 2462 _bdev_rw_split(void *_bdev_io) 2463 { 2464 struct iovec *parent_iov, *iov; 2465 struct spdk_bdev_io *bdev_io = _bdev_io; 2466 struct spdk_bdev *bdev = bdev_io->bdev; 2467 uint64_t parent_offset, current_offset, remaining; 2468 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2469 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2470 uint32_t iovcnt, iov_len, child_iovsize; 2471 uint32_t blocklen = bdev->blocklen; 2472 uint32_t io_boundary; 2473 uint32_t max_segment_size = bdev->max_segment_size; 2474 uint32_t max_child_iovcnt = bdev->max_num_segments; 2475 void *md_buf = NULL; 2476 int rc; 2477 2478 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2479 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2480 BDEV_IO_NUM_CHILD_IOV; 2481 2482 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2483 io_boundary = bdev->write_unit_size; 2484 } else if (bdev->split_on_optimal_io_boundary) { 2485 io_boundary = bdev->optimal_io_boundary; 2486 } else { 2487 io_boundary = UINT32_MAX; 2488 } 2489 2490 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2491 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2492 parent_offset = bdev_io->u.bdev.offset_blocks; 2493 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2494 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2495 2496 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2497 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2498 if (parent_iov_offset < parent_iov->iov_len) { 2499 break; 2500 } 2501 parent_iov_offset -= parent_iov->iov_len; 2502 } 2503 2504 child_iovcnt = 0; 2505 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2506 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2507 to_next_boundary = spdk_min(remaining, to_next_boundary); 2508 to_next_boundary_bytes = to_next_boundary * blocklen; 2509 2510 iov = &bdev_io->child_iov[child_iovcnt]; 2511 iovcnt = 0; 2512 2513 if (bdev_io->u.bdev.md_buf) { 2514 md_buf = (char *)bdev_io->u.bdev.md_buf + 2515 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2516 } 2517 2518 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2519 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2520 iovcnt < child_iovsize) { 2521 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2522 iov_len = parent_iov->iov_len - parent_iov_offset; 2523 2524 iov_len = spdk_min(iov_len, max_segment_size); 2525 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2526 to_next_boundary_bytes -= iov_len; 2527 2528 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2529 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2530 2531 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2532 parent_iov_offset += iov_len; 2533 } else { 2534 parent_iovpos++; 2535 parent_iov_offset = 0; 2536 } 2537 child_iovcnt++; 2538 iovcnt++; 2539 } 2540 2541 if (to_next_boundary_bytes > 0) { 2542 /* We had to stop this child I/O early because we ran out of 2543 * child_iov space or were limited by max_num_segments. 2544 * Ensure the iovs to be aligned with block size and 2545 * then adjust to_next_boundary before starting the 2546 * child I/O. 2547 */ 2548 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2549 iovcnt == child_iovsize); 2550 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2551 if (to_last_block_bytes != 0) { 2552 uint32_t child_iovpos = child_iovcnt - 1; 2553 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2554 * so the loop will naturally end 2555 */ 2556 2557 to_last_block_bytes = blocklen - to_last_block_bytes; 2558 to_next_boundary_bytes += to_last_block_bytes; 2559 while (to_last_block_bytes > 0 && iovcnt > 0) { 2560 iov_len = spdk_min(to_last_block_bytes, 2561 bdev_io->child_iov[child_iovpos].iov_len); 2562 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2563 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2564 child_iovpos--; 2565 if (--iovcnt == 0) { 2566 /* If the child IO is less than a block size just return. 2567 * If the first child IO of any split round is less than 2568 * a block size, an error exit. 2569 */ 2570 if (bdev_io->u.bdev.split_outstanding == 0) { 2571 SPDK_ERRLOG("The first child io was less than a block size\n"); 2572 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2573 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2574 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2575 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2576 } 2577 2578 return; 2579 } 2580 } 2581 2582 to_last_block_bytes -= iov_len; 2583 2584 if (parent_iov_offset == 0) { 2585 parent_iovpos--; 2586 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2587 } 2588 parent_iov_offset -= iov_len; 2589 } 2590 2591 assert(to_last_block_bytes == 0); 2592 } 2593 to_next_boundary -= to_next_boundary_bytes / blocklen; 2594 } 2595 2596 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2597 ¤t_offset, &remaining); 2598 if (spdk_unlikely(rc)) { 2599 return; 2600 } 2601 } 2602 } 2603 2604 static void 2605 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2606 { 2607 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2608 uint32_t num_children_reqs = 0; 2609 int rc; 2610 2611 offset = bdev_io->u.bdev.split_current_offset_blocks; 2612 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2613 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2614 2615 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2616 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2617 2618 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2619 &offset, &remaining); 2620 if (spdk_likely(rc == 0)) { 2621 num_children_reqs++; 2622 } else { 2623 return; 2624 } 2625 } 2626 } 2627 2628 static void 2629 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2630 { 2631 uint64_t offset, write_zeroes_blocks, remaining; 2632 uint32_t num_children_reqs = 0; 2633 int rc; 2634 2635 offset = bdev_io->u.bdev.split_current_offset_blocks; 2636 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2637 2638 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2639 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2640 2641 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2642 &offset, &remaining); 2643 if (spdk_likely(rc == 0)) { 2644 num_children_reqs++; 2645 } else { 2646 return; 2647 } 2648 } 2649 } 2650 2651 static void 2652 parent_bdev_io_complete(void *ctx, int rc) 2653 { 2654 struct spdk_bdev_io *parent_io = ctx; 2655 2656 if (rc) { 2657 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2658 } 2659 2660 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2661 parent_io->internal.caller_ctx); 2662 } 2663 2664 static void 2665 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2666 { 2667 struct spdk_bdev_io *parent_io = cb_arg; 2668 2669 spdk_bdev_free_io(bdev_io); 2670 2671 if (!success) { 2672 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2673 /* If any child I/O failed, stop further splitting process. */ 2674 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2675 parent_io->u.bdev.split_remaining_num_blocks = 0; 2676 } 2677 parent_io->u.bdev.split_outstanding--; 2678 if (parent_io->u.bdev.split_outstanding != 0) { 2679 return; 2680 } 2681 2682 /* 2683 * Parent I/O finishes when all blocks are consumed. 2684 */ 2685 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2686 assert(parent_io->internal.cb != bdev_io_split_done); 2687 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2688 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2689 2690 if (parent_io->internal.orig_iovcnt != 0) { 2691 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2692 /* bdev IO will be completed in the callback */ 2693 } else { 2694 parent_bdev_io_complete(parent_io, 0); 2695 } 2696 return; 2697 } 2698 2699 /* 2700 * Continue with the splitting process. This function will complete the parent I/O if the 2701 * splitting is done. 2702 */ 2703 switch (parent_io->type) { 2704 case SPDK_BDEV_IO_TYPE_READ: 2705 case SPDK_BDEV_IO_TYPE_WRITE: 2706 _bdev_rw_split(parent_io); 2707 break; 2708 case SPDK_BDEV_IO_TYPE_UNMAP: 2709 bdev_unmap_split(parent_io); 2710 break; 2711 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2712 bdev_write_zeroes_split(parent_io); 2713 break; 2714 default: 2715 assert(false); 2716 break; 2717 } 2718 } 2719 2720 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2721 bool success); 2722 2723 static void 2724 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2725 { 2726 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2727 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2728 bdev_io->u.bdev.split_outstanding = 0; 2729 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2730 2731 switch (bdev_io->type) { 2732 case SPDK_BDEV_IO_TYPE_READ: 2733 case SPDK_BDEV_IO_TYPE_WRITE: 2734 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2735 _bdev_rw_split(bdev_io); 2736 } else { 2737 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2738 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2739 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2740 } 2741 break; 2742 case SPDK_BDEV_IO_TYPE_UNMAP: 2743 bdev_unmap_split(bdev_io); 2744 break; 2745 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2746 bdev_write_zeroes_split(bdev_io); 2747 break; 2748 default: 2749 assert(false); 2750 break; 2751 } 2752 } 2753 2754 static void 2755 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2756 { 2757 if (!success) { 2758 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2759 return; 2760 } 2761 2762 _bdev_rw_split(bdev_io); 2763 } 2764 2765 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2766 * be inlined, at least on some compilers. 2767 */ 2768 static inline void 2769 _bdev_io_submit(void *ctx) 2770 { 2771 struct spdk_bdev_io *bdev_io = ctx; 2772 struct spdk_bdev *bdev = bdev_io->bdev; 2773 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2774 uint64_t tsc; 2775 2776 tsc = spdk_get_ticks(); 2777 bdev_io->internal.submit_tsc = tsc; 2778 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, 2779 (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2780 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2781 2782 if (spdk_likely(bdev_ch->flags == 0)) { 2783 bdev_io_do_submit(bdev_ch, bdev_io); 2784 return; 2785 } 2786 2787 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2788 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2789 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2790 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2791 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2792 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2793 } else { 2794 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2795 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2796 } 2797 } else { 2798 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2799 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2800 } 2801 } 2802 2803 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2804 2805 bool 2806 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2807 { 2808 if (range1->length == 0 || range2->length == 0) { 2809 return false; 2810 } 2811 2812 if (range1->offset + range1->length <= range2->offset) { 2813 return false; 2814 } 2815 2816 if (range2->offset + range2->length <= range1->offset) { 2817 return false; 2818 } 2819 2820 return true; 2821 } 2822 2823 static bool 2824 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2825 { 2826 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2827 struct lba_range r; 2828 2829 switch (bdev_io->type) { 2830 case SPDK_BDEV_IO_TYPE_NVME_IO: 2831 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2832 /* Don't try to decode the NVMe command - just assume worst-case and that 2833 * it overlaps a locked range. 2834 */ 2835 return true; 2836 case SPDK_BDEV_IO_TYPE_WRITE: 2837 case SPDK_BDEV_IO_TYPE_UNMAP: 2838 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2839 case SPDK_BDEV_IO_TYPE_ZCOPY: 2840 r.offset = bdev_io->u.bdev.offset_blocks; 2841 r.length = bdev_io->u.bdev.num_blocks; 2842 if (!bdev_lba_range_overlapped(range, &r)) { 2843 /* This I/O doesn't overlap the specified LBA range. */ 2844 return false; 2845 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2846 /* This I/O overlaps, but the I/O is on the same channel that locked this 2847 * range, and the caller_ctx is the same as the locked_ctx. This means 2848 * that this I/O is associated with the lock, and is allowed to execute. 2849 */ 2850 return false; 2851 } else { 2852 return true; 2853 } 2854 default: 2855 return false; 2856 } 2857 } 2858 2859 void 2860 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2861 { 2862 struct spdk_bdev *bdev = bdev_io->bdev; 2863 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2864 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2865 2866 assert(thread != NULL); 2867 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2868 2869 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2870 struct lba_range *range; 2871 2872 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2873 if (bdev_io_range_is_locked(bdev_io, range)) { 2874 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2875 return; 2876 } 2877 } 2878 } 2879 2880 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2881 2882 if (bdev_io_should_split(bdev_io)) { 2883 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2884 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2885 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2886 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2887 bdev_io_split(NULL, bdev_io); 2888 return; 2889 } 2890 2891 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2892 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2893 _bdev_io_submit(bdev_io); 2894 } else { 2895 bdev_io->internal.io_submit_ch = ch; 2896 bdev_io->internal.ch = bdev->internal.qos->ch; 2897 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2898 } 2899 } else { 2900 _bdev_io_submit(bdev_io); 2901 } 2902 } 2903 2904 static inline void 2905 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2906 { 2907 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2908 2909 /* Zero part we don't copy */ 2910 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2911 memcpy(opts_copy, opts, opts->size); 2912 opts_copy->size = sizeof(*opts_copy); 2913 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2914 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2915 bdev_io->u.bdev.ext_opts = opts_copy; 2916 } 2917 2918 static inline void 2919 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2920 { 2921 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2922 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2923 * For write operation we need to pull buffers from memory domain before submitting IO. 2924 * Once read operation completes, we need to use memory_domain push functionality to 2925 * update data in original memory domain IO buffer 2926 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2927 * the copied ext_opts */ 2928 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2929 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2930 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2931 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2932 } 2933 2934 static inline void 2935 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2936 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2937 { 2938 if (opts) { 2939 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2940 assert(opts->size <= sizeof(*opts)); 2941 /* 2942 * copy if size is smaller than opts struct to avoid having to check size 2943 * on every access to bdev_io->u.bdev.ext_opts 2944 */ 2945 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2946 _bdev_io_copy_ext_opts(bdev_io, opts); 2947 if (use_pull_push) { 2948 _bdev_io_ext_use_bounce_buffer(bdev_io); 2949 return; 2950 } 2951 } 2952 } 2953 bdev_io_submit(bdev_io); 2954 } 2955 2956 static void 2957 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2958 { 2959 struct spdk_bdev *bdev = bdev_io->bdev; 2960 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2961 struct spdk_io_channel *ch = bdev_ch->channel; 2962 2963 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2964 2965 bdev_io->internal.in_submit_request = true; 2966 bdev->fn_table->submit_request(ch, bdev_io); 2967 bdev_io->internal.in_submit_request = false; 2968 } 2969 2970 void 2971 bdev_io_init(struct spdk_bdev_io *bdev_io, 2972 struct spdk_bdev *bdev, void *cb_arg, 2973 spdk_bdev_io_completion_cb cb) 2974 { 2975 bdev_io->bdev = bdev; 2976 bdev_io->internal.caller_ctx = cb_arg; 2977 bdev_io->internal.cb = cb; 2978 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2979 bdev_io->internal.in_submit_request = false; 2980 bdev_io->internal.buf = NULL; 2981 bdev_io->internal.io_submit_ch = NULL; 2982 bdev_io->internal.orig_iovs = NULL; 2983 bdev_io->internal.orig_iovcnt = 0; 2984 bdev_io->internal.orig_md_iov.iov_base = NULL; 2985 bdev_io->internal.error.nvme.cdw0 = 0; 2986 bdev_io->num_retries = 0; 2987 bdev_io->internal.get_buf_cb = NULL; 2988 bdev_io->internal.get_aux_buf_cb = NULL; 2989 bdev_io->internal.ext_opts = NULL; 2990 bdev_io->internal.data_transfer_cpl = NULL; 2991 } 2992 2993 static bool 2994 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2995 { 2996 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2997 } 2998 2999 bool 3000 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3001 { 3002 bool supported; 3003 3004 supported = bdev_io_type_supported(bdev, io_type); 3005 3006 if (!supported) { 3007 switch (io_type) { 3008 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3009 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3010 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3011 break; 3012 default: 3013 break; 3014 } 3015 } 3016 3017 return supported; 3018 } 3019 3020 int 3021 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3022 { 3023 if (bdev->fn_table->dump_info_json) { 3024 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3025 } 3026 3027 return 0; 3028 } 3029 3030 static void 3031 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3032 { 3033 uint32_t max_per_timeslice = 0; 3034 int i; 3035 3036 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3037 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3038 qos->rate_limits[i].max_per_timeslice = 0; 3039 continue; 3040 } 3041 3042 max_per_timeslice = qos->rate_limits[i].limit * 3043 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3044 3045 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3046 qos->rate_limits[i].min_per_timeslice); 3047 3048 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3049 } 3050 3051 bdev_qos_set_ops(qos); 3052 } 3053 3054 static int 3055 bdev_channel_poll_qos(void *arg) 3056 { 3057 struct spdk_bdev_qos *qos = arg; 3058 uint64_t now = spdk_get_ticks(); 3059 int i; 3060 3061 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3062 /* We received our callback earlier than expected - return 3063 * immediately and wait to do accounting until at least one 3064 * timeslice has actually expired. This should never happen 3065 * with a well-behaved timer implementation. 3066 */ 3067 return SPDK_POLLER_IDLE; 3068 } 3069 3070 /* Reset for next round of rate limiting */ 3071 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3072 /* We may have allowed the IOs or bytes to slightly overrun in the last 3073 * timeslice. remaining_this_timeslice is signed, so if it's negative 3074 * here, we'll account for the overrun so that the next timeslice will 3075 * be appropriately reduced. 3076 */ 3077 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3078 qos->rate_limits[i].remaining_this_timeslice = 0; 3079 } 3080 } 3081 3082 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3083 qos->last_timeslice += qos->timeslice_size; 3084 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3085 qos->rate_limits[i].remaining_this_timeslice += 3086 qos->rate_limits[i].max_per_timeslice; 3087 } 3088 } 3089 3090 return bdev_qos_io_submit(qos->ch, qos); 3091 } 3092 3093 static void 3094 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3095 { 3096 struct spdk_bdev_shared_resource *shared_resource; 3097 struct lba_range *range; 3098 3099 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3100 range = TAILQ_FIRST(&ch->locked_ranges); 3101 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3102 free(range); 3103 } 3104 3105 spdk_put_io_channel(ch->channel); 3106 3107 shared_resource = ch->shared_resource; 3108 3109 assert(TAILQ_EMPTY(&ch->io_locked)); 3110 assert(TAILQ_EMPTY(&ch->io_submitted)); 3111 assert(ch->io_outstanding == 0); 3112 assert(shared_resource->ref > 0); 3113 shared_resource->ref--; 3114 if (shared_resource->ref == 0) { 3115 assert(shared_resource->io_outstanding == 0); 3116 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3117 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3118 free(shared_resource); 3119 } 3120 } 3121 3122 /* Caller must hold bdev->internal.mutex. */ 3123 static void 3124 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3125 { 3126 struct spdk_bdev_qos *qos = bdev->internal.qos; 3127 int i; 3128 3129 /* Rate limiting on this bdev enabled */ 3130 if (qos) { 3131 if (qos->ch == NULL) { 3132 struct spdk_io_channel *io_ch; 3133 3134 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3135 bdev->name, spdk_get_thread()); 3136 3137 /* No qos channel has been selected, so set one up */ 3138 3139 /* Take another reference to ch */ 3140 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3141 assert(io_ch != NULL); 3142 qos->ch = ch; 3143 3144 qos->thread = spdk_io_channel_get_thread(io_ch); 3145 3146 TAILQ_INIT(&qos->queued); 3147 3148 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3149 if (bdev_qos_is_iops_rate_limit(i) == true) { 3150 qos->rate_limits[i].min_per_timeslice = 3151 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3152 } else { 3153 qos->rate_limits[i].min_per_timeslice = 3154 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3155 } 3156 3157 if (qos->rate_limits[i].limit == 0) { 3158 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3159 } 3160 } 3161 bdev_qos_update_max_quota_per_timeslice(qos); 3162 qos->timeslice_size = 3163 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3164 qos->last_timeslice = spdk_get_ticks(); 3165 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3166 qos, 3167 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3168 } 3169 3170 ch->flags |= BDEV_CH_QOS_ENABLED; 3171 } 3172 } 3173 3174 struct poll_timeout_ctx { 3175 struct spdk_bdev_desc *desc; 3176 uint64_t timeout_in_sec; 3177 spdk_bdev_io_timeout_cb cb_fn; 3178 void *cb_arg; 3179 }; 3180 3181 static void 3182 bdev_desc_free(struct spdk_bdev_desc *desc) 3183 { 3184 pthread_mutex_destroy(&desc->mutex); 3185 free(desc->media_events_buffer); 3186 free(desc); 3187 } 3188 3189 static void 3190 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3191 { 3192 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3193 struct spdk_bdev_desc *desc = ctx->desc; 3194 3195 free(ctx); 3196 3197 pthread_mutex_lock(&desc->mutex); 3198 desc->refs--; 3199 if (desc->closed == true && desc->refs == 0) { 3200 pthread_mutex_unlock(&desc->mutex); 3201 bdev_desc_free(desc); 3202 return; 3203 } 3204 pthread_mutex_unlock(&desc->mutex); 3205 } 3206 3207 static void 3208 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3209 { 3210 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3211 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3212 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3213 struct spdk_bdev_desc *desc = ctx->desc; 3214 struct spdk_bdev_io *bdev_io; 3215 uint64_t now; 3216 3217 pthread_mutex_lock(&desc->mutex); 3218 if (desc->closed == true) { 3219 pthread_mutex_unlock(&desc->mutex); 3220 spdk_for_each_channel_continue(i, -1); 3221 return; 3222 } 3223 pthread_mutex_unlock(&desc->mutex); 3224 3225 now = spdk_get_ticks(); 3226 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3227 /* Exclude any I/O that are generated via splitting. */ 3228 if (bdev_io->internal.cb == bdev_io_split_done) { 3229 continue; 3230 } 3231 3232 /* Once we find an I/O that has not timed out, we can immediately 3233 * exit the loop. 3234 */ 3235 if (now < (bdev_io->internal.submit_tsc + 3236 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3237 goto end; 3238 } 3239 3240 if (bdev_io->internal.desc == desc) { 3241 ctx->cb_fn(ctx->cb_arg, bdev_io); 3242 } 3243 } 3244 3245 end: 3246 spdk_for_each_channel_continue(i, 0); 3247 } 3248 3249 static int 3250 bdev_poll_timeout_io(void *arg) 3251 { 3252 struct spdk_bdev_desc *desc = arg; 3253 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3254 struct poll_timeout_ctx *ctx; 3255 3256 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3257 if (!ctx) { 3258 SPDK_ERRLOG("failed to allocate memory\n"); 3259 return SPDK_POLLER_BUSY; 3260 } 3261 ctx->desc = desc; 3262 ctx->cb_arg = desc->cb_arg; 3263 ctx->cb_fn = desc->cb_fn; 3264 ctx->timeout_in_sec = desc->timeout_in_sec; 3265 3266 /* Take a ref on the descriptor in case it gets closed while we are checking 3267 * all of the channels. 3268 */ 3269 pthread_mutex_lock(&desc->mutex); 3270 desc->refs++; 3271 pthread_mutex_unlock(&desc->mutex); 3272 3273 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3274 bdev_channel_poll_timeout_io, 3275 ctx, 3276 bdev_channel_poll_timeout_io_done); 3277 3278 return SPDK_POLLER_BUSY; 3279 } 3280 3281 int 3282 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3283 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3284 { 3285 assert(desc->thread == spdk_get_thread()); 3286 3287 spdk_poller_unregister(&desc->io_timeout_poller); 3288 3289 if (timeout_in_sec) { 3290 assert(cb_fn != NULL); 3291 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3292 desc, 3293 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3294 1000); 3295 if (desc->io_timeout_poller == NULL) { 3296 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3297 return -1; 3298 } 3299 } 3300 3301 desc->cb_fn = cb_fn; 3302 desc->cb_arg = cb_arg; 3303 desc->timeout_in_sec = timeout_in_sec; 3304 3305 return 0; 3306 } 3307 3308 static int 3309 bdev_channel_create(void *io_device, void *ctx_buf) 3310 { 3311 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3312 struct spdk_bdev_channel *ch = ctx_buf; 3313 struct spdk_io_channel *mgmt_io_ch; 3314 struct spdk_bdev_mgmt_channel *mgmt_ch; 3315 struct spdk_bdev_shared_resource *shared_resource; 3316 struct lba_range *range; 3317 3318 ch->bdev = bdev; 3319 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3320 if (!ch->channel) { 3321 return -1; 3322 } 3323 3324 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3325 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3326 3327 assert(ch->histogram == NULL); 3328 if (bdev->internal.histogram_enabled) { 3329 ch->histogram = spdk_histogram_data_alloc(); 3330 if (ch->histogram == NULL) { 3331 SPDK_ERRLOG("Could not allocate histogram\n"); 3332 } 3333 } 3334 3335 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3336 if (!mgmt_io_ch) { 3337 spdk_put_io_channel(ch->channel); 3338 return -1; 3339 } 3340 3341 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3342 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3343 if (shared_resource->shared_ch == ch->channel) { 3344 spdk_put_io_channel(mgmt_io_ch); 3345 shared_resource->ref++; 3346 break; 3347 } 3348 } 3349 3350 if (shared_resource == NULL) { 3351 shared_resource = calloc(1, sizeof(*shared_resource)); 3352 if (shared_resource == NULL) { 3353 spdk_put_io_channel(ch->channel); 3354 spdk_put_io_channel(mgmt_io_ch); 3355 return -1; 3356 } 3357 3358 shared_resource->mgmt_ch = mgmt_ch; 3359 shared_resource->io_outstanding = 0; 3360 TAILQ_INIT(&shared_resource->nomem_io); 3361 shared_resource->nomem_threshold = 0; 3362 shared_resource->shared_ch = ch->channel; 3363 shared_resource->ref = 1; 3364 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3365 } 3366 3367 memset(&ch->stat, 0, sizeof(ch->stat)); 3368 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3369 ch->io_outstanding = 0; 3370 TAILQ_INIT(&ch->queued_resets); 3371 TAILQ_INIT(&ch->locked_ranges); 3372 ch->flags = 0; 3373 ch->shared_resource = shared_resource; 3374 3375 TAILQ_INIT(&ch->io_submitted); 3376 TAILQ_INIT(&ch->io_locked); 3377 3378 #ifdef SPDK_CONFIG_VTUNE 3379 { 3380 char *name; 3381 __itt_init_ittlib(NULL, 0); 3382 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3383 if (!name) { 3384 bdev_channel_destroy_resource(ch); 3385 return -1; 3386 } 3387 ch->handle = __itt_string_handle_create(name); 3388 free(name); 3389 ch->start_tsc = spdk_get_ticks(); 3390 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3391 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3392 } 3393 #endif 3394 3395 pthread_mutex_lock(&bdev->internal.mutex); 3396 bdev_enable_qos(bdev, ch); 3397 3398 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3399 struct lba_range *new_range; 3400 3401 new_range = calloc(1, sizeof(*new_range)); 3402 if (new_range == NULL) { 3403 pthread_mutex_unlock(&bdev->internal.mutex); 3404 bdev_channel_destroy_resource(ch); 3405 return -1; 3406 } 3407 new_range->length = range->length; 3408 new_range->offset = range->offset; 3409 new_range->locked_ctx = range->locked_ctx; 3410 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3411 } 3412 3413 pthread_mutex_unlock(&bdev->internal.mutex); 3414 3415 return 0; 3416 } 3417 3418 /* 3419 * Abort I/O that are waiting on a data buffer. These types of I/O are 3420 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3421 */ 3422 static void 3423 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3424 { 3425 bdev_io_stailq_t tmp; 3426 struct spdk_bdev_io *bdev_io; 3427 3428 STAILQ_INIT(&tmp); 3429 3430 while (!STAILQ_EMPTY(queue)) { 3431 bdev_io = STAILQ_FIRST(queue); 3432 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3433 if (bdev_io->internal.ch == ch) { 3434 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3435 } else { 3436 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3437 } 3438 } 3439 3440 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3441 } 3442 3443 /* 3444 * Abort I/O that are queued waiting for submission. These types of I/O are 3445 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3446 */ 3447 static void 3448 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3449 { 3450 struct spdk_bdev_io *bdev_io, *tmp; 3451 3452 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3453 if (bdev_io->internal.ch == ch) { 3454 TAILQ_REMOVE(queue, bdev_io, internal.link); 3455 /* 3456 * spdk_bdev_io_complete() assumes that the completed I/O had 3457 * been submitted to the bdev module. Since in this case it 3458 * hadn't, bump io_outstanding to account for the decrement 3459 * that spdk_bdev_io_complete() will do. 3460 */ 3461 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3462 ch->io_outstanding++; 3463 ch->shared_resource->io_outstanding++; 3464 } 3465 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3466 } 3467 } 3468 } 3469 3470 static bool 3471 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3472 { 3473 struct spdk_bdev_io *bdev_io; 3474 3475 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3476 if (bdev_io == bio_to_abort) { 3477 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3478 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3479 return true; 3480 } 3481 } 3482 3483 return false; 3484 } 3485 3486 static bool 3487 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3488 { 3489 struct spdk_bdev_io *bdev_io; 3490 3491 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3492 if (bdev_io == bio_to_abort) { 3493 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3494 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3495 return true; 3496 } 3497 } 3498 3499 return false; 3500 } 3501 3502 static void 3503 bdev_qos_channel_destroy(void *cb_arg) 3504 { 3505 struct spdk_bdev_qos *qos = cb_arg; 3506 3507 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3508 spdk_poller_unregister(&qos->poller); 3509 3510 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3511 3512 free(qos); 3513 } 3514 3515 static int 3516 bdev_qos_destroy(struct spdk_bdev *bdev) 3517 { 3518 int i; 3519 3520 /* 3521 * Cleanly shutting down the QoS poller is tricky, because 3522 * during the asynchronous operation the user could open 3523 * a new descriptor and create a new channel, spawning 3524 * a new QoS poller. 3525 * 3526 * The strategy is to create a new QoS structure here and swap it 3527 * in. The shutdown path then continues to refer to the old one 3528 * until it completes and then releases it. 3529 */ 3530 struct spdk_bdev_qos *new_qos, *old_qos; 3531 3532 old_qos = bdev->internal.qos; 3533 3534 new_qos = calloc(1, sizeof(*new_qos)); 3535 if (!new_qos) { 3536 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3537 return -ENOMEM; 3538 } 3539 3540 /* Copy the old QoS data into the newly allocated structure */ 3541 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3542 3543 /* Zero out the key parts of the QoS structure */ 3544 new_qos->ch = NULL; 3545 new_qos->thread = NULL; 3546 new_qos->poller = NULL; 3547 TAILQ_INIT(&new_qos->queued); 3548 /* 3549 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3550 * It will be used later for the new QoS structure. 3551 */ 3552 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3553 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3554 new_qos->rate_limits[i].min_per_timeslice = 0; 3555 new_qos->rate_limits[i].max_per_timeslice = 0; 3556 } 3557 3558 bdev->internal.qos = new_qos; 3559 3560 if (old_qos->thread == NULL) { 3561 free(old_qos); 3562 } else { 3563 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3564 } 3565 3566 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3567 * been destroyed yet. The destruction path will end up waiting for the final 3568 * channel to be put before it releases resources. */ 3569 3570 return 0; 3571 } 3572 3573 static void 3574 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3575 { 3576 total->bytes_read += add->bytes_read; 3577 total->num_read_ops += add->num_read_ops; 3578 total->bytes_written += add->bytes_written; 3579 total->num_write_ops += add->num_write_ops; 3580 total->bytes_unmapped += add->bytes_unmapped; 3581 total->num_unmap_ops += add->num_unmap_ops; 3582 total->read_latency_ticks += add->read_latency_ticks; 3583 total->write_latency_ticks += add->write_latency_ticks; 3584 total->unmap_latency_ticks += add->unmap_latency_ticks; 3585 } 3586 3587 static void 3588 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3589 { 3590 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3591 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3592 3593 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3594 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3595 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3596 } 3597 3598 static void 3599 bdev_channel_destroy(void *io_device, void *ctx_buf) 3600 { 3601 struct spdk_bdev_channel *ch = ctx_buf; 3602 3603 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3604 spdk_get_thread()); 3605 3606 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3607 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3608 3609 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3610 pthread_mutex_lock(&ch->bdev->internal.mutex); 3611 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3612 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3613 3614 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3615 3616 bdev_channel_abort_queued_ios(ch); 3617 3618 if (ch->histogram) { 3619 spdk_histogram_data_free(ch->histogram); 3620 } 3621 3622 bdev_channel_destroy_resource(ch); 3623 } 3624 3625 /* 3626 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3627 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3628 */ 3629 static int 3630 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3631 { 3632 struct spdk_bdev_name *tmp; 3633 3634 bdev_name->name = strdup(name); 3635 if (bdev_name->name == NULL) { 3636 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3637 return -ENOMEM; 3638 } 3639 3640 bdev_name->bdev = bdev; 3641 3642 pthread_mutex_lock(&g_bdev_mgr.mutex); 3643 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3644 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3645 3646 if (tmp != NULL) { 3647 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3648 free(bdev_name->name); 3649 return -EEXIST; 3650 } 3651 3652 return 0; 3653 } 3654 3655 static void 3656 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3657 { 3658 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3659 free(bdev_name->name); 3660 } 3661 3662 static void 3663 bdev_name_del(struct spdk_bdev_name *bdev_name) 3664 { 3665 pthread_mutex_lock(&g_bdev_mgr.mutex); 3666 bdev_name_del_unsafe(bdev_name); 3667 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3668 } 3669 3670 int 3671 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3672 { 3673 struct spdk_bdev_alias *tmp; 3674 int ret; 3675 3676 if (alias == NULL) { 3677 SPDK_ERRLOG("Empty alias passed\n"); 3678 return -EINVAL; 3679 } 3680 3681 tmp = calloc(1, sizeof(*tmp)); 3682 if (tmp == NULL) { 3683 SPDK_ERRLOG("Unable to allocate alias\n"); 3684 return -ENOMEM; 3685 } 3686 3687 ret = bdev_name_add(&tmp->alias, bdev, alias); 3688 if (ret != 0) { 3689 free(tmp); 3690 return ret; 3691 } 3692 3693 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3694 3695 return 0; 3696 } 3697 3698 static int 3699 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3700 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3701 { 3702 struct spdk_bdev_alias *tmp; 3703 3704 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3705 if (strcmp(alias, tmp->alias.name) == 0) { 3706 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3707 alias_del_fn(&tmp->alias); 3708 free(tmp); 3709 return 0; 3710 } 3711 } 3712 3713 return -ENOENT; 3714 } 3715 3716 int 3717 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3718 { 3719 int rc; 3720 3721 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3722 if (rc == -ENOENT) { 3723 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3724 } 3725 3726 return rc; 3727 } 3728 3729 void 3730 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3731 { 3732 struct spdk_bdev_alias *p, *tmp; 3733 3734 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3735 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3736 bdev_name_del(&p->alias); 3737 free(p); 3738 } 3739 } 3740 3741 struct spdk_io_channel * 3742 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3743 { 3744 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3745 } 3746 3747 void * 3748 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3749 { 3750 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3751 void *ctx = NULL; 3752 3753 if (bdev->fn_table->get_module_ctx) { 3754 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3755 } 3756 3757 return ctx; 3758 } 3759 3760 const char * 3761 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3762 { 3763 return bdev->module->name; 3764 } 3765 3766 const char * 3767 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3768 { 3769 return bdev->name; 3770 } 3771 3772 const char * 3773 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3774 { 3775 return bdev->product_name; 3776 } 3777 3778 const struct spdk_bdev_aliases_list * 3779 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3780 { 3781 return &bdev->aliases; 3782 } 3783 3784 uint32_t 3785 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3786 { 3787 return bdev->blocklen; 3788 } 3789 3790 uint32_t 3791 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3792 { 3793 return bdev->write_unit_size; 3794 } 3795 3796 uint64_t 3797 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3798 { 3799 return bdev->blockcnt; 3800 } 3801 3802 const char * 3803 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3804 { 3805 return qos_rpc_type[type]; 3806 } 3807 3808 void 3809 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3810 { 3811 int i; 3812 3813 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3814 3815 pthread_mutex_lock(&bdev->internal.mutex); 3816 if (bdev->internal.qos) { 3817 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3818 if (bdev->internal.qos->rate_limits[i].limit != 3819 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3820 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3821 if (bdev_qos_is_iops_rate_limit(i) == false) { 3822 /* Change from Byte to Megabyte which is user visible. */ 3823 limits[i] = limits[i] / 1024 / 1024; 3824 } 3825 } 3826 } 3827 } 3828 pthread_mutex_unlock(&bdev->internal.mutex); 3829 } 3830 3831 size_t 3832 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3833 { 3834 return 1 << bdev->required_alignment; 3835 } 3836 3837 uint32_t 3838 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3839 { 3840 return bdev->optimal_io_boundary; 3841 } 3842 3843 bool 3844 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3845 { 3846 return bdev->write_cache; 3847 } 3848 3849 const struct spdk_uuid * 3850 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3851 { 3852 return &bdev->uuid; 3853 } 3854 3855 uint16_t 3856 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3857 { 3858 return bdev->acwu; 3859 } 3860 3861 uint32_t 3862 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3863 { 3864 return bdev->md_len; 3865 } 3866 3867 bool 3868 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3869 { 3870 return (bdev->md_len != 0) && bdev->md_interleave; 3871 } 3872 3873 bool 3874 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3875 { 3876 return (bdev->md_len != 0) && !bdev->md_interleave; 3877 } 3878 3879 bool 3880 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3881 { 3882 return bdev->zoned; 3883 } 3884 3885 uint32_t 3886 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3887 { 3888 if (spdk_bdev_is_md_interleaved(bdev)) { 3889 return bdev->blocklen - bdev->md_len; 3890 } else { 3891 return bdev->blocklen; 3892 } 3893 } 3894 3895 uint32_t 3896 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3897 { 3898 return bdev->phys_blocklen; 3899 } 3900 3901 static uint32_t 3902 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3903 { 3904 if (!spdk_bdev_is_md_interleaved(bdev)) { 3905 return bdev->blocklen + bdev->md_len; 3906 } else { 3907 return bdev->blocklen; 3908 } 3909 } 3910 3911 /* We have to use the typedef in the function declaration to appease astyle. */ 3912 typedef enum spdk_dif_type spdk_dif_type_t; 3913 3914 spdk_dif_type_t 3915 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3916 { 3917 if (bdev->md_len != 0) { 3918 return bdev->dif_type; 3919 } else { 3920 return SPDK_DIF_DISABLE; 3921 } 3922 } 3923 3924 bool 3925 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3926 { 3927 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3928 return bdev->dif_is_head_of_md; 3929 } else { 3930 return false; 3931 } 3932 } 3933 3934 bool 3935 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3936 enum spdk_dif_check_type check_type) 3937 { 3938 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3939 return false; 3940 } 3941 3942 switch (check_type) { 3943 case SPDK_DIF_CHECK_TYPE_REFTAG: 3944 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3945 case SPDK_DIF_CHECK_TYPE_APPTAG: 3946 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3947 case SPDK_DIF_CHECK_TYPE_GUARD: 3948 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3949 default: 3950 return false; 3951 } 3952 } 3953 3954 uint64_t 3955 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3956 { 3957 return bdev->internal.measured_queue_depth; 3958 } 3959 3960 uint64_t 3961 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3962 { 3963 return bdev->internal.period; 3964 } 3965 3966 uint64_t 3967 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3968 { 3969 return bdev->internal.weighted_io_time; 3970 } 3971 3972 uint64_t 3973 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3974 { 3975 return bdev->internal.io_time; 3976 } 3977 3978 static void bdev_update_qd_sampling_period(void *ctx); 3979 3980 static void 3981 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3982 { 3983 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3984 3985 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3986 3987 if (bdev->internal.measured_queue_depth) { 3988 bdev->internal.io_time += bdev->internal.period; 3989 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3990 } 3991 3992 bdev->internal.qd_poll_in_progress = false; 3993 3994 bdev_update_qd_sampling_period(bdev); 3995 } 3996 3997 static void 3998 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3999 { 4000 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 4001 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4002 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 4003 4004 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4005 spdk_for_each_channel_continue(i, 0); 4006 } 4007 4008 static int 4009 bdev_calculate_measured_queue_depth(void *ctx) 4010 { 4011 struct spdk_bdev *bdev = ctx; 4012 4013 bdev->internal.qd_poll_in_progress = true; 4014 bdev->internal.temporary_queue_depth = 0; 4015 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 4016 _calculate_measured_qd_cpl); 4017 return SPDK_POLLER_BUSY; 4018 } 4019 4020 static void 4021 bdev_update_qd_sampling_period(void *ctx) 4022 { 4023 struct spdk_bdev *bdev = ctx; 4024 4025 if (bdev->internal.period == bdev->internal.new_period) { 4026 return; 4027 } 4028 4029 if (bdev->internal.qd_poll_in_progress) { 4030 return; 4031 } 4032 4033 bdev->internal.period = bdev->internal.new_period; 4034 4035 spdk_poller_unregister(&bdev->internal.qd_poller); 4036 if (bdev->internal.period != 0) { 4037 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4038 bdev, bdev->internal.period); 4039 } else { 4040 spdk_bdev_close(bdev->internal.qd_desc); 4041 bdev->internal.qd_desc = NULL; 4042 } 4043 } 4044 4045 static void 4046 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4047 { 4048 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4049 } 4050 4051 void 4052 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4053 { 4054 int rc; 4055 4056 if (bdev->internal.new_period == period) { 4057 return; 4058 } 4059 4060 bdev->internal.new_period = period; 4061 4062 if (bdev->internal.qd_desc != NULL) { 4063 assert(bdev->internal.period != 0); 4064 4065 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4066 bdev_update_qd_sampling_period, bdev); 4067 return; 4068 } 4069 4070 assert(bdev->internal.period == 0); 4071 4072 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4073 NULL, &bdev->internal.qd_desc); 4074 if (rc != 0) { 4075 return; 4076 } 4077 4078 bdev->internal.period = period; 4079 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4080 bdev, period); 4081 } 4082 4083 struct bdev_get_current_qd_ctx { 4084 uint64_t current_qd; 4085 spdk_bdev_get_current_qd_cb cb_fn; 4086 void *cb_arg; 4087 }; 4088 4089 static void 4090 bdev_get_current_qd_done(struct spdk_io_channel_iter *i, int status) 4091 { 4092 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4093 void *io_dev = spdk_io_channel_iter_get_io_device(i); 4094 4095 ctx->cb_fn(__bdev_from_io_dev(io_dev), ctx->current_qd, ctx->cb_arg, 0); 4096 4097 free(ctx); 4098 } 4099 4100 static void 4101 bdev_get_current_qd(struct spdk_io_channel_iter *i) 4102 { 4103 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4104 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4105 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 4106 4107 ctx->current_qd += bdev_ch->io_outstanding; 4108 4109 spdk_for_each_channel_continue(i, 0); 4110 } 4111 4112 void 4113 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4114 void *cb_arg) 4115 { 4116 struct bdev_get_current_qd_ctx *ctx; 4117 4118 assert(cb_fn != NULL); 4119 4120 ctx = calloc(1, sizeof(*ctx)); 4121 if (ctx == NULL) { 4122 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4123 return; 4124 } 4125 4126 ctx->cb_fn = cb_fn; 4127 ctx->cb_arg = cb_arg; 4128 4129 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4130 bdev_get_current_qd, 4131 ctx, 4132 bdev_get_current_qd_done); 4133 } 4134 4135 static void 4136 _resize_notify(void *arg) 4137 { 4138 struct spdk_bdev_desc *desc = arg; 4139 4140 pthread_mutex_lock(&desc->mutex); 4141 desc->refs--; 4142 if (!desc->closed) { 4143 pthread_mutex_unlock(&desc->mutex); 4144 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4145 desc->bdev, 4146 desc->callback.ctx); 4147 return; 4148 } else if (0 == desc->refs) { 4149 /* This descriptor was closed after this resize_notify message was sent. 4150 * spdk_bdev_close() could not free the descriptor since this message was 4151 * in flight, so we free it now using bdev_desc_free(). 4152 */ 4153 pthread_mutex_unlock(&desc->mutex); 4154 bdev_desc_free(desc); 4155 return; 4156 } 4157 pthread_mutex_unlock(&desc->mutex); 4158 } 4159 4160 int 4161 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4162 { 4163 struct spdk_bdev_desc *desc; 4164 int ret; 4165 4166 if (size == bdev->blockcnt) { 4167 return 0; 4168 } 4169 4170 pthread_mutex_lock(&bdev->internal.mutex); 4171 4172 /* bdev has open descriptors */ 4173 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4174 bdev->blockcnt > size) { 4175 ret = -EBUSY; 4176 } else { 4177 bdev->blockcnt = size; 4178 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4179 pthread_mutex_lock(&desc->mutex); 4180 if (!desc->closed) { 4181 desc->refs++; 4182 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4183 } 4184 pthread_mutex_unlock(&desc->mutex); 4185 } 4186 ret = 0; 4187 } 4188 4189 pthread_mutex_unlock(&bdev->internal.mutex); 4190 4191 return ret; 4192 } 4193 4194 /* 4195 * Convert I/O offset and length from bytes to blocks. 4196 * 4197 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4198 */ 4199 static uint64_t 4200 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4201 uint64_t num_bytes, uint64_t *num_blocks) 4202 { 4203 uint32_t block_size = bdev->blocklen; 4204 uint8_t shift_cnt; 4205 4206 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4207 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4208 shift_cnt = spdk_u32log2(block_size); 4209 *offset_blocks = offset_bytes >> shift_cnt; 4210 *num_blocks = num_bytes >> shift_cnt; 4211 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4212 (num_bytes - (*num_blocks << shift_cnt)); 4213 } else { 4214 *offset_blocks = offset_bytes / block_size; 4215 *num_blocks = num_bytes / block_size; 4216 return (offset_bytes % block_size) | (num_bytes % block_size); 4217 } 4218 } 4219 4220 static bool 4221 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4222 { 4223 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4224 * has been an overflow and hence the offset has been wrapped around */ 4225 if (offset_blocks + num_blocks < offset_blocks) { 4226 return false; 4227 } 4228 4229 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4230 if (offset_blocks + num_blocks > bdev->blockcnt) { 4231 return false; 4232 } 4233 4234 return true; 4235 } 4236 4237 static void 4238 bdev_seek_complete_cb(void *ctx) 4239 { 4240 struct spdk_bdev_io *bdev_io = ctx; 4241 4242 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4243 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4244 } 4245 4246 static int 4247 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4248 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4249 spdk_bdev_io_completion_cb cb, void *cb_arg) 4250 { 4251 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4252 struct spdk_bdev_io *bdev_io; 4253 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4254 4255 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4256 4257 /* Check if offset_blocks is valid looking at the validity of one block */ 4258 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4259 return -EINVAL; 4260 } 4261 4262 bdev_io = bdev_channel_get_io(channel); 4263 if (!bdev_io) { 4264 return -ENOMEM; 4265 } 4266 4267 bdev_io->internal.ch = channel; 4268 bdev_io->internal.desc = desc; 4269 bdev_io->type = io_type; 4270 bdev_io->u.bdev.offset_blocks = offset_blocks; 4271 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4272 4273 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4274 /* In case bdev doesn't support seek to next data/hole offset, 4275 * it is assumed that only data and no holes are present */ 4276 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4277 bdev_io->u.bdev.seek.offset = offset_blocks; 4278 } else { 4279 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4280 } 4281 4282 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4283 return 0; 4284 } 4285 4286 bdev_io_submit(bdev_io); 4287 return 0; 4288 } 4289 4290 int 4291 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4292 uint64_t offset_blocks, 4293 spdk_bdev_io_completion_cb cb, void *cb_arg) 4294 { 4295 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4296 } 4297 4298 int 4299 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4300 uint64_t offset_blocks, 4301 spdk_bdev_io_completion_cb cb, void *cb_arg) 4302 { 4303 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4304 } 4305 4306 uint64_t 4307 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4308 { 4309 return bdev_io->u.bdev.seek.offset; 4310 } 4311 4312 static int 4313 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4314 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4315 spdk_bdev_io_completion_cb cb, void *cb_arg) 4316 { 4317 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4318 struct spdk_bdev_io *bdev_io; 4319 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4320 4321 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4322 return -EINVAL; 4323 } 4324 4325 bdev_io = bdev_channel_get_io(channel); 4326 if (!bdev_io) { 4327 return -ENOMEM; 4328 } 4329 4330 bdev_io->internal.ch = channel; 4331 bdev_io->internal.desc = desc; 4332 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4333 bdev_io->u.bdev.iovs = &bdev_io->iov; 4334 bdev_io->u.bdev.iovs[0].iov_base = buf; 4335 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4336 bdev_io->u.bdev.iovcnt = 1; 4337 bdev_io->u.bdev.md_buf = md_buf; 4338 bdev_io->u.bdev.num_blocks = num_blocks; 4339 bdev_io->u.bdev.offset_blocks = offset_blocks; 4340 bdev_io->u.bdev.ext_opts = NULL; 4341 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4342 4343 bdev_io_submit(bdev_io); 4344 return 0; 4345 } 4346 4347 int 4348 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4349 void *buf, uint64_t offset, uint64_t nbytes, 4350 spdk_bdev_io_completion_cb cb, void *cb_arg) 4351 { 4352 uint64_t offset_blocks, num_blocks; 4353 4354 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4355 nbytes, &num_blocks) != 0) { 4356 return -EINVAL; 4357 } 4358 4359 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4360 } 4361 4362 int 4363 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4364 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4365 spdk_bdev_io_completion_cb cb, void *cb_arg) 4366 { 4367 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4368 } 4369 4370 int 4371 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4372 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4373 spdk_bdev_io_completion_cb cb, void *cb_arg) 4374 { 4375 struct iovec iov = { 4376 .iov_base = buf, 4377 }; 4378 4379 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4380 return -EINVAL; 4381 } 4382 4383 if (md_buf && !_is_buf_allocated(&iov)) { 4384 return -EINVAL; 4385 } 4386 4387 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4388 cb, cb_arg); 4389 } 4390 4391 int 4392 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4393 struct iovec *iov, int iovcnt, 4394 uint64_t offset, uint64_t nbytes, 4395 spdk_bdev_io_completion_cb cb, void *cb_arg) 4396 { 4397 uint64_t offset_blocks, num_blocks; 4398 4399 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4400 nbytes, &num_blocks) != 0) { 4401 return -EINVAL; 4402 } 4403 4404 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4405 } 4406 4407 static int 4408 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4409 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4410 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4411 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4412 { 4413 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4414 struct spdk_bdev_io *bdev_io; 4415 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4416 4417 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4418 return -EINVAL; 4419 } 4420 4421 bdev_io = bdev_channel_get_io(channel); 4422 if (!bdev_io) { 4423 return -ENOMEM; 4424 } 4425 4426 bdev_io->internal.ch = channel; 4427 bdev_io->internal.desc = desc; 4428 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4429 bdev_io->u.bdev.iovs = iov; 4430 bdev_io->u.bdev.iovcnt = iovcnt; 4431 bdev_io->u.bdev.md_buf = md_buf; 4432 bdev_io->u.bdev.num_blocks = num_blocks; 4433 bdev_io->u.bdev.offset_blocks = offset_blocks; 4434 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4435 bdev_io->internal.ext_opts = opts; 4436 bdev_io->u.bdev.ext_opts = opts; 4437 4438 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4439 4440 return 0; 4441 } 4442 4443 int 4444 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4445 struct iovec *iov, int iovcnt, 4446 uint64_t offset_blocks, uint64_t num_blocks, 4447 spdk_bdev_io_completion_cb cb, void *cb_arg) 4448 { 4449 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4450 num_blocks, cb, cb_arg, NULL, false); 4451 } 4452 4453 int 4454 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4455 struct iovec *iov, int iovcnt, void *md_buf, 4456 uint64_t offset_blocks, uint64_t num_blocks, 4457 spdk_bdev_io_completion_cb cb, void *cb_arg) 4458 { 4459 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4460 return -EINVAL; 4461 } 4462 4463 if (md_buf && !_is_buf_allocated(iov)) { 4464 return -EINVAL; 4465 } 4466 4467 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4468 num_blocks, cb, cb_arg, NULL, false); 4469 } 4470 4471 static inline bool 4472 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4473 { 4474 /* 4475 * We check if opts size is at least of size when we first introduced 4476 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4477 * are not checked internal. 4478 */ 4479 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4480 sizeof(opts->metadata) && 4481 opts->size <= sizeof(*opts) && 4482 /* When memory domain is used, the user must provide data buffers */ 4483 (!opts->memory_domain || (iov && iov[0].iov_base)); 4484 } 4485 4486 int 4487 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4488 struct iovec *iov, int iovcnt, 4489 uint64_t offset_blocks, uint64_t num_blocks, 4490 spdk_bdev_io_completion_cb cb, void *cb_arg, 4491 struct spdk_bdev_ext_io_opts *opts) 4492 { 4493 void *md = NULL; 4494 4495 if (opts) { 4496 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4497 return -EINVAL; 4498 } 4499 md = opts->metadata; 4500 } 4501 4502 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4503 return -EINVAL; 4504 } 4505 4506 if (md && !_is_buf_allocated(iov)) { 4507 return -EINVAL; 4508 } 4509 4510 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4511 num_blocks, cb, cb_arg, opts, false); 4512 } 4513 4514 static int 4515 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4516 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4517 spdk_bdev_io_completion_cb cb, void *cb_arg) 4518 { 4519 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4520 struct spdk_bdev_io *bdev_io; 4521 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4522 4523 if (!desc->write) { 4524 return -EBADF; 4525 } 4526 4527 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4528 return -EINVAL; 4529 } 4530 4531 bdev_io = bdev_channel_get_io(channel); 4532 if (!bdev_io) { 4533 return -ENOMEM; 4534 } 4535 4536 bdev_io->internal.ch = channel; 4537 bdev_io->internal.desc = desc; 4538 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4539 bdev_io->u.bdev.iovs = &bdev_io->iov; 4540 bdev_io->u.bdev.iovs[0].iov_base = buf; 4541 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4542 bdev_io->u.bdev.iovcnt = 1; 4543 bdev_io->u.bdev.md_buf = md_buf; 4544 bdev_io->u.bdev.num_blocks = num_blocks; 4545 bdev_io->u.bdev.offset_blocks = offset_blocks; 4546 bdev_io->u.bdev.ext_opts = NULL; 4547 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4548 4549 bdev_io_submit(bdev_io); 4550 return 0; 4551 } 4552 4553 int 4554 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4555 void *buf, uint64_t offset, uint64_t nbytes, 4556 spdk_bdev_io_completion_cb cb, void *cb_arg) 4557 { 4558 uint64_t offset_blocks, num_blocks; 4559 4560 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4561 nbytes, &num_blocks) != 0) { 4562 return -EINVAL; 4563 } 4564 4565 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4566 } 4567 4568 int 4569 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4570 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4571 spdk_bdev_io_completion_cb cb, void *cb_arg) 4572 { 4573 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4574 cb, cb_arg); 4575 } 4576 4577 int 4578 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4579 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4580 spdk_bdev_io_completion_cb cb, void *cb_arg) 4581 { 4582 struct iovec iov = { 4583 .iov_base = buf, 4584 }; 4585 4586 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4587 return -EINVAL; 4588 } 4589 4590 if (md_buf && !_is_buf_allocated(&iov)) { 4591 return -EINVAL; 4592 } 4593 4594 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4595 cb, cb_arg); 4596 } 4597 4598 static int 4599 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4600 struct iovec *iov, int iovcnt, void *md_buf, 4601 uint64_t offset_blocks, uint64_t num_blocks, 4602 spdk_bdev_io_completion_cb cb, void *cb_arg, 4603 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4604 { 4605 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4606 struct spdk_bdev_io *bdev_io; 4607 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4608 4609 if (!desc->write) { 4610 return -EBADF; 4611 } 4612 4613 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4614 return -EINVAL; 4615 } 4616 4617 bdev_io = bdev_channel_get_io(channel); 4618 if (!bdev_io) { 4619 return -ENOMEM; 4620 } 4621 4622 bdev_io->internal.ch = channel; 4623 bdev_io->internal.desc = desc; 4624 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4625 bdev_io->u.bdev.iovs = iov; 4626 bdev_io->u.bdev.iovcnt = iovcnt; 4627 bdev_io->u.bdev.md_buf = md_buf; 4628 bdev_io->u.bdev.num_blocks = num_blocks; 4629 bdev_io->u.bdev.offset_blocks = offset_blocks; 4630 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4631 bdev_io->internal.ext_opts = opts; 4632 bdev_io->u.bdev.ext_opts = opts; 4633 4634 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4635 4636 return 0; 4637 } 4638 4639 int 4640 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4641 struct iovec *iov, int iovcnt, 4642 uint64_t offset, uint64_t len, 4643 spdk_bdev_io_completion_cb cb, void *cb_arg) 4644 { 4645 uint64_t offset_blocks, num_blocks; 4646 4647 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4648 len, &num_blocks) != 0) { 4649 return -EINVAL; 4650 } 4651 4652 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4653 } 4654 4655 int 4656 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4657 struct iovec *iov, int iovcnt, 4658 uint64_t offset_blocks, uint64_t num_blocks, 4659 spdk_bdev_io_completion_cb cb, void *cb_arg) 4660 { 4661 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4662 num_blocks, cb, cb_arg, NULL, false); 4663 } 4664 4665 int 4666 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4667 struct iovec *iov, int iovcnt, void *md_buf, 4668 uint64_t offset_blocks, uint64_t num_blocks, 4669 spdk_bdev_io_completion_cb cb, void *cb_arg) 4670 { 4671 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4672 return -EINVAL; 4673 } 4674 4675 if (md_buf && !_is_buf_allocated(iov)) { 4676 return -EINVAL; 4677 } 4678 4679 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4680 num_blocks, cb, cb_arg, NULL, false); 4681 } 4682 4683 int 4684 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4685 struct iovec *iov, int iovcnt, 4686 uint64_t offset_blocks, uint64_t num_blocks, 4687 spdk_bdev_io_completion_cb cb, void *cb_arg, 4688 struct spdk_bdev_ext_io_opts *opts) 4689 { 4690 void *md = NULL; 4691 4692 if (opts) { 4693 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4694 return -EINVAL; 4695 } 4696 md = opts->metadata; 4697 } 4698 4699 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4700 return -EINVAL; 4701 } 4702 4703 if (md && !_is_buf_allocated(iov)) { 4704 return -EINVAL; 4705 } 4706 4707 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4708 num_blocks, cb, cb_arg, opts, false); 4709 } 4710 4711 static void 4712 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4713 { 4714 struct spdk_bdev_io *parent_io = cb_arg; 4715 struct spdk_bdev *bdev = parent_io->bdev; 4716 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4717 int i, rc = 0; 4718 4719 if (!success) { 4720 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4721 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4722 spdk_bdev_free_io(bdev_io); 4723 return; 4724 } 4725 4726 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4727 rc = memcmp(read_buf, 4728 parent_io->u.bdev.iovs[i].iov_base, 4729 parent_io->u.bdev.iovs[i].iov_len); 4730 if (rc) { 4731 break; 4732 } 4733 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4734 } 4735 4736 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 4737 rc = memcmp(bdev_io->u.bdev.md_buf, 4738 parent_io->u.bdev.md_buf, 4739 spdk_bdev_get_md_size(bdev)); 4740 } 4741 4742 spdk_bdev_free_io(bdev_io); 4743 4744 if (rc == 0) { 4745 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4746 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4747 } else { 4748 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4749 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4750 } 4751 } 4752 4753 static void 4754 bdev_compare_do_read(void *_bdev_io) 4755 { 4756 struct spdk_bdev_io *bdev_io = _bdev_io; 4757 int rc; 4758 4759 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4760 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4761 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4762 bdev_compare_do_read_done, bdev_io); 4763 4764 if (rc == -ENOMEM) { 4765 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4766 } else if (rc != 0) { 4767 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4768 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4769 } 4770 } 4771 4772 static int 4773 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4774 struct iovec *iov, int iovcnt, void *md_buf, 4775 uint64_t offset_blocks, uint64_t num_blocks, 4776 spdk_bdev_io_completion_cb cb, void *cb_arg) 4777 { 4778 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4779 struct spdk_bdev_io *bdev_io; 4780 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4781 4782 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4783 return -EINVAL; 4784 } 4785 4786 bdev_io = bdev_channel_get_io(channel); 4787 if (!bdev_io) { 4788 return -ENOMEM; 4789 } 4790 4791 bdev_io->internal.ch = channel; 4792 bdev_io->internal.desc = desc; 4793 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4794 bdev_io->u.bdev.iovs = iov; 4795 bdev_io->u.bdev.iovcnt = iovcnt; 4796 bdev_io->u.bdev.md_buf = md_buf; 4797 bdev_io->u.bdev.num_blocks = num_blocks; 4798 bdev_io->u.bdev.offset_blocks = offset_blocks; 4799 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4800 bdev_io->u.bdev.ext_opts = NULL; 4801 4802 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4803 bdev_io_submit(bdev_io); 4804 return 0; 4805 } 4806 4807 bdev_compare_do_read(bdev_io); 4808 4809 return 0; 4810 } 4811 4812 int 4813 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4814 struct iovec *iov, int iovcnt, 4815 uint64_t offset_blocks, uint64_t num_blocks, 4816 spdk_bdev_io_completion_cb cb, void *cb_arg) 4817 { 4818 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4819 num_blocks, cb, cb_arg); 4820 } 4821 4822 int 4823 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4824 struct iovec *iov, int iovcnt, void *md_buf, 4825 uint64_t offset_blocks, uint64_t num_blocks, 4826 spdk_bdev_io_completion_cb cb, void *cb_arg) 4827 { 4828 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4829 return -EINVAL; 4830 } 4831 4832 if (md_buf && !_is_buf_allocated(iov)) { 4833 return -EINVAL; 4834 } 4835 4836 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4837 num_blocks, cb, cb_arg); 4838 } 4839 4840 static int 4841 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4842 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4843 spdk_bdev_io_completion_cb cb, void *cb_arg) 4844 { 4845 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4846 struct spdk_bdev_io *bdev_io; 4847 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4848 4849 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4850 return -EINVAL; 4851 } 4852 4853 bdev_io = bdev_channel_get_io(channel); 4854 if (!bdev_io) { 4855 return -ENOMEM; 4856 } 4857 4858 bdev_io->internal.ch = channel; 4859 bdev_io->internal.desc = desc; 4860 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4861 bdev_io->u.bdev.iovs = &bdev_io->iov; 4862 bdev_io->u.bdev.iovs[0].iov_base = buf; 4863 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4864 bdev_io->u.bdev.iovcnt = 1; 4865 bdev_io->u.bdev.md_buf = md_buf; 4866 bdev_io->u.bdev.num_blocks = num_blocks; 4867 bdev_io->u.bdev.offset_blocks = offset_blocks; 4868 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4869 bdev_io->u.bdev.ext_opts = NULL; 4870 4871 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4872 bdev_io_submit(bdev_io); 4873 return 0; 4874 } 4875 4876 bdev_compare_do_read(bdev_io); 4877 4878 return 0; 4879 } 4880 4881 int 4882 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4883 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4884 spdk_bdev_io_completion_cb cb, void *cb_arg) 4885 { 4886 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4887 cb, cb_arg); 4888 } 4889 4890 int 4891 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4892 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4893 spdk_bdev_io_completion_cb cb, void *cb_arg) 4894 { 4895 struct iovec iov = { 4896 .iov_base = buf, 4897 }; 4898 4899 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4900 return -EINVAL; 4901 } 4902 4903 if (md_buf && !_is_buf_allocated(&iov)) { 4904 return -EINVAL; 4905 } 4906 4907 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4908 cb, cb_arg); 4909 } 4910 4911 static void 4912 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4913 { 4914 struct spdk_bdev_io *bdev_io = ctx; 4915 4916 if (unlock_status) { 4917 SPDK_ERRLOG("LBA range unlock failed\n"); 4918 } 4919 4920 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4921 false, bdev_io->internal.caller_ctx); 4922 } 4923 4924 static void 4925 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4926 { 4927 bdev_io->internal.status = status; 4928 4929 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4930 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4931 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4932 } 4933 4934 static void 4935 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4936 { 4937 struct spdk_bdev_io *parent_io = cb_arg; 4938 4939 if (!success) { 4940 SPDK_ERRLOG("Compare and write operation failed\n"); 4941 } 4942 4943 spdk_bdev_free_io(bdev_io); 4944 4945 bdev_comparev_and_writev_blocks_unlock(parent_io, 4946 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4947 } 4948 4949 static void 4950 bdev_compare_and_write_do_write(void *_bdev_io) 4951 { 4952 struct spdk_bdev_io *bdev_io = _bdev_io; 4953 int rc; 4954 4955 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4956 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4957 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4958 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4959 bdev_compare_and_write_do_write_done, bdev_io); 4960 4961 4962 if (rc == -ENOMEM) { 4963 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4964 } else if (rc != 0) { 4965 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4966 } 4967 } 4968 4969 static void 4970 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4971 { 4972 struct spdk_bdev_io *parent_io = cb_arg; 4973 4974 spdk_bdev_free_io(bdev_io); 4975 4976 if (!success) { 4977 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4978 return; 4979 } 4980 4981 bdev_compare_and_write_do_write(parent_io); 4982 } 4983 4984 static void 4985 bdev_compare_and_write_do_compare(void *_bdev_io) 4986 { 4987 struct spdk_bdev_io *bdev_io = _bdev_io; 4988 int rc; 4989 4990 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4991 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4992 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4993 bdev_compare_and_write_do_compare_done, bdev_io); 4994 4995 if (rc == -ENOMEM) { 4996 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4997 } else if (rc != 0) { 4998 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4999 } 5000 } 5001 5002 static void 5003 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5004 { 5005 struct spdk_bdev_io *bdev_io = ctx; 5006 5007 if (status) { 5008 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5009 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5010 return; 5011 } 5012 5013 bdev_compare_and_write_do_compare(bdev_io); 5014 } 5015 5016 int 5017 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5018 struct iovec *compare_iov, int compare_iovcnt, 5019 struct iovec *write_iov, int write_iovcnt, 5020 uint64_t offset_blocks, uint64_t num_blocks, 5021 spdk_bdev_io_completion_cb cb, void *cb_arg) 5022 { 5023 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5024 struct spdk_bdev_io *bdev_io; 5025 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5026 5027 if (!desc->write) { 5028 return -EBADF; 5029 } 5030 5031 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5032 return -EINVAL; 5033 } 5034 5035 if (num_blocks > bdev->acwu) { 5036 return -EINVAL; 5037 } 5038 5039 bdev_io = bdev_channel_get_io(channel); 5040 if (!bdev_io) { 5041 return -ENOMEM; 5042 } 5043 5044 bdev_io->internal.ch = channel; 5045 bdev_io->internal.desc = desc; 5046 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5047 bdev_io->u.bdev.iovs = compare_iov; 5048 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5049 bdev_io->u.bdev.fused_iovs = write_iov; 5050 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5051 bdev_io->u.bdev.md_buf = NULL; 5052 bdev_io->u.bdev.num_blocks = num_blocks; 5053 bdev_io->u.bdev.offset_blocks = offset_blocks; 5054 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5055 bdev_io->u.bdev.ext_opts = NULL; 5056 5057 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5058 bdev_io_submit(bdev_io); 5059 return 0; 5060 } 5061 5062 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5063 bdev_comparev_and_writev_blocks_locked, bdev_io); 5064 } 5065 5066 int 5067 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5068 struct iovec *iov, int iovcnt, 5069 uint64_t offset_blocks, uint64_t num_blocks, 5070 bool populate, 5071 spdk_bdev_io_completion_cb cb, void *cb_arg) 5072 { 5073 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5074 struct spdk_bdev_io *bdev_io; 5075 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5076 5077 if (!desc->write) { 5078 return -EBADF; 5079 } 5080 5081 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5082 return -EINVAL; 5083 } 5084 5085 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5086 return -ENOTSUP; 5087 } 5088 5089 bdev_io = bdev_channel_get_io(channel); 5090 if (!bdev_io) { 5091 return -ENOMEM; 5092 } 5093 5094 bdev_io->internal.ch = channel; 5095 bdev_io->internal.desc = desc; 5096 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5097 bdev_io->u.bdev.num_blocks = num_blocks; 5098 bdev_io->u.bdev.offset_blocks = offset_blocks; 5099 bdev_io->u.bdev.iovs = iov; 5100 bdev_io->u.bdev.iovcnt = iovcnt; 5101 bdev_io->u.bdev.md_buf = NULL; 5102 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5103 bdev_io->u.bdev.zcopy.commit = 0; 5104 bdev_io->u.bdev.zcopy.start = 1; 5105 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5106 bdev_io->u.bdev.ext_opts = NULL; 5107 5108 bdev_io_submit(bdev_io); 5109 5110 return 0; 5111 } 5112 5113 int 5114 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5115 spdk_bdev_io_completion_cb cb, void *cb_arg) 5116 { 5117 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5118 return -EINVAL; 5119 } 5120 5121 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5122 bdev_io->u.bdev.zcopy.start = 0; 5123 bdev_io->internal.caller_ctx = cb_arg; 5124 bdev_io->internal.cb = cb; 5125 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5126 5127 bdev_io_submit(bdev_io); 5128 5129 return 0; 5130 } 5131 5132 int 5133 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5134 uint64_t offset, uint64_t len, 5135 spdk_bdev_io_completion_cb cb, void *cb_arg) 5136 { 5137 uint64_t offset_blocks, num_blocks; 5138 5139 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5140 len, &num_blocks) != 0) { 5141 return -EINVAL; 5142 } 5143 5144 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5145 } 5146 5147 int 5148 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5149 uint64_t offset_blocks, uint64_t num_blocks, 5150 spdk_bdev_io_completion_cb cb, void *cb_arg) 5151 { 5152 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5153 struct spdk_bdev_io *bdev_io; 5154 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5155 5156 if (!desc->write) { 5157 return -EBADF; 5158 } 5159 5160 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5161 return -EINVAL; 5162 } 5163 5164 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5165 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5166 return -ENOTSUP; 5167 } 5168 5169 bdev_io = bdev_channel_get_io(channel); 5170 5171 if (!bdev_io) { 5172 return -ENOMEM; 5173 } 5174 5175 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5176 bdev_io->internal.ch = channel; 5177 bdev_io->internal.desc = desc; 5178 bdev_io->u.bdev.offset_blocks = offset_blocks; 5179 bdev_io->u.bdev.num_blocks = num_blocks; 5180 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5181 bdev_io->u.bdev.ext_opts = NULL; 5182 5183 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5184 bdev_io_submit(bdev_io); 5185 return 0; 5186 } 5187 5188 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5189 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5190 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5191 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5192 bdev_write_zero_buffer_next(bdev_io); 5193 5194 return 0; 5195 } 5196 5197 int 5198 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5199 uint64_t offset, uint64_t nbytes, 5200 spdk_bdev_io_completion_cb cb, void *cb_arg) 5201 { 5202 uint64_t offset_blocks, num_blocks; 5203 5204 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5205 nbytes, &num_blocks) != 0) { 5206 return -EINVAL; 5207 } 5208 5209 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5210 } 5211 5212 int 5213 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5214 uint64_t offset_blocks, uint64_t num_blocks, 5215 spdk_bdev_io_completion_cb cb, void *cb_arg) 5216 { 5217 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5218 struct spdk_bdev_io *bdev_io; 5219 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5220 5221 if (!desc->write) { 5222 return -EBADF; 5223 } 5224 5225 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5226 return -EINVAL; 5227 } 5228 5229 if (num_blocks == 0) { 5230 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5231 return -EINVAL; 5232 } 5233 5234 bdev_io = bdev_channel_get_io(channel); 5235 if (!bdev_io) { 5236 return -ENOMEM; 5237 } 5238 5239 bdev_io->internal.ch = channel; 5240 bdev_io->internal.desc = desc; 5241 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5242 5243 bdev_io->u.bdev.iovs = &bdev_io->iov; 5244 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5245 bdev_io->u.bdev.iovs[0].iov_len = 0; 5246 bdev_io->u.bdev.iovcnt = 1; 5247 5248 bdev_io->u.bdev.offset_blocks = offset_blocks; 5249 bdev_io->u.bdev.num_blocks = num_blocks; 5250 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5251 bdev_io->u.bdev.ext_opts = NULL; 5252 5253 bdev_io_submit(bdev_io); 5254 return 0; 5255 } 5256 5257 int 5258 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5259 uint64_t offset, uint64_t length, 5260 spdk_bdev_io_completion_cb cb, void *cb_arg) 5261 { 5262 uint64_t offset_blocks, num_blocks; 5263 5264 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5265 length, &num_blocks) != 0) { 5266 return -EINVAL; 5267 } 5268 5269 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5270 } 5271 5272 int 5273 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5274 uint64_t offset_blocks, uint64_t num_blocks, 5275 spdk_bdev_io_completion_cb cb, void *cb_arg) 5276 { 5277 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5278 struct spdk_bdev_io *bdev_io; 5279 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5280 5281 if (!desc->write) { 5282 return -EBADF; 5283 } 5284 5285 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5286 return -EINVAL; 5287 } 5288 5289 bdev_io = bdev_channel_get_io(channel); 5290 if (!bdev_io) { 5291 return -ENOMEM; 5292 } 5293 5294 bdev_io->internal.ch = channel; 5295 bdev_io->internal.desc = desc; 5296 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5297 bdev_io->u.bdev.iovs = NULL; 5298 bdev_io->u.bdev.iovcnt = 0; 5299 bdev_io->u.bdev.offset_blocks = offset_blocks; 5300 bdev_io->u.bdev.num_blocks = num_blocks; 5301 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5302 5303 bdev_io_submit(bdev_io); 5304 return 0; 5305 } 5306 5307 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5308 5309 static void 5310 bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status) 5311 { 5312 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5313 struct spdk_bdev_io *bdev_io; 5314 5315 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5316 5317 if (status == -EBUSY) { 5318 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5319 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5320 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5321 } else { 5322 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5323 * start the reset. */ 5324 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5325 bdev_io_submit_reset(bdev_io); 5326 } 5327 } else { 5328 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5329 SPDK_DEBUGLOG(bdev, 5330 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5331 ch->bdev->name); 5332 /* Mark the completion status as a SUCCESS and complete the reset. */ 5333 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5334 } 5335 } 5336 5337 static void 5338 bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i) 5339 { 5340 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 5341 struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch); 5342 int status = 0; 5343 5344 if (cur_ch->io_outstanding > 0) { 5345 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5346 * further iteration over the rest of the channels and pass non-zero status 5347 * to the callback function. */ 5348 status = -EBUSY; 5349 } 5350 spdk_for_each_channel_continue(i, status); 5351 } 5352 5353 static int 5354 bdev_reset_poll_for_outstanding_io(void *ctx) 5355 { 5356 struct spdk_bdev_channel *ch = ctx; 5357 struct spdk_bdev_io *bdev_io; 5358 5359 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5360 5361 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5362 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5363 ch, bdev_reset_check_outstanding_io_done); 5364 5365 return SPDK_POLLER_BUSY; 5366 } 5367 5368 static void 5369 bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status) 5370 { 5371 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5372 struct spdk_bdev *bdev = ch->bdev; 5373 struct spdk_bdev_io *bdev_io; 5374 5375 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5376 5377 if (bdev->reset_io_drain_timeout == 0) { 5378 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5379 5380 bdev_io_submit_reset(bdev_io); 5381 return; 5382 } 5383 5384 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5385 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5386 5387 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5388 * submit the reset to the underlying module only if outstanding I/O 5389 * remain after reset_io_drain_timeout seconds have passed. */ 5390 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5391 ch, bdev_reset_check_outstanding_io_done); 5392 } 5393 5394 static void 5395 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 5396 { 5397 struct spdk_io_channel *ch; 5398 struct spdk_bdev_channel *channel; 5399 struct spdk_bdev_mgmt_channel *mgmt_channel; 5400 struct spdk_bdev_shared_resource *shared_resource; 5401 bdev_io_tailq_t tmp_queued; 5402 5403 TAILQ_INIT(&tmp_queued); 5404 5405 ch = spdk_io_channel_iter_get_channel(i); 5406 channel = spdk_io_channel_get_ctx(ch); 5407 shared_resource = channel->shared_resource; 5408 mgmt_channel = shared_resource->mgmt_ch; 5409 5410 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5411 5412 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5413 /* The QoS object is always valid and readable while 5414 * the channel flag is set, so the lock here should not 5415 * be necessary. We're not in the fast path though, so 5416 * just take it anyway. */ 5417 pthread_mutex_lock(&channel->bdev->internal.mutex); 5418 if (channel->bdev->internal.qos->ch == channel) { 5419 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5420 } 5421 pthread_mutex_unlock(&channel->bdev->internal.mutex); 5422 } 5423 5424 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5425 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 5426 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 5427 bdev_abort_all_queued_io(&tmp_queued, channel); 5428 5429 spdk_for_each_channel_continue(i, 0); 5430 } 5431 5432 static void 5433 bdev_start_reset(void *ctx) 5434 { 5435 struct spdk_bdev_channel *ch = ctx; 5436 5437 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5438 ch, bdev_reset_freeze_channel_done); 5439 } 5440 5441 static void 5442 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5443 { 5444 struct spdk_bdev *bdev = ch->bdev; 5445 5446 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5447 5448 pthread_mutex_lock(&bdev->internal.mutex); 5449 if (bdev->internal.reset_in_progress == NULL) { 5450 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5451 /* 5452 * Take a channel reference for the target bdev for the life of this 5453 * reset. This guards against the channel getting destroyed while 5454 * spdk_for_each_channel() calls related to this reset IO are in 5455 * progress. We will release the reference when this reset is 5456 * completed. 5457 */ 5458 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5459 bdev_start_reset(ch); 5460 } 5461 pthread_mutex_unlock(&bdev->internal.mutex); 5462 } 5463 5464 int 5465 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5466 spdk_bdev_io_completion_cb cb, void *cb_arg) 5467 { 5468 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5469 struct spdk_bdev_io *bdev_io; 5470 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5471 5472 bdev_io = bdev_channel_get_io(channel); 5473 if (!bdev_io) { 5474 return -ENOMEM; 5475 } 5476 5477 bdev_io->internal.ch = channel; 5478 bdev_io->internal.desc = desc; 5479 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5480 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5481 bdev_io->u.reset.ch_ref = NULL; 5482 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5483 5484 pthread_mutex_lock(&bdev->internal.mutex); 5485 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5486 pthread_mutex_unlock(&bdev->internal.mutex); 5487 5488 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5489 internal.ch_link); 5490 5491 bdev_channel_start_reset(channel); 5492 5493 return 0; 5494 } 5495 5496 void 5497 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5498 struct spdk_bdev_io_stat *stat) 5499 { 5500 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5501 5502 *stat = channel->stat; 5503 } 5504 5505 static void 5506 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5507 { 5508 void *io_device = spdk_io_channel_iter_get_io_device(i); 5509 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5510 5511 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5512 bdev_iostat_ctx->cb_arg, 0); 5513 free(bdev_iostat_ctx); 5514 } 5515 5516 static void 5517 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5518 { 5519 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5520 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5521 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5522 5523 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5524 spdk_for_each_channel_continue(i, 0); 5525 } 5526 5527 void 5528 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5529 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5530 { 5531 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5532 5533 assert(bdev != NULL); 5534 assert(stat != NULL); 5535 assert(cb != NULL); 5536 5537 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5538 if (bdev_iostat_ctx == NULL) { 5539 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5540 cb(bdev, stat, cb_arg, -ENOMEM); 5541 return; 5542 } 5543 5544 bdev_iostat_ctx->stat = stat; 5545 bdev_iostat_ctx->cb = cb; 5546 bdev_iostat_ctx->cb_arg = cb_arg; 5547 5548 /* Start with the statistics from previously deleted channels. */ 5549 pthread_mutex_lock(&bdev->internal.mutex); 5550 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5551 pthread_mutex_unlock(&bdev->internal.mutex); 5552 5553 /* Then iterate and add the statistics from each existing channel. */ 5554 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5555 bdev_get_each_channel_stat, 5556 bdev_iostat_ctx, 5557 bdev_get_device_stat_done); 5558 } 5559 5560 int 5561 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5562 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5563 spdk_bdev_io_completion_cb cb, void *cb_arg) 5564 { 5565 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5566 struct spdk_bdev_io *bdev_io; 5567 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5568 5569 if (!desc->write) { 5570 return -EBADF; 5571 } 5572 5573 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5574 return -ENOTSUP; 5575 } 5576 5577 bdev_io = bdev_channel_get_io(channel); 5578 if (!bdev_io) { 5579 return -ENOMEM; 5580 } 5581 5582 bdev_io->internal.ch = channel; 5583 bdev_io->internal.desc = desc; 5584 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5585 bdev_io->u.nvme_passthru.cmd = *cmd; 5586 bdev_io->u.nvme_passthru.buf = buf; 5587 bdev_io->u.nvme_passthru.nbytes = nbytes; 5588 bdev_io->u.nvme_passthru.md_buf = NULL; 5589 bdev_io->u.nvme_passthru.md_len = 0; 5590 5591 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5592 5593 bdev_io_submit(bdev_io); 5594 return 0; 5595 } 5596 5597 int 5598 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5599 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5600 spdk_bdev_io_completion_cb cb, void *cb_arg) 5601 { 5602 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5603 struct spdk_bdev_io *bdev_io; 5604 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5605 5606 if (!desc->write) { 5607 /* 5608 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5609 * to easily determine if the command is a read or write, but for now just 5610 * do not allow io_passthru with a read-only descriptor. 5611 */ 5612 return -EBADF; 5613 } 5614 5615 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5616 return -ENOTSUP; 5617 } 5618 5619 bdev_io = bdev_channel_get_io(channel); 5620 if (!bdev_io) { 5621 return -ENOMEM; 5622 } 5623 5624 bdev_io->internal.ch = channel; 5625 bdev_io->internal.desc = desc; 5626 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5627 bdev_io->u.nvme_passthru.cmd = *cmd; 5628 bdev_io->u.nvme_passthru.buf = buf; 5629 bdev_io->u.nvme_passthru.nbytes = nbytes; 5630 bdev_io->u.nvme_passthru.md_buf = NULL; 5631 bdev_io->u.nvme_passthru.md_len = 0; 5632 5633 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5634 5635 bdev_io_submit(bdev_io); 5636 return 0; 5637 } 5638 5639 int 5640 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5641 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5642 spdk_bdev_io_completion_cb cb, void *cb_arg) 5643 { 5644 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5645 struct spdk_bdev_io *bdev_io; 5646 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5647 5648 if (!desc->write) { 5649 /* 5650 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5651 * to easily determine if the command is a read or write, but for now just 5652 * do not allow io_passthru with a read-only descriptor. 5653 */ 5654 return -EBADF; 5655 } 5656 5657 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5658 return -ENOTSUP; 5659 } 5660 5661 bdev_io = bdev_channel_get_io(channel); 5662 if (!bdev_io) { 5663 return -ENOMEM; 5664 } 5665 5666 bdev_io->internal.ch = channel; 5667 bdev_io->internal.desc = desc; 5668 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5669 bdev_io->u.nvme_passthru.cmd = *cmd; 5670 bdev_io->u.nvme_passthru.buf = buf; 5671 bdev_io->u.nvme_passthru.nbytes = nbytes; 5672 bdev_io->u.nvme_passthru.md_buf = md_buf; 5673 bdev_io->u.nvme_passthru.md_len = md_len; 5674 5675 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5676 5677 bdev_io_submit(bdev_io); 5678 return 0; 5679 } 5680 5681 static void bdev_abort_retry(void *ctx); 5682 static void bdev_abort(struct spdk_bdev_io *parent_io); 5683 5684 static void 5685 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5686 { 5687 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5688 struct spdk_bdev_io *parent_io = cb_arg; 5689 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5690 5691 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5692 5693 spdk_bdev_free_io(bdev_io); 5694 5695 if (!success) { 5696 /* Check if the target I/O completed in the meantime. */ 5697 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5698 if (tmp_io == bio_to_abort) { 5699 break; 5700 } 5701 } 5702 5703 /* If the target I/O still exists, set the parent to failed. */ 5704 if (tmp_io != NULL) { 5705 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5706 } 5707 } 5708 5709 parent_io->u.bdev.split_outstanding--; 5710 if (parent_io->u.bdev.split_outstanding == 0) { 5711 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5712 bdev_abort_retry(parent_io); 5713 } else { 5714 bdev_io_complete(parent_io); 5715 } 5716 } 5717 } 5718 5719 static int 5720 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5721 struct spdk_bdev_io *bio_to_abort, 5722 spdk_bdev_io_completion_cb cb, void *cb_arg) 5723 { 5724 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5725 struct spdk_bdev_io *bdev_io; 5726 5727 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5728 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5729 /* TODO: Abort reset or abort request. */ 5730 return -ENOTSUP; 5731 } 5732 5733 bdev_io = bdev_channel_get_io(channel); 5734 if (bdev_io == NULL) { 5735 return -ENOMEM; 5736 } 5737 5738 bdev_io->internal.ch = channel; 5739 bdev_io->internal.desc = desc; 5740 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5741 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5742 5743 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5744 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5745 5746 /* Parent abort request is not submitted directly, but to manage its 5747 * execution add it to the submitted list here. 5748 */ 5749 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5750 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5751 5752 bdev_abort(bdev_io); 5753 5754 return 0; 5755 } 5756 5757 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5758 5759 /* Submit the abort request to the underlying bdev module. */ 5760 bdev_io_submit(bdev_io); 5761 5762 return 0; 5763 } 5764 5765 static uint32_t 5766 _bdev_abort(struct spdk_bdev_io *parent_io) 5767 { 5768 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5769 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5770 void *bio_cb_arg; 5771 struct spdk_bdev_io *bio_to_abort; 5772 uint32_t matched_ios; 5773 int rc; 5774 5775 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5776 5777 /* matched_ios is returned and will be kept by the caller. 5778 * 5779 * This funcion will be used for two cases, 1) the same cb_arg is used for 5780 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5781 * Incrementing split_outstanding directly here may confuse readers especially 5782 * for the 1st case. 5783 * 5784 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5785 * works as expected. 5786 */ 5787 matched_ios = 0; 5788 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5789 5790 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5791 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5792 continue; 5793 } 5794 5795 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5796 /* Any I/O which was submitted after this abort command should be excluded. */ 5797 continue; 5798 } 5799 5800 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5801 if (rc != 0) { 5802 if (rc == -ENOMEM) { 5803 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5804 } else { 5805 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5806 } 5807 break; 5808 } 5809 matched_ios++; 5810 } 5811 5812 return matched_ios; 5813 } 5814 5815 static void 5816 bdev_abort_retry(void *ctx) 5817 { 5818 struct spdk_bdev_io *parent_io = ctx; 5819 uint32_t matched_ios; 5820 5821 matched_ios = _bdev_abort(parent_io); 5822 5823 if (matched_ios == 0) { 5824 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5825 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5826 } else { 5827 /* For retry, the case that no target I/O was found is success 5828 * because it means target I/Os completed in the meantime. 5829 */ 5830 bdev_io_complete(parent_io); 5831 } 5832 return; 5833 } 5834 5835 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5836 parent_io->u.bdev.split_outstanding = matched_ios; 5837 } 5838 5839 static void 5840 bdev_abort(struct spdk_bdev_io *parent_io) 5841 { 5842 uint32_t matched_ios; 5843 5844 matched_ios = _bdev_abort(parent_io); 5845 5846 if (matched_ios == 0) { 5847 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5848 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5849 } else { 5850 /* The case the no target I/O was found is failure. */ 5851 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5852 bdev_io_complete(parent_io); 5853 } 5854 return; 5855 } 5856 5857 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5858 parent_io->u.bdev.split_outstanding = matched_ios; 5859 } 5860 5861 int 5862 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5863 void *bio_cb_arg, 5864 spdk_bdev_io_completion_cb cb, void *cb_arg) 5865 { 5866 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5867 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5868 struct spdk_bdev_io *bdev_io; 5869 5870 if (bio_cb_arg == NULL) { 5871 return -EINVAL; 5872 } 5873 5874 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5875 return -ENOTSUP; 5876 } 5877 5878 bdev_io = bdev_channel_get_io(channel); 5879 if (bdev_io == NULL) { 5880 return -ENOMEM; 5881 } 5882 5883 bdev_io->internal.ch = channel; 5884 bdev_io->internal.desc = desc; 5885 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5886 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5887 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5888 5889 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5890 5891 /* Parent abort request is not submitted directly, but to manage its execution, 5892 * add it to the submitted list here. 5893 */ 5894 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5895 5896 bdev_abort(bdev_io); 5897 5898 return 0; 5899 } 5900 5901 int 5902 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5903 struct spdk_bdev_io_wait_entry *entry) 5904 { 5905 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5906 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5907 5908 if (bdev != entry->bdev) { 5909 SPDK_ERRLOG("bdevs do not match\n"); 5910 return -EINVAL; 5911 } 5912 5913 if (mgmt_ch->per_thread_cache_count > 0) { 5914 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5915 return -EINVAL; 5916 } 5917 5918 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5919 return 0; 5920 } 5921 5922 static inline void 5923 bdev_io_complete(void *ctx) 5924 { 5925 struct spdk_bdev_io *bdev_io = ctx; 5926 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5927 uint64_t tsc, tsc_diff; 5928 5929 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5930 /* 5931 * Send the completion to the thread that originally submitted the I/O, 5932 * which may not be the current thread in the case of QoS. 5933 */ 5934 if (bdev_io->internal.io_submit_ch) { 5935 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5936 bdev_io->internal.io_submit_ch = NULL; 5937 } 5938 5939 /* 5940 * Defer completion to avoid potential infinite recursion if the 5941 * user's completion callback issues a new I/O. 5942 */ 5943 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5944 bdev_io_complete, bdev_io); 5945 return; 5946 } 5947 5948 tsc = spdk_get_ticks(); 5949 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5950 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5951 bdev_io->internal.caller_ctx); 5952 5953 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5954 5955 if (bdev_io->internal.ch->histogram) { 5956 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5957 } 5958 5959 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5960 switch (bdev_io->type) { 5961 case SPDK_BDEV_IO_TYPE_READ: 5962 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5963 bdev_io->internal.ch->stat.num_read_ops++; 5964 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5965 break; 5966 case SPDK_BDEV_IO_TYPE_WRITE: 5967 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5968 bdev_io->internal.ch->stat.num_write_ops++; 5969 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5970 break; 5971 case SPDK_BDEV_IO_TYPE_UNMAP: 5972 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5973 bdev_io->internal.ch->stat.num_unmap_ops++; 5974 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5975 break; 5976 case SPDK_BDEV_IO_TYPE_ZCOPY: 5977 /* Track the data in the start phase only */ 5978 if (bdev_io->u.bdev.zcopy.start) { 5979 if (bdev_io->u.bdev.zcopy.populate) { 5980 bdev_io->internal.ch->stat.bytes_read += 5981 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5982 bdev_io->internal.ch->stat.num_read_ops++; 5983 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5984 } else { 5985 bdev_io->internal.ch->stat.bytes_written += 5986 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5987 bdev_io->internal.ch->stat.num_write_ops++; 5988 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5989 } 5990 } 5991 break; 5992 default: 5993 break; 5994 } 5995 } 5996 5997 #ifdef SPDK_CONFIG_VTUNE 5998 uint64_t now_tsc = spdk_get_ticks(); 5999 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6000 uint64_t data[5]; 6001 6002 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 6003 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 6004 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 6005 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 6006 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6007 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6008 6009 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6010 __itt_metadata_u64, 5, data); 6011 6012 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 6013 bdev_io->internal.ch->start_tsc = now_tsc; 6014 } 6015 #endif 6016 6017 assert(bdev_io->internal.cb != NULL); 6018 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6019 6020 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6021 bdev_io->internal.caller_ctx); 6022 } 6023 6024 static void bdev_destroy_cb(void *io_device); 6025 6026 static void 6027 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 6028 { 6029 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6030 struct spdk_bdev *bdev = bdev_io->bdev; 6031 6032 if (bdev_io->u.reset.ch_ref != NULL) { 6033 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6034 bdev_io->u.reset.ch_ref = NULL; 6035 } 6036 6037 bdev_io_complete(bdev_io); 6038 6039 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6040 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6041 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6042 } 6043 } 6044 6045 static void 6046 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 6047 { 6048 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6049 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6050 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6051 struct spdk_bdev_io *queued_reset; 6052 6053 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6054 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6055 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6056 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6057 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6058 } 6059 6060 spdk_for_each_channel_continue(i, 0); 6061 } 6062 6063 void 6064 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6065 { 6066 struct spdk_bdev *bdev = bdev_io->bdev; 6067 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6068 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6069 6070 bdev_io->internal.status = status; 6071 6072 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6073 bool unlock_channels = false; 6074 6075 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6076 SPDK_ERRLOG("NOMEM returned for reset\n"); 6077 } 6078 pthread_mutex_lock(&bdev->internal.mutex); 6079 if (bdev_io == bdev->internal.reset_in_progress) { 6080 bdev->internal.reset_in_progress = NULL; 6081 unlock_channels = true; 6082 } 6083 pthread_mutex_unlock(&bdev->internal.mutex); 6084 6085 if (unlock_channels) { 6086 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 6087 bdev_io, bdev_reset_complete); 6088 return; 6089 } 6090 } else { 6091 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6092 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6093 /* bdev IO will be completed in the callback */ 6094 return; 6095 } 6096 6097 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6098 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6099 return; 6100 } 6101 } 6102 6103 bdev_io_complete(bdev_io); 6104 } 6105 6106 void 6107 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6108 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6109 { 6110 if (sc == SPDK_SCSI_STATUS_GOOD) { 6111 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6112 } else { 6113 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6114 bdev_io->internal.error.scsi.sc = sc; 6115 bdev_io->internal.error.scsi.sk = sk; 6116 bdev_io->internal.error.scsi.asc = asc; 6117 bdev_io->internal.error.scsi.ascq = ascq; 6118 } 6119 6120 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6121 } 6122 6123 void 6124 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6125 int *sc, int *sk, int *asc, int *ascq) 6126 { 6127 assert(sc != NULL); 6128 assert(sk != NULL); 6129 assert(asc != NULL); 6130 assert(ascq != NULL); 6131 6132 switch (bdev_io->internal.status) { 6133 case SPDK_BDEV_IO_STATUS_SUCCESS: 6134 *sc = SPDK_SCSI_STATUS_GOOD; 6135 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6136 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6137 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6138 break; 6139 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6140 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6141 break; 6142 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6143 *sc = bdev_io->internal.error.scsi.sc; 6144 *sk = bdev_io->internal.error.scsi.sk; 6145 *asc = bdev_io->internal.error.scsi.asc; 6146 *ascq = bdev_io->internal.error.scsi.ascq; 6147 break; 6148 default: 6149 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6150 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6151 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6152 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6153 break; 6154 } 6155 } 6156 6157 void 6158 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6159 { 6160 if (aio_result == 0) { 6161 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6162 } else { 6163 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6164 } 6165 6166 bdev_io->internal.error.aio_result = aio_result; 6167 6168 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6169 } 6170 6171 void 6172 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6173 { 6174 assert(aio_result != NULL); 6175 6176 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6177 *aio_result = bdev_io->internal.error.aio_result; 6178 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6179 *aio_result = 0; 6180 } else { 6181 *aio_result = -EIO; 6182 } 6183 } 6184 6185 void 6186 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6187 { 6188 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6189 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6190 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6191 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6192 } else { 6193 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6194 } 6195 6196 bdev_io->internal.error.nvme.cdw0 = cdw0; 6197 bdev_io->internal.error.nvme.sct = sct; 6198 bdev_io->internal.error.nvme.sc = sc; 6199 6200 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6201 } 6202 6203 void 6204 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6205 { 6206 assert(sct != NULL); 6207 assert(sc != NULL); 6208 assert(cdw0 != NULL); 6209 6210 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6211 *sct = SPDK_NVME_SCT_GENERIC; 6212 *sc = SPDK_NVME_SC_SUCCESS; 6213 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6214 *cdw0 = 0; 6215 } else { 6216 *cdw0 = 1U; 6217 } 6218 return; 6219 } 6220 6221 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6222 *sct = bdev_io->internal.error.nvme.sct; 6223 *sc = bdev_io->internal.error.nvme.sc; 6224 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6225 *sct = SPDK_NVME_SCT_GENERIC; 6226 *sc = SPDK_NVME_SC_SUCCESS; 6227 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6228 *sct = SPDK_NVME_SCT_GENERIC; 6229 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6230 } else { 6231 *sct = SPDK_NVME_SCT_GENERIC; 6232 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6233 } 6234 6235 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6236 } 6237 6238 void 6239 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6240 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6241 { 6242 assert(first_sct != NULL); 6243 assert(first_sc != NULL); 6244 assert(second_sct != NULL); 6245 assert(second_sc != NULL); 6246 assert(cdw0 != NULL); 6247 6248 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6249 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6250 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6251 *first_sct = bdev_io->internal.error.nvme.sct; 6252 *first_sc = bdev_io->internal.error.nvme.sc; 6253 *second_sct = SPDK_NVME_SCT_GENERIC; 6254 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6255 } else { 6256 *first_sct = SPDK_NVME_SCT_GENERIC; 6257 *first_sc = SPDK_NVME_SC_SUCCESS; 6258 *second_sct = bdev_io->internal.error.nvme.sct; 6259 *second_sc = bdev_io->internal.error.nvme.sc; 6260 } 6261 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6262 *first_sct = SPDK_NVME_SCT_GENERIC; 6263 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6264 *second_sct = SPDK_NVME_SCT_GENERIC; 6265 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6266 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6267 *first_sct = SPDK_NVME_SCT_GENERIC; 6268 *first_sc = SPDK_NVME_SC_SUCCESS; 6269 *second_sct = SPDK_NVME_SCT_GENERIC; 6270 *second_sc = SPDK_NVME_SC_SUCCESS; 6271 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6272 *first_sct = SPDK_NVME_SCT_GENERIC; 6273 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6274 *second_sct = SPDK_NVME_SCT_GENERIC; 6275 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6276 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6277 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6278 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6279 *second_sct = SPDK_NVME_SCT_GENERIC; 6280 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6281 } else { 6282 *first_sct = SPDK_NVME_SCT_GENERIC; 6283 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6284 *second_sct = SPDK_NVME_SCT_GENERIC; 6285 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6286 } 6287 6288 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6289 } 6290 6291 struct spdk_thread * 6292 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6293 { 6294 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6295 } 6296 6297 struct spdk_io_channel * 6298 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6299 { 6300 return bdev_io->internal.ch->channel; 6301 } 6302 6303 static int 6304 bdev_register(struct spdk_bdev *bdev) 6305 { 6306 char *bdev_name; 6307 char uuid[SPDK_UUID_STRING_LEN]; 6308 int ret; 6309 6310 assert(bdev->module != NULL); 6311 6312 if (!bdev->name) { 6313 SPDK_ERRLOG("Bdev name is NULL\n"); 6314 return -EINVAL; 6315 } 6316 6317 if (!strlen(bdev->name)) { 6318 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6319 return -EINVAL; 6320 } 6321 6322 /* Users often register their own I/O devices using the bdev name. In 6323 * order to avoid conflicts, prepend bdev_. */ 6324 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6325 if (!bdev_name) { 6326 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6327 return -ENOMEM; 6328 } 6329 6330 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6331 bdev->internal.measured_queue_depth = UINT64_MAX; 6332 bdev->internal.claim_module = NULL; 6333 bdev->internal.qd_poller = NULL; 6334 bdev->internal.qos = NULL; 6335 6336 TAILQ_INIT(&bdev->internal.open_descs); 6337 TAILQ_INIT(&bdev->internal.locked_ranges); 6338 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6339 TAILQ_INIT(&bdev->aliases); 6340 6341 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6342 if (ret != 0) { 6343 free(bdev_name); 6344 return ret; 6345 } 6346 6347 /* If the user didn't specify a uuid, generate one. */ 6348 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6349 spdk_uuid_generate(&bdev->uuid); 6350 } 6351 6352 /* Add the UUID alias only if it's different than the name */ 6353 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6354 if (strcmp(bdev->name, uuid) != 0) { 6355 ret = spdk_bdev_alias_add(bdev, uuid); 6356 if (ret != 0) { 6357 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6358 bdev_name_del(&bdev->internal.bdev_name); 6359 free(bdev_name); 6360 return ret; 6361 } 6362 } 6363 6364 if (spdk_bdev_get_buf_align(bdev) > 1) { 6365 if (bdev->split_on_optimal_io_boundary) { 6366 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6367 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6368 } else { 6369 bdev->split_on_optimal_io_boundary = true; 6370 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6371 } 6372 } 6373 6374 /* If the user didn't specify a write unit size, set it to one. */ 6375 if (bdev->write_unit_size == 0) { 6376 bdev->write_unit_size = 1; 6377 } 6378 6379 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6380 if (bdev->acwu == 0) { 6381 bdev->acwu = bdev->write_unit_size; 6382 } 6383 6384 if (bdev->phys_blocklen == 0) { 6385 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6386 } 6387 6388 bdev->internal.reset_in_progress = NULL; 6389 bdev->internal.qd_poll_in_progress = false; 6390 bdev->internal.period = 0; 6391 bdev->internal.new_period = 0; 6392 6393 spdk_io_device_register(__bdev_to_io_dev(bdev), 6394 bdev_channel_create, bdev_channel_destroy, 6395 sizeof(struct spdk_bdev_channel), 6396 bdev_name); 6397 6398 free(bdev_name); 6399 6400 pthread_mutex_init(&bdev->internal.mutex, NULL); 6401 6402 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6403 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6404 6405 return 0; 6406 } 6407 6408 static void 6409 bdev_destroy_cb(void *io_device) 6410 { 6411 int rc; 6412 struct spdk_bdev *bdev; 6413 spdk_bdev_unregister_cb cb_fn; 6414 void *cb_arg; 6415 6416 bdev = __bdev_from_io_dev(io_device); 6417 cb_fn = bdev->internal.unregister_cb; 6418 cb_arg = bdev->internal.unregister_ctx; 6419 6420 pthread_mutex_destroy(&bdev->internal.mutex); 6421 free(bdev->internal.qos); 6422 6423 rc = bdev->fn_table->destruct(bdev->ctxt); 6424 if (rc < 0) { 6425 SPDK_ERRLOG("destruct failed\n"); 6426 } 6427 if (rc <= 0 && cb_fn != NULL) { 6428 cb_fn(cb_arg, rc); 6429 } 6430 } 6431 6432 void 6433 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6434 { 6435 if (bdev->internal.unregister_cb != NULL) { 6436 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6437 } 6438 } 6439 6440 static void 6441 _remove_notify(void *arg) 6442 { 6443 struct spdk_bdev_desc *desc = arg; 6444 6445 pthread_mutex_lock(&desc->mutex); 6446 desc->refs--; 6447 6448 if (!desc->closed) { 6449 pthread_mutex_unlock(&desc->mutex); 6450 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6451 return; 6452 } else if (0 == desc->refs) { 6453 /* This descriptor was closed after this remove_notify message was sent. 6454 * spdk_bdev_close() could not free the descriptor since this message was 6455 * in flight, so we free it now using bdev_desc_free(). 6456 */ 6457 pthread_mutex_unlock(&desc->mutex); 6458 bdev_desc_free(desc); 6459 return; 6460 } 6461 pthread_mutex_unlock(&desc->mutex); 6462 } 6463 6464 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6465 * returns: 0 - bdev removed and ready to be destructed. 6466 * -EBUSY - bdev can't be destructed yet. */ 6467 static int 6468 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6469 { 6470 struct spdk_bdev_desc *desc, *tmp; 6471 int rc = 0; 6472 char uuid[SPDK_UUID_STRING_LEN]; 6473 6474 /* Notify each descriptor about hotremoval */ 6475 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6476 rc = -EBUSY; 6477 pthread_mutex_lock(&desc->mutex); 6478 /* 6479 * Defer invocation of the event_cb to a separate message that will 6480 * run later on its thread. This ensures this context unwinds and 6481 * we don't recursively unregister this bdev again if the event_cb 6482 * immediately closes its descriptor. 6483 */ 6484 desc->refs++; 6485 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6486 pthread_mutex_unlock(&desc->mutex); 6487 } 6488 6489 /* If there are no descriptors, proceed removing the bdev */ 6490 if (rc == 0) { 6491 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6492 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6493 6494 /* Delete the name and the UUID alias */ 6495 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6496 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6497 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6498 6499 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6500 6501 if (bdev->internal.reset_in_progress != NULL) { 6502 /* If reset is in progress, let the completion callback for reset 6503 * unregister the bdev. 6504 */ 6505 rc = -EBUSY; 6506 } 6507 } 6508 6509 return rc; 6510 } 6511 6512 static void 6513 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i) 6514 { 6515 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 6516 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 6517 6518 bdev_channel_abort_queued_ios(bdev_ch); 6519 spdk_for_each_channel_continue(i, 0); 6520 } 6521 6522 static void 6523 bdev_unregister(struct spdk_io_channel_iter *i, int status) 6524 { 6525 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 6526 int rc; 6527 6528 pthread_mutex_lock(&g_bdev_mgr.mutex); 6529 pthread_mutex_lock(&bdev->internal.mutex); 6530 /* 6531 * Set the status to REMOVING after completing to abort channels. Otherwise, 6532 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6533 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail. 6534 */ 6535 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6536 rc = bdev_unregister_unsafe(bdev); 6537 pthread_mutex_unlock(&bdev->internal.mutex); 6538 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6539 6540 if (rc == 0) { 6541 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6542 } 6543 } 6544 6545 void 6546 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6547 { 6548 struct spdk_thread *thread; 6549 6550 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6551 6552 thread = spdk_get_thread(); 6553 if (!thread) { 6554 /* The user called this from a non-SPDK thread. */ 6555 if (cb_fn != NULL) { 6556 cb_fn(cb_arg, -ENOTSUP); 6557 } 6558 return; 6559 } 6560 6561 pthread_mutex_lock(&g_bdev_mgr.mutex); 6562 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6563 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6564 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6565 if (cb_fn) { 6566 cb_fn(cb_arg, -EBUSY); 6567 } 6568 return; 6569 } 6570 6571 pthread_mutex_lock(&bdev->internal.mutex); 6572 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6573 bdev->internal.unregister_cb = cb_fn; 6574 bdev->internal.unregister_ctx = cb_arg; 6575 pthread_mutex_unlock(&bdev->internal.mutex); 6576 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6577 6578 spdk_bdev_set_qd_sampling_period(bdev, 0); 6579 6580 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6581 bdev_unregister_abort_channel, 6582 bdev, 6583 bdev_unregister); 6584 } 6585 6586 int 6587 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6588 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6589 { 6590 struct spdk_bdev_desc *desc; 6591 struct spdk_bdev *bdev; 6592 int rc; 6593 6594 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6595 if (rc != 0) { 6596 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6597 return rc; 6598 } 6599 6600 bdev = spdk_bdev_desc_get_bdev(desc); 6601 6602 if (bdev->module != module) { 6603 spdk_bdev_close(desc); 6604 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6605 bdev_name); 6606 return -ENODEV; 6607 } 6608 6609 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6610 6611 spdk_bdev_close(desc); 6612 6613 return 0; 6614 } 6615 6616 static int 6617 bdev_start_qos(struct spdk_bdev *bdev) 6618 { 6619 struct set_qos_limit_ctx *ctx; 6620 6621 /* Enable QoS */ 6622 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6623 ctx = calloc(1, sizeof(*ctx)); 6624 if (ctx == NULL) { 6625 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6626 return -ENOMEM; 6627 } 6628 ctx->bdev = bdev; 6629 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6630 bdev_enable_qos_msg, ctx, 6631 bdev_enable_qos_done); 6632 } 6633 6634 return 0; 6635 } 6636 6637 static int 6638 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6639 { 6640 struct spdk_thread *thread; 6641 int rc = 0; 6642 6643 thread = spdk_get_thread(); 6644 if (!thread) { 6645 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6646 return -ENOTSUP; 6647 } 6648 6649 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6650 spdk_get_thread()); 6651 6652 desc->bdev = bdev; 6653 desc->thread = thread; 6654 desc->write = write; 6655 6656 pthread_mutex_lock(&bdev->internal.mutex); 6657 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6658 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6659 pthread_mutex_unlock(&bdev->internal.mutex); 6660 return -ENODEV; 6661 } 6662 6663 if (write && bdev->internal.claim_module) { 6664 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6665 bdev->name, bdev->internal.claim_module->name); 6666 pthread_mutex_unlock(&bdev->internal.mutex); 6667 return -EPERM; 6668 } 6669 6670 rc = bdev_start_qos(bdev); 6671 if (rc != 0) { 6672 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6673 pthread_mutex_unlock(&bdev->internal.mutex); 6674 return rc; 6675 } 6676 6677 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6678 6679 pthread_mutex_unlock(&bdev->internal.mutex); 6680 6681 return 0; 6682 } 6683 6684 static int 6685 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6686 struct spdk_bdev_desc **_desc) 6687 { 6688 struct spdk_bdev_desc *desc; 6689 unsigned int event_id; 6690 6691 desc = calloc(1, sizeof(*desc)); 6692 if (desc == NULL) { 6693 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6694 return -ENOMEM; 6695 } 6696 6697 TAILQ_INIT(&desc->pending_media_events); 6698 TAILQ_INIT(&desc->free_media_events); 6699 6700 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6701 desc->callback.event_fn = event_cb; 6702 desc->callback.ctx = event_ctx; 6703 pthread_mutex_init(&desc->mutex, NULL); 6704 6705 if (bdev->media_events) { 6706 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6707 sizeof(*desc->media_events_buffer)); 6708 if (desc->media_events_buffer == NULL) { 6709 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6710 bdev_desc_free(desc); 6711 return -ENOMEM; 6712 } 6713 6714 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6715 TAILQ_INSERT_TAIL(&desc->free_media_events, 6716 &desc->media_events_buffer[event_id], tailq); 6717 } 6718 } 6719 6720 *_desc = desc; 6721 6722 return 0; 6723 } 6724 6725 int 6726 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6727 void *event_ctx, struct spdk_bdev_desc **_desc) 6728 { 6729 struct spdk_bdev_desc *desc; 6730 struct spdk_bdev *bdev; 6731 int rc; 6732 6733 if (event_cb == NULL) { 6734 SPDK_ERRLOG("Missing event callback function\n"); 6735 return -EINVAL; 6736 } 6737 6738 pthread_mutex_lock(&g_bdev_mgr.mutex); 6739 6740 bdev = bdev_get_by_name(bdev_name); 6741 6742 if (bdev == NULL) { 6743 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6744 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6745 return -ENODEV; 6746 } 6747 6748 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 6749 if (rc != 0) { 6750 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6751 return rc; 6752 } 6753 6754 rc = bdev_open(bdev, write, desc); 6755 if (rc != 0) { 6756 bdev_desc_free(desc); 6757 desc = NULL; 6758 } 6759 6760 *_desc = desc; 6761 6762 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6763 6764 return rc; 6765 } 6766 6767 static void 6768 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 6769 { 6770 int rc; 6771 6772 pthread_mutex_lock(&bdev->internal.mutex); 6773 pthread_mutex_lock(&desc->mutex); 6774 6775 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6776 6777 desc->closed = true; 6778 6779 if (0 == desc->refs) { 6780 pthread_mutex_unlock(&desc->mutex); 6781 bdev_desc_free(desc); 6782 } else { 6783 pthread_mutex_unlock(&desc->mutex); 6784 } 6785 6786 /* If no more descriptors, kill QoS channel */ 6787 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6788 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6789 bdev->name, spdk_get_thread()); 6790 6791 if (bdev_qos_destroy(bdev)) { 6792 /* There isn't anything we can do to recover here. Just let the 6793 * old QoS poller keep running. The QoS handling won't change 6794 * cores when the user allocates a new channel, but it won't break. */ 6795 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6796 } 6797 } 6798 6799 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6800 rc = bdev_unregister_unsafe(bdev); 6801 pthread_mutex_unlock(&bdev->internal.mutex); 6802 6803 if (rc == 0) { 6804 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6805 } 6806 } else { 6807 pthread_mutex_unlock(&bdev->internal.mutex); 6808 } 6809 } 6810 6811 void 6812 spdk_bdev_close(struct spdk_bdev_desc *desc) 6813 { 6814 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6815 6816 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6817 spdk_get_thread()); 6818 6819 assert(desc->thread == spdk_get_thread()); 6820 6821 spdk_poller_unregister(&desc->io_timeout_poller); 6822 6823 pthread_mutex_lock(&g_bdev_mgr.mutex); 6824 6825 bdev_close(bdev, desc); 6826 6827 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6828 } 6829 6830 static void 6831 bdev_register_finished(void *arg) 6832 { 6833 struct spdk_bdev_desc *desc = arg; 6834 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6835 6836 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 6837 6838 bdev_close(bdev, desc); 6839 } 6840 6841 int 6842 spdk_bdev_register(struct spdk_bdev *bdev) 6843 { 6844 struct spdk_bdev_desc *desc; 6845 int rc; 6846 6847 rc = bdev_register(bdev); 6848 if (rc != 0) { 6849 return rc; 6850 } 6851 6852 /* A descriptor is opened to prevent bdev deletion during examination */ 6853 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6854 if (rc != 0) { 6855 spdk_bdev_unregister(bdev, NULL, NULL); 6856 return rc; 6857 } 6858 6859 rc = bdev_open(bdev, false, desc); 6860 if (rc != 0) { 6861 bdev_desc_free(desc); 6862 spdk_bdev_unregister(bdev, NULL, NULL); 6863 return rc; 6864 } 6865 6866 /* Examine configuration before initializing I/O */ 6867 bdev_examine(bdev); 6868 6869 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 6870 if (rc != 0) { 6871 bdev_close(bdev, desc); 6872 spdk_bdev_unregister(bdev, NULL, NULL); 6873 } 6874 6875 return rc; 6876 } 6877 6878 int 6879 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6880 struct spdk_bdev_module *module) 6881 { 6882 if (bdev->internal.claim_module != NULL) { 6883 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6884 bdev->internal.claim_module->name); 6885 return -EPERM; 6886 } 6887 6888 if (desc && !desc->write) { 6889 desc->write = true; 6890 } 6891 6892 bdev->internal.claim_module = module; 6893 return 0; 6894 } 6895 6896 void 6897 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6898 { 6899 assert(bdev->internal.claim_module != NULL); 6900 bdev->internal.claim_module = NULL; 6901 } 6902 6903 struct spdk_bdev * 6904 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6905 { 6906 assert(desc != NULL); 6907 return desc->bdev; 6908 } 6909 6910 int 6911 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 6912 { 6913 struct spdk_bdev *bdev, *tmp; 6914 struct spdk_bdev_desc *desc; 6915 int rc = 0; 6916 6917 assert(fn != NULL); 6918 6919 pthread_mutex_lock(&g_bdev_mgr.mutex); 6920 bdev = spdk_bdev_first(); 6921 while (bdev != NULL) { 6922 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6923 if (rc != 0) { 6924 break; 6925 } 6926 rc = bdev_open(bdev, false, desc); 6927 if (rc != 0) { 6928 bdev_desc_free(desc); 6929 break; 6930 } 6931 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6932 6933 rc = fn(ctx, bdev); 6934 6935 pthread_mutex_lock(&g_bdev_mgr.mutex); 6936 tmp = spdk_bdev_next(bdev); 6937 bdev_close(bdev, desc); 6938 if (rc != 0) { 6939 break; 6940 } 6941 bdev = tmp; 6942 } 6943 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6944 6945 return rc; 6946 } 6947 6948 int 6949 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 6950 { 6951 struct spdk_bdev *bdev, *tmp; 6952 struct spdk_bdev_desc *desc; 6953 int rc = 0; 6954 6955 assert(fn != NULL); 6956 6957 pthread_mutex_lock(&g_bdev_mgr.mutex); 6958 bdev = spdk_bdev_first_leaf(); 6959 while (bdev != NULL) { 6960 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6961 if (rc != 0) { 6962 break; 6963 } 6964 rc = bdev_open(bdev, false, desc); 6965 if (rc != 0) { 6966 bdev_desc_free(desc); 6967 break; 6968 } 6969 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6970 6971 rc = fn(ctx, bdev); 6972 6973 pthread_mutex_lock(&g_bdev_mgr.mutex); 6974 tmp = spdk_bdev_next_leaf(bdev); 6975 bdev_close(bdev, desc); 6976 if (rc != 0) { 6977 break; 6978 } 6979 bdev = tmp; 6980 } 6981 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6982 6983 return rc; 6984 } 6985 6986 void 6987 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6988 { 6989 struct iovec *iovs; 6990 int iovcnt; 6991 6992 if (bdev_io == NULL) { 6993 return; 6994 } 6995 6996 switch (bdev_io->type) { 6997 case SPDK_BDEV_IO_TYPE_READ: 6998 case SPDK_BDEV_IO_TYPE_WRITE: 6999 case SPDK_BDEV_IO_TYPE_ZCOPY: 7000 iovs = bdev_io->u.bdev.iovs; 7001 iovcnt = bdev_io->u.bdev.iovcnt; 7002 break; 7003 default: 7004 iovs = NULL; 7005 iovcnt = 0; 7006 break; 7007 } 7008 7009 if (iovp) { 7010 *iovp = iovs; 7011 } 7012 if (iovcntp) { 7013 *iovcntp = iovcnt; 7014 } 7015 } 7016 7017 void * 7018 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7019 { 7020 if (bdev_io == NULL) { 7021 return NULL; 7022 } 7023 7024 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7025 return NULL; 7026 } 7027 7028 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7029 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7030 return bdev_io->u.bdev.md_buf; 7031 } 7032 7033 return NULL; 7034 } 7035 7036 void * 7037 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7038 { 7039 if (bdev_io == NULL) { 7040 assert(false); 7041 return NULL; 7042 } 7043 7044 return bdev_io->internal.caller_ctx; 7045 } 7046 7047 void 7048 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7049 { 7050 7051 if (spdk_bdev_module_list_find(bdev_module->name)) { 7052 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7053 assert(false); 7054 } 7055 7056 /* 7057 * Modules with examine callbacks must be initialized first, so they are 7058 * ready to handle examine callbacks from later modules that will 7059 * register physical bdevs. 7060 */ 7061 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7062 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7063 } else { 7064 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7065 } 7066 } 7067 7068 struct spdk_bdev_module * 7069 spdk_bdev_module_list_find(const char *name) 7070 { 7071 struct spdk_bdev_module *bdev_module; 7072 7073 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7074 if (strcmp(name, bdev_module->name) == 0) { 7075 break; 7076 } 7077 } 7078 7079 return bdev_module; 7080 } 7081 7082 static void 7083 bdev_write_zero_buffer_next(void *_bdev_io) 7084 { 7085 struct spdk_bdev_io *bdev_io = _bdev_io; 7086 uint64_t num_bytes, num_blocks; 7087 void *md_buf = NULL; 7088 int rc; 7089 7090 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7091 bdev_io->u.bdev.split_remaining_num_blocks, 7092 ZERO_BUFFER_SIZE); 7093 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7094 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7095 7096 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7097 md_buf = (char *)g_bdev_mgr.zero_buffer + 7098 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7099 } 7100 7101 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7102 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7103 g_bdev_mgr.zero_buffer, md_buf, 7104 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7105 bdev_write_zero_buffer_done, bdev_io); 7106 if (rc == 0) { 7107 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7108 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7109 } else if (rc == -ENOMEM) { 7110 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7111 } else { 7112 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7113 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7114 } 7115 } 7116 7117 static void 7118 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7119 { 7120 struct spdk_bdev_io *parent_io = cb_arg; 7121 7122 spdk_bdev_free_io(bdev_io); 7123 7124 if (!success) { 7125 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7126 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7127 return; 7128 } 7129 7130 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7131 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7132 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7133 return; 7134 } 7135 7136 bdev_write_zero_buffer_next(parent_io); 7137 } 7138 7139 static void 7140 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7141 { 7142 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7143 ctx->bdev->internal.qos_mod_in_progress = false; 7144 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7145 7146 if (ctx->cb_fn) { 7147 ctx->cb_fn(ctx->cb_arg, status); 7148 } 7149 free(ctx); 7150 } 7151 7152 static void 7153 bdev_disable_qos_done(void *cb_arg) 7154 { 7155 struct set_qos_limit_ctx *ctx = cb_arg; 7156 struct spdk_bdev *bdev = ctx->bdev; 7157 struct spdk_bdev_io *bdev_io; 7158 struct spdk_bdev_qos *qos; 7159 7160 pthread_mutex_lock(&bdev->internal.mutex); 7161 qos = bdev->internal.qos; 7162 bdev->internal.qos = NULL; 7163 pthread_mutex_unlock(&bdev->internal.mutex); 7164 7165 while (!TAILQ_EMPTY(&qos->queued)) { 7166 /* Send queued I/O back to their original thread for resubmission. */ 7167 bdev_io = TAILQ_FIRST(&qos->queued); 7168 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7169 7170 if (bdev_io->internal.io_submit_ch) { 7171 /* 7172 * Channel was changed when sending it to the QoS thread - change it back 7173 * before sending it back to the original thread. 7174 */ 7175 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7176 bdev_io->internal.io_submit_ch = NULL; 7177 } 7178 7179 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7180 _bdev_io_submit, bdev_io); 7181 } 7182 7183 if (qos->thread != NULL) { 7184 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7185 spdk_poller_unregister(&qos->poller); 7186 } 7187 7188 free(qos); 7189 7190 bdev_set_qos_limit_done(ctx, 0); 7191 } 7192 7193 static void 7194 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 7195 { 7196 void *io_device = spdk_io_channel_iter_get_io_device(i); 7197 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7198 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7199 struct spdk_thread *thread; 7200 7201 pthread_mutex_lock(&bdev->internal.mutex); 7202 thread = bdev->internal.qos->thread; 7203 pthread_mutex_unlock(&bdev->internal.mutex); 7204 7205 if (thread != NULL) { 7206 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7207 } else { 7208 bdev_disable_qos_done(ctx); 7209 } 7210 } 7211 7212 static void 7213 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 7214 { 7215 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7216 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7217 7218 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7219 7220 spdk_for_each_channel_continue(i, 0); 7221 } 7222 7223 static void 7224 bdev_update_qos_rate_limit_msg(void *cb_arg) 7225 { 7226 struct set_qos_limit_ctx *ctx = cb_arg; 7227 struct spdk_bdev *bdev = ctx->bdev; 7228 7229 pthread_mutex_lock(&bdev->internal.mutex); 7230 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7231 pthread_mutex_unlock(&bdev->internal.mutex); 7232 7233 bdev_set_qos_limit_done(ctx, 0); 7234 } 7235 7236 static void 7237 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 7238 { 7239 void *io_device = spdk_io_channel_iter_get_io_device(i); 7240 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7241 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7242 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7243 7244 pthread_mutex_lock(&bdev->internal.mutex); 7245 bdev_enable_qos(bdev, bdev_ch); 7246 pthread_mutex_unlock(&bdev->internal.mutex); 7247 spdk_for_each_channel_continue(i, 0); 7248 } 7249 7250 static void 7251 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 7252 { 7253 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7254 7255 bdev_set_qos_limit_done(ctx, status); 7256 } 7257 7258 static void 7259 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7260 { 7261 int i; 7262 7263 assert(bdev->internal.qos != NULL); 7264 7265 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7266 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7267 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7268 7269 if (limits[i] == 0) { 7270 bdev->internal.qos->rate_limits[i].limit = 7271 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7272 } 7273 } 7274 } 7275 } 7276 7277 void 7278 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7279 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7280 { 7281 struct set_qos_limit_ctx *ctx; 7282 uint32_t limit_set_complement; 7283 uint64_t min_limit_per_sec; 7284 int i; 7285 bool disable_rate_limit = true; 7286 7287 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7288 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7289 continue; 7290 } 7291 7292 if (limits[i] > 0) { 7293 disable_rate_limit = false; 7294 } 7295 7296 if (bdev_qos_is_iops_rate_limit(i) == true) { 7297 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7298 } else { 7299 /* Change from megabyte to byte rate limit */ 7300 limits[i] = limits[i] * 1024 * 1024; 7301 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7302 } 7303 7304 limit_set_complement = limits[i] % min_limit_per_sec; 7305 if (limit_set_complement) { 7306 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7307 limits[i], min_limit_per_sec); 7308 limits[i] += min_limit_per_sec - limit_set_complement; 7309 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7310 } 7311 } 7312 7313 ctx = calloc(1, sizeof(*ctx)); 7314 if (ctx == NULL) { 7315 cb_fn(cb_arg, -ENOMEM); 7316 return; 7317 } 7318 7319 ctx->cb_fn = cb_fn; 7320 ctx->cb_arg = cb_arg; 7321 ctx->bdev = bdev; 7322 7323 pthread_mutex_lock(&bdev->internal.mutex); 7324 if (bdev->internal.qos_mod_in_progress) { 7325 pthread_mutex_unlock(&bdev->internal.mutex); 7326 free(ctx); 7327 cb_fn(cb_arg, -EAGAIN); 7328 return; 7329 } 7330 bdev->internal.qos_mod_in_progress = true; 7331 7332 if (disable_rate_limit == true && bdev->internal.qos) { 7333 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7334 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7335 (bdev->internal.qos->rate_limits[i].limit > 0 && 7336 bdev->internal.qos->rate_limits[i].limit != 7337 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7338 disable_rate_limit = false; 7339 break; 7340 } 7341 } 7342 } 7343 7344 if (disable_rate_limit == false) { 7345 if (bdev->internal.qos == NULL) { 7346 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7347 if (!bdev->internal.qos) { 7348 pthread_mutex_unlock(&bdev->internal.mutex); 7349 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7350 bdev_set_qos_limit_done(ctx, -ENOMEM); 7351 return; 7352 } 7353 } 7354 7355 if (bdev->internal.qos->thread == NULL) { 7356 /* Enabling */ 7357 bdev_set_qos_rate_limits(bdev, limits); 7358 7359 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7360 bdev_enable_qos_msg, ctx, 7361 bdev_enable_qos_done); 7362 } else { 7363 /* Updating */ 7364 bdev_set_qos_rate_limits(bdev, limits); 7365 7366 spdk_thread_send_msg(bdev->internal.qos->thread, 7367 bdev_update_qos_rate_limit_msg, ctx); 7368 } 7369 } else { 7370 if (bdev->internal.qos != NULL) { 7371 bdev_set_qos_rate_limits(bdev, limits); 7372 7373 /* Disabling */ 7374 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7375 bdev_disable_qos_msg, ctx, 7376 bdev_disable_qos_msg_done); 7377 } else { 7378 pthread_mutex_unlock(&bdev->internal.mutex); 7379 bdev_set_qos_limit_done(ctx, 0); 7380 return; 7381 } 7382 } 7383 7384 pthread_mutex_unlock(&bdev->internal.mutex); 7385 } 7386 7387 struct spdk_bdev_histogram_ctx { 7388 spdk_bdev_histogram_status_cb cb_fn; 7389 void *cb_arg; 7390 struct spdk_bdev *bdev; 7391 int status; 7392 }; 7393 7394 static void 7395 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 7396 { 7397 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7398 7399 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7400 ctx->bdev->internal.histogram_in_progress = false; 7401 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7402 ctx->cb_fn(ctx->cb_arg, ctx->status); 7403 free(ctx); 7404 } 7405 7406 static void 7407 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 7408 { 7409 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7410 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7411 7412 if (ch->histogram != NULL) { 7413 spdk_histogram_data_free(ch->histogram); 7414 ch->histogram = NULL; 7415 } 7416 spdk_for_each_channel_continue(i, 0); 7417 } 7418 7419 static void 7420 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 7421 { 7422 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7423 7424 if (status != 0) { 7425 ctx->status = status; 7426 ctx->bdev->internal.histogram_enabled = false; 7427 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 7428 bdev_histogram_disable_channel_cb); 7429 } else { 7430 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7431 ctx->bdev->internal.histogram_in_progress = false; 7432 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7433 ctx->cb_fn(ctx->cb_arg, ctx->status); 7434 free(ctx); 7435 } 7436 } 7437 7438 static void 7439 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 7440 { 7441 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7442 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7443 int status = 0; 7444 7445 if (ch->histogram == NULL) { 7446 ch->histogram = spdk_histogram_data_alloc(); 7447 if (ch->histogram == NULL) { 7448 status = -ENOMEM; 7449 } 7450 } 7451 7452 spdk_for_each_channel_continue(i, status); 7453 } 7454 7455 void 7456 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7457 void *cb_arg, bool enable) 7458 { 7459 struct spdk_bdev_histogram_ctx *ctx; 7460 7461 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7462 if (ctx == NULL) { 7463 cb_fn(cb_arg, -ENOMEM); 7464 return; 7465 } 7466 7467 ctx->bdev = bdev; 7468 ctx->status = 0; 7469 ctx->cb_fn = cb_fn; 7470 ctx->cb_arg = cb_arg; 7471 7472 pthread_mutex_lock(&bdev->internal.mutex); 7473 if (bdev->internal.histogram_in_progress) { 7474 pthread_mutex_unlock(&bdev->internal.mutex); 7475 free(ctx); 7476 cb_fn(cb_arg, -EAGAIN); 7477 return; 7478 } 7479 7480 bdev->internal.histogram_in_progress = true; 7481 pthread_mutex_unlock(&bdev->internal.mutex); 7482 7483 bdev->internal.histogram_enabled = enable; 7484 7485 if (enable) { 7486 /* Allocate histogram for each channel */ 7487 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 7488 bdev_histogram_enable_channel_cb); 7489 } else { 7490 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 7491 bdev_histogram_disable_channel_cb); 7492 } 7493 } 7494 7495 struct spdk_bdev_histogram_data_ctx { 7496 spdk_bdev_histogram_data_cb cb_fn; 7497 void *cb_arg; 7498 struct spdk_bdev *bdev; 7499 /** merged histogram data from all channels */ 7500 struct spdk_histogram_data *histogram; 7501 }; 7502 7503 static void 7504 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 7505 { 7506 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7507 7508 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7509 free(ctx); 7510 } 7511 7512 static void 7513 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 7514 { 7515 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7516 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7517 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7518 int status = 0; 7519 7520 if (ch->histogram == NULL) { 7521 status = -EFAULT; 7522 } else { 7523 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7524 } 7525 7526 spdk_for_each_channel_continue(i, status); 7527 } 7528 7529 void 7530 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7531 spdk_bdev_histogram_data_cb cb_fn, 7532 void *cb_arg) 7533 { 7534 struct spdk_bdev_histogram_data_ctx *ctx; 7535 7536 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7537 if (ctx == NULL) { 7538 cb_fn(cb_arg, -ENOMEM, NULL); 7539 return; 7540 } 7541 7542 ctx->bdev = bdev; 7543 ctx->cb_fn = cb_fn; 7544 ctx->cb_arg = cb_arg; 7545 7546 ctx->histogram = histogram; 7547 7548 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 7549 bdev_histogram_get_channel_cb); 7550 } 7551 7552 size_t 7553 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7554 size_t max_events) 7555 { 7556 struct media_event_entry *entry; 7557 size_t num_events = 0; 7558 7559 for (; num_events < max_events; ++num_events) { 7560 entry = TAILQ_FIRST(&desc->pending_media_events); 7561 if (entry == NULL) { 7562 break; 7563 } 7564 7565 events[num_events] = entry->event; 7566 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7567 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7568 } 7569 7570 return num_events; 7571 } 7572 7573 int 7574 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7575 size_t num_events) 7576 { 7577 struct spdk_bdev_desc *desc; 7578 struct media_event_entry *entry; 7579 size_t event_id; 7580 int rc = 0; 7581 7582 assert(bdev->media_events); 7583 7584 pthread_mutex_lock(&bdev->internal.mutex); 7585 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7586 if (desc->write) { 7587 break; 7588 } 7589 } 7590 7591 if (desc == NULL || desc->media_events_buffer == NULL) { 7592 rc = -ENODEV; 7593 goto out; 7594 } 7595 7596 for (event_id = 0; event_id < num_events; ++event_id) { 7597 entry = TAILQ_FIRST(&desc->free_media_events); 7598 if (entry == NULL) { 7599 break; 7600 } 7601 7602 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7603 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7604 entry->event = events[event_id]; 7605 } 7606 7607 rc = event_id; 7608 out: 7609 pthread_mutex_unlock(&bdev->internal.mutex); 7610 return rc; 7611 } 7612 7613 void 7614 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7615 { 7616 struct spdk_bdev_desc *desc; 7617 7618 pthread_mutex_lock(&bdev->internal.mutex); 7619 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7620 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7621 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7622 desc->callback.ctx); 7623 } 7624 } 7625 pthread_mutex_unlock(&bdev->internal.mutex); 7626 } 7627 7628 struct locked_lba_range_ctx { 7629 struct lba_range range; 7630 struct spdk_bdev *bdev; 7631 struct lba_range *current_range; 7632 struct lba_range *owner_range; 7633 struct spdk_poller *poller; 7634 lock_range_cb cb_fn; 7635 void *cb_arg; 7636 }; 7637 7638 static void 7639 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 7640 { 7641 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7642 7643 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7644 free(ctx); 7645 } 7646 7647 static void bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7648 7649 static void 7650 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7651 { 7652 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7653 struct spdk_bdev *bdev = ctx->bdev; 7654 7655 if (status == -ENOMEM) { 7656 /* One of the channels could not allocate a range object. 7657 * So we have to go back and clean up any ranges that were 7658 * allocated successfully before we return error status to 7659 * the caller. We can reuse the unlock function to do that 7660 * clean up. 7661 */ 7662 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7663 bdev_unlock_lba_range_get_channel, ctx, 7664 bdev_lock_error_cleanup_cb); 7665 return; 7666 } 7667 7668 /* All channels have locked this range and no I/O overlapping the range 7669 * are outstanding! Set the owner_ch for the range object for the 7670 * locking channel, so that this channel will know that it is allowed 7671 * to write to this range. 7672 */ 7673 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7674 ctx->cb_fn(ctx->cb_arg, status); 7675 7676 /* Don't free the ctx here. Its range is in the bdev's global list of 7677 * locked ranges still, and will be removed and freed when this range 7678 * is later unlocked. 7679 */ 7680 } 7681 7682 static int 7683 bdev_lock_lba_range_check_io(void *_i) 7684 { 7685 struct spdk_io_channel_iter *i = _i; 7686 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7687 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7688 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7689 struct lba_range *range = ctx->current_range; 7690 struct spdk_bdev_io *bdev_io; 7691 7692 spdk_poller_unregister(&ctx->poller); 7693 7694 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7695 * range. But we need to wait until any outstanding IO overlapping with this range 7696 * are completed. 7697 */ 7698 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7699 if (bdev_io_range_is_locked(bdev_io, range)) { 7700 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7701 return SPDK_POLLER_BUSY; 7702 } 7703 } 7704 7705 spdk_for_each_channel_continue(i, 0); 7706 return SPDK_POLLER_BUSY; 7707 } 7708 7709 static void 7710 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7711 { 7712 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7713 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7714 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7715 struct lba_range *range; 7716 7717 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7718 if (range->length == ctx->range.length && 7719 range->offset == ctx->range.offset && 7720 range->locked_ctx == ctx->range.locked_ctx) { 7721 /* This range already exists on this channel, so don't add 7722 * it again. This can happen when a new channel is created 7723 * while the for_each_channel operation is in progress. 7724 * Do not check for outstanding I/O in that case, since the 7725 * range was locked before any I/O could be submitted to the 7726 * new channel. 7727 */ 7728 spdk_for_each_channel_continue(i, 0); 7729 return; 7730 } 7731 } 7732 7733 range = calloc(1, sizeof(*range)); 7734 if (range == NULL) { 7735 spdk_for_each_channel_continue(i, -ENOMEM); 7736 return; 7737 } 7738 7739 range->length = ctx->range.length; 7740 range->offset = ctx->range.offset; 7741 range->locked_ctx = ctx->range.locked_ctx; 7742 ctx->current_range = range; 7743 if (ctx->range.owner_ch == ch) { 7744 /* This is the range object for the channel that will hold 7745 * the lock. Store it in the ctx object so that we can easily 7746 * set its owner_ch after the lock is finally acquired. 7747 */ 7748 ctx->owner_range = range; 7749 } 7750 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7751 bdev_lock_lba_range_check_io(i); 7752 } 7753 7754 static void 7755 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7756 { 7757 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7758 7759 /* We will add a copy of this range to each channel now. */ 7760 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7761 bdev_lock_lba_range_cb); 7762 } 7763 7764 static bool 7765 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7766 { 7767 struct lba_range *r; 7768 7769 TAILQ_FOREACH(r, tailq, tailq) { 7770 if (bdev_lba_range_overlapped(range, r)) { 7771 return true; 7772 } 7773 } 7774 return false; 7775 } 7776 7777 static int 7778 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7779 uint64_t offset, uint64_t length, 7780 lock_range_cb cb_fn, void *cb_arg) 7781 { 7782 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7783 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7784 struct locked_lba_range_ctx *ctx; 7785 7786 if (cb_arg == NULL) { 7787 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7788 return -EINVAL; 7789 } 7790 7791 ctx = calloc(1, sizeof(*ctx)); 7792 if (ctx == NULL) { 7793 return -ENOMEM; 7794 } 7795 7796 ctx->range.offset = offset; 7797 ctx->range.length = length; 7798 ctx->range.owner_ch = ch; 7799 ctx->range.locked_ctx = cb_arg; 7800 ctx->bdev = bdev; 7801 ctx->cb_fn = cb_fn; 7802 ctx->cb_arg = cb_arg; 7803 7804 pthread_mutex_lock(&bdev->internal.mutex); 7805 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7806 /* There is an active lock overlapping with this range. 7807 * Put it on the pending list until this range no 7808 * longer overlaps with another. 7809 */ 7810 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7811 } else { 7812 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7813 bdev_lock_lba_range_ctx(bdev, ctx); 7814 } 7815 pthread_mutex_unlock(&bdev->internal.mutex); 7816 return 0; 7817 } 7818 7819 static void 7820 bdev_lock_lba_range_ctx_msg(void *_ctx) 7821 { 7822 struct locked_lba_range_ctx *ctx = _ctx; 7823 7824 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7825 } 7826 7827 static void 7828 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7829 { 7830 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7831 struct locked_lba_range_ctx *pending_ctx; 7832 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7833 struct spdk_bdev *bdev = ch->bdev; 7834 struct lba_range *range, *tmp; 7835 7836 pthread_mutex_lock(&bdev->internal.mutex); 7837 /* Check if there are any pending locked ranges that overlap with this range 7838 * that was just unlocked. If there are, check that it doesn't overlap with any 7839 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7840 * the lock process. 7841 */ 7842 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7843 if (bdev_lba_range_overlapped(range, &ctx->range) && 7844 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7845 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7846 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7847 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7848 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7849 bdev_lock_lba_range_ctx_msg, pending_ctx); 7850 } 7851 } 7852 pthread_mutex_unlock(&bdev->internal.mutex); 7853 7854 ctx->cb_fn(ctx->cb_arg, status); 7855 free(ctx); 7856 } 7857 7858 static void 7859 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7860 { 7861 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7862 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7863 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7864 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7865 struct spdk_bdev_io *bdev_io; 7866 struct lba_range *range; 7867 7868 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7869 if (ctx->range.offset == range->offset && 7870 ctx->range.length == range->length && 7871 ctx->range.locked_ctx == range->locked_ctx) { 7872 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7873 free(range); 7874 break; 7875 } 7876 } 7877 7878 /* Note: we should almost always be able to assert that the range specified 7879 * was found. But there are some very rare corner cases where a new channel 7880 * gets created simultaneously with a range unlock, where this function 7881 * would execute on that new channel and wouldn't have the range. 7882 * We also use this to clean up range allocations when a later allocation 7883 * fails in the locking path. 7884 * So we can't actually assert() here. 7885 */ 7886 7887 /* Swap the locked IO into a temporary list, and then try to submit them again. 7888 * We could hyper-optimize this to only resubmit locked I/O that overlap 7889 * with the range that was just unlocked, but this isn't a performance path so 7890 * we go for simplicity here. 7891 */ 7892 TAILQ_INIT(&io_locked); 7893 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7894 while (!TAILQ_EMPTY(&io_locked)) { 7895 bdev_io = TAILQ_FIRST(&io_locked); 7896 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7897 bdev_io_submit(bdev_io); 7898 } 7899 7900 spdk_for_each_channel_continue(i, 0); 7901 } 7902 7903 static int 7904 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7905 uint64_t offset, uint64_t length, 7906 lock_range_cb cb_fn, void *cb_arg) 7907 { 7908 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7909 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7910 struct locked_lba_range_ctx *ctx; 7911 struct lba_range *range; 7912 bool range_found = false; 7913 7914 /* Let's make sure the specified channel actually has a lock on 7915 * the specified range. Note that the range must match exactly. 7916 */ 7917 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7918 if (range->offset == offset && range->length == length && 7919 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7920 range_found = true; 7921 break; 7922 } 7923 } 7924 7925 if (!range_found) { 7926 return -EINVAL; 7927 } 7928 7929 pthread_mutex_lock(&bdev->internal.mutex); 7930 /* We confirmed that this channel has locked the specified range. To 7931 * start the unlock the process, we find the range in the bdev's locked_ranges 7932 * and remove it. This ensures new channels don't inherit the locked range. 7933 * Then we will send a message to each channel (including the one specified 7934 * here) to remove the range from its per-channel list. 7935 */ 7936 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7937 if (range->offset == offset && range->length == length && 7938 range->locked_ctx == cb_arg) { 7939 break; 7940 } 7941 } 7942 if (range == NULL) { 7943 assert(false); 7944 pthread_mutex_unlock(&bdev->internal.mutex); 7945 return -EINVAL; 7946 } 7947 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7948 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7949 pthread_mutex_unlock(&bdev->internal.mutex); 7950 7951 ctx->cb_fn = cb_fn; 7952 ctx->cb_arg = cb_arg; 7953 7954 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7955 bdev_unlock_lba_range_cb); 7956 return 0; 7957 } 7958 7959 int 7960 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7961 int array_size) 7962 { 7963 if (!bdev) { 7964 return -EINVAL; 7965 } 7966 7967 if (bdev->fn_table->get_memory_domains) { 7968 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7969 } 7970 7971 return 0; 7972 } 7973 7974 struct spdk_bdev_for_each_io_ctx { 7975 void *ctx; 7976 spdk_bdev_io_fn fn; 7977 spdk_bdev_for_each_io_cb cb; 7978 }; 7979 7980 static void 7981 bdev_channel_for_each_io(struct spdk_io_channel_iter *i) 7982 { 7983 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7984 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 7985 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 7986 struct spdk_bdev_io *bdev_io; 7987 int rc = 0; 7988 7989 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 7990 rc = ctx->fn(ctx->ctx, bdev_io); 7991 if (rc != 0) { 7992 break; 7993 } 7994 } 7995 7996 spdk_for_each_channel_continue(i, rc); 7997 } 7998 7999 static void 8000 bdev_for_each_io_done(struct spdk_io_channel_iter *i, int status) 8001 { 8002 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 8003 8004 ctx->cb(ctx->ctx, status); 8005 8006 free(ctx); 8007 } 8008 8009 void 8010 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8011 spdk_bdev_for_each_io_cb cb) 8012 { 8013 struct spdk_bdev_for_each_io_ctx *ctx; 8014 8015 assert(fn != NULL && cb != NULL); 8016 8017 ctx = calloc(1, sizeof(*ctx)); 8018 if (ctx == NULL) { 8019 SPDK_ERRLOG("Failed to allocate context.\n"); 8020 cb(_ctx, -ENOMEM); 8021 return; 8022 } 8023 8024 ctx->ctx = _ctx; 8025 ctx->fn = fn; 8026 ctx->cb = cb; 8027 8028 spdk_for_each_channel(__bdev_to_io_dev(bdev), 8029 bdev_channel_for_each_io, 8030 ctx, 8031 bdev_for_each_io_done); 8032 } 8033 8034 SPDK_LOG_REGISTER_COMPONENT(bdev) 8035 8036 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8037 { 8038 struct spdk_trace_tpoint_opts opts[] = { 8039 { 8040 "BDEV_IO_START", TRACE_BDEV_IO_START, 8041 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8042 { 8043 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8044 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8045 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8046 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 8047 } 8048 }, 8049 { 8050 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8051 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8052 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8053 }, 8054 { 8055 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8056 OWNER_BDEV, OBJECT_NONE, 1, 8057 { 8058 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8059 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8060 } 8061 }, 8062 { 8063 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8064 OWNER_BDEV, OBJECT_NONE, 0, 8065 { 8066 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8067 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8068 } 8069 }, 8070 }; 8071 8072 8073 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8074 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8075 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8076 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8077 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8078 } 8079