1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define NOMEM_THRESHOLD_COUNT 8 42 43 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 44 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 45 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 46 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 47 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 48 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 49 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 50 51 #define SPDK_BDEV_POOL_ALIGNMENT 512 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 #define BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD 1000000 58 59 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 60 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 61 }; 62 63 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 64 65 RB_HEAD(bdev_name_tree, spdk_bdev_name); 66 67 static int 68 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 69 { 70 return strcmp(name1->name, name2->name); 71 } 72 73 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 74 75 struct spdk_bdev_mgr { 76 struct spdk_mempool *bdev_io_pool; 77 78 struct spdk_mempool *buf_small_pool; 79 struct spdk_mempool *buf_large_pool; 80 81 void *zero_buffer; 82 83 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 84 85 struct spdk_bdev_list bdevs; 86 struct bdev_name_tree bdev_names; 87 88 bool init_complete; 89 bool module_init_complete; 90 91 pthread_mutex_t mutex; 92 93 #ifdef SPDK_CONFIG_VTUNE 94 __itt_domain *domain; 95 #endif 96 }; 97 98 static struct spdk_bdev_mgr g_bdev_mgr = { 99 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 100 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 101 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 102 .init_complete = false, 103 .module_init_complete = false, 104 .mutex = PTHREAD_MUTEX_INITIALIZER, 105 }; 106 107 typedef void (*lock_range_cb)(void *ctx, int status); 108 109 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 110 111 struct lba_range { 112 uint64_t offset; 113 uint64_t length; 114 void *locked_ctx; 115 struct spdk_bdev_channel *owner_ch; 116 TAILQ_ENTRY(lba_range) tailq; 117 }; 118 119 static struct spdk_bdev_opts g_bdev_opts = { 120 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 121 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 122 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 123 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 124 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 125 }; 126 127 static spdk_bdev_init_cb g_init_cb_fn = NULL; 128 static void *g_init_cb_arg = NULL; 129 130 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 131 static void *g_fini_cb_arg = NULL; 132 static struct spdk_thread *g_fini_thread = NULL; 133 134 struct spdk_bdev_qos_limit { 135 /** IOs or bytes allowed per second (i.e., 1s). */ 136 uint64_t limit; 137 138 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 139 * For remaining bytes, allowed to run negative if an I/O is submitted when 140 * some bytes are remaining, but the I/O is bigger than that amount. The 141 * excess will be deducted from the next timeslice. 142 */ 143 int64_t remaining_this_timeslice; 144 145 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 146 uint32_t min_per_timeslice; 147 148 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 149 uint32_t max_per_timeslice; 150 151 /** Function to check whether to queue the IO. */ 152 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 153 154 /** Function to update for the submitted IO. */ 155 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 156 }; 157 158 struct spdk_bdev_qos { 159 /** Types of structure of rate limits. */ 160 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 161 162 /** The channel that all I/O are funneled through. */ 163 struct spdk_bdev_channel *ch; 164 165 /** The thread on which the poller is running. */ 166 struct spdk_thread *thread; 167 168 /** Queue of I/O waiting to be issued. */ 169 bdev_io_tailq_t queued; 170 171 /** Size of a timeslice in tsc ticks. */ 172 uint64_t timeslice_size; 173 174 /** Timestamp of start of last timeslice. */ 175 uint64_t last_timeslice; 176 177 /** Poller that processes queued I/O commands each time slice. */ 178 struct spdk_poller *poller; 179 }; 180 181 struct spdk_bdev_mgmt_channel { 182 bdev_io_stailq_t need_buf_small; 183 bdev_io_stailq_t need_buf_large; 184 185 /* 186 * Each thread keeps a cache of bdev_io - this allows 187 * bdev threads which are *not* DPDK threads to still 188 * benefit from a per-thread bdev_io cache. Without 189 * this, non-DPDK threads fetching from the mempool 190 * incur a cmpxchg on get and put. 191 */ 192 bdev_io_stailq_t per_thread_cache; 193 uint32_t per_thread_cache_count; 194 uint32_t bdev_io_cache_size; 195 196 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 197 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 198 }; 199 200 /* 201 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 202 * will queue here their IO that awaits retry. It makes it possible to retry sending 203 * IO to one bdev after IO from other bdev completes. 204 */ 205 struct spdk_bdev_shared_resource { 206 /* The bdev management channel */ 207 struct spdk_bdev_mgmt_channel *mgmt_ch; 208 209 /* 210 * Count of I/O submitted to bdev module and waiting for completion. 211 * Incremented before submit_request() is called on an spdk_bdev_io. 212 */ 213 uint64_t io_outstanding; 214 215 /* 216 * Queue of IO awaiting retry because of a previous NOMEM status returned 217 * on this channel. 218 */ 219 bdev_io_tailq_t nomem_io; 220 221 /* 222 * Threshold which io_outstanding must drop to before retrying nomem_io. 223 */ 224 uint64_t nomem_threshold; 225 226 /* I/O channel allocated by a bdev module */ 227 struct spdk_io_channel *shared_ch; 228 229 /* Refcount of bdev channels using this resource */ 230 uint32_t ref; 231 232 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 233 }; 234 235 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 236 #define BDEV_CH_QOS_ENABLED (1 << 1) 237 238 struct spdk_bdev_channel { 239 struct spdk_bdev *bdev; 240 241 /* The channel for the underlying device */ 242 struct spdk_io_channel *channel; 243 244 /* Per io_device per thread data */ 245 struct spdk_bdev_shared_resource *shared_resource; 246 247 struct spdk_bdev_io_stat stat; 248 249 /* 250 * Count of I/O submitted to the underlying dev module through this channel 251 * and waiting for completion. 252 */ 253 uint64_t io_outstanding; 254 255 /* 256 * List of all submitted I/Os including I/O that are generated via splitting. 257 */ 258 bdev_io_tailq_t io_submitted; 259 260 /* 261 * List of spdk_bdev_io that are currently queued because they write to a locked 262 * LBA range. 263 */ 264 bdev_io_tailq_t io_locked; 265 266 uint32_t flags; 267 268 struct spdk_histogram_data *histogram; 269 270 #ifdef SPDK_CONFIG_VTUNE 271 uint64_t start_tsc; 272 uint64_t interval_tsc; 273 __itt_string_handle *handle; 274 struct spdk_bdev_io_stat prev_stat; 275 #endif 276 277 bdev_io_tailq_t queued_resets; 278 279 lba_range_tailq_t locked_ranges; 280 }; 281 282 struct media_event_entry { 283 struct spdk_bdev_media_event event; 284 TAILQ_ENTRY(media_event_entry) tailq; 285 }; 286 287 #define MEDIA_EVENT_POOL_SIZE 64 288 289 struct spdk_bdev_desc { 290 struct spdk_bdev *bdev; 291 struct spdk_thread *thread; 292 struct { 293 spdk_bdev_event_cb_t event_fn; 294 void *ctx; 295 } callback; 296 bool closed; 297 bool write; 298 bool memory_domains_supported; 299 pthread_mutex_t mutex; 300 uint32_t refs; 301 TAILQ_HEAD(, media_event_entry) pending_media_events; 302 TAILQ_HEAD(, media_event_entry) free_media_events; 303 struct media_event_entry *media_events_buffer; 304 TAILQ_ENTRY(spdk_bdev_desc) link; 305 306 uint64_t timeout_in_sec; 307 spdk_bdev_io_timeout_cb cb_fn; 308 void *cb_arg; 309 struct spdk_poller *io_timeout_poller; 310 }; 311 312 struct spdk_bdev_iostat_ctx { 313 struct spdk_bdev_io_stat *stat; 314 spdk_bdev_get_device_stat_cb cb; 315 void *cb_arg; 316 }; 317 318 struct set_qos_limit_ctx { 319 void (*cb_fn)(void *cb_arg, int status); 320 void *cb_arg; 321 struct spdk_bdev *bdev; 322 }; 323 324 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 325 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 326 327 static inline void bdev_io_complete(void *ctx); 328 329 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 330 static void bdev_write_zero_buffer_next(void *_bdev_io); 331 332 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 333 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 334 335 static int bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 336 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 337 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 338 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 339 static int bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 340 struct iovec *iov, int iovcnt, void *md_buf, 341 uint64_t offset_blocks, uint64_t num_blocks, 342 spdk_bdev_io_completion_cb cb, void *cb_arg, 343 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 344 345 static int bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 346 uint64_t offset, uint64_t length, 347 lock_range_cb cb_fn, void *cb_arg); 348 349 static int bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 350 uint64_t offset, uint64_t length, 351 lock_range_cb cb_fn, void *cb_arg); 352 353 static inline void bdev_io_complete(void *ctx); 354 355 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 356 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 357 358 void 359 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 360 { 361 if (!opts) { 362 SPDK_ERRLOG("opts should not be NULL\n"); 363 return; 364 } 365 366 if (!opts_size) { 367 SPDK_ERRLOG("opts_size should not be zero value\n"); 368 return; 369 } 370 371 opts->opts_size = opts_size; 372 373 #define SET_FIELD(field) \ 374 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 375 opts->field = g_bdev_opts.field; \ 376 } \ 377 378 SET_FIELD(bdev_io_pool_size); 379 SET_FIELD(bdev_io_cache_size); 380 SET_FIELD(bdev_auto_examine); 381 SET_FIELD(small_buf_pool_size); 382 SET_FIELD(large_buf_pool_size); 383 384 /* Do not remove this statement, you should always update this statement when you adding a new field, 385 * and do not forget to add the SET_FIELD statement for your added field. */ 386 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 387 388 #undef SET_FIELD 389 } 390 391 int 392 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 393 { 394 uint32_t min_pool_size; 395 396 if (!opts) { 397 SPDK_ERRLOG("opts cannot be NULL\n"); 398 return -1; 399 } 400 401 if (!opts->opts_size) { 402 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 403 return -1; 404 } 405 406 /* 407 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 408 * initialization. A second mgmt_ch will be created on the same thread when the application starts 409 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 410 */ 411 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 412 if (opts->bdev_io_pool_size < min_pool_size) { 413 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 414 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 415 spdk_thread_get_count()); 416 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 417 return -1; 418 } 419 420 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 421 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 422 return -1; 423 } 424 425 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 426 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 427 return -1; 428 } 429 430 #define SET_FIELD(field) \ 431 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 432 g_bdev_opts.field = opts->field; \ 433 } \ 434 435 SET_FIELD(bdev_io_pool_size); 436 SET_FIELD(bdev_io_cache_size); 437 SET_FIELD(bdev_auto_examine); 438 SET_FIELD(small_buf_pool_size); 439 SET_FIELD(large_buf_pool_size); 440 441 g_bdev_opts.opts_size = opts->opts_size; 442 443 #undef SET_FIELD 444 445 return 0; 446 } 447 448 static struct spdk_bdev * 449 bdev_get_by_name(const char *bdev_name) 450 { 451 struct spdk_bdev_name find; 452 struct spdk_bdev_name *res; 453 454 find.name = (char *)bdev_name; 455 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 456 if (res != NULL) { 457 return res->bdev; 458 } 459 460 return NULL; 461 } 462 463 struct spdk_bdev * 464 spdk_bdev_get_by_name(const char *bdev_name) 465 { 466 struct spdk_bdev *bdev; 467 468 pthread_mutex_lock(&g_bdev_mgr.mutex); 469 bdev = bdev_get_by_name(bdev_name); 470 pthread_mutex_unlock(&g_bdev_mgr.mutex); 471 472 return bdev; 473 } 474 475 struct spdk_bdev_wait_for_examine_ctx { 476 struct spdk_poller *poller; 477 spdk_bdev_wait_for_examine_cb cb_fn; 478 void *cb_arg; 479 }; 480 481 static bool bdev_module_all_actions_completed(void); 482 483 static int 484 bdev_wait_for_examine_cb(void *arg) 485 { 486 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 487 488 if (!bdev_module_all_actions_completed()) { 489 return SPDK_POLLER_IDLE; 490 } 491 492 spdk_poller_unregister(&ctx->poller); 493 ctx->cb_fn(ctx->cb_arg); 494 free(ctx); 495 496 return SPDK_POLLER_BUSY; 497 } 498 499 int 500 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 501 { 502 struct spdk_bdev_wait_for_examine_ctx *ctx; 503 504 ctx = calloc(1, sizeof(*ctx)); 505 if (ctx == NULL) { 506 return -ENOMEM; 507 } 508 ctx->cb_fn = cb_fn; 509 ctx->cb_arg = cb_arg; 510 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 511 512 return 0; 513 } 514 515 struct spdk_bdev_examine_item { 516 char *name; 517 TAILQ_ENTRY(spdk_bdev_examine_item) link; 518 }; 519 520 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 521 522 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 523 g_bdev_examine_allowlist); 524 525 static inline bool 526 bdev_examine_allowlist_check(const char *name) 527 { 528 struct spdk_bdev_examine_item *item; 529 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 530 if (strcmp(name, item->name) == 0) { 531 return true; 532 } 533 } 534 return false; 535 } 536 537 static inline void 538 bdev_examine_allowlist_free(void) 539 { 540 struct spdk_bdev_examine_item *item; 541 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 542 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 543 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 544 free(item->name); 545 free(item); 546 } 547 } 548 549 static inline bool 550 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 551 { 552 struct spdk_bdev_alias *tmp; 553 if (bdev_examine_allowlist_check(bdev->name)) { 554 return true; 555 } 556 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 557 if (bdev_examine_allowlist_check(tmp->alias.name)) { 558 return true; 559 } 560 } 561 return false; 562 } 563 564 static inline bool 565 bdev_ok_to_examine(struct spdk_bdev *bdev) 566 { 567 if (g_bdev_opts.bdev_auto_examine) { 568 return true; 569 } else { 570 return bdev_in_examine_allowlist(bdev); 571 } 572 } 573 574 static void 575 bdev_examine(struct spdk_bdev *bdev) 576 { 577 struct spdk_bdev_module *module; 578 uint32_t action; 579 580 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 581 if (module->examine_config && bdev_ok_to_examine(bdev)) { 582 action = module->internal.action_in_progress; 583 module->internal.action_in_progress++; 584 module->examine_config(bdev); 585 if (action != module->internal.action_in_progress) { 586 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 587 module->name); 588 } 589 } 590 } 591 592 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 593 if (bdev->internal.claim_module->examine_disk) { 594 bdev->internal.claim_module->internal.action_in_progress++; 595 bdev->internal.claim_module->examine_disk(bdev); 596 } 597 return; 598 } 599 600 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 601 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 602 module->internal.action_in_progress++; 603 module->examine_disk(bdev); 604 } 605 } 606 } 607 608 int 609 spdk_bdev_examine(const char *name) 610 { 611 struct spdk_bdev *bdev; 612 struct spdk_bdev_examine_item *item; 613 614 if (g_bdev_opts.bdev_auto_examine) { 615 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 616 return -EINVAL; 617 } 618 619 if (bdev_examine_allowlist_check(name)) { 620 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 621 return -EEXIST; 622 } 623 624 item = calloc(1, sizeof(*item)); 625 if (!item) { 626 return -ENOMEM; 627 } 628 item->name = strdup(name); 629 if (!item->name) { 630 free(item); 631 return -ENOMEM; 632 } 633 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 634 635 bdev = spdk_bdev_get_by_name(name); 636 if (bdev) { 637 bdev_examine(bdev); 638 } 639 return 0; 640 } 641 642 static inline void 643 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 644 { 645 struct spdk_bdev_examine_item *item; 646 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 647 spdk_json_write_object_begin(w); 648 spdk_json_write_named_string(w, "method", "bdev_examine"); 649 spdk_json_write_named_object_begin(w, "params"); 650 spdk_json_write_named_string(w, "name", item->name); 651 spdk_json_write_object_end(w); 652 spdk_json_write_object_end(w); 653 } 654 } 655 656 struct spdk_bdev * 657 spdk_bdev_first(void) 658 { 659 struct spdk_bdev *bdev; 660 661 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 662 if (bdev) { 663 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 664 } 665 666 return bdev; 667 } 668 669 struct spdk_bdev * 670 spdk_bdev_next(struct spdk_bdev *prev) 671 { 672 struct spdk_bdev *bdev; 673 674 bdev = TAILQ_NEXT(prev, internal.link); 675 if (bdev) { 676 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 677 } 678 679 return bdev; 680 } 681 682 static struct spdk_bdev * 683 _bdev_next_leaf(struct spdk_bdev *bdev) 684 { 685 while (bdev != NULL) { 686 if (bdev->internal.claim_module == NULL) { 687 return bdev; 688 } else { 689 bdev = TAILQ_NEXT(bdev, internal.link); 690 } 691 } 692 693 return bdev; 694 } 695 696 struct spdk_bdev * 697 spdk_bdev_first_leaf(void) 698 { 699 struct spdk_bdev *bdev; 700 701 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 702 703 if (bdev) { 704 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 705 } 706 707 return bdev; 708 } 709 710 struct spdk_bdev * 711 spdk_bdev_next_leaf(struct spdk_bdev *prev) 712 { 713 struct spdk_bdev *bdev; 714 715 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 716 717 if (bdev) { 718 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 719 } 720 721 return bdev; 722 } 723 724 static inline bool 725 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 726 { 727 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 728 } 729 730 void 731 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 732 { 733 struct iovec *iovs; 734 735 if (bdev_io->u.bdev.iovs == NULL) { 736 bdev_io->u.bdev.iovs = &bdev_io->iov; 737 bdev_io->u.bdev.iovcnt = 1; 738 } 739 740 iovs = bdev_io->u.bdev.iovs; 741 742 assert(iovs != NULL); 743 assert(bdev_io->u.bdev.iovcnt >= 1); 744 745 iovs[0].iov_base = buf; 746 iovs[0].iov_len = len; 747 } 748 749 void 750 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 751 { 752 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 753 bdev_io->u.bdev.md_buf = md_buf; 754 } 755 756 static bool 757 _is_buf_allocated(const struct iovec *iovs) 758 { 759 if (iovs == NULL) { 760 return false; 761 } 762 763 return iovs[0].iov_base != NULL; 764 } 765 766 static bool 767 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 768 { 769 int i; 770 uintptr_t iov_base; 771 772 if (spdk_likely(alignment == 1)) { 773 return true; 774 } 775 776 for (i = 0; i < iovcnt; i++) { 777 iov_base = (uintptr_t)iovs[i].iov_base; 778 if ((iov_base & (alignment - 1)) != 0) { 779 return false; 780 } 781 } 782 783 return true; 784 } 785 786 static void 787 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 788 { 789 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 790 void *buf; 791 792 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 793 buf = bdev_io->internal.buf; 794 bdev_io->internal.buf = NULL; 795 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 796 bdev_io->internal.get_aux_buf_cb = NULL; 797 } else { 798 assert(bdev_io->internal.get_buf_cb != NULL); 799 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 800 bdev_io->internal.get_buf_cb = NULL; 801 } 802 } 803 804 static void 805 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 806 { 807 struct spdk_bdev_io *bdev_io = ctx; 808 809 if (rc) { 810 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 811 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 812 } 813 bdev_io_get_buf_complete(bdev_io, !rc); 814 } 815 816 static void 817 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 818 { 819 int rc = 0; 820 821 /* save original md_buf */ 822 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 823 bdev_io->internal.orig_md_iov.iov_len = len; 824 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 825 bdev_io->internal.bounce_md_iov.iov_len = len; 826 /* set bounce md_buf */ 827 bdev_io->u.bdev.md_buf = md_buf; 828 829 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 830 if (bdev_io_use_memory_domain(bdev_io)) { 831 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 832 bdev_io->internal.ext_opts->memory_domain_ctx, 833 &bdev_io->internal.orig_md_iov, 1, 834 &bdev_io->internal.bounce_md_iov, 1, 835 bdev_io->internal.data_transfer_cpl, 836 bdev_io); 837 if (rc == 0) { 838 /* Continue to submit IO in completion callback */ 839 return; 840 } 841 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 842 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 843 } else { 844 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 845 } 846 } 847 848 assert(bdev_io->internal.data_transfer_cpl); 849 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 850 } 851 852 static void 853 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 854 { 855 struct spdk_bdev *bdev = bdev_io->bdev; 856 uint64_t md_len; 857 void *buf; 858 859 if (spdk_bdev_is_md_separate(bdev)) { 860 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 861 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 862 863 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 864 865 if (bdev_io->u.bdev.md_buf != NULL) { 866 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 867 return; 868 } else { 869 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 870 } 871 } 872 873 bdev_io_get_buf_complete(bdev_io, true); 874 } 875 876 static void 877 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 878 { 879 struct spdk_bdev_io *bdev_io = ctx; 880 881 if (rc) { 882 SPDK_ERRLOG("Failed to get data buffer\n"); 883 assert(bdev_io->internal.data_transfer_cpl); 884 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 885 return; 886 } 887 888 _bdev_io_set_md_buf(bdev_io); 889 } 890 891 static void 892 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 893 bdev_copy_bounce_buffer_cpl cpl_cb) 894 { 895 int rc = 0; 896 897 bdev_io->internal.data_transfer_cpl = cpl_cb; 898 /* save original iovec */ 899 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 900 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 901 /* set bounce iov */ 902 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 903 bdev_io->u.bdev.iovcnt = 1; 904 /* set bounce buffer for this operation */ 905 bdev_io->u.bdev.iovs[0].iov_base = buf; 906 bdev_io->u.bdev.iovs[0].iov_len = len; 907 /* if this is write path, copy data from original buffer to bounce buffer */ 908 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 909 if (bdev_io_use_memory_domain(bdev_io)) { 910 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 911 bdev_io->internal.ext_opts->memory_domain_ctx, 912 bdev_io->internal.orig_iovs, 913 (uint32_t) bdev_io->internal.orig_iovcnt, 914 bdev_io->u.bdev.iovs, 1, 915 _bdev_io_pull_bounce_data_buf_done, 916 bdev_io); 917 if (rc == 0) { 918 /* Continue to submit IO in completion callback */ 919 return; 920 } 921 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 922 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 923 } else { 924 spdk_copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 925 } 926 } 927 928 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 929 } 930 931 static void 932 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 933 { 934 struct spdk_bdev *bdev = bdev_io->bdev; 935 bool buf_allocated; 936 uint64_t alignment; 937 void *aligned_buf; 938 939 bdev_io->internal.buf = buf; 940 941 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 942 bdev_io_get_buf_complete(bdev_io, true); 943 return; 944 } 945 946 alignment = spdk_bdev_get_buf_align(bdev); 947 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 948 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 949 950 if (buf_allocated) { 951 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 952 /* Continue in completion callback */ 953 return; 954 } else { 955 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 956 } 957 958 _bdev_io_set_md_buf(bdev_io); 959 } 960 961 static void 962 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 963 { 964 struct spdk_bdev *bdev = bdev_io->bdev; 965 struct spdk_mempool *pool; 966 struct spdk_bdev_io *tmp; 967 bdev_io_stailq_t *stailq; 968 struct spdk_bdev_mgmt_channel *ch; 969 uint64_t md_len, alignment; 970 971 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 972 alignment = spdk_bdev_get_buf_align(bdev); 973 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 974 975 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 976 SPDK_BDEV_POOL_ALIGNMENT) { 977 pool = g_bdev_mgr.buf_small_pool; 978 stailq = &ch->need_buf_small; 979 } else { 980 pool = g_bdev_mgr.buf_large_pool; 981 stailq = &ch->need_buf_large; 982 } 983 984 if (STAILQ_EMPTY(stailq)) { 985 spdk_mempool_put(pool, buf); 986 } else { 987 tmp = STAILQ_FIRST(stailq); 988 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 989 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 990 } 991 } 992 993 static void 994 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 995 { 996 assert(bdev_io->internal.buf != NULL); 997 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 998 bdev_io->internal.buf = NULL; 999 } 1000 1001 void 1002 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1003 { 1004 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1005 1006 assert(buf != NULL); 1007 _bdev_io_put_buf(bdev_io, buf, len); 1008 } 1009 1010 static void 1011 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1012 { 1013 struct spdk_bdev *bdev = bdev_ch->bdev; 1014 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1015 struct spdk_bdev_io *bdev_io; 1016 1017 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1018 /* 1019 * Allow some more I/O to complete before retrying the nomem_io queue. 1020 * Some drivers (such as nvme) cannot immediately take a new I/O in 1021 * the context of a completion, because the resources for the I/O are 1022 * not released until control returns to the bdev poller. Also, we 1023 * may require several small I/O to complete before a larger I/O 1024 * (that requires splitting) can be submitted. 1025 */ 1026 return; 1027 } 1028 1029 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1030 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1031 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1032 bdev_io->internal.ch->io_outstanding++; 1033 shared_resource->io_outstanding++; 1034 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1035 bdev_io->internal.error.nvme.cdw0 = 0; 1036 bdev_io->num_retries++; 1037 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1038 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1039 break; 1040 } 1041 } 1042 } 1043 1044 static inline void 1045 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1046 struct spdk_bdev_shared_resource *shared_resource) 1047 { 1048 assert(bdev_ch->io_outstanding > 0); 1049 assert(shared_resource->io_outstanding > 0); 1050 bdev_ch->io_outstanding--; 1051 shared_resource->io_outstanding--; 1052 } 1053 1054 static inline bool 1055 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1056 { 1057 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1058 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1059 1060 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1061 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1062 /* 1063 * Wait for some of the outstanding I/O to complete before we 1064 * retry any of the nomem_io. Normally we will wait for 1065 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1066 * depth channels we will instead wait for half to complete. 1067 */ 1068 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1069 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1070 return true; 1071 } 1072 1073 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1074 bdev_ch_retry_io(bdev_ch); 1075 } 1076 1077 return false; 1078 } 1079 1080 static void 1081 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1082 { 1083 struct spdk_bdev_io *bdev_io = ctx; 1084 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1085 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1086 1087 if (rc) { 1088 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1089 } 1090 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1091 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1092 */ 1093 bdev_io_put_buf(bdev_io); 1094 1095 /* Continue with IO completion flow */ 1096 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1097 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1098 return; 1099 } 1100 1101 bdev_io_complete(bdev_io); 1102 } 1103 1104 static inline void 1105 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1106 { 1107 int rc = 0; 1108 1109 /* do the same for metadata buffer */ 1110 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1111 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1112 1113 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1114 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1115 if (bdev_io_use_memory_domain(bdev_io)) { 1116 /* If memory domain is used then we need to call async push function */ 1117 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1118 bdev_io->internal.ext_opts->memory_domain_ctx, 1119 &bdev_io->internal.orig_md_iov, 1120 (uint32_t)bdev_io->internal.orig_iovcnt, 1121 &bdev_io->internal.bounce_md_iov, 1, 1122 bdev_io->internal.data_transfer_cpl, 1123 bdev_io); 1124 if (rc == 0) { 1125 /* Continue IO completion in async callback */ 1126 return; 1127 } 1128 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1129 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1130 } else { 1131 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1132 bdev_io->internal.orig_md_iov.iov_len); 1133 } 1134 } 1135 } 1136 1137 assert(bdev_io->internal.data_transfer_cpl); 1138 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1139 } 1140 1141 static void 1142 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1143 { 1144 struct spdk_bdev_io *bdev_io = ctx; 1145 1146 assert(bdev_io->internal.data_transfer_cpl); 1147 1148 if (rc) { 1149 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1150 return; 1151 } 1152 1153 /* set original buffer for this io */ 1154 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1155 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1156 /* disable bouncing buffer for this io */ 1157 bdev_io->internal.orig_iovcnt = 0; 1158 bdev_io->internal.orig_iovs = NULL; 1159 1160 _bdev_io_push_bounce_md_buffer(bdev_io); 1161 } 1162 1163 static inline void 1164 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1165 { 1166 int rc = 0; 1167 1168 bdev_io->internal.data_transfer_cpl = cpl_cb; 1169 1170 /* if this is read path, copy data from bounce buffer to original buffer */ 1171 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1172 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1173 if (bdev_io_use_memory_domain(bdev_io)) { 1174 /* If memory domain is used then we need to call async push function */ 1175 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1176 bdev_io->internal.ext_opts->memory_domain_ctx, 1177 bdev_io->internal.orig_iovs, 1178 (uint32_t)bdev_io->internal.orig_iovcnt, 1179 &bdev_io->internal.bounce_iov, 1, 1180 _bdev_io_push_bounce_data_buffer_done, 1181 bdev_io); 1182 if (rc == 0) { 1183 /* Continue IO completion in async callback */ 1184 return; 1185 } 1186 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1187 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1188 } else { 1189 spdk_copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1190 bdev_io->internal.orig_iovcnt, 1191 bdev_io->internal.bounce_iov.iov_base, 1192 bdev_io->internal.bounce_iov.iov_len); 1193 } 1194 } 1195 1196 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1197 } 1198 1199 static void 1200 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1201 { 1202 struct spdk_bdev *bdev = bdev_io->bdev; 1203 struct spdk_mempool *pool; 1204 bdev_io_stailq_t *stailq; 1205 struct spdk_bdev_mgmt_channel *mgmt_ch; 1206 uint64_t alignment, md_len; 1207 void *buf; 1208 1209 alignment = spdk_bdev_get_buf_align(bdev); 1210 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1211 1212 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1213 SPDK_BDEV_POOL_ALIGNMENT) { 1214 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1215 len + alignment); 1216 bdev_io_get_buf_complete(bdev_io, false); 1217 return; 1218 } 1219 1220 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1221 1222 bdev_io->internal.buf_len = len; 1223 1224 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1225 SPDK_BDEV_POOL_ALIGNMENT) { 1226 pool = g_bdev_mgr.buf_small_pool; 1227 stailq = &mgmt_ch->need_buf_small; 1228 } else { 1229 pool = g_bdev_mgr.buf_large_pool; 1230 stailq = &mgmt_ch->need_buf_large; 1231 } 1232 1233 buf = spdk_mempool_get(pool); 1234 if (!buf) { 1235 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1236 } else { 1237 _bdev_io_set_buf(bdev_io, buf, len); 1238 } 1239 } 1240 1241 void 1242 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1243 { 1244 struct spdk_bdev *bdev = bdev_io->bdev; 1245 uint64_t alignment; 1246 1247 assert(cb != NULL); 1248 bdev_io->internal.get_buf_cb = cb; 1249 1250 alignment = spdk_bdev_get_buf_align(bdev); 1251 1252 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1253 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1254 /* Buffer already present and aligned */ 1255 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1256 return; 1257 } 1258 1259 bdev_io_get_buf(bdev_io, len); 1260 } 1261 1262 static void 1263 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1264 bool success) 1265 { 1266 if (!success) { 1267 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1268 bdev_io_complete(bdev_io); 1269 } else { 1270 bdev_io_submit(bdev_io); 1271 } 1272 } 1273 1274 static void 1275 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1276 uint64_t len) 1277 { 1278 assert(cb != NULL); 1279 bdev_io->internal.get_buf_cb = cb; 1280 1281 bdev_io_get_buf(bdev_io, len); 1282 } 1283 1284 void 1285 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1286 { 1287 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1288 1289 assert(cb != NULL); 1290 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1291 bdev_io->internal.get_aux_buf_cb = cb; 1292 bdev_io_get_buf(bdev_io, len); 1293 } 1294 1295 static int 1296 bdev_module_get_max_ctx_size(void) 1297 { 1298 struct spdk_bdev_module *bdev_module; 1299 int max_bdev_module_size = 0; 1300 1301 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1302 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1303 max_bdev_module_size = bdev_module->get_ctx_size(); 1304 } 1305 } 1306 1307 return max_bdev_module_size; 1308 } 1309 1310 static void 1311 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1312 { 1313 int i; 1314 struct spdk_bdev_qos *qos = bdev->internal.qos; 1315 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1316 1317 if (!qos) { 1318 return; 1319 } 1320 1321 spdk_bdev_get_qos_rate_limits(bdev, limits); 1322 1323 spdk_json_write_object_begin(w); 1324 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1325 1326 spdk_json_write_named_object_begin(w, "params"); 1327 spdk_json_write_named_string(w, "name", bdev->name); 1328 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1329 if (limits[i] > 0) { 1330 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1331 } 1332 } 1333 spdk_json_write_object_end(w); 1334 1335 spdk_json_write_object_end(w); 1336 } 1337 1338 void 1339 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1340 { 1341 struct spdk_bdev_module *bdev_module; 1342 struct spdk_bdev *bdev; 1343 1344 assert(w != NULL); 1345 1346 spdk_json_write_array_begin(w); 1347 1348 spdk_json_write_object_begin(w); 1349 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1350 spdk_json_write_named_object_begin(w, "params"); 1351 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1352 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1353 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1354 spdk_json_write_object_end(w); 1355 spdk_json_write_object_end(w); 1356 1357 bdev_examine_allowlist_config_json(w); 1358 1359 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1360 if (bdev_module->config_json) { 1361 bdev_module->config_json(w); 1362 } 1363 } 1364 1365 pthread_mutex_lock(&g_bdev_mgr.mutex); 1366 1367 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1368 if (bdev->fn_table->write_config_json) { 1369 bdev->fn_table->write_config_json(bdev, w); 1370 } 1371 1372 bdev_qos_config_json(bdev, w); 1373 } 1374 1375 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1376 1377 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1378 spdk_json_write_object_begin(w); 1379 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1380 spdk_json_write_object_end(w); 1381 1382 spdk_json_write_array_end(w); 1383 } 1384 1385 static void 1386 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1387 { 1388 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1389 struct spdk_bdev_io *bdev_io; 1390 1391 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1392 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1393 } 1394 1395 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1396 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1397 } 1398 1399 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1400 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1401 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1402 ch->per_thread_cache_count--; 1403 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1404 } 1405 1406 assert(ch->per_thread_cache_count == 0); 1407 } 1408 1409 static int 1410 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1411 { 1412 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1413 struct spdk_bdev_io *bdev_io; 1414 uint32_t i; 1415 1416 STAILQ_INIT(&ch->need_buf_small); 1417 STAILQ_INIT(&ch->need_buf_large); 1418 1419 STAILQ_INIT(&ch->per_thread_cache); 1420 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1421 1422 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1423 ch->per_thread_cache_count = 0; 1424 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1425 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1426 if (bdev_io == NULL) { 1427 SPDK_ERRLOG("You need to increase bdev_io_pool_size using bdev_set_options RPC.\n"); 1428 assert(false); 1429 bdev_mgmt_channel_destroy(io_device, ctx_buf); 1430 return -1; 1431 } 1432 ch->per_thread_cache_count++; 1433 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1434 } 1435 1436 TAILQ_INIT(&ch->shared_resources); 1437 TAILQ_INIT(&ch->io_wait_queue); 1438 1439 return 0; 1440 } 1441 1442 static void 1443 bdev_init_complete(int rc) 1444 { 1445 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1446 void *cb_arg = g_init_cb_arg; 1447 struct spdk_bdev_module *m; 1448 1449 g_bdev_mgr.init_complete = true; 1450 g_init_cb_fn = NULL; 1451 g_init_cb_arg = NULL; 1452 1453 /* 1454 * For modules that need to know when subsystem init is complete, 1455 * inform them now. 1456 */ 1457 if (rc == 0) { 1458 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1459 if (m->init_complete) { 1460 m->init_complete(); 1461 } 1462 } 1463 } 1464 1465 cb_fn(cb_arg, rc); 1466 } 1467 1468 static bool 1469 bdev_module_all_actions_completed(void) 1470 { 1471 struct spdk_bdev_module *m; 1472 1473 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1474 if (m->internal.action_in_progress > 0) { 1475 return false; 1476 } 1477 } 1478 return true; 1479 } 1480 1481 static void 1482 bdev_module_action_complete(void) 1483 { 1484 /* 1485 * Don't finish bdev subsystem initialization if 1486 * module pre-initialization is still in progress, or 1487 * the subsystem been already initialized. 1488 */ 1489 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1490 return; 1491 } 1492 1493 /* 1494 * Check all bdev modules for inits/examinations in progress. If any 1495 * exist, return immediately since we cannot finish bdev subsystem 1496 * initialization until all are completed. 1497 */ 1498 if (!bdev_module_all_actions_completed()) { 1499 return; 1500 } 1501 1502 /* 1503 * Modules already finished initialization - now that all 1504 * the bdev modules have finished their asynchronous I/O 1505 * processing, the entire bdev layer can be marked as complete. 1506 */ 1507 bdev_init_complete(0); 1508 } 1509 1510 static void 1511 bdev_module_action_done(struct spdk_bdev_module *module) 1512 { 1513 assert(module->internal.action_in_progress > 0); 1514 module->internal.action_in_progress--; 1515 bdev_module_action_complete(); 1516 } 1517 1518 void 1519 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1520 { 1521 bdev_module_action_done(module); 1522 } 1523 1524 void 1525 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1526 { 1527 bdev_module_action_done(module); 1528 } 1529 1530 /** The last initialized bdev module */ 1531 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1532 1533 static void 1534 bdev_init_failed(void *cb_arg) 1535 { 1536 struct spdk_bdev_module *module = cb_arg; 1537 1538 module->internal.action_in_progress--; 1539 bdev_init_complete(-1); 1540 } 1541 1542 static int 1543 bdev_modules_init(void) 1544 { 1545 struct spdk_bdev_module *module; 1546 int rc = 0; 1547 1548 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1549 g_resume_bdev_module = module; 1550 if (module->async_init) { 1551 module->internal.action_in_progress = 1; 1552 } 1553 rc = module->module_init(); 1554 if (rc != 0) { 1555 /* Bump action_in_progress to prevent other modules from completion of modules_init 1556 * Send message to defer application shutdown until resources are cleaned up */ 1557 module->internal.action_in_progress = 1; 1558 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1559 return rc; 1560 } 1561 } 1562 1563 g_resume_bdev_module = NULL; 1564 return 0; 1565 } 1566 1567 void 1568 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1569 { 1570 int cache_size; 1571 int rc = 0; 1572 char mempool_name[32]; 1573 1574 assert(cb_fn != NULL); 1575 1576 g_init_cb_fn = cb_fn; 1577 g_init_cb_arg = cb_arg; 1578 1579 spdk_notify_type_register("bdev_register"); 1580 spdk_notify_type_register("bdev_unregister"); 1581 1582 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1583 1584 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1585 g_bdev_opts.bdev_io_pool_size, 1586 sizeof(struct spdk_bdev_io) + 1587 bdev_module_get_max_ctx_size(), 1588 0, 1589 SPDK_ENV_SOCKET_ID_ANY); 1590 1591 if (g_bdev_mgr.bdev_io_pool == NULL) { 1592 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1593 bdev_init_complete(-1); 1594 return; 1595 } 1596 1597 /** 1598 * Ensure no more than half of the total buffers end up local caches, by 1599 * using spdk_env_get_core_count() to determine how many local caches we need 1600 * to account for. 1601 */ 1602 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1603 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1604 1605 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1606 g_bdev_opts.small_buf_pool_size, 1607 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1608 SPDK_BDEV_POOL_ALIGNMENT, 1609 cache_size, 1610 SPDK_ENV_SOCKET_ID_ANY); 1611 if (!g_bdev_mgr.buf_small_pool) { 1612 SPDK_ERRLOG("create rbuf small pool failed\n"); 1613 bdev_init_complete(-1); 1614 return; 1615 } 1616 1617 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1618 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1619 1620 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1621 g_bdev_opts.large_buf_pool_size, 1622 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1623 SPDK_BDEV_POOL_ALIGNMENT, 1624 cache_size, 1625 SPDK_ENV_SOCKET_ID_ANY); 1626 if (!g_bdev_mgr.buf_large_pool) { 1627 SPDK_ERRLOG("create rbuf large pool failed\n"); 1628 bdev_init_complete(-1); 1629 return; 1630 } 1631 1632 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1633 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1634 if (!g_bdev_mgr.zero_buffer) { 1635 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1636 bdev_init_complete(-1); 1637 return; 1638 } 1639 1640 #ifdef SPDK_CONFIG_VTUNE 1641 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1642 #endif 1643 1644 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1645 bdev_mgmt_channel_destroy, 1646 sizeof(struct spdk_bdev_mgmt_channel), 1647 "bdev_mgr"); 1648 1649 rc = bdev_modules_init(); 1650 g_bdev_mgr.module_init_complete = true; 1651 if (rc != 0) { 1652 SPDK_ERRLOG("bdev modules init failed\n"); 1653 return; 1654 } 1655 1656 bdev_module_action_complete(); 1657 } 1658 1659 static void 1660 bdev_mgr_unregister_cb(void *io_device) 1661 { 1662 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1663 1664 if (g_bdev_mgr.bdev_io_pool) { 1665 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1666 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1667 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1668 g_bdev_opts.bdev_io_pool_size); 1669 } 1670 1671 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1672 } 1673 1674 if (g_bdev_mgr.buf_small_pool) { 1675 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1676 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1677 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1678 g_bdev_opts.small_buf_pool_size); 1679 assert(false); 1680 } 1681 1682 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1683 } 1684 1685 if (g_bdev_mgr.buf_large_pool) { 1686 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1687 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1688 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1689 g_bdev_opts.large_buf_pool_size); 1690 assert(false); 1691 } 1692 1693 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1694 } 1695 1696 spdk_free(g_bdev_mgr.zero_buffer); 1697 1698 bdev_examine_allowlist_free(); 1699 1700 cb_fn(g_fini_cb_arg); 1701 g_fini_cb_fn = NULL; 1702 g_fini_cb_arg = NULL; 1703 g_bdev_mgr.init_complete = false; 1704 g_bdev_mgr.module_init_complete = false; 1705 } 1706 1707 static void 1708 bdev_module_fini_iter(void *arg) 1709 { 1710 struct spdk_bdev_module *bdev_module; 1711 1712 /* FIXME: Handling initialization failures is broken now, 1713 * so we won't even try cleaning up after successfully 1714 * initialized modules. if module_init_complete is false, 1715 * just call spdk_bdev_mgr_unregister_cb 1716 */ 1717 if (!g_bdev_mgr.module_init_complete) { 1718 bdev_mgr_unregister_cb(NULL); 1719 return; 1720 } 1721 1722 /* Start iterating from the last touched module */ 1723 if (!g_resume_bdev_module) { 1724 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1725 } else { 1726 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1727 internal.tailq); 1728 } 1729 1730 while (bdev_module) { 1731 if (bdev_module->async_fini) { 1732 /* Save our place so we can resume later. We must 1733 * save the variable here, before calling module_fini() 1734 * below, because in some cases the module may immediately 1735 * call spdk_bdev_module_fini_done() and re-enter 1736 * this function to continue iterating. */ 1737 g_resume_bdev_module = bdev_module; 1738 } 1739 1740 if (bdev_module->module_fini) { 1741 bdev_module->module_fini(); 1742 } 1743 1744 if (bdev_module->async_fini) { 1745 return; 1746 } 1747 1748 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1749 internal.tailq); 1750 } 1751 1752 g_resume_bdev_module = NULL; 1753 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1754 } 1755 1756 void 1757 spdk_bdev_module_fini_done(void) 1758 { 1759 if (spdk_get_thread() != g_fini_thread) { 1760 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1761 } else { 1762 bdev_module_fini_iter(NULL); 1763 } 1764 } 1765 1766 static void 1767 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1768 { 1769 struct spdk_bdev *bdev = cb_arg; 1770 1771 if (bdeverrno && bdev) { 1772 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1773 bdev->name); 1774 1775 /* 1776 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1777 * bdev; try to continue by manually removing this bdev from the list and continue 1778 * with the next bdev in the list. 1779 */ 1780 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1781 } 1782 1783 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1784 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1785 /* 1786 * Bdev module finish need to be deferred as we might be in the middle of some context 1787 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1788 * after returning. 1789 */ 1790 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1791 return; 1792 } 1793 1794 /* 1795 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1796 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1797 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1798 * base bdevs. 1799 * 1800 * Also, walk the list in the reverse order. 1801 */ 1802 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1803 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1804 if (bdev->internal.claim_module != NULL) { 1805 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1806 bdev->name, bdev->internal.claim_module->name); 1807 continue; 1808 } 1809 1810 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1811 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1812 return; 1813 } 1814 1815 /* 1816 * If any bdev fails to unclaim underlying bdev properly, we may face the 1817 * case of bdev list consisting of claimed bdevs only (if claims are managed 1818 * correctly, this would mean there's a loop in the claims graph which is 1819 * clearly impossible). Warn and unregister last bdev on the list then. 1820 */ 1821 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1822 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1823 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1824 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1825 return; 1826 } 1827 } 1828 1829 static void 1830 bdev_module_fini_start_iter(void *arg) 1831 { 1832 struct spdk_bdev_module *bdev_module; 1833 1834 if (!g_resume_bdev_module) { 1835 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1836 } else { 1837 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1838 } 1839 1840 while (bdev_module) { 1841 if (bdev_module->async_fini_start) { 1842 /* Save our place so we can resume later. We must 1843 * save the variable here, before calling fini_start() 1844 * below, because in some cases the module may immediately 1845 * call spdk_bdev_module_fini_start_done() and re-enter 1846 * this function to continue iterating. */ 1847 g_resume_bdev_module = bdev_module; 1848 } 1849 1850 if (bdev_module->fini_start) { 1851 bdev_module->fini_start(); 1852 } 1853 1854 if (bdev_module->async_fini_start) { 1855 return; 1856 } 1857 1858 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1859 } 1860 1861 g_resume_bdev_module = NULL; 1862 1863 bdev_finish_unregister_bdevs_iter(NULL, 0); 1864 } 1865 1866 void 1867 spdk_bdev_module_fini_start_done(void) 1868 { 1869 if (spdk_get_thread() != g_fini_thread) { 1870 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1871 } else { 1872 bdev_module_fini_start_iter(NULL); 1873 } 1874 } 1875 1876 static void 1877 bdev_finish_wait_for_examine_done(void *cb_arg) 1878 { 1879 bdev_module_fini_start_iter(NULL); 1880 } 1881 1882 void 1883 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1884 { 1885 int rc; 1886 1887 assert(cb_fn != NULL); 1888 1889 g_fini_thread = spdk_get_thread(); 1890 1891 g_fini_cb_fn = cb_fn; 1892 g_fini_cb_arg = cb_arg; 1893 1894 rc = spdk_bdev_wait_for_examine(bdev_finish_wait_for_examine_done, NULL); 1895 if (rc != 0) { 1896 SPDK_ERRLOG("wait_for_examine failed: %s\n", spdk_strerror(-rc)); 1897 bdev_finish_wait_for_examine_done(NULL); 1898 } 1899 } 1900 1901 struct spdk_bdev_io * 1902 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1903 { 1904 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1905 struct spdk_bdev_io *bdev_io; 1906 1907 if (ch->per_thread_cache_count > 0) { 1908 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1909 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1910 ch->per_thread_cache_count--; 1911 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1912 /* 1913 * Don't try to look for bdev_ios in the global pool if there are 1914 * waiters on bdev_ios - we don't want this caller to jump the line. 1915 */ 1916 bdev_io = NULL; 1917 } else { 1918 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1919 } 1920 1921 return bdev_io; 1922 } 1923 1924 void 1925 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1926 { 1927 struct spdk_bdev_mgmt_channel *ch; 1928 1929 assert(bdev_io != NULL); 1930 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1931 1932 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1933 1934 if (bdev_io->internal.buf != NULL) { 1935 bdev_io_put_buf(bdev_io); 1936 } 1937 1938 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1939 ch->per_thread_cache_count++; 1940 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1941 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1942 struct spdk_bdev_io_wait_entry *entry; 1943 1944 entry = TAILQ_FIRST(&ch->io_wait_queue); 1945 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1946 entry->cb_fn(entry->cb_arg); 1947 } 1948 } else { 1949 /* We should never have a full cache with entries on the io wait queue. */ 1950 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1951 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1952 } 1953 } 1954 1955 static bool 1956 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1957 { 1958 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1959 1960 switch (limit) { 1961 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1962 return true; 1963 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1964 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1965 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1966 return false; 1967 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1968 default: 1969 return false; 1970 } 1971 } 1972 1973 static bool 1974 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1975 { 1976 switch (bdev_io->type) { 1977 case SPDK_BDEV_IO_TYPE_NVME_IO: 1978 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1979 case SPDK_BDEV_IO_TYPE_READ: 1980 case SPDK_BDEV_IO_TYPE_WRITE: 1981 return true; 1982 case SPDK_BDEV_IO_TYPE_ZCOPY: 1983 if (bdev_io->u.bdev.zcopy.start) { 1984 return true; 1985 } else { 1986 return false; 1987 } 1988 default: 1989 return false; 1990 } 1991 } 1992 1993 static bool 1994 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 1995 { 1996 switch (bdev_io->type) { 1997 case SPDK_BDEV_IO_TYPE_NVME_IO: 1998 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1999 /* Bit 1 (0x2) set for read operation */ 2000 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2001 return true; 2002 } else { 2003 return false; 2004 } 2005 case SPDK_BDEV_IO_TYPE_READ: 2006 return true; 2007 case SPDK_BDEV_IO_TYPE_ZCOPY: 2008 /* Populate to read from disk */ 2009 if (bdev_io->u.bdev.zcopy.populate) { 2010 return true; 2011 } else { 2012 return false; 2013 } 2014 default: 2015 return false; 2016 } 2017 } 2018 2019 static uint64_t 2020 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2021 { 2022 struct spdk_bdev *bdev = bdev_io->bdev; 2023 2024 switch (bdev_io->type) { 2025 case SPDK_BDEV_IO_TYPE_NVME_IO: 2026 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2027 return bdev_io->u.nvme_passthru.nbytes; 2028 case SPDK_BDEV_IO_TYPE_READ: 2029 case SPDK_BDEV_IO_TYPE_WRITE: 2030 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2031 case SPDK_BDEV_IO_TYPE_ZCOPY: 2032 /* Track the data in the start phase only */ 2033 if (bdev_io->u.bdev.zcopy.start) { 2034 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2035 } else { 2036 return 0; 2037 } 2038 default: 2039 return 0; 2040 } 2041 } 2042 2043 static bool 2044 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2045 { 2046 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2047 return true; 2048 } else { 2049 return false; 2050 } 2051 } 2052 2053 static bool 2054 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2055 { 2056 if (bdev_is_read_io(io) == false) { 2057 return false; 2058 } 2059 2060 return bdev_qos_rw_queue_io(limit, io); 2061 } 2062 2063 static bool 2064 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2065 { 2066 if (bdev_is_read_io(io) == true) { 2067 return false; 2068 } 2069 2070 return bdev_qos_rw_queue_io(limit, io); 2071 } 2072 2073 static void 2074 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2075 { 2076 limit->remaining_this_timeslice--; 2077 } 2078 2079 static void 2080 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2081 { 2082 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2083 } 2084 2085 static void 2086 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2087 { 2088 if (bdev_is_read_io(io) == false) { 2089 return; 2090 } 2091 2092 return bdev_qos_rw_bps_update_quota(limit, io); 2093 } 2094 2095 static void 2096 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2097 { 2098 if (bdev_is_read_io(io) == true) { 2099 return; 2100 } 2101 2102 return bdev_qos_rw_bps_update_quota(limit, io); 2103 } 2104 2105 static void 2106 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2107 { 2108 int i; 2109 2110 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2111 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2112 qos->rate_limits[i].queue_io = NULL; 2113 qos->rate_limits[i].update_quota = NULL; 2114 continue; 2115 } 2116 2117 switch (i) { 2118 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2119 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2120 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2121 break; 2122 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2123 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2124 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2125 break; 2126 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2127 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2128 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2129 break; 2130 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2131 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2132 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2133 break; 2134 default: 2135 break; 2136 } 2137 } 2138 } 2139 2140 static void 2141 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2142 struct spdk_bdev_io *bdev_io, 2143 enum spdk_bdev_io_status status) 2144 { 2145 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2146 2147 bdev_io->internal.in_submit_request = true; 2148 bdev_ch->io_outstanding++; 2149 shared_resource->io_outstanding++; 2150 spdk_bdev_io_complete(bdev_io, status); 2151 bdev_io->internal.in_submit_request = false; 2152 } 2153 2154 static inline void 2155 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2156 { 2157 struct spdk_bdev *bdev = bdev_io->bdev; 2158 struct spdk_io_channel *ch = bdev_ch->channel; 2159 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2160 2161 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2162 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2163 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2164 2165 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2166 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2167 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2168 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2169 SPDK_BDEV_IO_STATUS_SUCCESS); 2170 return; 2171 } 2172 } 2173 2174 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && 2175 bdev_io->bdev->split_on_write_unit && 2176 bdev_io->u.bdev.num_blocks < bdev_io->bdev->write_unit_size)) { 2177 SPDK_ERRLOG("IO does not match the write_unit_size\n"); 2178 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2179 return; 2180 } 2181 2182 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2183 bdev_ch->io_outstanding++; 2184 shared_resource->io_outstanding++; 2185 bdev_io->internal.in_submit_request = true; 2186 bdev->fn_table->submit_request(ch, bdev_io); 2187 bdev_io->internal.in_submit_request = false; 2188 } else { 2189 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2190 } 2191 } 2192 2193 static bool 2194 bdev_qos_queue_io(struct spdk_bdev_qos *qos, struct spdk_bdev_io *bdev_io) 2195 { 2196 int i; 2197 2198 if (bdev_qos_io_to_limit(bdev_io) == true) { 2199 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2200 if (!qos->rate_limits[i].queue_io) { 2201 continue; 2202 } 2203 2204 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2205 bdev_io) == true) { 2206 return true; 2207 } 2208 } 2209 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2210 if (!qos->rate_limits[i].update_quota) { 2211 continue; 2212 } 2213 2214 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2215 } 2216 } 2217 2218 return false; 2219 } 2220 2221 static int 2222 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2223 { 2224 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2225 int submitted_ios = 0; 2226 2227 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2228 if (!bdev_qos_queue_io(qos, bdev_io)) { 2229 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2230 bdev_io_do_submit(ch, bdev_io); 2231 submitted_ios++; 2232 } 2233 } 2234 2235 return submitted_ios; 2236 } 2237 2238 static void 2239 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2240 { 2241 int rc; 2242 2243 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2244 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2245 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2246 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2247 &bdev_io->internal.waitq_entry); 2248 if (rc != 0) { 2249 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2250 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2251 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2252 } 2253 } 2254 2255 static bool 2256 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2257 { 2258 uint32_t io_boundary; 2259 struct spdk_bdev *bdev = bdev_io->bdev; 2260 uint32_t max_size = bdev->max_segment_size; 2261 int max_segs = bdev->max_num_segments; 2262 2263 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2264 io_boundary = bdev->write_unit_size; 2265 } else if (bdev->split_on_optimal_io_boundary) { 2266 io_boundary = bdev->optimal_io_boundary; 2267 } else { 2268 io_boundary = 0; 2269 } 2270 2271 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2272 return false; 2273 } 2274 2275 if (io_boundary) { 2276 uint64_t start_stripe, end_stripe; 2277 2278 start_stripe = bdev_io->u.bdev.offset_blocks; 2279 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2280 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2281 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2282 start_stripe >>= spdk_u32log2(io_boundary); 2283 end_stripe >>= spdk_u32log2(io_boundary); 2284 } else { 2285 start_stripe /= io_boundary; 2286 end_stripe /= io_boundary; 2287 } 2288 2289 if (start_stripe != end_stripe) { 2290 return true; 2291 } 2292 } 2293 2294 if (max_segs) { 2295 if (bdev_io->u.bdev.iovcnt > max_segs) { 2296 return true; 2297 } 2298 } 2299 2300 if (max_size) { 2301 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2302 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2303 return true; 2304 } 2305 } 2306 } 2307 2308 return false; 2309 } 2310 2311 static bool 2312 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2313 { 2314 uint32_t num_unmap_segments; 2315 2316 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2317 return false; 2318 } 2319 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2320 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2321 return true; 2322 } 2323 2324 return false; 2325 } 2326 2327 static bool 2328 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2329 { 2330 if (!bdev_io->bdev->max_write_zeroes) { 2331 return false; 2332 } 2333 2334 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2335 return true; 2336 } 2337 2338 return false; 2339 } 2340 2341 static bool 2342 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2343 { 2344 switch (bdev_io->type) { 2345 case SPDK_BDEV_IO_TYPE_READ: 2346 case SPDK_BDEV_IO_TYPE_WRITE: 2347 return bdev_rw_should_split(bdev_io); 2348 case SPDK_BDEV_IO_TYPE_UNMAP: 2349 return bdev_unmap_should_split(bdev_io); 2350 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2351 return bdev_write_zeroes_should_split(bdev_io); 2352 default: 2353 return false; 2354 } 2355 } 2356 2357 static uint32_t 2358 _to_next_boundary(uint64_t offset, uint32_t boundary) 2359 { 2360 return (boundary - (offset % boundary)); 2361 } 2362 2363 static void bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2364 2365 static void _bdev_rw_split(void *_bdev_io); 2366 2367 static void bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2368 2369 static void 2370 _bdev_unmap_split(void *_bdev_io) 2371 { 2372 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2373 } 2374 2375 static void bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2376 2377 static void 2378 _bdev_write_zeroes_split(void *_bdev_io) 2379 { 2380 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2381 } 2382 2383 static int 2384 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2385 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2386 { 2387 int rc; 2388 uint64_t current_offset, current_remaining; 2389 spdk_bdev_io_wait_cb io_wait_fn; 2390 2391 current_offset = *offset; 2392 current_remaining = *remaining; 2393 2394 bdev_io->u.bdev.split_outstanding++; 2395 2396 io_wait_fn = _bdev_rw_split; 2397 switch (bdev_io->type) { 2398 case SPDK_BDEV_IO_TYPE_READ: 2399 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2400 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2401 iov, iovcnt, md_buf, current_offset, 2402 num_blocks, 2403 bdev_io_split_done, bdev_io, 2404 bdev_io->internal.ext_opts, true); 2405 break; 2406 case SPDK_BDEV_IO_TYPE_WRITE: 2407 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2408 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2409 iov, iovcnt, md_buf, current_offset, 2410 num_blocks, 2411 bdev_io_split_done, bdev_io, 2412 bdev_io->internal.ext_opts, true); 2413 break; 2414 case SPDK_BDEV_IO_TYPE_UNMAP: 2415 io_wait_fn = _bdev_unmap_split; 2416 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2417 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2418 current_offset, num_blocks, 2419 bdev_io_split_done, bdev_io); 2420 break; 2421 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2422 io_wait_fn = _bdev_write_zeroes_split; 2423 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2424 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2425 current_offset, num_blocks, 2426 bdev_io_split_done, bdev_io); 2427 break; 2428 default: 2429 assert(false); 2430 rc = -EINVAL; 2431 break; 2432 } 2433 2434 if (rc == 0) { 2435 current_offset += num_blocks; 2436 current_remaining -= num_blocks; 2437 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2438 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2439 *offset = current_offset; 2440 *remaining = current_remaining; 2441 } else { 2442 bdev_io->u.bdev.split_outstanding--; 2443 if (rc == -ENOMEM) { 2444 if (bdev_io->u.bdev.split_outstanding == 0) { 2445 /* No I/O is outstanding. Hence we should wait here. */ 2446 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2447 } 2448 } else { 2449 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2450 if (bdev_io->u.bdev.split_outstanding == 0) { 2451 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2452 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2453 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2454 } 2455 } 2456 } 2457 2458 return rc; 2459 } 2460 2461 static void 2462 _bdev_rw_split(void *_bdev_io) 2463 { 2464 struct iovec *parent_iov, *iov; 2465 struct spdk_bdev_io *bdev_io = _bdev_io; 2466 struct spdk_bdev *bdev = bdev_io->bdev; 2467 uint64_t parent_offset, current_offset, remaining; 2468 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2469 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2470 uint32_t iovcnt, iov_len, child_iovsize; 2471 uint32_t blocklen = bdev->blocklen; 2472 uint32_t io_boundary; 2473 uint32_t max_segment_size = bdev->max_segment_size; 2474 uint32_t max_child_iovcnt = bdev->max_num_segments; 2475 void *md_buf = NULL; 2476 int rc; 2477 2478 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2479 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2480 BDEV_IO_NUM_CHILD_IOV; 2481 2482 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE && bdev->split_on_write_unit) { 2483 io_boundary = bdev->write_unit_size; 2484 } else if (bdev->split_on_optimal_io_boundary) { 2485 io_boundary = bdev->optimal_io_boundary; 2486 } else { 2487 io_boundary = UINT32_MAX; 2488 } 2489 2490 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2491 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2492 parent_offset = bdev_io->u.bdev.offset_blocks; 2493 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2494 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2495 2496 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2497 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2498 if (parent_iov_offset < parent_iov->iov_len) { 2499 break; 2500 } 2501 parent_iov_offset -= parent_iov->iov_len; 2502 } 2503 2504 child_iovcnt = 0; 2505 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2506 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2507 to_next_boundary = spdk_min(remaining, to_next_boundary); 2508 to_next_boundary_bytes = to_next_boundary * blocklen; 2509 2510 iov = &bdev_io->child_iov[child_iovcnt]; 2511 iovcnt = 0; 2512 2513 if (bdev_io->u.bdev.md_buf) { 2514 md_buf = (char *)bdev_io->u.bdev.md_buf + 2515 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2516 } 2517 2518 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2519 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2520 iovcnt < child_iovsize) { 2521 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2522 iov_len = parent_iov->iov_len - parent_iov_offset; 2523 2524 iov_len = spdk_min(iov_len, max_segment_size); 2525 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2526 to_next_boundary_bytes -= iov_len; 2527 2528 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2529 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2530 2531 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2532 parent_iov_offset += iov_len; 2533 } else { 2534 parent_iovpos++; 2535 parent_iov_offset = 0; 2536 } 2537 child_iovcnt++; 2538 iovcnt++; 2539 } 2540 2541 if (to_next_boundary_bytes > 0) { 2542 /* We had to stop this child I/O early because we ran out of 2543 * child_iov space or were limited by max_num_segments. 2544 * Ensure the iovs to be aligned with block size and 2545 * then adjust to_next_boundary before starting the 2546 * child I/O. 2547 */ 2548 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2549 iovcnt == child_iovsize); 2550 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2551 if (to_last_block_bytes != 0) { 2552 uint32_t child_iovpos = child_iovcnt - 1; 2553 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2554 * so the loop will naturally end 2555 */ 2556 2557 to_last_block_bytes = blocklen - to_last_block_bytes; 2558 to_next_boundary_bytes += to_last_block_bytes; 2559 while (to_last_block_bytes > 0 && iovcnt > 0) { 2560 iov_len = spdk_min(to_last_block_bytes, 2561 bdev_io->child_iov[child_iovpos].iov_len); 2562 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2563 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2564 child_iovpos--; 2565 if (--iovcnt == 0) { 2566 /* If the child IO is less than a block size just return. 2567 * If the first child IO of any split round is less than 2568 * a block size, an error exit. 2569 */ 2570 if (bdev_io->u.bdev.split_outstanding == 0) { 2571 SPDK_ERRLOG("The first child io was less than a block size\n"); 2572 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2573 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2574 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2575 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2576 } 2577 2578 return; 2579 } 2580 } 2581 2582 to_last_block_bytes -= iov_len; 2583 2584 if (parent_iov_offset == 0) { 2585 parent_iovpos--; 2586 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2587 } 2588 parent_iov_offset -= iov_len; 2589 } 2590 2591 assert(to_last_block_bytes == 0); 2592 } 2593 to_next_boundary -= to_next_boundary_bytes / blocklen; 2594 } 2595 2596 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2597 ¤t_offset, &remaining); 2598 if (spdk_unlikely(rc)) { 2599 return; 2600 } 2601 } 2602 } 2603 2604 static void 2605 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2606 { 2607 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2608 uint32_t num_children_reqs = 0; 2609 int rc; 2610 2611 offset = bdev_io->u.bdev.split_current_offset_blocks; 2612 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2613 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2614 2615 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2616 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2617 2618 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2619 &offset, &remaining); 2620 if (spdk_likely(rc == 0)) { 2621 num_children_reqs++; 2622 } else { 2623 return; 2624 } 2625 } 2626 } 2627 2628 static void 2629 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2630 { 2631 uint64_t offset, write_zeroes_blocks, remaining; 2632 uint32_t num_children_reqs = 0; 2633 int rc; 2634 2635 offset = bdev_io->u.bdev.split_current_offset_blocks; 2636 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2637 2638 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2639 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2640 2641 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2642 &offset, &remaining); 2643 if (spdk_likely(rc == 0)) { 2644 num_children_reqs++; 2645 } else { 2646 return; 2647 } 2648 } 2649 } 2650 2651 static void 2652 parent_bdev_io_complete(void *ctx, int rc) 2653 { 2654 struct spdk_bdev_io *parent_io = ctx; 2655 2656 if (rc) { 2657 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2658 } 2659 2660 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2661 parent_io->internal.caller_ctx); 2662 } 2663 2664 static void 2665 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2666 { 2667 struct spdk_bdev_io *parent_io = cb_arg; 2668 2669 spdk_bdev_free_io(bdev_io); 2670 2671 if (!success) { 2672 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2673 /* If any child I/O failed, stop further splitting process. */ 2674 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2675 parent_io->u.bdev.split_remaining_num_blocks = 0; 2676 } 2677 parent_io->u.bdev.split_outstanding--; 2678 if (parent_io->u.bdev.split_outstanding != 0) { 2679 return; 2680 } 2681 2682 /* 2683 * Parent I/O finishes when all blocks are consumed. 2684 */ 2685 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2686 assert(parent_io->internal.cb != bdev_io_split_done); 2687 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2688 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2689 2690 if (parent_io->internal.orig_iovcnt != 0) { 2691 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2692 /* bdev IO will be completed in the callback */ 2693 } else { 2694 parent_bdev_io_complete(parent_io, 0); 2695 } 2696 return; 2697 } 2698 2699 /* 2700 * Continue with the splitting process. This function will complete the parent I/O if the 2701 * splitting is done. 2702 */ 2703 switch (parent_io->type) { 2704 case SPDK_BDEV_IO_TYPE_READ: 2705 case SPDK_BDEV_IO_TYPE_WRITE: 2706 _bdev_rw_split(parent_io); 2707 break; 2708 case SPDK_BDEV_IO_TYPE_UNMAP: 2709 bdev_unmap_split(parent_io); 2710 break; 2711 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2712 bdev_write_zeroes_split(parent_io); 2713 break; 2714 default: 2715 assert(false); 2716 break; 2717 } 2718 } 2719 2720 static void bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 2721 bool success); 2722 2723 static void 2724 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2725 { 2726 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2727 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2728 bdev_io->u.bdev.split_outstanding = 0; 2729 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2730 2731 switch (bdev_io->type) { 2732 case SPDK_BDEV_IO_TYPE_READ: 2733 case SPDK_BDEV_IO_TYPE_WRITE: 2734 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2735 _bdev_rw_split(bdev_io); 2736 } else { 2737 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2738 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2739 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2740 } 2741 break; 2742 case SPDK_BDEV_IO_TYPE_UNMAP: 2743 bdev_unmap_split(bdev_io); 2744 break; 2745 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2746 bdev_write_zeroes_split(bdev_io); 2747 break; 2748 default: 2749 assert(false); 2750 break; 2751 } 2752 } 2753 2754 static void 2755 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2756 { 2757 if (!success) { 2758 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2759 return; 2760 } 2761 2762 _bdev_rw_split(bdev_io); 2763 } 2764 2765 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2766 * be inlined, at least on some compilers. 2767 */ 2768 static inline void 2769 _bdev_io_submit(void *ctx) 2770 { 2771 struct spdk_bdev_io *bdev_io = ctx; 2772 struct spdk_bdev *bdev = bdev_io->bdev; 2773 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2774 uint64_t tsc; 2775 2776 tsc = spdk_get_ticks(); 2777 bdev_io->internal.submit_tsc = tsc; 2778 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, 2779 (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2780 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2781 spdk_bdev_get_name(bdev)); 2782 2783 if (spdk_likely(bdev_ch->flags == 0)) { 2784 bdev_io_do_submit(bdev_ch, bdev_io); 2785 return; 2786 } 2787 2788 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2789 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2790 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2791 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2792 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2793 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2794 } else { 2795 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2796 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2797 } 2798 } else { 2799 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2800 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2801 } 2802 } 2803 2804 bool bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2805 2806 bool 2807 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2808 { 2809 if (range1->length == 0 || range2->length == 0) { 2810 return false; 2811 } 2812 2813 if (range1->offset + range1->length <= range2->offset) { 2814 return false; 2815 } 2816 2817 if (range2->offset + range2->length <= range1->offset) { 2818 return false; 2819 } 2820 2821 return true; 2822 } 2823 2824 static bool 2825 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2826 { 2827 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2828 struct lba_range r; 2829 2830 switch (bdev_io->type) { 2831 case SPDK_BDEV_IO_TYPE_NVME_IO: 2832 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2833 /* Don't try to decode the NVMe command - just assume worst-case and that 2834 * it overlaps a locked range. 2835 */ 2836 return true; 2837 case SPDK_BDEV_IO_TYPE_WRITE: 2838 case SPDK_BDEV_IO_TYPE_UNMAP: 2839 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2840 case SPDK_BDEV_IO_TYPE_ZCOPY: 2841 r.offset = bdev_io->u.bdev.offset_blocks; 2842 r.length = bdev_io->u.bdev.num_blocks; 2843 if (!bdev_lba_range_overlapped(range, &r)) { 2844 /* This I/O doesn't overlap the specified LBA range. */ 2845 return false; 2846 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2847 /* This I/O overlaps, but the I/O is on the same channel that locked this 2848 * range, and the caller_ctx is the same as the locked_ctx. This means 2849 * that this I/O is associated with the lock, and is allowed to execute. 2850 */ 2851 return false; 2852 } else { 2853 return true; 2854 } 2855 default: 2856 return false; 2857 } 2858 } 2859 2860 void 2861 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2862 { 2863 struct spdk_bdev *bdev = bdev_io->bdev; 2864 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2865 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2866 2867 assert(thread != NULL); 2868 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2869 2870 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2871 struct lba_range *range; 2872 2873 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2874 if (bdev_io_range_is_locked(bdev_io, range)) { 2875 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2876 return; 2877 } 2878 } 2879 } 2880 2881 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2882 2883 if (bdev_io_should_split(bdev_io)) { 2884 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2885 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2886 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2887 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 2888 spdk_bdev_get_name(bdev)); 2889 bdev_io_split(NULL, bdev_io); 2890 return; 2891 } 2892 2893 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2894 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2895 _bdev_io_submit(bdev_io); 2896 } else { 2897 bdev_io->internal.io_submit_ch = ch; 2898 bdev_io->internal.ch = bdev->internal.qos->ch; 2899 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2900 } 2901 } else { 2902 _bdev_io_submit(bdev_io); 2903 } 2904 } 2905 2906 static inline void 2907 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2908 { 2909 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2910 2911 /* Zero part we don't copy */ 2912 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2913 memcpy(opts_copy, opts, opts->size); 2914 opts_copy->size = sizeof(*opts_copy); 2915 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2916 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2917 bdev_io->u.bdev.ext_opts = opts_copy; 2918 } 2919 2920 static inline void 2921 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2922 { 2923 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2924 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2925 * For write operation we need to pull buffers from memory domain before submitting IO. 2926 * Once read operation completes, we need to use memory_domain push functionality to 2927 * update data in original memory domain IO buffer 2928 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2929 * the copied ext_opts */ 2930 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2931 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2932 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2933 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2934 } 2935 2936 static inline void 2937 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2938 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2939 { 2940 if (opts) { 2941 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2942 assert(opts->size <= sizeof(*opts)); 2943 /* 2944 * copy if size is smaller than opts struct to avoid having to check size 2945 * on every access to bdev_io->u.bdev.ext_opts 2946 */ 2947 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2948 _bdev_io_copy_ext_opts(bdev_io, opts); 2949 if (use_pull_push) { 2950 _bdev_io_ext_use_bounce_buffer(bdev_io); 2951 return; 2952 } 2953 } 2954 } 2955 bdev_io_submit(bdev_io); 2956 } 2957 2958 static void 2959 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2960 { 2961 struct spdk_bdev *bdev = bdev_io->bdev; 2962 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2963 struct spdk_io_channel *ch = bdev_ch->channel; 2964 2965 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2966 2967 bdev_io->internal.in_submit_request = true; 2968 bdev->fn_table->submit_request(ch, bdev_io); 2969 bdev_io->internal.in_submit_request = false; 2970 } 2971 2972 void 2973 bdev_io_init(struct spdk_bdev_io *bdev_io, 2974 struct spdk_bdev *bdev, void *cb_arg, 2975 spdk_bdev_io_completion_cb cb) 2976 { 2977 bdev_io->bdev = bdev; 2978 bdev_io->internal.caller_ctx = cb_arg; 2979 bdev_io->internal.cb = cb; 2980 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2981 bdev_io->internal.in_submit_request = false; 2982 bdev_io->internal.buf = NULL; 2983 bdev_io->internal.io_submit_ch = NULL; 2984 bdev_io->internal.orig_iovs = NULL; 2985 bdev_io->internal.orig_iovcnt = 0; 2986 bdev_io->internal.orig_md_iov.iov_base = NULL; 2987 bdev_io->internal.error.nvme.cdw0 = 0; 2988 bdev_io->num_retries = 0; 2989 bdev_io->internal.get_buf_cb = NULL; 2990 bdev_io->internal.get_aux_buf_cb = NULL; 2991 bdev_io->internal.ext_opts = NULL; 2992 bdev_io->internal.data_transfer_cpl = NULL; 2993 } 2994 2995 static bool 2996 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2997 { 2998 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2999 } 3000 3001 bool 3002 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 3003 { 3004 bool supported; 3005 3006 supported = bdev_io_type_supported(bdev, io_type); 3007 3008 if (!supported) { 3009 switch (io_type) { 3010 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 3011 /* The bdev layer will emulate write zeroes as long as write is supported. */ 3012 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 3013 break; 3014 default: 3015 break; 3016 } 3017 } 3018 3019 return supported; 3020 } 3021 3022 int 3023 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3024 { 3025 if (bdev->fn_table->dump_info_json) { 3026 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3027 } 3028 3029 return 0; 3030 } 3031 3032 static void 3033 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3034 { 3035 uint32_t max_per_timeslice = 0; 3036 int i; 3037 3038 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3039 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3040 qos->rate_limits[i].max_per_timeslice = 0; 3041 continue; 3042 } 3043 3044 max_per_timeslice = qos->rate_limits[i].limit * 3045 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3046 3047 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3048 qos->rate_limits[i].min_per_timeslice); 3049 3050 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3051 } 3052 3053 bdev_qos_set_ops(qos); 3054 } 3055 3056 static int 3057 bdev_channel_poll_qos(void *arg) 3058 { 3059 struct spdk_bdev_qos *qos = arg; 3060 uint64_t now = spdk_get_ticks(); 3061 int i; 3062 3063 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3064 /* We received our callback earlier than expected - return 3065 * immediately and wait to do accounting until at least one 3066 * timeslice has actually expired. This should never happen 3067 * with a well-behaved timer implementation. 3068 */ 3069 return SPDK_POLLER_IDLE; 3070 } 3071 3072 /* Reset for next round of rate limiting */ 3073 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3074 /* We may have allowed the IOs or bytes to slightly overrun in the last 3075 * timeslice. remaining_this_timeslice is signed, so if it's negative 3076 * here, we'll account for the overrun so that the next timeslice will 3077 * be appropriately reduced. 3078 */ 3079 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3080 qos->rate_limits[i].remaining_this_timeslice = 0; 3081 } 3082 } 3083 3084 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3085 qos->last_timeslice += qos->timeslice_size; 3086 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3087 qos->rate_limits[i].remaining_this_timeslice += 3088 qos->rate_limits[i].max_per_timeslice; 3089 } 3090 } 3091 3092 return bdev_qos_io_submit(qos->ch, qos); 3093 } 3094 3095 static void 3096 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3097 { 3098 struct spdk_bdev_shared_resource *shared_resource; 3099 struct lba_range *range; 3100 3101 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3102 range = TAILQ_FIRST(&ch->locked_ranges); 3103 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3104 free(range); 3105 } 3106 3107 spdk_put_io_channel(ch->channel); 3108 3109 shared_resource = ch->shared_resource; 3110 3111 assert(TAILQ_EMPTY(&ch->io_locked)); 3112 assert(TAILQ_EMPTY(&ch->io_submitted)); 3113 assert(ch->io_outstanding == 0); 3114 assert(shared_resource->ref > 0); 3115 shared_resource->ref--; 3116 if (shared_resource->ref == 0) { 3117 assert(shared_resource->io_outstanding == 0); 3118 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3119 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3120 free(shared_resource); 3121 } 3122 } 3123 3124 /* Caller must hold bdev->internal.mutex. */ 3125 static void 3126 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3127 { 3128 struct spdk_bdev_qos *qos = bdev->internal.qos; 3129 int i; 3130 3131 /* Rate limiting on this bdev enabled */ 3132 if (qos) { 3133 if (qos->ch == NULL) { 3134 struct spdk_io_channel *io_ch; 3135 3136 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3137 bdev->name, spdk_get_thread()); 3138 3139 /* No qos channel has been selected, so set one up */ 3140 3141 /* Take another reference to ch */ 3142 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3143 assert(io_ch != NULL); 3144 qos->ch = ch; 3145 3146 qos->thread = spdk_io_channel_get_thread(io_ch); 3147 3148 TAILQ_INIT(&qos->queued); 3149 3150 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3151 if (bdev_qos_is_iops_rate_limit(i) == true) { 3152 qos->rate_limits[i].min_per_timeslice = 3153 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3154 } else { 3155 qos->rate_limits[i].min_per_timeslice = 3156 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3157 } 3158 3159 if (qos->rate_limits[i].limit == 0) { 3160 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3161 } 3162 } 3163 bdev_qos_update_max_quota_per_timeslice(qos); 3164 qos->timeslice_size = 3165 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3166 qos->last_timeslice = spdk_get_ticks(); 3167 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3168 qos, 3169 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3170 } 3171 3172 ch->flags |= BDEV_CH_QOS_ENABLED; 3173 } 3174 } 3175 3176 struct poll_timeout_ctx { 3177 struct spdk_bdev_desc *desc; 3178 uint64_t timeout_in_sec; 3179 spdk_bdev_io_timeout_cb cb_fn; 3180 void *cb_arg; 3181 }; 3182 3183 static void 3184 bdev_desc_free(struct spdk_bdev_desc *desc) 3185 { 3186 pthread_mutex_destroy(&desc->mutex); 3187 free(desc->media_events_buffer); 3188 free(desc); 3189 } 3190 3191 static void 3192 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3193 { 3194 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3195 struct spdk_bdev_desc *desc = ctx->desc; 3196 3197 free(ctx); 3198 3199 pthread_mutex_lock(&desc->mutex); 3200 desc->refs--; 3201 if (desc->closed == true && desc->refs == 0) { 3202 pthread_mutex_unlock(&desc->mutex); 3203 bdev_desc_free(desc); 3204 return; 3205 } 3206 pthread_mutex_unlock(&desc->mutex); 3207 } 3208 3209 static void 3210 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3211 { 3212 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3213 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3214 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3215 struct spdk_bdev_desc *desc = ctx->desc; 3216 struct spdk_bdev_io *bdev_io; 3217 uint64_t now; 3218 3219 pthread_mutex_lock(&desc->mutex); 3220 if (desc->closed == true) { 3221 pthread_mutex_unlock(&desc->mutex); 3222 spdk_for_each_channel_continue(i, -1); 3223 return; 3224 } 3225 pthread_mutex_unlock(&desc->mutex); 3226 3227 now = spdk_get_ticks(); 3228 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3229 /* Exclude any I/O that are generated via splitting. */ 3230 if (bdev_io->internal.cb == bdev_io_split_done) { 3231 continue; 3232 } 3233 3234 /* Once we find an I/O that has not timed out, we can immediately 3235 * exit the loop. 3236 */ 3237 if (now < (bdev_io->internal.submit_tsc + 3238 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3239 goto end; 3240 } 3241 3242 if (bdev_io->internal.desc == desc) { 3243 ctx->cb_fn(ctx->cb_arg, bdev_io); 3244 } 3245 } 3246 3247 end: 3248 spdk_for_each_channel_continue(i, 0); 3249 } 3250 3251 static int 3252 bdev_poll_timeout_io(void *arg) 3253 { 3254 struct spdk_bdev_desc *desc = arg; 3255 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3256 struct poll_timeout_ctx *ctx; 3257 3258 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3259 if (!ctx) { 3260 SPDK_ERRLOG("failed to allocate memory\n"); 3261 return SPDK_POLLER_BUSY; 3262 } 3263 ctx->desc = desc; 3264 ctx->cb_arg = desc->cb_arg; 3265 ctx->cb_fn = desc->cb_fn; 3266 ctx->timeout_in_sec = desc->timeout_in_sec; 3267 3268 /* Take a ref on the descriptor in case it gets closed while we are checking 3269 * all of the channels. 3270 */ 3271 pthread_mutex_lock(&desc->mutex); 3272 desc->refs++; 3273 pthread_mutex_unlock(&desc->mutex); 3274 3275 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3276 bdev_channel_poll_timeout_io, 3277 ctx, 3278 bdev_channel_poll_timeout_io_done); 3279 3280 return SPDK_POLLER_BUSY; 3281 } 3282 3283 int 3284 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3285 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3286 { 3287 assert(desc->thread == spdk_get_thread()); 3288 3289 spdk_poller_unregister(&desc->io_timeout_poller); 3290 3291 if (timeout_in_sec) { 3292 assert(cb_fn != NULL); 3293 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3294 desc, 3295 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3296 1000); 3297 if (desc->io_timeout_poller == NULL) { 3298 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3299 return -1; 3300 } 3301 } 3302 3303 desc->cb_fn = cb_fn; 3304 desc->cb_arg = cb_arg; 3305 desc->timeout_in_sec = timeout_in_sec; 3306 3307 return 0; 3308 } 3309 3310 static int 3311 bdev_channel_create(void *io_device, void *ctx_buf) 3312 { 3313 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3314 struct spdk_bdev_channel *ch = ctx_buf; 3315 struct spdk_io_channel *mgmt_io_ch; 3316 struct spdk_bdev_mgmt_channel *mgmt_ch; 3317 struct spdk_bdev_shared_resource *shared_resource; 3318 struct lba_range *range; 3319 3320 ch->bdev = bdev; 3321 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3322 if (!ch->channel) { 3323 return -1; 3324 } 3325 3326 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3327 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3328 3329 assert(ch->histogram == NULL); 3330 if (bdev->internal.histogram_enabled) { 3331 ch->histogram = spdk_histogram_data_alloc(); 3332 if (ch->histogram == NULL) { 3333 SPDK_ERRLOG("Could not allocate histogram\n"); 3334 } 3335 } 3336 3337 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3338 if (!mgmt_io_ch) { 3339 spdk_put_io_channel(ch->channel); 3340 return -1; 3341 } 3342 3343 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3344 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3345 if (shared_resource->shared_ch == ch->channel) { 3346 spdk_put_io_channel(mgmt_io_ch); 3347 shared_resource->ref++; 3348 break; 3349 } 3350 } 3351 3352 if (shared_resource == NULL) { 3353 shared_resource = calloc(1, sizeof(*shared_resource)); 3354 if (shared_resource == NULL) { 3355 spdk_put_io_channel(ch->channel); 3356 spdk_put_io_channel(mgmt_io_ch); 3357 return -1; 3358 } 3359 3360 shared_resource->mgmt_ch = mgmt_ch; 3361 shared_resource->io_outstanding = 0; 3362 TAILQ_INIT(&shared_resource->nomem_io); 3363 shared_resource->nomem_threshold = 0; 3364 shared_resource->shared_ch = ch->channel; 3365 shared_resource->ref = 1; 3366 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3367 } 3368 3369 memset(&ch->stat, 0, sizeof(ch->stat)); 3370 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3371 ch->io_outstanding = 0; 3372 TAILQ_INIT(&ch->queued_resets); 3373 TAILQ_INIT(&ch->locked_ranges); 3374 ch->flags = 0; 3375 ch->shared_resource = shared_resource; 3376 3377 TAILQ_INIT(&ch->io_submitted); 3378 TAILQ_INIT(&ch->io_locked); 3379 3380 #ifdef SPDK_CONFIG_VTUNE 3381 { 3382 char *name; 3383 __itt_init_ittlib(NULL, 0); 3384 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3385 if (!name) { 3386 bdev_channel_destroy_resource(ch); 3387 return -1; 3388 } 3389 ch->handle = __itt_string_handle_create(name); 3390 free(name); 3391 ch->start_tsc = spdk_get_ticks(); 3392 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3393 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3394 } 3395 #endif 3396 3397 pthread_mutex_lock(&bdev->internal.mutex); 3398 bdev_enable_qos(bdev, ch); 3399 3400 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3401 struct lba_range *new_range; 3402 3403 new_range = calloc(1, sizeof(*new_range)); 3404 if (new_range == NULL) { 3405 pthread_mutex_unlock(&bdev->internal.mutex); 3406 bdev_channel_destroy_resource(ch); 3407 return -1; 3408 } 3409 new_range->length = range->length; 3410 new_range->offset = range->offset; 3411 new_range->locked_ctx = range->locked_ctx; 3412 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3413 } 3414 3415 pthread_mutex_unlock(&bdev->internal.mutex); 3416 3417 return 0; 3418 } 3419 3420 /* 3421 * Abort I/O that are waiting on a data buffer. These types of I/O are 3422 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3423 */ 3424 static void 3425 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3426 { 3427 bdev_io_stailq_t tmp; 3428 struct spdk_bdev_io *bdev_io; 3429 3430 STAILQ_INIT(&tmp); 3431 3432 while (!STAILQ_EMPTY(queue)) { 3433 bdev_io = STAILQ_FIRST(queue); 3434 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3435 if (bdev_io->internal.ch == ch) { 3436 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3437 } else { 3438 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3439 } 3440 } 3441 3442 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3443 } 3444 3445 /* 3446 * Abort I/O that are queued waiting for submission. These types of I/O are 3447 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3448 */ 3449 static void 3450 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3451 { 3452 struct spdk_bdev_io *bdev_io, *tmp; 3453 3454 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3455 if (bdev_io->internal.ch == ch) { 3456 TAILQ_REMOVE(queue, bdev_io, internal.link); 3457 /* 3458 * spdk_bdev_io_complete() assumes that the completed I/O had 3459 * been submitted to the bdev module. Since in this case it 3460 * hadn't, bump io_outstanding to account for the decrement 3461 * that spdk_bdev_io_complete() will do. 3462 */ 3463 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3464 ch->io_outstanding++; 3465 ch->shared_resource->io_outstanding++; 3466 } 3467 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3468 } 3469 } 3470 } 3471 3472 static bool 3473 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3474 { 3475 struct spdk_bdev_io *bdev_io; 3476 3477 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3478 if (bdev_io == bio_to_abort) { 3479 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3480 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3481 return true; 3482 } 3483 } 3484 3485 return false; 3486 } 3487 3488 static bool 3489 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3490 { 3491 struct spdk_bdev_io *bdev_io; 3492 3493 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3494 if (bdev_io == bio_to_abort) { 3495 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3496 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3497 return true; 3498 } 3499 } 3500 3501 return false; 3502 } 3503 3504 static void 3505 bdev_qos_channel_destroy(void *cb_arg) 3506 { 3507 struct spdk_bdev_qos *qos = cb_arg; 3508 3509 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3510 spdk_poller_unregister(&qos->poller); 3511 3512 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3513 3514 free(qos); 3515 } 3516 3517 static int 3518 bdev_qos_destroy(struct spdk_bdev *bdev) 3519 { 3520 int i; 3521 3522 /* 3523 * Cleanly shutting down the QoS poller is tricky, because 3524 * during the asynchronous operation the user could open 3525 * a new descriptor and create a new channel, spawning 3526 * a new QoS poller. 3527 * 3528 * The strategy is to create a new QoS structure here and swap it 3529 * in. The shutdown path then continues to refer to the old one 3530 * until it completes and then releases it. 3531 */ 3532 struct spdk_bdev_qos *new_qos, *old_qos; 3533 3534 old_qos = bdev->internal.qos; 3535 3536 new_qos = calloc(1, sizeof(*new_qos)); 3537 if (!new_qos) { 3538 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3539 return -ENOMEM; 3540 } 3541 3542 /* Copy the old QoS data into the newly allocated structure */ 3543 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3544 3545 /* Zero out the key parts of the QoS structure */ 3546 new_qos->ch = NULL; 3547 new_qos->thread = NULL; 3548 new_qos->poller = NULL; 3549 TAILQ_INIT(&new_qos->queued); 3550 /* 3551 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3552 * It will be used later for the new QoS structure. 3553 */ 3554 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3555 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3556 new_qos->rate_limits[i].min_per_timeslice = 0; 3557 new_qos->rate_limits[i].max_per_timeslice = 0; 3558 } 3559 3560 bdev->internal.qos = new_qos; 3561 3562 if (old_qos->thread == NULL) { 3563 free(old_qos); 3564 } else { 3565 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3566 } 3567 3568 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3569 * been destroyed yet. The destruction path will end up waiting for the final 3570 * channel to be put before it releases resources. */ 3571 3572 return 0; 3573 } 3574 3575 static void 3576 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3577 { 3578 total->bytes_read += add->bytes_read; 3579 total->num_read_ops += add->num_read_ops; 3580 total->bytes_written += add->bytes_written; 3581 total->num_write_ops += add->num_write_ops; 3582 total->bytes_unmapped += add->bytes_unmapped; 3583 total->num_unmap_ops += add->num_unmap_ops; 3584 total->read_latency_ticks += add->read_latency_ticks; 3585 total->write_latency_ticks += add->write_latency_ticks; 3586 total->unmap_latency_ticks += add->unmap_latency_ticks; 3587 } 3588 3589 static void 3590 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3591 { 3592 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3593 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3594 3595 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3596 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3597 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3598 } 3599 3600 static void 3601 bdev_channel_destroy(void *io_device, void *ctx_buf) 3602 { 3603 struct spdk_bdev_channel *ch = ctx_buf; 3604 3605 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3606 spdk_get_thread()); 3607 3608 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3609 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3610 3611 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3612 pthread_mutex_lock(&ch->bdev->internal.mutex); 3613 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3614 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3615 3616 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3617 3618 bdev_channel_abort_queued_ios(ch); 3619 3620 if (ch->histogram) { 3621 spdk_histogram_data_free(ch->histogram); 3622 } 3623 3624 bdev_channel_destroy_resource(ch); 3625 } 3626 3627 /* 3628 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3629 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3630 */ 3631 static int 3632 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3633 { 3634 struct spdk_bdev_name *tmp; 3635 3636 bdev_name->name = strdup(name); 3637 if (bdev_name->name == NULL) { 3638 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3639 return -ENOMEM; 3640 } 3641 3642 bdev_name->bdev = bdev; 3643 3644 pthread_mutex_lock(&g_bdev_mgr.mutex); 3645 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3646 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3647 3648 if (tmp != NULL) { 3649 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3650 free(bdev_name->name); 3651 return -EEXIST; 3652 } 3653 3654 return 0; 3655 } 3656 3657 static void 3658 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3659 { 3660 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3661 free(bdev_name->name); 3662 } 3663 3664 static void 3665 bdev_name_del(struct spdk_bdev_name *bdev_name) 3666 { 3667 pthread_mutex_lock(&g_bdev_mgr.mutex); 3668 bdev_name_del_unsafe(bdev_name); 3669 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3670 } 3671 3672 int 3673 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3674 { 3675 struct spdk_bdev_alias *tmp; 3676 int ret; 3677 3678 if (alias == NULL) { 3679 SPDK_ERRLOG("Empty alias passed\n"); 3680 return -EINVAL; 3681 } 3682 3683 tmp = calloc(1, sizeof(*tmp)); 3684 if (tmp == NULL) { 3685 SPDK_ERRLOG("Unable to allocate alias\n"); 3686 return -ENOMEM; 3687 } 3688 3689 ret = bdev_name_add(&tmp->alias, bdev, alias); 3690 if (ret != 0) { 3691 free(tmp); 3692 return ret; 3693 } 3694 3695 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3696 3697 return 0; 3698 } 3699 3700 static int 3701 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3702 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3703 { 3704 struct spdk_bdev_alias *tmp; 3705 3706 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3707 if (strcmp(alias, tmp->alias.name) == 0) { 3708 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3709 alias_del_fn(&tmp->alias); 3710 free(tmp); 3711 return 0; 3712 } 3713 } 3714 3715 return -ENOENT; 3716 } 3717 3718 int 3719 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3720 { 3721 int rc; 3722 3723 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3724 if (rc == -ENOENT) { 3725 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3726 } 3727 3728 return rc; 3729 } 3730 3731 void 3732 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3733 { 3734 struct spdk_bdev_alias *p, *tmp; 3735 3736 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3737 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3738 bdev_name_del(&p->alias); 3739 free(p); 3740 } 3741 } 3742 3743 struct spdk_io_channel * 3744 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3745 { 3746 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3747 } 3748 3749 void * 3750 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3751 { 3752 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3753 void *ctx = NULL; 3754 3755 if (bdev->fn_table->get_module_ctx) { 3756 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3757 } 3758 3759 return ctx; 3760 } 3761 3762 const char * 3763 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3764 { 3765 return bdev->module->name; 3766 } 3767 3768 const char * 3769 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3770 { 3771 return bdev->name; 3772 } 3773 3774 const char * 3775 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3776 { 3777 return bdev->product_name; 3778 } 3779 3780 const struct spdk_bdev_aliases_list * 3781 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3782 { 3783 return &bdev->aliases; 3784 } 3785 3786 uint32_t 3787 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3788 { 3789 return bdev->blocklen; 3790 } 3791 3792 uint32_t 3793 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3794 { 3795 return bdev->write_unit_size; 3796 } 3797 3798 uint64_t 3799 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3800 { 3801 return bdev->blockcnt; 3802 } 3803 3804 const char * 3805 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3806 { 3807 return qos_rpc_type[type]; 3808 } 3809 3810 void 3811 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3812 { 3813 int i; 3814 3815 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3816 3817 pthread_mutex_lock(&bdev->internal.mutex); 3818 if (bdev->internal.qos) { 3819 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3820 if (bdev->internal.qos->rate_limits[i].limit != 3821 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3822 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3823 if (bdev_qos_is_iops_rate_limit(i) == false) { 3824 /* Change from Byte to Megabyte which is user visible. */ 3825 limits[i] = limits[i] / 1024 / 1024; 3826 } 3827 } 3828 } 3829 } 3830 pthread_mutex_unlock(&bdev->internal.mutex); 3831 } 3832 3833 size_t 3834 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3835 { 3836 return 1 << bdev->required_alignment; 3837 } 3838 3839 uint32_t 3840 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3841 { 3842 return bdev->optimal_io_boundary; 3843 } 3844 3845 bool 3846 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3847 { 3848 return bdev->write_cache; 3849 } 3850 3851 const struct spdk_uuid * 3852 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3853 { 3854 return &bdev->uuid; 3855 } 3856 3857 uint16_t 3858 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3859 { 3860 return bdev->acwu; 3861 } 3862 3863 uint32_t 3864 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3865 { 3866 return bdev->md_len; 3867 } 3868 3869 bool 3870 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3871 { 3872 return (bdev->md_len != 0) && bdev->md_interleave; 3873 } 3874 3875 bool 3876 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3877 { 3878 return (bdev->md_len != 0) && !bdev->md_interleave; 3879 } 3880 3881 bool 3882 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3883 { 3884 return bdev->zoned; 3885 } 3886 3887 uint32_t 3888 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3889 { 3890 if (spdk_bdev_is_md_interleaved(bdev)) { 3891 return bdev->blocklen - bdev->md_len; 3892 } else { 3893 return bdev->blocklen; 3894 } 3895 } 3896 3897 uint32_t 3898 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3899 { 3900 return bdev->phys_blocklen; 3901 } 3902 3903 static uint32_t 3904 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3905 { 3906 if (!spdk_bdev_is_md_interleaved(bdev)) { 3907 return bdev->blocklen + bdev->md_len; 3908 } else { 3909 return bdev->blocklen; 3910 } 3911 } 3912 3913 /* We have to use the typedef in the function declaration to appease astyle. */ 3914 typedef enum spdk_dif_type spdk_dif_type_t; 3915 3916 spdk_dif_type_t 3917 spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3918 { 3919 if (bdev->md_len != 0) { 3920 return bdev->dif_type; 3921 } else { 3922 return SPDK_DIF_DISABLE; 3923 } 3924 } 3925 3926 bool 3927 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3928 { 3929 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3930 return bdev->dif_is_head_of_md; 3931 } else { 3932 return false; 3933 } 3934 } 3935 3936 bool 3937 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3938 enum spdk_dif_check_type check_type) 3939 { 3940 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3941 return false; 3942 } 3943 3944 switch (check_type) { 3945 case SPDK_DIF_CHECK_TYPE_REFTAG: 3946 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3947 case SPDK_DIF_CHECK_TYPE_APPTAG: 3948 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3949 case SPDK_DIF_CHECK_TYPE_GUARD: 3950 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3951 default: 3952 return false; 3953 } 3954 } 3955 3956 uint64_t 3957 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3958 { 3959 return bdev->internal.measured_queue_depth; 3960 } 3961 3962 uint64_t 3963 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3964 { 3965 return bdev->internal.period; 3966 } 3967 3968 uint64_t 3969 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3970 { 3971 return bdev->internal.weighted_io_time; 3972 } 3973 3974 uint64_t 3975 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3976 { 3977 return bdev->internal.io_time; 3978 } 3979 3980 static void bdev_update_qd_sampling_period(void *ctx); 3981 3982 static void 3983 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3984 { 3985 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3986 3987 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3988 3989 if (bdev->internal.measured_queue_depth) { 3990 bdev->internal.io_time += bdev->internal.period; 3991 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3992 } 3993 3994 bdev->internal.qd_poll_in_progress = false; 3995 3996 bdev_update_qd_sampling_period(bdev); 3997 } 3998 3999 static void 4000 _calculate_measured_qd(struct spdk_io_channel_iter *i) 4001 { 4002 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 4003 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4004 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 4005 4006 bdev->internal.temporary_queue_depth += ch->io_outstanding; 4007 spdk_for_each_channel_continue(i, 0); 4008 } 4009 4010 static int 4011 bdev_calculate_measured_queue_depth(void *ctx) 4012 { 4013 struct spdk_bdev *bdev = ctx; 4014 4015 bdev->internal.qd_poll_in_progress = true; 4016 bdev->internal.temporary_queue_depth = 0; 4017 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 4018 _calculate_measured_qd_cpl); 4019 return SPDK_POLLER_BUSY; 4020 } 4021 4022 static void 4023 bdev_update_qd_sampling_period(void *ctx) 4024 { 4025 struct spdk_bdev *bdev = ctx; 4026 4027 if (bdev->internal.period == bdev->internal.new_period) { 4028 return; 4029 } 4030 4031 if (bdev->internal.qd_poll_in_progress) { 4032 return; 4033 } 4034 4035 bdev->internal.period = bdev->internal.new_period; 4036 4037 spdk_poller_unregister(&bdev->internal.qd_poller); 4038 if (bdev->internal.period != 0) { 4039 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4040 bdev, bdev->internal.period); 4041 } else { 4042 spdk_bdev_close(bdev->internal.qd_desc); 4043 bdev->internal.qd_desc = NULL; 4044 } 4045 } 4046 4047 static void 4048 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 4049 { 4050 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 4051 } 4052 4053 void 4054 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 4055 { 4056 int rc; 4057 4058 if (bdev->internal.new_period == period) { 4059 return; 4060 } 4061 4062 bdev->internal.new_period = period; 4063 4064 if (bdev->internal.qd_desc != NULL) { 4065 assert(bdev->internal.period != 0); 4066 4067 spdk_thread_send_msg(bdev->internal.qd_desc->thread, 4068 bdev_update_qd_sampling_period, bdev); 4069 return; 4070 } 4071 4072 assert(bdev->internal.period == 0); 4073 4074 rc = spdk_bdev_open_ext(spdk_bdev_get_name(bdev), false, _tmp_bdev_event_cb, 4075 NULL, &bdev->internal.qd_desc); 4076 if (rc != 0) { 4077 return; 4078 } 4079 4080 bdev->internal.period = period; 4081 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, 4082 bdev, period); 4083 } 4084 4085 struct bdev_get_current_qd_ctx { 4086 uint64_t current_qd; 4087 spdk_bdev_get_current_qd_cb cb_fn; 4088 void *cb_arg; 4089 }; 4090 4091 static void 4092 bdev_get_current_qd_done(struct spdk_io_channel_iter *i, int status) 4093 { 4094 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4095 void *io_dev = spdk_io_channel_iter_get_io_device(i); 4096 4097 ctx->cb_fn(__bdev_from_io_dev(io_dev), ctx->current_qd, ctx->cb_arg, 0); 4098 4099 free(ctx); 4100 } 4101 4102 static void 4103 bdev_get_current_qd(struct spdk_io_channel_iter *i) 4104 { 4105 struct bdev_get_current_qd_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 4106 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 4107 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 4108 4109 ctx->current_qd += bdev_ch->io_outstanding; 4110 4111 spdk_for_each_channel_continue(i, 0); 4112 } 4113 4114 void 4115 spdk_bdev_get_current_qd(struct spdk_bdev *bdev, spdk_bdev_get_current_qd_cb cb_fn, 4116 void *cb_arg) 4117 { 4118 struct bdev_get_current_qd_ctx *ctx; 4119 4120 assert(cb_fn != NULL); 4121 4122 ctx = calloc(1, sizeof(*ctx)); 4123 if (ctx == NULL) { 4124 cb_fn(bdev, 0, cb_arg, -ENOMEM); 4125 return; 4126 } 4127 4128 ctx->cb_fn = cb_fn; 4129 ctx->cb_arg = cb_arg; 4130 4131 spdk_for_each_channel(__bdev_to_io_dev(bdev), 4132 bdev_get_current_qd, 4133 ctx, 4134 bdev_get_current_qd_done); 4135 } 4136 4137 static void 4138 _resize_notify(void *arg) 4139 { 4140 struct spdk_bdev_desc *desc = arg; 4141 4142 pthread_mutex_lock(&desc->mutex); 4143 desc->refs--; 4144 if (!desc->closed) { 4145 pthread_mutex_unlock(&desc->mutex); 4146 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4147 desc->bdev, 4148 desc->callback.ctx); 4149 return; 4150 } else if (0 == desc->refs) { 4151 /* This descriptor was closed after this resize_notify message was sent. 4152 * spdk_bdev_close() could not free the descriptor since this message was 4153 * in flight, so we free it now using bdev_desc_free(). 4154 */ 4155 pthread_mutex_unlock(&desc->mutex); 4156 bdev_desc_free(desc); 4157 return; 4158 } 4159 pthread_mutex_unlock(&desc->mutex); 4160 } 4161 4162 int 4163 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4164 { 4165 struct spdk_bdev_desc *desc; 4166 int ret; 4167 4168 if (size == bdev->blockcnt) { 4169 return 0; 4170 } 4171 4172 pthread_mutex_lock(&bdev->internal.mutex); 4173 4174 /* bdev has open descriptors */ 4175 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4176 bdev->blockcnt > size) { 4177 ret = -EBUSY; 4178 } else { 4179 bdev->blockcnt = size; 4180 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4181 pthread_mutex_lock(&desc->mutex); 4182 if (!desc->closed) { 4183 desc->refs++; 4184 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4185 } 4186 pthread_mutex_unlock(&desc->mutex); 4187 } 4188 ret = 0; 4189 } 4190 4191 pthread_mutex_unlock(&bdev->internal.mutex); 4192 4193 return ret; 4194 } 4195 4196 /* 4197 * Convert I/O offset and length from bytes to blocks. 4198 * 4199 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4200 */ 4201 static uint64_t 4202 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4203 uint64_t num_bytes, uint64_t *num_blocks) 4204 { 4205 uint32_t block_size = bdev->blocklen; 4206 uint8_t shift_cnt; 4207 4208 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4209 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4210 shift_cnt = spdk_u32log2(block_size); 4211 *offset_blocks = offset_bytes >> shift_cnt; 4212 *num_blocks = num_bytes >> shift_cnt; 4213 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4214 (num_bytes - (*num_blocks << shift_cnt)); 4215 } else { 4216 *offset_blocks = offset_bytes / block_size; 4217 *num_blocks = num_bytes / block_size; 4218 return (offset_bytes % block_size) | (num_bytes % block_size); 4219 } 4220 } 4221 4222 static bool 4223 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4224 { 4225 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4226 * has been an overflow and hence the offset has been wrapped around */ 4227 if (offset_blocks + num_blocks < offset_blocks) { 4228 return false; 4229 } 4230 4231 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4232 if (offset_blocks + num_blocks > bdev->blockcnt) { 4233 return false; 4234 } 4235 4236 return true; 4237 } 4238 4239 static void 4240 bdev_seek_complete_cb(void *ctx) 4241 { 4242 struct spdk_bdev_io *bdev_io = ctx; 4243 4244 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4245 bdev_io->internal.cb(bdev_io, true, bdev_io->internal.caller_ctx); 4246 } 4247 4248 static int 4249 bdev_seek(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4250 uint64_t offset_blocks, enum spdk_bdev_io_type io_type, 4251 spdk_bdev_io_completion_cb cb, void *cb_arg) 4252 { 4253 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4254 struct spdk_bdev_io *bdev_io; 4255 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4256 4257 assert(io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA || io_type == SPDK_BDEV_IO_TYPE_SEEK_HOLE); 4258 4259 /* Check if offset_blocks is valid looking at the validity of one block */ 4260 if (!bdev_io_valid_blocks(bdev, offset_blocks, 1)) { 4261 return -EINVAL; 4262 } 4263 4264 bdev_io = bdev_channel_get_io(channel); 4265 if (!bdev_io) { 4266 return -ENOMEM; 4267 } 4268 4269 bdev_io->internal.ch = channel; 4270 bdev_io->internal.desc = desc; 4271 bdev_io->type = io_type; 4272 bdev_io->u.bdev.offset_blocks = offset_blocks; 4273 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4274 4275 if (!spdk_bdev_io_type_supported(bdev, io_type)) { 4276 /* In case bdev doesn't support seek to next data/hole offset, 4277 * it is assumed that only data and no holes are present */ 4278 if (io_type == SPDK_BDEV_IO_TYPE_SEEK_DATA) { 4279 bdev_io->u.bdev.seek.offset = offset_blocks; 4280 } else { 4281 bdev_io->u.bdev.seek.offset = UINT64_MAX; 4282 } 4283 4284 spdk_thread_send_msg(spdk_get_thread(), bdev_seek_complete_cb, bdev_io); 4285 return 0; 4286 } 4287 4288 bdev_io_submit(bdev_io); 4289 return 0; 4290 } 4291 4292 int 4293 spdk_bdev_seek_data(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4294 uint64_t offset_blocks, 4295 spdk_bdev_io_completion_cb cb, void *cb_arg) 4296 { 4297 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_DATA, cb, cb_arg); 4298 } 4299 4300 int 4301 spdk_bdev_seek_hole(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4302 uint64_t offset_blocks, 4303 spdk_bdev_io_completion_cb cb, void *cb_arg) 4304 { 4305 return bdev_seek(desc, ch, offset_blocks, SPDK_BDEV_IO_TYPE_SEEK_HOLE, cb, cb_arg); 4306 } 4307 4308 uint64_t 4309 spdk_bdev_io_get_seek_offset(const struct spdk_bdev_io *bdev_io) 4310 { 4311 return bdev_io->u.bdev.seek.offset; 4312 } 4313 4314 static int 4315 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4316 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4317 spdk_bdev_io_completion_cb cb, void *cb_arg) 4318 { 4319 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4320 struct spdk_bdev_io *bdev_io; 4321 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4322 4323 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4324 return -EINVAL; 4325 } 4326 4327 bdev_io = bdev_channel_get_io(channel); 4328 if (!bdev_io) { 4329 return -ENOMEM; 4330 } 4331 4332 bdev_io->internal.ch = channel; 4333 bdev_io->internal.desc = desc; 4334 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4335 bdev_io->u.bdev.iovs = &bdev_io->iov; 4336 bdev_io->u.bdev.iovs[0].iov_base = buf; 4337 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4338 bdev_io->u.bdev.iovcnt = 1; 4339 bdev_io->u.bdev.md_buf = md_buf; 4340 bdev_io->u.bdev.num_blocks = num_blocks; 4341 bdev_io->u.bdev.offset_blocks = offset_blocks; 4342 bdev_io->u.bdev.ext_opts = NULL; 4343 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4344 4345 bdev_io_submit(bdev_io); 4346 return 0; 4347 } 4348 4349 int 4350 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4351 void *buf, uint64_t offset, uint64_t nbytes, 4352 spdk_bdev_io_completion_cb cb, void *cb_arg) 4353 { 4354 uint64_t offset_blocks, num_blocks; 4355 4356 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4357 nbytes, &num_blocks) != 0) { 4358 return -EINVAL; 4359 } 4360 4361 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4362 } 4363 4364 int 4365 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4366 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4367 spdk_bdev_io_completion_cb cb, void *cb_arg) 4368 { 4369 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4370 } 4371 4372 int 4373 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4374 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4375 spdk_bdev_io_completion_cb cb, void *cb_arg) 4376 { 4377 struct iovec iov = { 4378 .iov_base = buf, 4379 }; 4380 4381 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4382 return -EINVAL; 4383 } 4384 4385 if (md_buf && !_is_buf_allocated(&iov)) { 4386 return -EINVAL; 4387 } 4388 4389 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4390 cb, cb_arg); 4391 } 4392 4393 int 4394 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4395 struct iovec *iov, int iovcnt, 4396 uint64_t offset, uint64_t nbytes, 4397 spdk_bdev_io_completion_cb cb, void *cb_arg) 4398 { 4399 uint64_t offset_blocks, num_blocks; 4400 4401 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4402 nbytes, &num_blocks) != 0) { 4403 return -EINVAL; 4404 } 4405 4406 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4407 } 4408 4409 static int 4410 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4411 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4412 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4413 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4414 { 4415 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4416 struct spdk_bdev_io *bdev_io; 4417 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4418 4419 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4420 return -EINVAL; 4421 } 4422 4423 bdev_io = bdev_channel_get_io(channel); 4424 if (!bdev_io) { 4425 return -ENOMEM; 4426 } 4427 4428 bdev_io->internal.ch = channel; 4429 bdev_io->internal.desc = desc; 4430 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4431 bdev_io->u.bdev.iovs = iov; 4432 bdev_io->u.bdev.iovcnt = iovcnt; 4433 bdev_io->u.bdev.md_buf = md_buf; 4434 bdev_io->u.bdev.num_blocks = num_blocks; 4435 bdev_io->u.bdev.offset_blocks = offset_blocks; 4436 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4437 bdev_io->internal.ext_opts = opts; 4438 bdev_io->u.bdev.ext_opts = opts; 4439 4440 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4441 4442 return 0; 4443 } 4444 4445 int 4446 spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4447 struct iovec *iov, int iovcnt, 4448 uint64_t offset_blocks, uint64_t num_blocks, 4449 spdk_bdev_io_completion_cb cb, void *cb_arg) 4450 { 4451 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4452 num_blocks, cb, cb_arg, NULL, false); 4453 } 4454 4455 int 4456 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4457 struct iovec *iov, int iovcnt, void *md_buf, 4458 uint64_t offset_blocks, uint64_t num_blocks, 4459 spdk_bdev_io_completion_cb cb, void *cb_arg) 4460 { 4461 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4462 return -EINVAL; 4463 } 4464 4465 if (md_buf && !_is_buf_allocated(iov)) { 4466 return -EINVAL; 4467 } 4468 4469 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4470 num_blocks, cb, cb_arg, NULL, false); 4471 } 4472 4473 static inline bool 4474 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4475 { 4476 /* 4477 * We check if opts size is at least of size when we first introduced 4478 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4479 * are not checked internal. 4480 */ 4481 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4482 sizeof(opts->metadata) && 4483 opts->size <= sizeof(*opts) && 4484 /* When memory domain is used, the user must provide data buffers */ 4485 (!opts->memory_domain || (iov && iov[0].iov_base)); 4486 } 4487 4488 int 4489 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4490 struct iovec *iov, int iovcnt, 4491 uint64_t offset_blocks, uint64_t num_blocks, 4492 spdk_bdev_io_completion_cb cb, void *cb_arg, 4493 struct spdk_bdev_ext_io_opts *opts) 4494 { 4495 void *md = NULL; 4496 4497 if (opts) { 4498 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4499 return -EINVAL; 4500 } 4501 md = opts->metadata; 4502 } 4503 4504 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4505 return -EINVAL; 4506 } 4507 4508 if (md && !_is_buf_allocated(iov)) { 4509 return -EINVAL; 4510 } 4511 4512 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4513 num_blocks, cb, cb_arg, opts, false); 4514 } 4515 4516 static int 4517 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4518 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4519 spdk_bdev_io_completion_cb cb, void *cb_arg) 4520 { 4521 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4522 struct spdk_bdev_io *bdev_io; 4523 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4524 4525 if (!desc->write) { 4526 return -EBADF; 4527 } 4528 4529 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4530 return -EINVAL; 4531 } 4532 4533 bdev_io = bdev_channel_get_io(channel); 4534 if (!bdev_io) { 4535 return -ENOMEM; 4536 } 4537 4538 bdev_io->internal.ch = channel; 4539 bdev_io->internal.desc = desc; 4540 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4541 bdev_io->u.bdev.iovs = &bdev_io->iov; 4542 bdev_io->u.bdev.iovs[0].iov_base = buf; 4543 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4544 bdev_io->u.bdev.iovcnt = 1; 4545 bdev_io->u.bdev.md_buf = md_buf; 4546 bdev_io->u.bdev.num_blocks = num_blocks; 4547 bdev_io->u.bdev.offset_blocks = offset_blocks; 4548 bdev_io->u.bdev.ext_opts = NULL; 4549 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4550 4551 bdev_io_submit(bdev_io); 4552 return 0; 4553 } 4554 4555 int 4556 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4557 void *buf, uint64_t offset, uint64_t nbytes, 4558 spdk_bdev_io_completion_cb cb, void *cb_arg) 4559 { 4560 uint64_t offset_blocks, num_blocks; 4561 4562 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4563 nbytes, &num_blocks) != 0) { 4564 return -EINVAL; 4565 } 4566 4567 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4568 } 4569 4570 int 4571 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4572 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4573 spdk_bdev_io_completion_cb cb, void *cb_arg) 4574 { 4575 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4576 cb, cb_arg); 4577 } 4578 4579 int 4580 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4581 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4582 spdk_bdev_io_completion_cb cb, void *cb_arg) 4583 { 4584 struct iovec iov = { 4585 .iov_base = buf, 4586 }; 4587 4588 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4589 return -EINVAL; 4590 } 4591 4592 if (md_buf && !_is_buf_allocated(&iov)) { 4593 return -EINVAL; 4594 } 4595 4596 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4597 cb, cb_arg); 4598 } 4599 4600 static int 4601 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4602 struct iovec *iov, int iovcnt, void *md_buf, 4603 uint64_t offset_blocks, uint64_t num_blocks, 4604 spdk_bdev_io_completion_cb cb, void *cb_arg, 4605 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4606 { 4607 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4608 struct spdk_bdev_io *bdev_io; 4609 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4610 4611 if (!desc->write) { 4612 return -EBADF; 4613 } 4614 4615 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4616 return -EINVAL; 4617 } 4618 4619 bdev_io = bdev_channel_get_io(channel); 4620 if (!bdev_io) { 4621 return -ENOMEM; 4622 } 4623 4624 bdev_io->internal.ch = channel; 4625 bdev_io->internal.desc = desc; 4626 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4627 bdev_io->u.bdev.iovs = iov; 4628 bdev_io->u.bdev.iovcnt = iovcnt; 4629 bdev_io->u.bdev.md_buf = md_buf; 4630 bdev_io->u.bdev.num_blocks = num_blocks; 4631 bdev_io->u.bdev.offset_blocks = offset_blocks; 4632 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4633 bdev_io->internal.ext_opts = opts; 4634 bdev_io->u.bdev.ext_opts = opts; 4635 4636 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4637 4638 return 0; 4639 } 4640 4641 int 4642 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4643 struct iovec *iov, int iovcnt, 4644 uint64_t offset, uint64_t len, 4645 spdk_bdev_io_completion_cb cb, void *cb_arg) 4646 { 4647 uint64_t offset_blocks, num_blocks; 4648 4649 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4650 len, &num_blocks) != 0) { 4651 return -EINVAL; 4652 } 4653 4654 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4655 } 4656 4657 int 4658 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4659 struct iovec *iov, int iovcnt, 4660 uint64_t offset_blocks, uint64_t num_blocks, 4661 spdk_bdev_io_completion_cb cb, void *cb_arg) 4662 { 4663 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4664 num_blocks, cb, cb_arg, NULL, false); 4665 } 4666 4667 int 4668 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4669 struct iovec *iov, int iovcnt, void *md_buf, 4670 uint64_t offset_blocks, uint64_t num_blocks, 4671 spdk_bdev_io_completion_cb cb, void *cb_arg) 4672 { 4673 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4674 return -EINVAL; 4675 } 4676 4677 if (md_buf && !_is_buf_allocated(iov)) { 4678 return -EINVAL; 4679 } 4680 4681 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4682 num_blocks, cb, cb_arg, NULL, false); 4683 } 4684 4685 int 4686 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4687 struct iovec *iov, int iovcnt, 4688 uint64_t offset_blocks, uint64_t num_blocks, 4689 spdk_bdev_io_completion_cb cb, void *cb_arg, 4690 struct spdk_bdev_ext_io_opts *opts) 4691 { 4692 void *md = NULL; 4693 4694 if (opts) { 4695 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4696 return -EINVAL; 4697 } 4698 md = opts->metadata; 4699 } 4700 4701 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4702 return -EINVAL; 4703 } 4704 4705 if (md && !_is_buf_allocated(iov)) { 4706 return -EINVAL; 4707 } 4708 4709 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4710 num_blocks, cb, cb_arg, opts, false); 4711 } 4712 4713 static void 4714 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4715 { 4716 struct spdk_bdev_io *parent_io = cb_arg; 4717 struct spdk_bdev *bdev = parent_io->bdev; 4718 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4719 int i, rc = 0; 4720 4721 if (!success) { 4722 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4723 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4724 spdk_bdev_free_io(bdev_io); 4725 return; 4726 } 4727 4728 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4729 rc = memcmp(read_buf, 4730 parent_io->u.bdev.iovs[i].iov_base, 4731 parent_io->u.bdev.iovs[i].iov_len); 4732 if (rc) { 4733 break; 4734 } 4735 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4736 } 4737 4738 if (rc == 0 && parent_io->u.bdev.md_buf && spdk_bdev_is_md_separate(bdev)) { 4739 rc = memcmp(bdev_io->u.bdev.md_buf, 4740 parent_io->u.bdev.md_buf, 4741 spdk_bdev_get_md_size(bdev)); 4742 } 4743 4744 spdk_bdev_free_io(bdev_io); 4745 4746 if (rc == 0) { 4747 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4748 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4749 } else { 4750 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4751 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4752 } 4753 } 4754 4755 static void 4756 bdev_compare_do_read(void *_bdev_io) 4757 { 4758 struct spdk_bdev_io *bdev_io = _bdev_io; 4759 int rc; 4760 4761 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4762 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4763 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4764 bdev_compare_do_read_done, bdev_io); 4765 4766 if (rc == -ENOMEM) { 4767 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4768 } else if (rc != 0) { 4769 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4770 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4771 } 4772 } 4773 4774 static int 4775 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4776 struct iovec *iov, int iovcnt, void *md_buf, 4777 uint64_t offset_blocks, uint64_t num_blocks, 4778 spdk_bdev_io_completion_cb cb, void *cb_arg) 4779 { 4780 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4781 struct spdk_bdev_io *bdev_io; 4782 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4783 4784 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4785 return -EINVAL; 4786 } 4787 4788 bdev_io = bdev_channel_get_io(channel); 4789 if (!bdev_io) { 4790 return -ENOMEM; 4791 } 4792 4793 bdev_io->internal.ch = channel; 4794 bdev_io->internal.desc = desc; 4795 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4796 bdev_io->u.bdev.iovs = iov; 4797 bdev_io->u.bdev.iovcnt = iovcnt; 4798 bdev_io->u.bdev.md_buf = md_buf; 4799 bdev_io->u.bdev.num_blocks = num_blocks; 4800 bdev_io->u.bdev.offset_blocks = offset_blocks; 4801 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4802 bdev_io->u.bdev.ext_opts = NULL; 4803 4804 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4805 bdev_io_submit(bdev_io); 4806 return 0; 4807 } 4808 4809 bdev_compare_do_read(bdev_io); 4810 4811 return 0; 4812 } 4813 4814 int 4815 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4816 struct iovec *iov, int iovcnt, 4817 uint64_t offset_blocks, uint64_t num_blocks, 4818 spdk_bdev_io_completion_cb cb, void *cb_arg) 4819 { 4820 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4821 num_blocks, cb, cb_arg); 4822 } 4823 4824 int 4825 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4826 struct iovec *iov, int iovcnt, void *md_buf, 4827 uint64_t offset_blocks, uint64_t num_blocks, 4828 spdk_bdev_io_completion_cb cb, void *cb_arg) 4829 { 4830 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4831 return -EINVAL; 4832 } 4833 4834 if (md_buf && !_is_buf_allocated(iov)) { 4835 return -EINVAL; 4836 } 4837 4838 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4839 num_blocks, cb, cb_arg); 4840 } 4841 4842 static int 4843 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4844 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4845 spdk_bdev_io_completion_cb cb, void *cb_arg) 4846 { 4847 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4848 struct spdk_bdev_io *bdev_io; 4849 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4850 4851 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4852 return -EINVAL; 4853 } 4854 4855 bdev_io = bdev_channel_get_io(channel); 4856 if (!bdev_io) { 4857 return -ENOMEM; 4858 } 4859 4860 bdev_io->internal.ch = channel; 4861 bdev_io->internal.desc = desc; 4862 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4863 bdev_io->u.bdev.iovs = &bdev_io->iov; 4864 bdev_io->u.bdev.iovs[0].iov_base = buf; 4865 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4866 bdev_io->u.bdev.iovcnt = 1; 4867 bdev_io->u.bdev.md_buf = md_buf; 4868 bdev_io->u.bdev.num_blocks = num_blocks; 4869 bdev_io->u.bdev.offset_blocks = offset_blocks; 4870 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4871 bdev_io->u.bdev.ext_opts = NULL; 4872 4873 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4874 bdev_io_submit(bdev_io); 4875 return 0; 4876 } 4877 4878 bdev_compare_do_read(bdev_io); 4879 4880 return 0; 4881 } 4882 4883 int 4884 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4885 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4886 spdk_bdev_io_completion_cb cb, void *cb_arg) 4887 { 4888 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4889 cb, cb_arg); 4890 } 4891 4892 int 4893 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4894 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4895 spdk_bdev_io_completion_cb cb, void *cb_arg) 4896 { 4897 struct iovec iov = { 4898 .iov_base = buf, 4899 }; 4900 4901 if (md_buf && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4902 return -EINVAL; 4903 } 4904 4905 if (md_buf && !_is_buf_allocated(&iov)) { 4906 return -EINVAL; 4907 } 4908 4909 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4910 cb, cb_arg); 4911 } 4912 4913 static void 4914 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4915 { 4916 struct spdk_bdev_io *bdev_io = ctx; 4917 4918 if (unlock_status) { 4919 SPDK_ERRLOG("LBA range unlock failed\n"); 4920 } 4921 4922 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4923 false, bdev_io->internal.caller_ctx); 4924 } 4925 4926 static void 4927 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4928 { 4929 bdev_io->internal.status = status; 4930 4931 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4932 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4933 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4934 } 4935 4936 static void 4937 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4938 { 4939 struct spdk_bdev_io *parent_io = cb_arg; 4940 4941 if (!success) { 4942 SPDK_ERRLOG("Compare and write operation failed\n"); 4943 } 4944 4945 spdk_bdev_free_io(bdev_io); 4946 4947 bdev_comparev_and_writev_blocks_unlock(parent_io, 4948 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4949 } 4950 4951 static void 4952 bdev_compare_and_write_do_write(void *_bdev_io) 4953 { 4954 struct spdk_bdev_io *bdev_io = _bdev_io; 4955 int rc; 4956 4957 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4958 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4959 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4960 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4961 bdev_compare_and_write_do_write_done, bdev_io); 4962 4963 4964 if (rc == -ENOMEM) { 4965 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4966 } else if (rc != 0) { 4967 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4968 } 4969 } 4970 4971 static void 4972 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4973 { 4974 struct spdk_bdev_io *parent_io = cb_arg; 4975 4976 spdk_bdev_free_io(bdev_io); 4977 4978 if (!success) { 4979 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4980 return; 4981 } 4982 4983 bdev_compare_and_write_do_write(parent_io); 4984 } 4985 4986 static void 4987 bdev_compare_and_write_do_compare(void *_bdev_io) 4988 { 4989 struct spdk_bdev_io *bdev_io = _bdev_io; 4990 int rc; 4991 4992 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4993 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4994 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4995 bdev_compare_and_write_do_compare_done, bdev_io); 4996 4997 if (rc == -ENOMEM) { 4998 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4999 } else if (rc != 0) { 5000 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 5001 } 5002 } 5003 5004 static void 5005 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 5006 { 5007 struct spdk_bdev_io *bdev_io = ctx; 5008 5009 if (status) { 5010 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 5011 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 5012 return; 5013 } 5014 5015 bdev_compare_and_write_do_compare(bdev_io); 5016 } 5017 5018 int 5019 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5020 struct iovec *compare_iov, int compare_iovcnt, 5021 struct iovec *write_iov, int write_iovcnt, 5022 uint64_t offset_blocks, uint64_t num_blocks, 5023 spdk_bdev_io_completion_cb cb, void *cb_arg) 5024 { 5025 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5026 struct spdk_bdev_io *bdev_io; 5027 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5028 5029 if (!desc->write) { 5030 return -EBADF; 5031 } 5032 5033 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5034 return -EINVAL; 5035 } 5036 5037 if (num_blocks > bdev->acwu) { 5038 return -EINVAL; 5039 } 5040 5041 bdev_io = bdev_channel_get_io(channel); 5042 if (!bdev_io) { 5043 return -ENOMEM; 5044 } 5045 5046 bdev_io->internal.ch = channel; 5047 bdev_io->internal.desc = desc; 5048 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 5049 bdev_io->u.bdev.iovs = compare_iov; 5050 bdev_io->u.bdev.iovcnt = compare_iovcnt; 5051 bdev_io->u.bdev.fused_iovs = write_iov; 5052 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 5053 bdev_io->u.bdev.md_buf = NULL; 5054 bdev_io->u.bdev.num_blocks = num_blocks; 5055 bdev_io->u.bdev.offset_blocks = offset_blocks; 5056 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5057 bdev_io->u.bdev.ext_opts = NULL; 5058 5059 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 5060 bdev_io_submit(bdev_io); 5061 return 0; 5062 } 5063 5064 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 5065 bdev_comparev_and_writev_blocks_locked, bdev_io); 5066 } 5067 5068 int 5069 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5070 struct iovec *iov, int iovcnt, 5071 uint64_t offset_blocks, uint64_t num_blocks, 5072 bool populate, 5073 spdk_bdev_io_completion_cb cb, void *cb_arg) 5074 { 5075 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5076 struct spdk_bdev_io *bdev_io; 5077 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5078 5079 if (!desc->write) { 5080 return -EBADF; 5081 } 5082 5083 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5084 return -EINVAL; 5085 } 5086 5087 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 5088 return -ENOTSUP; 5089 } 5090 5091 bdev_io = bdev_channel_get_io(channel); 5092 if (!bdev_io) { 5093 return -ENOMEM; 5094 } 5095 5096 bdev_io->internal.ch = channel; 5097 bdev_io->internal.desc = desc; 5098 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 5099 bdev_io->u.bdev.num_blocks = num_blocks; 5100 bdev_io->u.bdev.offset_blocks = offset_blocks; 5101 bdev_io->u.bdev.iovs = iov; 5102 bdev_io->u.bdev.iovcnt = iovcnt; 5103 bdev_io->u.bdev.md_buf = NULL; 5104 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 5105 bdev_io->u.bdev.zcopy.commit = 0; 5106 bdev_io->u.bdev.zcopy.start = 1; 5107 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5108 bdev_io->u.bdev.ext_opts = NULL; 5109 5110 bdev_io_submit(bdev_io); 5111 5112 return 0; 5113 } 5114 5115 int 5116 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 5117 spdk_bdev_io_completion_cb cb, void *cb_arg) 5118 { 5119 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 5120 return -EINVAL; 5121 } 5122 5123 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 5124 bdev_io->u.bdev.zcopy.start = 0; 5125 bdev_io->internal.caller_ctx = cb_arg; 5126 bdev_io->internal.cb = cb; 5127 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 5128 5129 bdev_io_submit(bdev_io); 5130 5131 return 0; 5132 } 5133 5134 int 5135 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5136 uint64_t offset, uint64_t len, 5137 spdk_bdev_io_completion_cb cb, void *cb_arg) 5138 { 5139 uint64_t offset_blocks, num_blocks; 5140 5141 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5142 len, &num_blocks) != 0) { 5143 return -EINVAL; 5144 } 5145 5146 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5147 } 5148 5149 int 5150 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5151 uint64_t offset_blocks, uint64_t num_blocks, 5152 spdk_bdev_io_completion_cb cb, void *cb_arg) 5153 { 5154 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5155 struct spdk_bdev_io *bdev_io; 5156 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5157 5158 if (!desc->write) { 5159 return -EBADF; 5160 } 5161 5162 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5163 return -EINVAL; 5164 } 5165 5166 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 5167 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 5168 return -ENOTSUP; 5169 } 5170 5171 bdev_io = bdev_channel_get_io(channel); 5172 5173 if (!bdev_io) { 5174 return -ENOMEM; 5175 } 5176 5177 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 5178 bdev_io->internal.ch = channel; 5179 bdev_io->internal.desc = desc; 5180 bdev_io->u.bdev.offset_blocks = offset_blocks; 5181 bdev_io->u.bdev.num_blocks = num_blocks; 5182 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5183 bdev_io->u.bdev.ext_opts = NULL; 5184 5185 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 5186 bdev_io_submit(bdev_io); 5187 return 0; 5188 } 5189 5190 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 5191 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 5192 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 5193 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 5194 bdev_write_zero_buffer_next(bdev_io); 5195 5196 return 0; 5197 } 5198 5199 int 5200 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5201 uint64_t offset, uint64_t nbytes, 5202 spdk_bdev_io_completion_cb cb, void *cb_arg) 5203 { 5204 uint64_t offset_blocks, num_blocks; 5205 5206 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5207 nbytes, &num_blocks) != 0) { 5208 return -EINVAL; 5209 } 5210 5211 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5212 } 5213 5214 int 5215 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5216 uint64_t offset_blocks, uint64_t num_blocks, 5217 spdk_bdev_io_completion_cb cb, void *cb_arg) 5218 { 5219 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5220 struct spdk_bdev_io *bdev_io; 5221 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5222 5223 if (!desc->write) { 5224 return -EBADF; 5225 } 5226 5227 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5228 return -EINVAL; 5229 } 5230 5231 if (num_blocks == 0) { 5232 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5233 return -EINVAL; 5234 } 5235 5236 bdev_io = bdev_channel_get_io(channel); 5237 if (!bdev_io) { 5238 return -ENOMEM; 5239 } 5240 5241 bdev_io->internal.ch = channel; 5242 bdev_io->internal.desc = desc; 5243 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5244 5245 bdev_io->u.bdev.iovs = &bdev_io->iov; 5246 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5247 bdev_io->u.bdev.iovs[0].iov_len = 0; 5248 bdev_io->u.bdev.iovcnt = 1; 5249 5250 bdev_io->u.bdev.offset_blocks = offset_blocks; 5251 bdev_io->u.bdev.num_blocks = num_blocks; 5252 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5253 bdev_io->u.bdev.ext_opts = NULL; 5254 5255 bdev_io_submit(bdev_io); 5256 return 0; 5257 } 5258 5259 int 5260 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5261 uint64_t offset, uint64_t length, 5262 spdk_bdev_io_completion_cb cb, void *cb_arg) 5263 { 5264 uint64_t offset_blocks, num_blocks; 5265 5266 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5267 length, &num_blocks) != 0) { 5268 return -EINVAL; 5269 } 5270 5271 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5272 } 5273 5274 int 5275 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5276 uint64_t offset_blocks, uint64_t num_blocks, 5277 spdk_bdev_io_completion_cb cb, void *cb_arg) 5278 { 5279 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5280 struct spdk_bdev_io *bdev_io; 5281 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5282 5283 if (!desc->write) { 5284 return -EBADF; 5285 } 5286 5287 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5288 return -EINVAL; 5289 } 5290 5291 bdev_io = bdev_channel_get_io(channel); 5292 if (!bdev_io) { 5293 return -ENOMEM; 5294 } 5295 5296 bdev_io->internal.ch = channel; 5297 bdev_io->internal.desc = desc; 5298 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5299 bdev_io->u.bdev.iovs = NULL; 5300 bdev_io->u.bdev.iovcnt = 0; 5301 bdev_io->u.bdev.offset_blocks = offset_blocks; 5302 bdev_io->u.bdev.num_blocks = num_blocks; 5303 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5304 5305 bdev_io_submit(bdev_io); 5306 return 0; 5307 } 5308 5309 static int bdev_reset_poll_for_outstanding_io(void *ctx); 5310 5311 static void 5312 bdev_reset_check_outstanding_io_done(struct spdk_io_channel_iter *i, int status) 5313 { 5314 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5315 struct spdk_bdev_io *bdev_io; 5316 5317 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5318 5319 if (status == -EBUSY) { 5320 if (spdk_get_ticks() < bdev_io->u.reset.wait_poller.stop_time_tsc) { 5321 bdev_io->u.reset.wait_poller.poller = SPDK_POLLER_REGISTER(bdev_reset_poll_for_outstanding_io, 5322 ch, BDEV_RESET_CHECK_OUTSTANDING_IO_PERIOD); 5323 } else { 5324 /* If outstanding IOs are still present and reset_io_drain_timeout seconds passed, 5325 * start the reset. */ 5326 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5327 bdev_io_submit_reset(bdev_io); 5328 } 5329 } else { 5330 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5331 SPDK_DEBUGLOG(bdev, 5332 "Skipping reset for underlying device of bdev: %s - no outstanding I/O.\n", 5333 ch->bdev->name); 5334 /* Mark the completion status as a SUCCESS and complete the reset. */ 5335 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 5336 } 5337 } 5338 5339 static void 5340 bdev_reset_check_outstanding_io(struct spdk_io_channel_iter *i) 5341 { 5342 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 5343 struct spdk_bdev_channel *cur_ch = spdk_io_channel_get_ctx(io_ch); 5344 int status = 0; 5345 5346 if (cur_ch->io_outstanding > 0) { 5347 /* If a channel has outstanding IO, set status to -EBUSY code. This will stop 5348 * further iteration over the rest of the channels and pass non-zero status 5349 * to the callback function. */ 5350 status = -EBUSY; 5351 } 5352 spdk_for_each_channel_continue(i, status); 5353 } 5354 5355 static int 5356 bdev_reset_poll_for_outstanding_io(void *ctx) 5357 { 5358 struct spdk_bdev_channel *ch = ctx; 5359 struct spdk_bdev_io *bdev_io; 5360 5361 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5362 5363 spdk_poller_unregister(&bdev_io->u.reset.wait_poller.poller); 5364 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5365 ch, bdev_reset_check_outstanding_io_done); 5366 5367 return SPDK_POLLER_BUSY; 5368 } 5369 5370 static void 5371 bdev_reset_freeze_channel_done(struct spdk_io_channel_iter *i, int status) 5372 { 5373 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5374 struct spdk_bdev *bdev = ch->bdev; 5375 struct spdk_bdev_io *bdev_io; 5376 5377 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5378 5379 if (bdev->reset_io_drain_timeout == 0) { 5380 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5381 5382 bdev_io_submit_reset(bdev_io); 5383 return; 5384 } 5385 5386 bdev_io->u.reset.wait_poller.stop_time_tsc = spdk_get_ticks() + 5387 (ch->bdev->reset_io_drain_timeout * spdk_get_ticks_hz()); 5388 5389 /* In case bdev->reset_io_drain_timeout is not equal to zero, 5390 * submit the reset to the underlying module only if outstanding I/O 5391 * remain after reset_io_drain_timeout seconds have passed. */ 5392 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_check_outstanding_io, 5393 ch, bdev_reset_check_outstanding_io_done); 5394 } 5395 5396 static void 5397 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 5398 { 5399 struct spdk_io_channel *ch; 5400 struct spdk_bdev_channel *channel; 5401 struct spdk_bdev_mgmt_channel *mgmt_channel; 5402 struct spdk_bdev_shared_resource *shared_resource; 5403 bdev_io_tailq_t tmp_queued; 5404 5405 TAILQ_INIT(&tmp_queued); 5406 5407 ch = spdk_io_channel_iter_get_channel(i); 5408 channel = spdk_io_channel_get_ctx(ch); 5409 shared_resource = channel->shared_resource; 5410 mgmt_channel = shared_resource->mgmt_ch; 5411 5412 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5413 5414 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5415 /* The QoS object is always valid and readable while 5416 * the channel flag is set, so the lock here should not 5417 * be necessary. We're not in the fast path though, so 5418 * just take it anyway. */ 5419 pthread_mutex_lock(&channel->bdev->internal.mutex); 5420 if (channel->bdev->internal.qos->ch == channel) { 5421 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5422 } 5423 pthread_mutex_unlock(&channel->bdev->internal.mutex); 5424 } 5425 5426 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5427 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 5428 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 5429 bdev_abort_all_queued_io(&tmp_queued, channel); 5430 5431 spdk_for_each_channel_continue(i, 0); 5432 } 5433 5434 static void 5435 bdev_start_reset(void *ctx) 5436 { 5437 struct spdk_bdev_channel *ch = ctx; 5438 5439 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5440 ch, bdev_reset_freeze_channel_done); 5441 } 5442 5443 static void 5444 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5445 { 5446 struct spdk_bdev *bdev = ch->bdev; 5447 5448 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5449 5450 pthread_mutex_lock(&bdev->internal.mutex); 5451 if (bdev->internal.reset_in_progress == NULL) { 5452 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5453 /* 5454 * Take a channel reference for the target bdev for the life of this 5455 * reset. This guards against the channel getting destroyed while 5456 * spdk_for_each_channel() calls related to this reset IO are in 5457 * progress. We will release the reference when this reset is 5458 * completed. 5459 */ 5460 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5461 bdev_start_reset(ch); 5462 } 5463 pthread_mutex_unlock(&bdev->internal.mutex); 5464 } 5465 5466 int 5467 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5468 spdk_bdev_io_completion_cb cb, void *cb_arg) 5469 { 5470 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5471 struct spdk_bdev_io *bdev_io; 5472 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5473 5474 bdev_io = bdev_channel_get_io(channel); 5475 if (!bdev_io) { 5476 return -ENOMEM; 5477 } 5478 5479 bdev_io->internal.ch = channel; 5480 bdev_io->internal.desc = desc; 5481 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5482 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5483 bdev_io->u.reset.ch_ref = NULL; 5484 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5485 5486 pthread_mutex_lock(&bdev->internal.mutex); 5487 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5488 pthread_mutex_unlock(&bdev->internal.mutex); 5489 5490 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5491 internal.ch_link); 5492 5493 bdev_channel_start_reset(channel); 5494 5495 return 0; 5496 } 5497 5498 void 5499 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5500 struct spdk_bdev_io_stat *stat) 5501 { 5502 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5503 5504 *stat = channel->stat; 5505 } 5506 5507 static void 5508 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5509 { 5510 void *io_device = spdk_io_channel_iter_get_io_device(i); 5511 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5512 5513 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5514 bdev_iostat_ctx->cb_arg, 0); 5515 free(bdev_iostat_ctx); 5516 } 5517 5518 static void 5519 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5520 { 5521 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5522 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5523 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5524 5525 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5526 spdk_for_each_channel_continue(i, 0); 5527 } 5528 5529 void 5530 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5531 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5532 { 5533 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5534 5535 assert(bdev != NULL); 5536 assert(stat != NULL); 5537 assert(cb != NULL); 5538 5539 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5540 if (bdev_iostat_ctx == NULL) { 5541 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5542 cb(bdev, stat, cb_arg, -ENOMEM); 5543 return; 5544 } 5545 5546 bdev_iostat_ctx->stat = stat; 5547 bdev_iostat_ctx->cb = cb; 5548 bdev_iostat_ctx->cb_arg = cb_arg; 5549 5550 /* Start with the statistics from previously deleted channels. */ 5551 pthread_mutex_lock(&bdev->internal.mutex); 5552 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5553 pthread_mutex_unlock(&bdev->internal.mutex); 5554 5555 /* Then iterate and add the statistics from each existing channel. */ 5556 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5557 bdev_get_each_channel_stat, 5558 bdev_iostat_ctx, 5559 bdev_get_device_stat_done); 5560 } 5561 5562 int 5563 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5564 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5565 spdk_bdev_io_completion_cb cb, void *cb_arg) 5566 { 5567 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5568 struct spdk_bdev_io *bdev_io; 5569 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5570 5571 if (!desc->write) { 5572 return -EBADF; 5573 } 5574 5575 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5576 return -ENOTSUP; 5577 } 5578 5579 bdev_io = bdev_channel_get_io(channel); 5580 if (!bdev_io) { 5581 return -ENOMEM; 5582 } 5583 5584 bdev_io->internal.ch = channel; 5585 bdev_io->internal.desc = desc; 5586 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5587 bdev_io->u.nvme_passthru.cmd = *cmd; 5588 bdev_io->u.nvme_passthru.buf = buf; 5589 bdev_io->u.nvme_passthru.nbytes = nbytes; 5590 bdev_io->u.nvme_passthru.md_buf = NULL; 5591 bdev_io->u.nvme_passthru.md_len = 0; 5592 5593 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5594 5595 bdev_io_submit(bdev_io); 5596 return 0; 5597 } 5598 5599 int 5600 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5601 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5602 spdk_bdev_io_completion_cb cb, void *cb_arg) 5603 { 5604 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5605 struct spdk_bdev_io *bdev_io; 5606 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5607 5608 if (!desc->write) { 5609 /* 5610 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5611 * to easily determine if the command is a read or write, but for now just 5612 * do not allow io_passthru with a read-only descriptor. 5613 */ 5614 return -EBADF; 5615 } 5616 5617 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5618 return -ENOTSUP; 5619 } 5620 5621 bdev_io = bdev_channel_get_io(channel); 5622 if (!bdev_io) { 5623 return -ENOMEM; 5624 } 5625 5626 bdev_io->internal.ch = channel; 5627 bdev_io->internal.desc = desc; 5628 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5629 bdev_io->u.nvme_passthru.cmd = *cmd; 5630 bdev_io->u.nvme_passthru.buf = buf; 5631 bdev_io->u.nvme_passthru.nbytes = nbytes; 5632 bdev_io->u.nvme_passthru.md_buf = NULL; 5633 bdev_io->u.nvme_passthru.md_len = 0; 5634 5635 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5636 5637 bdev_io_submit(bdev_io); 5638 return 0; 5639 } 5640 5641 int 5642 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5643 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5644 spdk_bdev_io_completion_cb cb, void *cb_arg) 5645 { 5646 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5647 struct spdk_bdev_io *bdev_io; 5648 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5649 5650 if (!desc->write) { 5651 /* 5652 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5653 * to easily determine if the command is a read or write, but for now just 5654 * do not allow io_passthru with a read-only descriptor. 5655 */ 5656 return -EBADF; 5657 } 5658 5659 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5660 return -ENOTSUP; 5661 } 5662 5663 bdev_io = bdev_channel_get_io(channel); 5664 if (!bdev_io) { 5665 return -ENOMEM; 5666 } 5667 5668 bdev_io->internal.ch = channel; 5669 bdev_io->internal.desc = desc; 5670 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5671 bdev_io->u.nvme_passthru.cmd = *cmd; 5672 bdev_io->u.nvme_passthru.buf = buf; 5673 bdev_io->u.nvme_passthru.nbytes = nbytes; 5674 bdev_io->u.nvme_passthru.md_buf = md_buf; 5675 bdev_io->u.nvme_passthru.md_len = md_len; 5676 5677 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5678 5679 bdev_io_submit(bdev_io); 5680 return 0; 5681 } 5682 5683 static void bdev_abort_retry(void *ctx); 5684 static void bdev_abort(struct spdk_bdev_io *parent_io); 5685 5686 static void 5687 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5688 { 5689 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5690 struct spdk_bdev_io *parent_io = cb_arg; 5691 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5692 5693 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5694 5695 spdk_bdev_free_io(bdev_io); 5696 5697 if (!success) { 5698 /* Check if the target I/O completed in the meantime. */ 5699 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5700 if (tmp_io == bio_to_abort) { 5701 break; 5702 } 5703 } 5704 5705 /* If the target I/O still exists, set the parent to failed. */ 5706 if (tmp_io != NULL) { 5707 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5708 } 5709 } 5710 5711 parent_io->u.bdev.split_outstanding--; 5712 if (parent_io->u.bdev.split_outstanding == 0) { 5713 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5714 bdev_abort_retry(parent_io); 5715 } else { 5716 bdev_io_complete(parent_io); 5717 } 5718 } 5719 } 5720 5721 static int 5722 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5723 struct spdk_bdev_io *bio_to_abort, 5724 spdk_bdev_io_completion_cb cb, void *cb_arg) 5725 { 5726 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5727 struct spdk_bdev_io *bdev_io; 5728 5729 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5730 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5731 /* TODO: Abort reset or abort request. */ 5732 return -ENOTSUP; 5733 } 5734 5735 bdev_io = bdev_channel_get_io(channel); 5736 if (bdev_io == NULL) { 5737 return -ENOMEM; 5738 } 5739 5740 bdev_io->internal.ch = channel; 5741 bdev_io->internal.desc = desc; 5742 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5743 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5744 5745 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5746 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5747 5748 /* Parent abort request is not submitted directly, but to manage its 5749 * execution add it to the submitted list here. 5750 */ 5751 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5752 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5753 5754 bdev_abort(bdev_io); 5755 5756 return 0; 5757 } 5758 5759 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5760 5761 /* Submit the abort request to the underlying bdev module. */ 5762 bdev_io_submit(bdev_io); 5763 5764 return 0; 5765 } 5766 5767 static uint32_t 5768 _bdev_abort(struct spdk_bdev_io *parent_io) 5769 { 5770 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5771 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5772 void *bio_cb_arg; 5773 struct spdk_bdev_io *bio_to_abort; 5774 uint32_t matched_ios; 5775 int rc; 5776 5777 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5778 5779 /* matched_ios is returned and will be kept by the caller. 5780 * 5781 * This funcion will be used for two cases, 1) the same cb_arg is used for 5782 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5783 * Incrementing split_outstanding directly here may confuse readers especially 5784 * for the 1st case. 5785 * 5786 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5787 * works as expected. 5788 */ 5789 matched_ios = 0; 5790 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5791 5792 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5793 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5794 continue; 5795 } 5796 5797 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5798 /* Any I/O which was submitted after this abort command should be excluded. */ 5799 continue; 5800 } 5801 5802 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5803 if (rc != 0) { 5804 if (rc == -ENOMEM) { 5805 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5806 } else { 5807 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5808 } 5809 break; 5810 } 5811 matched_ios++; 5812 } 5813 5814 return matched_ios; 5815 } 5816 5817 static void 5818 bdev_abort_retry(void *ctx) 5819 { 5820 struct spdk_bdev_io *parent_io = ctx; 5821 uint32_t matched_ios; 5822 5823 matched_ios = _bdev_abort(parent_io); 5824 5825 if (matched_ios == 0) { 5826 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5827 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5828 } else { 5829 /* For retry, the case that no target I/O was found is success 5830 * because it means target I/Os completed in the meantime. 5831 */ 5832 bdev_io_complete(parent_io); 5833 } 5834 return; 5835 } 5836 5837 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5838 parent_io->u.bdev.split_outstanding = matched_ios; 5839 } 5840 5841 static void 5842 bdev_abort(struct spdk_bdev_io *parent_io) 5843 { 5844 uint32_t matched_ios; 5845 5846 matched_ios = _bdev_abort(parent_io); 5847 5848 if (matched_ios == 0) { 5849 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5850 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5851 } else { 5852 /* The case the no target I/O was found is failure. */ 5853 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5854 bdev_io_complete(parent_io); 5855 } 5856 return; 5857 } 5858 5859 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5860 parent_io->u.bdev.split_outstanding = matched_ios; 5861 } 5862 5863 int 5864 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5865 void *bio_cb_arg, 5866 spdk_bdev_io_completion_cb cb, void *cb_arg) 5867 { 5868 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5869 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5870 struct spdk_bdev_io *bdev_io; 5871 5872 if (bio_cb_arg == NULL) { 5873 return -EINVAL; 5874 } 5875 5876 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5877 return -ENOTSUP; 5878 } 5879 5880 bdev_io = bdev_channel_get_io(channel); 5881 if (bdev_io == NULL) { 5882 return -ENOMEM; 5883 } 5884 5885 bdev_io->internal.ch = channel; 5886 bdev_io->internal.desc = desc; 5887 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5888 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5889 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5890 5891 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5892 5893 /* Parent abort request is not submitted directly, but to manage its execution, 5894 * add it to the submitted list here. 5895 */ 5896 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5897 5898 bdev_abort(bdev_io); 5899 5900 return 0; 5901 } 5902 5903 int 5904 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5905 struct spdk_bdev_io_wait_entry *entry) 5906 { 5907 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5908 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5909 5910 if (bdev != entry->bdev) { 5911 SPDK_ERRLOG("bdevs do not match\n"); 5912 return -EINVAL; 5913 } 5914 5915 if (mgmt_ch->per_thread_cache_count > 0) { 5916 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5917 return -EINVAL; 5918 } 5919 5920 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5921 return 0; 5922 } 5923 5924 static inline void 5925 bdev_io_complete(void *ctx) 5926 { 5927 struct spdk_bdev_io *bdev_io = ctx; 5928 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5929 uint64_t tsc, tsc_diff; 5930 5931 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5932 /* 5933 * Send the completion to the thread that originally submitted the I/O, 5934 * which may not be the current thread in the case of QoS. 5935 */ 5936 if (bdev_io->internal.io_submit_ch) { 5937 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5938 bdev_io->internal.io_submit_ch = NULL; 5939 } 5940 5941 /* 5942 * Defer completion to avoid potential infinite recursion if the 5943 * user's completion callback issues a new I/O. 5944 */ 5945 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5946 bdev_io_complete, bdev_io); 5947 return; 5948 } 5949 5950 tsc = spdk_get_ticks(); 5951 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5952 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5953 bdev_io->internal.caller_ctx); 5954 5955 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5956 5957 if (bdev_io->internal.ch->histogram) { 5958 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5959 } 5960 5961 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5962 switch (bdev_io->type) { 5963 case SPDK_BDEV_IO_TYPE_READ: 5964 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5965 bdev_io->internal.ch->stat.num_read_ops++; 5966 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5967 break; 5968 case SPDK_BDEV_IO_TYPE_WRITE: 5969 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5970 bdev_io->internal.ch->stat.num_write_ops++; 5971 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5972 break; 5973 case SPDK_BDEV_IO_TYPE_UNMAP: 5974 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5975 bdev_io->internal.ch->stat.num_unmap_ops++; 5976 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5977 break; 5978 case SPDK_BDEV_IO_TYPE_ZCOPY: 5979 /* Track the data in the start phase only */ 5980 if (bdev_io->u.bdev.zcopy.start) { 5981 if (bdev_io->u.bdev.zcopy.populate) { 5982 bdev_io->internal.ch->stat.bytes_read += 5983 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5984 bdev_io->internal.ch->stat.num_read_ops++; 5985 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5986 } else { 5987 bdev_io->internal.ch->stat.bytes_written += 5988 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5989 bdev_io->internal.ch->stat.num_write_ops++; 5990 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5991 } 5992 } 5993 break; 5994 default: 5995 break; 5996 } 5997 } 5998 5999 #ifdef SPDK_CONFIG_VTUNE 6000 uint64_t now_tsc = spdk_get_ticks(); 6001 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 6002 uint64_t data[5]; 6003 6004 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 6005 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 6006 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 6007 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 6008 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 6009 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 6010 6011 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 6012 __itt_metadata_u64, 5, data); 6013 6014 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 6015 bdev_io->internal.ch->start_tsc = now_tsc; 6016 } 6017 #endif 6018 6019 assert(bdev_io->internal.cb != NULL); 6020 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 6021 6022 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 6023 bdev_io->internal.caller_ctx); 6024 } 6025 6026 static void bdev_destroy_cb(void *io_device); 6027 6028 static void 6029 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 6030 { 6031 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6032 struct spdk_bdev *bdev = bdev_io->bdev; 6033 6034 if (bdev_io->u.reset.ch_ref != NULL) { 6035 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 6036 bdev_io->u.reset.ch_ref = NULL; 6037 } 6038 6039 bdev_io_complete(bdev_io); 6040 6041 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 6042 TAILQ_EMPTY(&bdev->internal.open_descs)) { 6043 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6044 } 6045 } 6046 6047 static void 6048 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 6049 { 6050 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 6051 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 6052 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 6053 struct spdk_bdev_io *queued_reset; 6054 6055 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 6056 while (!TAILQ_EMPTY(&ch->queued_resets)) { 6057 queued_reset = TAILQ_FIRST(&ch->queued_resets); 6058 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 6059 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 6060 } 6061 6062 spdk_for_each_channel_continue(i, 0); 6063 } 6064 6065 void 6066 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 6067 { 6068 struct spdk_bdev *bdev = bdev_io->bdev; 6069 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 6070 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 6071 6072 bdev_io->internal.status = status; 6073 6074 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 6075 bool unlock_channels = false; 6076 6077 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 6078 SPDK_ERRLOG("NOMEM returned for reset\n"); 6079 } 6080 pthread_mutex_lock(&bdev->internal.mutex); 6081 if (bdev_io == bdev->internal.reset_in_progress) { 6082 bdev->internal.reset_in_progress = NULL; 6083 unlock_channels = true; 6084 } 6085 pthread_mutex_unlock(&bdev->internal.mutex); 6086 6087 if (unlock_channels) { 6088 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 6089 bdev_io, bdev_reset_complete); 6090 return; 6091 } 6092 } else { 6093 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 6094 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 6095 /* bdev IO will be completed in the callback */ 6096 return; 6097 } 6098 6099 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 6100 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 6101 return; 6102 } 6103 } 6104 6105 bdev_io_complete(bdev_io); 6106 } 6107 6108 void 6109 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 6110 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 6111 { 6112 if (sc == SPDK_SCSI_STATUS_GOOD) { 6113 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6114 } else { 6115 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 6116 bdev_io->internal.error.scsi.sc = sc; 6117 bdev_io->internal.error.scsi.sk = sk; 6118 bdev_io->internal.error.scsi.asc = asc; 6119 bdev_io->internal.error.scsi.ascq = ascq; 6120 } 6121 6122 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6123 } 6124 6125 void 6126 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 6127 int *sc, int *sk, int *asc, int *ascq) 6128 { 6129 assert(sc != NULL); 6130 assert(sk != NULL); 6131 assert(asc != NULL); 6132 assert(ascq != NULL); 6133 6134 switch (bdev_io->internal.status) { 6135 case SPDK_BDEV_IO_STATUS_SUCCESS: 6136 *sc = SPDK_SCSI_STATUS_GOOD; 6137 *sk = SPDK_SCSI_SENSE_NO_SENSE; 6138 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6139 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6140 break; 6141 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 6142 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 6143 break; 6144 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 6145 *sc = bdev_io->internal.error.scsi.sc; 6146 *sk = bdev_io->internal.error.scsi.sk; 6147 *asc = bdev_io->internal.error.scsi.asc; 6148 *ascq = bdev_io->internal.error.scsi.ascq; 6149 break; 6150 default: 6151 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 6152 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 6153 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 6154 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 6155 break; 6156 } 6157 } 6158 6159 void 6160 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 6161 { 6162 if (aio_result == 0) { 6163 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6164 } else { 6165 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 6166 } 6167 6168 bdev_io->internal.error.aio_result = aio_result; 6169 6170 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6171 } 6172 6173 void 6174 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 6175 { 6176 assert(aio_result != NULL); 6177 6178 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 6179 *aio_result = bdev_io->internal.error.aio_result; 6180 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6181 *aio_result = 0; 6182 } else { 6183 *aio_result = -EIO; 6184 } 6185 } 6186 6187 void 6188 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 6189 { 6190 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 6191 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6192 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 6193 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 6194 } else { 6195 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 6196 } 6197 6198 bdev_io->internal.error.nvme.cdw0 = cdw0; 6199 bdev_io->internal.error.nvme.sct = sct; 6200 bdev_io->internal.error.nvme.sc = sc; 6201 6202 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 6203 } 6204 6205 void 6206 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 6207 { 6208 assert(sct != NULL); 6209 assert(sc != NULL); 6210 assert(cdw0 != NULL); 6211 6212 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 6213 *sct = SPDK_NVME_SCT_GENERIC; 6214 *sc = SPDK_NVME_SC_SUCCESS; 6215 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6216 *cdw0 = 0; 6217 } else { 6218 *cdw0 = 1U; 6219 } 6220 return; 6221 } 6222 6223 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6224 *sct = bdev_io->internal.error.nvme.sct; 6225 *sc = bdev_io->internal.error.nvme.sc; 6226 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6227 *sct = SPDK_NVME_SCT_GENERIC; 6228 *sc = SPDK_NVME_SC_SUCCESS; 6229 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6230 *sct = SPDK_NVME_SCT_GENERIC; 6231 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6232 } else { 6233 *sct = SPDK_NVME_SCT_GENERIC; 6234 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6235 } 6236 6237 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6238 } 6239 6240 void 6241 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 6242 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 6243 { 6244 assert(first_sct != NULL); 6245 assert(first_sc != NULL); 6246 assert(second_sct != NULL); 6247 assert(second_sc != NULL); 6248 assert(cdw0 != NULL); 6249 6250 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 6251 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 6252 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 6253 *first_sct = bdev_io->internal.error.nvme.sct; 6254 *first_sc = bdev_io->internal.error.nvme.sc; 6255 *second_sct = SPDK_NVME_SCT_GENERIC; 6256 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6257 } else { 6258 *first_sct = SPDK_NVME_SCT_GENERIC; 6259 *first_sc = SPDK_NVME_SC_SUCCESS; 6260 *second_sct = bdev_io->internal.error.nvme.sct; 6261 *second_sc = bdev_io->internal.error.nvme.sc; 6262 } 6263 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 6264 *first_sct = SPDK_NVME_SCT_GENERIC; 6265 *first_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6266 *second_sct = SPDK_NVME_SCT_GENERIC; 6267 *second_sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 6268 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 6269 *first_sct = SPDK_NVME_SCT_GENERIC; 6270 *first_sc = SPDK_NVME_SC_SUCCESS; 6271 *second_sct = SPDK_NVME_SCT_GENERIC; 6272 *second_sc = SPDK_NVME_SC_SUCCESS; 6273 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 6274 *first_sct = SPDK_NVME_SCT_GENERIC; 6275 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6276 *second_sct = SPDK_NVME_SCT_GENERIC; 6277 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6278 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 6279 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 6280 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 6281 *second_sct = SPDK_NVME_SCT_GENERIC; 6282 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 6283 } else { 6284 *first_sct = SPDK_NVME_SCT_GENERIC; 6285 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6286 *second_sct = SPDK_NVME_SCT_GENERIC; 6287 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 6288 } 6289 6290 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6291 } 6292 6293 struct spdk_thread * 6294 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6295 { 6296 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6297 } 6298 6299 struct spdk_io_channel * 6300 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6301 { 6302 return bdev_io->internal.ch->channel; 6303 } 6304 6305 static int 6306 bdev_register(struct spdk_bdev *bdev) 6307 { 6308 char *bdev_name; 6309 char uuid[SPDK_UUID_STRING_LEN]; 6310 int ret; 6311 6312 assert(bdev->module != NULL); 6313 6314 if (!bdev->name) { 6315 SPDK_ERRLOG("Bdev name is NULL\n"); 6316 return -EINVAL; 6317 } 6318 6319 if (!strlen(bdev->name)) { 6320 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6321 return -EINVAL; 6322 } 6323 6324 /* Users often register their own I/O devices using the bdev name. In 6325 * order to avoid conflicts, prepend bdev_. */ 6326 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6327 if (!bdev_name) { 6328 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6329 return -ENOMEM; 6330 } 6331 6332 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6333 bdev->internal.measured_queue_depth = UINT64_MAX; 6334 bdev->internal.claim_module = NULL; 6335 bdev->internal.qd_poller = NULL; 6336 bdev->internal.qos = NULL; 6337 6338 TAILQ_INIT(&bdev->internal.open_descs); 6339 TAILQ_INIT(&bdev->internal.locked_ranges); 6340 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6341 TAILQ_INIT(&bdev->aliases); 6342 6343 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6344 if (ret != 0) { 6345 free(bdev_name); 6346 return ret; 6347 } 6348 6349 /* If the user didn't specify a uuid, generate one. */ 6350 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6351 spdk_uuid_generate(&bdev->uuid); 6352 } 6353 6354 /* Add the UUID alias only if it's different than the name */ 6355 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6356 if (strcmp(bdev->name, uuid) != 0) { 6357 ret = spdk_bdev_alias_add(bdev, uuid); 6358 if (ret != 0) { 6359 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6360 bdev_name_del(&bdev->internal.bdev_name); 6361 free(bdev_name); 6362 return ret; 6363 } 6364 } 6365 6366 if (spdk_bdev_get_buf_align(bdev) > 1) { 6367 if (bdev->split_on_optimal_io_boundary) { 6368 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6369 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6370 } else { 6371 bdev->split_on_optimal_io_boundary = true; 6372 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6373 } 6374 } 6375 6376 /* If the user didn't specify a write unit size, set it to one. */ 6377 if (bdev->write_unit_size == 0) { 6378 bdev->write_unit_size = 1; 6379 } 6380 6381 /* Set ACWU value to the write unit size if bdev module did not set it (does not support it natively) */ 6382 if (bdev->acwu == 0) { 6383 bdev->acwu = bdev->write_unit_size; 6384 } 6385 6386 if (bdev->phys_blocklen == 0) { 6387 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6388 } 6389 6390 bdev->internal.reset_in_progress = NULL; 6391 bdev->internal.qd_poll_in_progress = false; 6392 bdev->internal.period = 0; 6393 bdev->internal.new_period = 0; 6394 6395 spdk_io_device_register(__bdev_to_io_dev(bdev), 6396 bdev_channel_create, bdev_channel_destroy, 6397 sizeof(struct spdk_bdev_channel), 6398 bdev_name); 6399 6400 free(bdev_name); 6401 6402 pthread_mutex_init(&bdev->internal.mutex, NULL); 6403 6404 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6405 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6406 6407 return 0; 6408 } 6409 6410 static void 6411 bdev_destroy_cb(void *io_device) 6412 { 6413 int rc; 6414 struct spdk_bdev *bdev; 6415 spdk_bdev_unregister_cb cb_fn; 6416 void *cb_arg; 6417 6418 bdev = __bdev_from_io_dev(io_device); 6419 cb_fn = bdev->internal.unregister_cb; 6420 cb_arg = bdev->internal.unregister_ctx; 6421 6422 pthread_mutex_destroy(&bdev->internal.mutex); 6423 free(bdev->internal.qos); 6424 6425 rc = bdev->fn_table->destruct(bdev->ctxt); 6426 if (rc < 0) { 6427 SPDK_ERRLOG("destruct failed\n"); 6428 } 6429 if (rc <= 0 && cb_fn != NULL) { 6430 cb_fn(cb_arg, rc); 6431 } 6432 } 6433 6434 void 6435 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6436 { 6437 if (bdev->internal.unregister_cb != NULL) { 6438 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6439 } 6440 } 6441 6442 static void 6443 _remove_notify(void *arg) 6444 { 6445 struct spdk_bdev_desc *desc = arg; 6446 6447 pthread_mutex_lock(&desc->mutex); 6448 desc->refs--; 6449 6450 if (!desc->closed) { 6451 pthread_mutex_unlock(&desc->mutex); 6452 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6453 return; 6454 } else if (0 == desc->refs) { 6455 /* This descriptor was closed after this remove_notify message was sent. 6456 * spdk_bdev_close() could not free the descriptor since this message was 6457 * in flight, so we free it now using bdev_desc_free(). 6458 */ 6459 pthread_mutex_unlock(&desc->mutex); 6460 bdev_desc_free(desc); 6461 return; 6462 } 6463 pthread_mutex_unlock(&desc->mutex); 6464 } 6465 6466 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6467 * returns: 0 - bdev removed and ready to be destructed. 6468 * -EBUSY - bdev can't be destructed yet. */ 6469 static int 6470 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6471 { 6472 struct spdk_bdev_desc *desc, *tmp; 6473 int rc = 0; 6474 char uuid[SPDK_UUID_STRING_LEN]; 6475 6476 /* Notify each descriptor about hotremoval */ 6477 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6478 rc = -EBUSY; 6479 pthread_mutex_lock(&desc->mutex); 6480 /* 6481 * Defer invocation of the event_cb to a separate message that will 6482 * run later on its thread. This ensures this context unwinds and 6483 * we don't recursively unregister this bdev again if the event_cb 6484 * immediately closes its descriptor. 6485 */ 6486 desc->refs++; 6487 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6488 pthread_mutex_unlock(&desc->mutex); 6489 } 6490 6491 /* If there are no descriptors, proceed removing the bdev */ 6492 if (rc == 0) { 6493 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6494 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6495 6496 /* Delete the name and the UUID alias */ 6497 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6498 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6499 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6500 6501 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6502 6503 if (bdev->internal.reset_in_progress != NULL) { 6504 /* If reset is in progress, let the completion callback for reset 6505 * unregister the bdev. 6506 */ 6507 rc = -EBUSY; 6508 } 6509 } 6510 6511 return rc; 6512 } 6513 6514 static void 6515 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i) 6516 { 6517 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 6518 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 6519 6520 bdev_channel_abort_queued_ios(bdev_ch); 6521 spdk_for_each_channel_continue(i, 0); 6522 } 6523 6524 static void 6525 bdev_unregister(struct spdk_io_channel_iter *i, int status) 6526 { 6527 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 6528 int rc; 6529 6530 pthread_mutex_lock(&g_bdev_mgr.mutex); 6531 pthread_mutex_lock(&bdev->internal.mutex); 6532 /* 6533 * Set the status to REMOVING after completing to abort channels. Otherwise, 6534 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6535 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail. 6536 */ 6537 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6538 rc = bdev_unregister_unsafe(bdev); 6539 pthread_mutex_unlock(&bdev->internal.mutex); 6540 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6541 6542 if (rc == 0) { 6543 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6544 } 6545 } 6546 6547 void 6548 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6549 { 6550 struct spdk_thread *thread; 6551 6552 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6553 6554 thread = spdk_get_thread(); 6555 if (!thread) { 6556 /* The user called this from a non-SPDK thread. */ 6557 if (cb_fn != NULL) { 6558 cb_fn(cb_arg, -ENOTSUP); 6559 } 6560 return; 6561 } 6562 6563 pthread_mutex_lock(&g_bdev_mgr.mutex); 6564 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6565 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6566 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6567 if (cb_fn) { 6568 cb_fn(cb_arg, -EBUSY); 6569 } 6570 return; 6571 } 6572 6573 pthread_mutex_lock(&bdev->internal.mutex); 6574 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6575 bdev->internal.unregister_cb = cb_fn; 6576 bdev->internal.unregister_ctx = cb_arg; 6577 pthread_mutex_unlock(&bdev->internal.mutex); 6578 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6579 6580 spdk_bdev_set_qd_sampling_period(bdev, 0); 6581 6582 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6583 bdev_unregister_abort_channel, 6584 bdev, 6585 bdev_unregister); 6586 } 6587 6588 int 6589 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6590 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6591 { 6592 struct spdk_bdev_desc *desc; 6593 struct spdk_bdev *bdev; 6594 int rc; 6595 6596 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6597 if (rc != 0) { 6598 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6599 return rc; 6600 } 6601 6602 bdev = spdk_bdev_desc_get_bdev(desc); 6603 6604 if (bdev->module != module) { 6605 spdk_bdev_close(desc); 6606 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6607 bdev_name); 6608 return -ENODEV; 6609 } 6610 6611 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6612 6613 spdk_bdev_close(desc); 6614 6615 return 0; 6616 } 6617 6618 static int 6619 bdev_start_qos(struct spdk_bdev *bdev) 6620 { 6621 struct set_qos_limit_ctx *ctx; 6622 6623 /* Enable QoS */ 6624 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6625 ctx = calloc(1, sizeof(*ctx)); 6626 if (ctx == NULL) { 6627 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6628 return -ENOMEM; 6629 } 6630 ctx->bdev = bdev; 6631 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6632 bdev_enable_qos_msg, ctx, 6633 bdev_enable_qos_done); 6634 } 6635 6636 return 0; 6637 } 6638 6639 static int 6640 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6641 { 6642 struct spdk_thread *thread; 6643 int rc = 0; 6644 6645 thread = spdk_get_thread(); 6646 if (!thread) { 6647 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6648 return -ENOTSUP; 6649 } 6650 6651 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6652 spdk_get_thread()); 6653 6654 desc->bdev = bdev; 6655 desc->thread = thread; 6656 desc->write = write; 6657 6658 pthread_mutex_lock(&bdev->internal.mutex); 6659 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6660 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6661 pthread_mutex_unlock(&bdev->internal.mutex); 6662 return -ENODEV; 6663 } 6664 6665 if (write && bdev->internal.claim_module) { 6666 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6667 bdev->name, bdev->internal.claim_module->name); 6668 pthread_mutex_unlock(&bdev->internal.mutex); 6669 return -EPERM; 6670 } 6671 6672 rc = bdev_start_qos(bdev); 6673 if (rc != 0) { 6674 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6675 pthread_mutex_unlock(&bdev->internal.mutex); 6676 return rc; 6677 } 6678 6679 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6680 6681 pthread_mutex_unlock(&bdev->internal.mutex); 6682 6683 return 0; 6684 } 6685 6686 static int 6687 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6688 struct spdk_bdev_desc **_desc) 6689 { 6690 struct spdk_bdev_desc *desc; 6691 unsigned int event_id; 6692 6693 desc = calloc(1, sizeof(*desc)); 6694 if (desc == NULL) { 6695 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6696 return -ENOMEM; 6697 } 6698 6699 TAILQ_INIT(&desc->pending_media_events); 6700 TAILQ_INIT(&desc->free_media_events); 6701 6702 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6703 desc->callback.event_fn = event_cb; 6704 desc->callback.ctx = event_ctx; 6705 pthread_mutex_init(&desc->mutex, NULL); 6706 6707 if (bdev->media_events) { 6708 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6709 sizeof(*desc->media_events_buffer)); 6710 if (desc->media_events_buffer == NULL) { 6711 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6712 bdev_desc_free(desc); 6713 return -ENOMEM; 6714 } 6715 6716 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6717 TAILQ_INSERT_TAIL(&desc->free_media_events, 6718 &desc->media_events_buffer[event_id], tailq); 6719 } 6720 } 6721 6722 *_desc = desc; 6723 6724 return 0; 6725 } 6726 6727 int 6728 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6729 void *event_ctx, struct spdk_bdev_desc **_desc) 6730 { 6731 struct spdk_bdev_desc *desc; 6732 struct spdk_bdev *bdev; 6733 int rc; 6734 6735 if (event_cb == NULL) { 6736 SPDK_ERRLOG("Missing event callback function\n"); 6737 return -EINVAL; 6738 } 6739 6740 pthread_mutex_lock(&g_bdev_mgr.mutex); 6741 6742 bdev = bdev_get_by_name(bdev_name); 6743 6744 if (bdev == NULL) { 6745 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6746 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6747 return -ENODEV; 6748 } 6749 6750 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 6751 if (rc != 0) { 6752 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6753 return rc; 6754 } 6755 6756 rc = bdev_open(bdev, write, desc); 6757 if (rc != 0) { 6758 bdev_desc_free(desc); 6759 desc = NULL; 6760 } 6761 6762 *_desc = desc; 6763 6764 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6765 6766 return rc; 6767 } 6768 6769 static void 6770 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 6771 { 6772 int rc; 6773 6774 pthread_mutex_lock(&bdev->internal.mutex); 6775 pthread_mutex_lock(&desc->mutex); 6776 6777 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6778 6779 desc->closed = true; 6780 6781 if (0 == desc->refs) { 6782 pthread_mutex_unlock(&desc->mutex); 6783 bdev_desc_free(desc); 6784 } else { 6785 pthread_mutex_unlock(&desc->mutex); 6786 } 6787 6788 /* If no more descriptors, kill QoS channel */ 6789 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6790 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6791 bdev->name, spdk_get_thread()); 6792 6793 if (bdev_qos_destroy(bdev)) { 6794 /* There isn't anything we can do to recover here. Just let the 6795 * old QoS poller keep running. The QoS handling won't change 6796 * cores when the user allocates a new channel, but it won't break. */ 6797 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6798 } 6799 } 6800 6801 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6802 rc = bdev_unregister_unsafe(bdev); 6803 pthread_mutex_unlock(&bdev->internal.mutex); 6804 6805 if (rc == 0) { 6806 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6807 } 6808 } else { 6809 pthread_mutex_unlock(&bdev->internal.mutex); 6810 } 6811 } 6812 6813 void 6814 spdk_bdev_close(struct spdk_bdev_desc *desc) 6815 { 6816 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6817 6818 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6819 spdk_get_thread()); 6820 6821 assert(desc->thread == spdk_get_thread()); 6822 6823 spdk_poller_unregister(&desc->io_timeout_poller); 6824 6825 pthread_mutex_lock(&g_bdev_mgr.mutex); 6826 6827 bdev_close(bdev, desc); 6828 6829 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6830 } 6831 6832 static void 6833 bdev_register_finished(void *arg) 6834 { 6835 struct spdk_bdev_desc *desc = arg; 6836 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6837 6838 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 6839 6840 bdev_close(bdev, desc); 6841 } 6842 6843 int 6844 spdk_bdev_register(struct spdk_bdev *bdev) 6845 { 6846 struct spdk_bdev_desc *desc; 6847 int rc; 6848 6849 rc = bdev_register(bdev); 6850 if (rc != 0) { 6851 return rc; 6852 } 6853 6854 /* A descriptor is opened to prevent bdev deletion during examination */ 6855 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6856 if (rc != 0) { 6857 spdk_bdev_unregister(bdev, NULL, NULL); 6858 return rc; 6859 } 6860 6861 rc = bdev_open(bdev, false, desc); 6862 if (rc != 0) { 6863 bdev_desc_free(desc); 6864 spdk_bdev_unregister(bdev, NULL, NULL); 6865 return rc; 6866 } 6867 6868 /* Examine configuration before initializing I/O */ 6869 bdev_examine(bdev); 6870 6871 rc = spdk_bdev_wait_for_examine(bdev_register_finished, desc); 6872 if (rc != 0) { 6873 bdev_close(bdev, desc); 6874 spdk_bdev_unregister(bdev, NULL, NULL); 6875 } 6876 6877 return rc; 6878 } 6879 6880 int 6881 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6882 struct spdk_bdev_module *module) 6883 { 6884 if (bdev->internal.claim_module != NULL) { 6885 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6886 bdev->internal.claim_module->name); 6887 return -EPERM; 6888 } 6889 6890 if (desc && !desc->write) { 6891 desc->write = true; 6892 } 6893 6894 bdev->internal.claim_module = module; 6895 return 0; 6896 } 6897 6898 void 6899 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6900 { 6901 assert(bdev->internal.claim_module != NULL); 6902 bdev->internal.claim_module = NULL; 6903 } 6904 6905 struct spdk_bdev * 6906 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6907 { 6908 assert(desc != NULL); 6909 return desc->bdev; 6910 } 6911 6912 int 6913 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 6914 { 6915 struct spdk_bdev *bdev, *tmp; 6916 struct spdk_bdev_desc *desc; 6917 int rc = 0; 6918 6919 assert(fn != NULL); 6920 6921 pthread_mutex_lock(&g_bdev_mgr.mutex); 6922 bdev = spdk_bdev_first(); 6923 while (bdev != NULL) { 6924 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6925 if (rc != 0) { 6926 break; 6927 } 6928 rc = bdev_open(bdev, false, desc); 6929 if (rc != 0) { 6930 bdev_desc_free(desc); 6931 break; 6932 } 6933 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6934 6935 rc = fn(ctx, bdev); 6936 6937 pthread_mutex_lock(&g_bdev_mgr.mutex); 6938 tmp = spdk_bdev_next(bdev); 6939 bdev_close(bdev, desc); 6940 if (rc != 0) { 6941 break; 6942 } 6943 bdev = tmp; 6944 } 6945 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6946 6947 return rc; 6948 } 6949 6950 int 6951 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 6952 { 6953 struct spdk_bdev *bdev, *tmp; 6954 struct spdk_bdev_desc *desc; 6955 int rc = 0; 6956 6957 assert(fn != NULL); 6958 6959 pthread_mutex_lock(&g_bdev_mgr.mutex); 6960 bdev = spdk_bdev_first_leaf(); 6961 while (bdev != NULL) { 6962 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6963 if (rc != 0) { 6964 break; 6965 } 6966 rc = bdev_open(bdev, false, desc); 6967 if (rc != 0) { 6968 bdev_desc_free(desc); 6969 break; 6970 } 6971 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6972 6973 rc = fn(ctx, bdev); 6974 6975 pthread_mutex_lock(&g_bdev_mgr.mutex); 6976 tmp = spdk_bdev_next_leaf(bdev); 6977 bdev_close(bdev, desc); 6978 if (rc != 0) { 6979 break; 6980 } 6981 bdev = tmp; 6982 } 6983 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6984 6985 return rc; 6986 } 6987 6988 void 6989 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6990 { 6991 struct iovec *iovs; 6992 int iovcnt; 6993 6994 if (bdev_io == NULL) { 6995 return; 6996 } 6997 6998 switch (bdev_io->type) { 6999 case SPDK_BDEV_IO_TYPE_READ: 7000 case SPDK_BDEV_IO_TYPE_WRITE: 7001 case SPDK_BDEV_IO_TYPE_ZCOPY: 7002 iovs = bdev_io->u.bdev.iovs; 7003 iovcnt = bdev_io->u.bdev.iovcnt; 7004 break; 7005 default: 7006 iovs = NULL; 7007 iovcnt = 0; 7008 break; 7009 } 7010 7011 if (iovp) { 7012 *iovp = iovs; 7013 } 7014 if (iovcntp) { 7015 *iovcntp = iovcnt; 7016 } 7017 } 7018 7019 void * 7020 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 7021 { 7022 if (bdev_io == NULL) { 7023 return NULL; 7024 } 7025 7026 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 7027 return NULL; 7028 } 7029 7030 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 7031 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 7032 return bdev_io->u.bdev.md_buf; 7033 } 7034 7035 return NULL; 7036 } 7037 7038 void * 7039 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 7040 { 7041 if (bdev_io == NULL) { 7042 assert(false); 7043 return NULL; 7044 } 7045 7046 return bdev_io->internal.caller_ctx; 7047 } 7048 7049 void 7050 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 7051 { 7052 7053 if (spdk_bdev_module_list_find(bdev_module->name)) { 7054 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 7055 assert(false); 7056 } 7057 7058 /* 7059 * Modules with examine callbacks must be initialized first, so they are 7060 * ready to handle examine callbacks from later modules that will 7061 * register physical bdevs. 7062 */ 7063 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 7064 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7065 } else { 7066 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 7067 } 7068 } 7069 7070 struct spdk_bdev_module * 7071 spdk_bdev_module_list_find(const char *name) 7072 { 7073 struct spdk_bdev_module *bdev_module; 7074 7075 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 7076 if (strcmp(name, bdev_module->name) == 0) { 7077 break; 7078 } 7079 } 7080 7081 return bdev_module; 7082 } 7083 7084 static void 7085 bdev_write_zero_buffer_next(void *_bdev_io) 7086 { 7087 struct spdk_bdev_io *bdev_io = _bdev_io; 7088 uint64_t num_bytes, num_blocks; 7089 void *md_buf = NULL; 7090 int rc; 7091 7092 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 7093 bdev_io->u.bdev.split_remaining_num_blocks, 7094 ZERO_BUFFER_SIZE); 7095 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 7096 num_blocks -= num_blocks % bdev_io->bdev->write_unit_size; 7097 7098 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 7099 md_buf = (char *)g_bdev_mgr.zero_buffer + 7100 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 7101 } 7102 7103 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 7104 spdk_io_channel_from_ctx(bdev_io->internal.ch), 7105 g_bdev_mgr.zero_buffer, md_buf, 7106 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 7107 bdev_write_zero_buffer_done, bdev_io); 7108 if (rc == 0) { 7109 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 7110 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 7111 } else if (rc == -ENOMEM) { 7112 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 7113 } else { 7114 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7115 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 7116 } 7117 } 7118 7119 static void 7120 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 7121 { 7122 struct spdk_bdev_io *parent_io = cb_arg; 7123 7124 spdk_bdev_free_io(bdev_io); 7125 7126 if (!success) { 7127 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 7128 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 7129 return; 7130 } 7131 7132 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 7133 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 7134 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 7135 return; 7136 } 7137 7138 bdev_write_zero_buffer_next(parent_io); 7139 } 7140 7141 static void 7142 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 7143 { 7144 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7145 ctx->bdev->internal.qos_mod_in_progress = false; 7146 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7147 7148 if (ctx->cb_fn) { 7149 ctx->cb_fn(ctx->cb_arg, status); 7150 } 7151 free(ctx); 7152 } 7153 7154 static void 7155 bdev_disable_qos_done(void *cb_arg) 7156 { 7157 struct set_qos_limit_ctx *ctx = cb_arg; 7158 struct spdk_bdev *bdev = ctx->bdev; 7159 struct spdk_bdev_io *bdev_io; 7160 struct spdk_bdev_qos *qos; 7161 7162 pthread_mutex_lock(&bdev->internal.mutex); 7163 qos = bdev->internal.qos; 7164 bdev->internal.qos = NULL; 7165 pthread_mutex_unlock(&bdev->internal.mutex); 7166 7167 while (!TAILQ_EMPTY(&qos->queued)) { 7168 /* Send queued I/O back to their original thread for resubmission. */ 7169 bdev_io = TAILQ_FIRST(&qos->queued); 7170 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 7171 7172 if (bdev_io->internal.io_submit_ch) { 7173 /* 7174 * Channel was changed when sending it to the QoS thread - change it back 7175 * before sending it back to the original thread. 7176 */ 7177 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 7178 bdev_io->internal.io_submit_ch = NULL; 7179 } 7180 7181 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 7182 _bdev_io_submit, bdev_io); 7183 } 7184 7185 if (qos->thread != NULL) { 7186 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 7187 spdk_poller_unregister(&qos->poller); 7188 } 7189 7190 free(qos); 7191 7192 bdev_set_qos_limit_done(ctx, 0); 7193 } 7194 7195 static void 7196 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 7197 { 7198 void *io_device = spdk_io_channel_iter_get_io_device(i); 7199 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7200 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7201 struct spdk_thread *thread; 7202 7203 pthread_mutex_lock(&bdev->internal.mutex); 7204 thread = bdev->internal.qos->thread; 7205 pthread_mutex_unlock(&bdev->internal.mutex); 7206 7207 if (thread != NULL) { 7208 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 7209 } else { 7210 bdev_disable_qos_done(ctx); 7211 } 7212 } 7213 7214 static void 7215 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 7216 { 7217 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7218 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7219 7220 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 7221 7222 spdk_for_each_channel_continue(i, 0); 7223 } 7224 7225 static void 7226 bdev_update_qos_rate_limit_msg(void *cb_arg) 7227 { 7228 struct set_qos_limit_ctx *ctx = cb_arg; 7229 struct spdk_bdev *bdev = ctx->bdev; 7230 7231 pthread_mutex_lock(&bdev->internal.mutex); 7232 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 7233 pthread_mutex_unlock(&bdev->internal.mutex); 7234 7235 bdev_set_qos_limit_done(ctx, 0); 7236 } 7237 7238 static void 7239 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 7240 { 7241 void *io_device = spdk_io_channel_iter_get_io_device(i); 7242 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 7243 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 7244 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 7245 7246 pthread_mutex_lock(&bdev->internal.mutex); 7247 bdev_enable_qos(bdev, bdev_ch); 7248 pthread_mutex_unlock(&bdev->internal.mutex); 7249 spdk_for_each_channel_continue(i, 0); 7250 } 7251 7252 static void 7253 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 7254 { 7255 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7256 7257 bdev_set_qos_limit_done(ctx, status); 7258 } 7259 7260 static void 7261 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 7262 { 7263 int i; 7264 7265 assert(bdev->internal.qos != NULL); 7266 7267 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7268 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7269 bdev->internal.qos->rate_limits[i].limit = limits[i]; 7270 7271 if (limits[i] == 0) { 7272 bdev->internal.qos->rate_limits[i].limit = 7273 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 7274 } 7275 } 7276 } 7277 } 7278 7279 void 7280 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 7281 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 7282 { 7283 struct set_qos_limit_ctx *ctx; 7284 uint32_t limit_set_complement; 7285 uint64_t min_limit_per_sec; 7286 int i; 7287 bool disable_rate_limit = true; 7288 7289 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7290 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 7291 continue; 7292 } 7293 7294 if (limits[i] > 0) { 7295 disable_rate_limit = false; 7296 } 7297 7298 if (bdev_qos_is_iops_rate_limit(i) == true) { 7299 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 7300 } else { 7301 /* Change from megabyte to byte rate limit */ 7302 limits[i] = limits[i] * 1024 * 1024; 7303 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 7304 } 7305 7306 limit_set_complement = limits[i] % min_limit_per_sec; 7307 if (limit_set_complement) { 7308 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 7309 limits[i], min_limit_per_sec); 7310 limits[i] += min_limit_per_sec - limit_set_complement; 7311 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7312 } 7313 } 7314 7315 ctx = calloc(1, sizeof(*ctx)); 7316 if (ctx == NULL) { 7317 cb_fn(cb_arg, -ENOMEM); 7318 return; 7319 } 7320 7321 ctx->cb_fn = cb_fn; 7322 ctx->cb_arg = cb_arg; 7323 ctx->bdev = bdev; 7324 7325 pthread_mutex_lock(&bdev->internal.mutex); 7326 if (bdev->internal.qos_mod_in_progress) { 7327 pthread_mutex_unlock(&bdev->internal.mutex); 7328 free(ctx); 7329 cb_fn(cb_arg, -EAGAIN); 7330 return; 7331 } 7332 bdev->internal.qos_mod_in_progress = true; 7333 7334 if (disable_rate_limit == true && bdev->internal.qos) { 7335 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7336 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7337 (bdev->internal.qos->rate_limits[i].limit > 0 && 7338 bdev->internal.qos->rate_limits[i].limit != 7339 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7340 disable_rate_limit = false; 7341 break; 7342 } 7343 } 7344 } 7345 7346 if (disable_rate_limit == false) { 7347 if (bdev->internal.qos == NULL) { 7348 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7349 if (!bdev->internal.qos) { 7350 pthread_mutex_unlock(&bdev->internal.mutex); 7351 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7352 bdev_set_qos_limit_done(ctx, -ENOMEM); 7353 return; 7354 } 7355 } 7356 7357 if (bdev->internal.qos->thread == NULL) { 7358 /* Enabling */ 7359 bdev_set_qos_rate_limits(bdev, limits); 7360 7361 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7362 bdev_enable_qos_msg, ctx, 7363 bdev_enable_qos_done); 7364 } else { 7365 /* Updating */ 7366 bdev_set_qos_rate_limits(bdev, limits); 7367 7368 spdk_thread_send_msg(bdev->internal.qos->thread, 7369 bdev_update_qos_rate_limit_msg, ctx); 7370 } 7371 } else { 7372 if (bdev->internal.qos != NULL) { 7373 bdev_set_qos_rate_limits(bdev, limits); 7374 7375 /* Disabling */ 7376 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7377 bdev_disable_qos_msg, ctx, 7378 bdev_disable_qos_msg_done); 7379 } else { 7380 pthread_mutex_unlock(&bdev->internal.mutex); 7381 bdev_set_qos_limit_done(ctx, 0); 7382 return; 7383 } 7384 } 7385 7386 pthread_mutex_unlock(&bdev->internal.mutex); 7387 } 7388 7389 struct spdk_bdev_histogram_ctx { 7390 spdk_bdev_histogram_status_cb cb_fn; 7391 void *cb_arg; 7392 struct spdk_bdev *bdev; 7393 int status; 7394 }; 7395 7396 static void 7397 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 7398 { 7399 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7400 7401 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7402 ctx->bdev->internal.histogram_in_progress = false; 7403 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7404 ctx->cb_fn(ctx->cb_arg, ctx->status); 7405 free(ctx); 7406 } 7407 7408 static void 7409 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 7410 { 7411 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7412 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7413 7414 if (ch->histogram != NULL) { 7415 spdk_histogram_data_free(ch->histogram); 7416 ch->histogram = NULL; 7417 } 7418 spdk_for_each_channel_continue(i, 0); 7419 } 7420 7421 static void 7422 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 7423 { 7424 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7425 7426 if (status != 0) { 7427 ctx->status = status; 7428 ctx->bdev->internal.histogram_enabled = false; 7429 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 7430 bdev_histogram_disable_channel_cb); 7431 } else { 7432 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7433 ctx->bdev->internal.histogram_in_progress = false; 7434 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7435 ctx->cb_fn(ctx->cb_arg, ctx->status); 7436 free(ctx); 7437 } 7438 } 7439 7440 static void 7441 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 7442 { 7443 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7444 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7445 int status = 0; 7446 7447 if (ch->histogram == NULL) { 7448 ch->histogram = spdk_histogram_data_alloc(); 7449 if (ch->histogram == NULL) { 7450 status = -ENOMEM; 7451 } 7452 } 7453 7454 spdk_for_each_channel_continue(i, status); 7455 } 7456 7457 void 7458 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7459 void *cb_arg, bool enable) 7460 { 7461 struct spdk_bdev_histogram_ctx *ctx; 7462 7463 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7464 if (ctx == NULL) { 7465 cb_fn(cb_arg, -ENOMEM); 7466 return; 7467 } 7468 7469 ctx->bdev = bdev; 7470 ctx->status = 0; 7471 ctx->cb_fn = cb_fn; 7472 ctx->cb_arg = cb_arg; 7473 7474 pthread_mutex_lock(&bdev->internal.mutex); 7475 if (bdev->internal.histogram_in_progress) { 7476 pthread_mutex_unlock(&bdev->internal.mutex); 7477 free(ctx); 7478 cb_fn(cb_arg, -EAGAIN); 7479 return; 7480 } 7481 7482 bdev->internal.histogram_in_progress = true; 7483 pthread_mutex_unlock(&bdev->internal.mutex); 7484 7485 bdev->internal.histogram_enabled = enable; 7486 7487 if (enable) { 7488 /* Allocate histogram for each channel */ 7489 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 7490 bdev_histogram_enable_channel_cb); 7491 } else { 7492 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 7493 bdev_histogram_disable_channel_cb); 7494 } 7495 } 7496 7497 struct spdk_bdev_histogram_data_ctx { 7498 spdk_bdev_histogram_data_cb cb_fn; 7499 void *cb_arg; 7500 struct spdk_bdev *bdev; 7501 /** merged histogram data from all channels */ 7502 struct spdk_histogram_data *histogram; 7503 }; 7504 7505 static void 7506 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 7507 { 7508 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7509 7510 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7511 free(ctx); 7512 } 7513 7514 static void 7515 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 7516 { 7517 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7518 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7519 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7520 int status = 0; 7521 7522 if (ch->histogram == NULL) { 7523 status = -EFAULT; 7524 } else { 7525 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7526 } 7527 7528 spdk_for_each_channel_continue(i, status); 7529 } 7530 7531 void 7532 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7533 spdk_bdev_histogram_data_cb cb_fn, 7534 void *cb_arg) 7535 { 7536 struct spdk_bdev_histogram_data_ctx *ctx; 7537 7538 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7539 if (ctx == NULL) { 7540 cb_fn(cb_arg, -ENOMEM, NULL); 7541 return; 7542 } 7543 7544 ctx->bdev = bdev; 7545 ctx->cb_fn = cb_fn; 7546 ctx->cb_arg = cb_arg; 7547 7548 ctx->histogram = histogram; 7549 7550 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 7551 bdev_histogram_get_channel_cb); 7552 } 7553 7554 size_t 7555 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7556 size_t max_events) 7557 { 7558 struct media_event_entry *entry; 7559 size_t num_events = 0; 7560 7561 for (; num_events < max_events; ++num_events) { 7562 entry = TAILQ_FIRST(&desc->pending_media_events); 7563 if (entry == NULL) { 7564 break; 7565 } 7566 7567 events[num_events] = entry->event; 7568 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7569 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7570 } 7571 7572 return num_events; 7573 } 7574 7575 int 7576 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7577 size_t num_events) 7578 { 7579 struct spdk_bdev_desc *desc; 7580 struct media_event_entry *entry; 7581 size_t event_id; 7582 int rc = 0; 7583 7584 assert(bdev->media_events); 7585 7586 pthread_mutex_lock(&bdev->internal.mutex); 7587 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7588 if (desc->write) { 7589 break; 7590 } 7591 } 7592 7593 if (desc == NULL || desc->media_events_buffer == NULL) { 7594 rc = -ENODEV; 7595 goto out; 7596 } 7597 7598 for (event_id = 0; event_id < num_events; ++event_id) { 7599 entry = TAILQ_FIRST(&desc->free_media_events); 7600 if (entry == NULL) { 7601 break; 7602 } 7603 7604 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7605 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7606 entry->event = events[event_id]; 7607 } 7608 7609 rc = event_id; 7610 out: 7611 pthread_mutex_unlock(&bdev->internal.mutex); 7612 return rc; 7613 } 7614 7615 void 7616 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7617 { 7618 struct spdk_bdev_desc *desc; 7619 7620 pthread_mutex_lock(&bdev->internal.mutex); 7621 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7622 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7623 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7624 desc->callback.ctx); 7625 } 7626 } 7627 pthread_mutex_unlock(&bdev->internal.mutex); 7628 } 7629 7630 struct locked_lba_range_ctx { 7631 struct lba_range range; 7632 struct spdk_bdev *bdev; 7633 struct lba_range *current_range; 7634 struct lba_range *owner_range; 7635 struct spdk_poller *poller; 7636 lock_range_cb cb_fn; 7637 void *cb_arg; 7638 }; 7639 7640 static void 7641 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 7642 { 7643 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7644 7645 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7646 free(ctx); 7647 } 7648 7649 static void bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7650 7651 static void 7652 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7653 { 7654 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7655 struct spdk_bdev *bdev = ctx->bdev; 7656 7657 if (status == -ENOMEM) { 7658 /* One of the channels could not allocate a range object. 7659 * So we have to go back and clean up any ranges that were 7660 * allocated successfully before we return error status to 7661 * the caller. We can reuse the unlock function to do that 7662 * clean up. 7663 */ 7664 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7665 bdev_unlock_lba_range_get_channel, ctx, 7666 bdev_lock_error_cleanup_cb); 7667 return; 7668 } 7669 7670 /* All channels have locked this range and no I/O overlapping the range 7671 * are outstanding! Set the owner_ch for the range object for the 7672 * locking channel, so that this channel will know that it is allowed 7673 * to write to this range. 7674 */ 7675 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7676 ctx->cb_fn(ctx->cb_arg, status); 7677 7678 /* Don't free the ctx here. Its range is in the bdev's global list of 7679 * locked ranges still, and will be removed and freed when this range 7680 * is later unlocked. 7681 */ 7682 } 7683 7684 static int 7685 bdev_lock_lba_range_check_io(void *_i) 7686 { 7687 struct spdk_io_channel_iter *i = _i; 7688 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7689 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7690 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7691 struct lba_range *range = ctx->current_range; 7692 struct spdk_bdev_io *bdev_io; 7693 7694 spdk_poller_unregister(&ctx->poller); 7695 7696 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7697 * range. But we need to wait until any outstanding IO overlapping with this range 7698 * are completed. 7699 */ 7700 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7701 if (bdev_io_range_is_locked(bdev_io, range)) { 7702 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7703 return SPDK_POLLER_BUSY; 7704 } 7705 } 7706 7707 spdk_for_each_channel_continue(i, 0); 7708 return SPDK_POLLER_BUSY; 7709 } 7710 7711 static void 7712 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7713 { 7714 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7715 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7716 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7717 struct lba_range *range; 7718 7719 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7720 if (range->length == ctx->range.length && 7721 range->offset == ctx->range.offset && 7722 range->locked_ctx == ctx->range.locked_ctx) { 7723 /* This range already exists on this channel, so don't add 7724 * it again. This can happen when a new channel is created 7725 * while the for_each_channel operation is in progress. 7726 * Do not check for outstanding I/O in that case, since the 7727 * range was locked before any I/O could be submitted to the 7728 * new channel. 7729 */ 7730 spdk_for_each_channel_continue(i, 0); 7731 return; 7732 } 7733 } 7734 7735 range = calloc(1, sizeof(*range)); 7736 if (range == NULL) { 7737 spdk_for_each_channel_continue(i, -ENOMEM); 7738 return; 7739 } 7740 7741 range->length = ctx->range.length; 7742 range->offset = ctx->range.offset; 7743 range->locked_ctx = ctx->range.locked_ctx; 7744 ctx->current_range = range; 7745 if (ctx->range.owner_ch == ch) { 7746 /* This is the range object for the channel that will hold 7747 * the lock. Store it in the ctx object so that we can easily 7748 * set its owner_ch after the lock is finally acquired. 7749 */ 7750 ctx->owner_range = range; 7751 } 7752 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7753 bdev_lock_lba_range_check_io(i); 7754 } 7755 7756 static void 7757 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7758 { 7759 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7760 7761 /* We will add a copy of this range to each channel now. */ 7762 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7763 bdev_lock_lba_range_cb); 7764 } 7765 7766 static bool 7767 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7768 { 7769 struct lba_range *r; 7770 7771 TAILQ_FOREACH(r, tailq, tailq) { 7772 if (bdev_lba_range_overlapped(range, r)) { 7773 return true; 7774 } 7775 } 7776 return false; 7777 } 7778 7779 static int 7780 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7781 uint64_t offset, uint64_t length, 7782 lock_range_cb cb_fn, void *cb_arg) 7783 { 7784 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7785 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7786 struct locked_lba_range_ctx *ctx; 7787 7788 if (cb_arg == NULL) { 7789 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7790 return -EINVAL; 7791 } 7792 7793 ctx = calloc(1, sizeof(*ctx)); 7794 if (ctx == NULL) { 7795 return -ENOMEM; 7796 } 7797 7798 ctx->range.offset = offset; 7799 ctx->range.length = length; 7800 ctx->range.owner_ch = ch; 7801 ctx->range.locked_ctx = cb_arg; 7802 ctx->bdev = bdev; 7803 ctx->cb_fn = cb_fn; 7804 ctx->cb_arg = cb_arg; 7805 7806 pthread_mutex_lock(&bdev->internal.mutex); 7807 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7808 /* There is an active lock overlapping with this range. 7809 * Put it on the pending list until this range no 7810 * longer overlaps with another. 7811 */ 7812 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7813 } else { 7814 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7815 bdev_lock_lba_range_ctx(bdev, ctx); 7816 } 7817 pthread_mutex_unlock(&bdev->internal.mutex); 7818 return 0; 7819 } 7820 7821 static void 7822 bdev_lock_lba_range_ctx_msg(void *_ctx) 7823 { 7824 struct locked_lba_range_ctx *ctx = _ctx; 7825 7826 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7827 } 7828 7829 static void 7830 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7831 { 7832 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7833 struct locked_lba_range_ctx *pending_ctx; 7834 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7835 struct spdk_bdev *bdev = ch->bdev; 7836 struct lba_range *range, *tmp; 7837 7838 pthread_mutex_lock(&bdev->internal.mutex); 7839 /* Check if there are any pending locked ranges that overlap with this range 7840 * that was just unlocked. If there are, check that it doesn't overlap with any 7841 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7842 * the lock process. 7843 */ 7844 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7845 if (bdev_lba_range_overlapped(range, &ctx->range) && 7846 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7847 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7848 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7849 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7850 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7851 bdev_lock_lba_range_ctx_msg, pending_ctx); 7852 } 7853 } 7854 pthread_mutex_unlock(&bdev->internal.mutex); 7855 7856 ctx->cb_fn(ctx->cb_arg, status); 7857 free(ctx); 7858 } 7859 7860 static void 7861 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7862 { 7863 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7864 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7865 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7866 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7867 struct spdk_bdev_io *bdev_io; 7868 struct lba_range *range; 7869 7870 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7871 if (ctx->range.offset == range->offset && 7872 ctx->range.length == range->length && 7873 ctx->range.locked_ctx == range->locked_ctx) { 7874 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7875 free(range); 7876 break; 7877 } 7878 } 7879 7880 /* Note: we should almost always be able to assert that the range specified 7881 * was found. But there are some very rare corner cases where a new channel 7882 * gets created simultaneously with a range unlock, where this function 7883 * would execute on that new channel and wouldn't have the range. 7884 * We also use this to clean up range allocations when a later allocation 7885 * fails in the locking path. 7886 * So we can't actually assert() here. 7887 */ 7888 7889 /* Swap the locked IO into a temporary list, and then try to submit them again. 7890 * We could hyper-optimize this to only resubmit locked I/O that overlap 7891 * with the range that was just unlocked, but this isn't a performance path so 7892 * we go for simplicity here. 7893 */ 7894 TAILQ_INIT(&io_locked); 7895 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7896 while (!TAILQ_EMPTY(&io_locked)) { 7897 bdev_io = TAILQ_FIRST(&io_locked); 7898 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7899 bdev_io_submit(bdev_io); 7900 } 7901 7902 spdk_for_each_channel_continue(i, 0); 7903 } 7904 7905 static int 7906 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7907 uint64_t offset, uint64_t length, 7908 lock_range_cb cb_fn, void *cb_arg) 7909 { 7910 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7911 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7912 struct locked_lba_range_ctx *ctx; 7913 struct lba_range *range; 7914 bool range_found = false; 7915 7916 /* Let's make sure the specified channel actually has a lock on 7917 * the specified range. Note that the range must match exactly. 7918 */ 7919 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7920 if (range->offset == offset && range->length == length && 7921 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7922 range_found = true; 7923 break; 7924 } 7925 } 7926 7927 if (!range_found) { 7928 return -EINVAL; 7929 } 7930 7931 pthread_mutex_lock(&bdev->internal.mutex); 7932 /* We confirmed that this channel has locked the specified range. To 7933 * start the unlock the process, we find the range in the bdev's locked_ranges 7934 * and remove it. This ensures new channels don't inherit the locked range. 7935 * Then we will send a message to each channel (including the one specified 7936 * here) to remove the range from its per-channel list. 7937 */ 7938 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7939 if (range->offset == offset && range->length == length && 7940 range->locked_ctx == cb_arg) { 7941 break; 7942 } 7943 } 7944 if (range == NULL) { 7945 assert(false); 7946 pthread_mutex_unlock(&bdev->internal.mutex); 7947 return -EINVAL; 7948 } 7949 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7950 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7951 pthread_mutex_unlock(&bdev->internal.mutex); 7952 7953 ctx->cb_fn = cb_fn; 7954 ctx->cb_arg = cb_arg; 7955 7956 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7957 bdev_unlock_lba_range_cb); 7958 return 0; 7959 } 7960 7961 int 7962 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7963 int array_size) 7964 { 7965 if (!bdev) { 7966 return -EINVAL; 7967 } 7968 7969 if (bdev->fn_table->get_memory_domains) { 7970 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7971 } 7972 7973 return 0; 7974 } 7975 7976 struct spdk_bdev_for_each_io_ctx { 7977 void *ctx; 7978 spdk_bdev_io_fn fn; 7979 spdk_bdev_for_each_io_cb cb; 7980 }; 7981 7982 static void 7983 bdev_channel_for_each_io(struct spdk_io_channel_iter *i) 7984 { 7985 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7986 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 7987 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 7988 struct spdk_bdev_io *bdev_io; 7989 int rc = 0; 7990 7991 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 7992 rc = ctx->fn(ctx->ctx, bdev_io); 7993 if (rc != 0) { 7994 break; 7995 } 7996 } 7997 7998 spdk_for_each_channel_continue(i, rc); 7999 } 8000 8001 static void 8002 bdev_for_each_io_done(struct spdk_io_channel_iter *i, int status) 8003 { 8004 struct spdk_bdev_for_each_io_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 8005 8006 ctx->cb(ctx->ctx, status); 8007 8008 free(ctx); 8009 } 8010 8011 void 8012 spdk_bdev_for_each_bdev_io(struct spdk_bdev *bdev, void *_ctx, spdk_bdev_io_fn fn, 8013 spdk_bdev_for_each_io_cb cb) 8014 { 8015 struct spdk_bdev_for_each_io_ctx *ctx; 8016 8017 assert(fn != NULL && cb != NULL); 8018 8019 ctx = calloc(1, sizeof(*ctx)); 8020 if (ctx == NULL) { 8021 SPDK_ERRLOG("Failed to allocate context.\n"); 8022 cb(_ctx, -ENOMEM); 8023 return; 8024 } 8025 8026 ctx->ctx = _ctx; 8027 ctx->fn = fn; 8028 ctx->cb = cb; 8029 8030 spdk_for_each_channel(__bdev_to_io_dev(bdev), 8031 bdev_channel_for_each_io, 8032 ctx, 8033 bdev_for_each_io_done); 8034 } 8035 8036 SPDK_LOG_REGISTER_COMPONENT(bdev) 8037 8038 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 8039 { 8040 struct spdk_trace_tpoint_opts opts[] = { 8041 { 8042 "BDEV_IO_START", TRACE_BDEV_IO_START, 8043 OWNER_BDEV, OBJECT_BDEV_IO, 1, 8044 { 8045 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8046 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 8047 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8048 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 }, 8049 { "name", SPDK_TRACE_ARG_TYPE_STR, 40} 8050 } 8051 }, 8052 { 8053 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 8054 OWNER_BDEV, OBJECT_BDEV_IO, 0, 8055 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 8056 }, 8057 { 8058 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 8059 OWNER_BDEV, OBJECT_NONE, 1, 8060 { 8061 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8062 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8063 } 8064 }, 8065 { 8066 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 8067 OWNER_BDEV, OBJECT_NONE, 0, 8068 { 8069 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 8070 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 8071 } 8072 }, 8073 }; 8074 8075 8076 spdk_trace_register_owner(OWNER_BDEV, 'b'); 8077 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 8078 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 8079 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_START, OBJECT_BDEV_IO, 0); 8080 spdk_trace_tpoint_register_relation(TRACE_BDEV_NVME_IO_DONE, OBJECT_BDEV_IO, 0); 8081 } 8082