1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright (c) Intel Corporation. All rights reserved. 3 * Copyright (c) 2019 Mellanox Technologies LTD. All rights reserved. 4 * Copyright (c) 2021, 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. 5 */ 6 7 #include "spdk/stdinc.h" 8 9 #include "spdk/bdev.h" 10 11 #include "spdk/config.h" 12 #include "spdk/env.h" 13 #include "spdk/thread.h" 14 #include "spdk/likely.h" 15 #include "spdk/queue.h" 16 #include "spdk/nvme_spec.h" 17 #include "spdk/scsi_spec.h" 18 #include "spdk/notify.h" 19 #include "spdk/util.h" 20 #include "spdk/trace.h" 21 #include "spdk/dma.h" 22 23 #include "spdk/bdev_module.h" 24 #include "spdk/log.h" 25 #include "spdk/string.h" 26 27 #include "bdev_internal.h" 28 #include "spdk_internal/trace_defs.h" 29 30 #ifdef SPDK_CONFIG_VTUNE 31 #include "ittnotify.h" 32 #include "ittnotify_types.h" 33 int __itt_init_ittlib(const char *, __itt_group_id); 34 #endif 35 36 #define SPDK_BDEV_IO_POOL_SIZE (64 * 1024 - 1) 37 #define SPDK_BDEV_IO_CACHE_SIZE 256 38 #define SPDK_BDEV_AUTO_EXAMINE true 39 #define BUF_SMALL_POOL_SIZE 8191 40 #define BUF_LARGE_POOL_SIZE 1023 41 #define NOMEM_THRESHOLD_COUNT 8 42 43 #define SPDK_BDEV_QOS_TIMESLICE_IN_USEC 1000 44 #define SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE 1 45 #define SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE 512 46 #define SPDK_BDEV_QOS_MIN_IOS_PER_SEC 1000 47 #define SPDK_BDEV_QOS_MIN_BYTES_PER_SEC (1024 * 1024) 48 #define SPDK_BDEV_QOS_LIMIT_NOT_DEFINED UINT64_MAX 49 #define SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC 1000 50 51 #define SPDK_BDEV_POOL_ALIGNMENT 512 52 53 /* The maximum number of children requests for a UNMAP or WRITE ZEROES command 54 * when splitting into children requests at a time. 55 */ 56 #define SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS (8) 57 58 static const char *qos_rpc_type[] = {"rw_ios_per_sec", 59 "rw_mbytes_per_sec", "r_mbytes_per_sec", "w_mbytes_per_sec" 60 }; 61 62 TAILQ_HEAD(spdk_bdev_list, spdk_bdev); 63 64 RB_HEAD(bdev_name_tree, spdk_bdev_name); 65 66 static int 67 bdev_name_cmp(struct spdk_bdev_name *name1, struct spdk_bdev_name *name2) 68 { 69 return strcmp(name1->name, name2->name); 70 } 71 72 RB_GENERATE_STATIC(bdev_name_tree, spdk_bdev_name, node, bdev_name_cmp); 73 74 struct spdk_bdev_mgr { 75 struct spdk_mempool *bdev_io_pool; 76 77 struct spdk_mempool *buf_small_pool; 78 struct spdk_mempool *buf_large_pool; 79 80 void *zero_buffer; 81 82 TAILQ_HEAD(bdev_module_list, spdk_bdev_module) bdev_modules; 83 84 struct spdk_bdev_list bdevs; 85 struct bdev_name_tree bdev_names; 86 87 bool init_complete; 88 bool module_init_complete; 89 90 pthread_mutex_t mutex; 91 92 #ifdef SPDK_CONFIG_VTUNE 93 __itt_domain *domain; 94 #endif 95 }; 96 97 static struct spdk_bdev_mgr g_bdev_mgr = { 98 .bdev_modules = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdev_modules), 99 .bdevs = TAILQ_HEAD_INITIALIZER(g_bdev_mgr.bdevs), 100 .bdev_names = RB_INITIALIZER(g_bdev_mgr.bdev_names), 101 .init_complete = false, 102 .module_init_complete = false, 103 .mutex = PTHREAD_MUTEX_INITIALIZER, 104 }; 105 106 typedef void (*lock_range_cb)(void *ctx, int status); 107 108 typedef void (*bdev_copy_bounce_buffer_cpl)(void *ctx, int rc); 109 110 struct lba_range { 111 uint64_t offset; 112 uint64_t length; 113 void *locked_ctx; 114 struct spdk_bdev_channel *owner_ch; 115 TAILQ_ENTRY(lba_range) tailq; 116 }; 117 118 static struct spdk_bdev_opts g_bdev_opts = { 119 .bdev_io_pool_size = SPDK_BDEV_IO_POOL_SIZE, 120 .bdev_io_cache_size = SPDK_BDEV_IO_CACHE_SIZE, 121 .bdev_auto_examine = SPDK_BDEV_AUTO_EXAMINE, 122 .small_buf_pool_size = BUF_SMALL_POOL_SIZE, 123 .large_buf_pool_size = BUF_LARGE_POOL_SIZE, 124 }; 125 126 static spdk_bdev_init_cb g_init_cb_fn = NULL; 127 static void *g_init_cb_arg = NULL; 128 129 static spdk_bdev_fini_cb g_fini_cb_fn = NULL; 130 static void *g_fini_cb_arg = NULL; 131 static struct spdk_thread *g_fini_thread = NULL; 132 133 struct spdk_bdev_qos_limit { 134 /** IOs or bytes allowed per second (i.e., 1s). */ 135 uint64_t limit; 136 137 /** Remaining IOs or bytes allowed in current timeslice (e.g., 1ms). 138 * For remaining bytes, allowed to run negative if an I/O is submitted when 139 * some bytes are remaining, but the I/O is bigger than that amount. The 140 * excess will be deducted from the next timeslice. 141 */ 142 int64_t remaining_this_timeslice; 143 144 /** Minimum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 145 uint32_t min_per_timeslice; 146 147 /** Maximum allowed IOs or bytes to be issued in one timeslice (e.g., 1ms). */ 148 uint32_t max_per_timeslice; 149 150 /** Function to check whether to queue the IO. */ 151 bool (*queue_io)(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 152 153 /** Function to update for the submitted IO. */ 154 void (*update_quota)(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io); 155 }; 156 157 struct spdk_bdev_qos { 158 /** Types of structure of rate limits. */ 159 struct spdk_bdev_qos_limit rate_limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 160 161 /** The channel that all I/O are funneled through. */ 162 struct spdk_bdev_channel *ch; 163 164 /** The thread on which the poller is running. */ 165 struct spdk_thread *thread; 166 167 /** Queue of I/O waiting to be issued. */ 168 bdev_io_tailq_t queued; 169 170 /** Size of a timeslice in tsc ticks. */ 171 uint64_t timeslice_size; 172 173 /** Timestamp of start of last timeslice. */ 174 uint64_t last_timeslice; 175 176 /** Poller that processes queued I/O commands each time slice. */ 177 struct spdk_poller *poller; 178 }; 179 180 struct spdk_bdev_mgmt_channel { 181 bdev_io_stailq_t need_buf_small; 182 bdev_io_stailq_t need_buf_large; 183 184 /* 185 * Each thread keeps a cache of bdev_io - this allows 186 * bdev threads which are *not* DPDK threads to still 187 * benefit from a per-thread bdev_io cache. Without 188 * this, non-DPDK threads fetching from the mempool 189 * incur a cmpxchg on get and put. 190 */ 191 bdev_io_stailq_t per_thread_cache; 192 uint32_t per_thread_cache_count; 193 uint32_t bdev_io_cache_size; 194 195 TAILQ_HEAD(, spdk_bdev_shared_resource) shared_resources; 196 TAILQ_HEAD(, spdk_bdev_io_wait_entry) io_wait_queue; 197 }; 198 199 /* 200 * Per-module (or per-io_device) data. Multiple bdevs built on the same io_device 201 * will queue here their IO that awaits retry. It makes it possible to retry sending 202 * IO to one bdev after IO from other bdev completes. 203 */ 204 struct spdk_bdev_shared_resource { 205 /* The bdev management channel */ 206 struct spdk_bdev_mgmt_channel *mgmt_ch; 207 208 /* 209 * Count of I/O submitted to bdev module and waiting for completion. 210 * Incremented before submit_request() is called on an spdk_bdev_io. 211 */ 212 uint64_t io_outstanding; 213 214 /* 215 * Queue of IO awaiting retry because of a previous NOMEM status returned 216 * on this channel. 217 */ 218 bdev_io_tailq_t nomem_io; 219 220 /* 221 * Threshold which io_outstanding must drop to before retrying nomem_io. 222 */ 223 uint64_t nomem_threshold; 224 225 /* I/O channel allocated by a bdev module */ 226 struct spdk_io_channel *shared_ch; 227 228 /* Refcount of bdev channels using this resource */ 229 uint32_t ref; 230 231 TAILQ_ENTRY(spdk_bdev_shared_resource) link; 232 }; 233 234 #define BDEV_CH_RESET_IN_PROGRESS (1 << 0) 235 #define BDEV_CH_QOS_ENABLED (1 << 1) 236 237 struct spdk_bdev_channel { 238 struct spdk_bdev *bdev; 239 240 /* The channel for the underlying device */ 241 struct spdk_io_channel *channel; 242 243 /* Per io_device per thread data */ 244 struct spdk_bdev_shared_resource *shared_resource; 245 246 struct spdk_bdev_io_stat stat; 247 248 /* 249 * Count of I/O submitted to the underlying dev module through this channel 250 * and waiting for completion. 251 */ 252 uint64_t io_outstanding; 253 254 /* 255 * List of all submitted I/Os including I/O that are generated via splitting. 256 */ 257 bdev_io_tailq_t io_submitted; 258 259 /* 260 * List of spdk_bdev_io that are currently queued because they write to a locked 261 * LBA range. 262 */ 263 bdev_io_tailq_t io_locked; 264 265 uint32_t flags; 266 267 struct spdk_histogram_data *histogram; 268 269 #ifdef SPDK_CONFIG_VTUNE 270 uint64_t start_tsc; 271 uint64_t interval_tsc; 272 __itt_string_handle *handle; 273 struct spdk_bdev_io_stat prev_stat; 274 #endif 275 276 bdev_io_tailq_t queued_resets; 277 278 lba_range_tailq_t locked_ranges; 279 }; 280 281 struct media_event_entry { 282 struct spdk_bdev_media_event event; 283 TAILQ_ENTRY(media_event_entry) tailq; 284 }; 285 286 #define MEDIA_EVENT_POOL_SIZE 64 287 288 struct spdk_bdev_desc { 289 struct spdk_bdev *bdev; 290 struct spdk_thread *thread; 291 struct { 292 spdk_bdev_event_cb_t event_fn; 293 void *ctx; 294 } callback; 295 bool closed; 296 bool write; 297 bool memory_domains_supported; 298 pthread_mutex_t mutex; 299 uint32_t refs; 300 TAILQ_HEAD(, media_event_entry) pending_media_events; 301 TAILQ_HEAD(, media_event_entry) free_media_events; 302 struct media_event_entry *media_events_buffer; 303 TAILQ_ENTRY(spdk_bdev_desc) link; 304 305 uint64_t timeout_in_sec; 306 spdk_bdev_io_timeout_cb cb_fn; 307 void *cb_arg; 308 struct spdk_poller *io_timeout_poller; 309 }; 310 311 struct spdk_bdev_iostat_ctx { 312 struct spdk_bdev_io_stat *stat; 313 spdk_bdev_get_device_stat_cb cb; 314 void *cb_arg; 315 }; 316 317 struct set_qos_limit_ctx { 318 void (*cb_fn)(void *cb_arg, int status); 319 void *cb_arg; 320 struct spdk_bdev *bdev; 321 }; 322 323 #define __bdev_to_io_dev(bdev) (((char *)bdev) + 1) 324 #define __bdev_from_io_dev(io_dev) ((struct spdk_bdev *)(((char *)io_dev) - 1)) 325 326 static inline void bdev_io_complete(void *ctx); 327 328 static void bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 329 static void bdev_write_zero_buffer_next(void *_bdev_io); 330 331 static void bdev_enable_qos_msg(struct spdk_io_channel_iter *i); 332 static void bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status); 333 334 static int 335 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 336 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 337 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 338 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 339 static int 340 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 341 struct iovec *iov, int iovcnt, void *md_buf, 342 uint64_t offset_blocks, uint64_t num_blocks, 343 spdk_bdev_io_completion_cb cb, void *cb_arg, 344 struct spdk_bdev_ext_io_opts *opts, bool copy_opts); 345 346 static int 347 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 348 uint64_t offset, uint64_t length, 349 lock_range_cb cb_fn, void *cb_arg); 350 351 static int 352 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 353 uint64_t offset, uint64_t length, 354 lock_range_cb cb_fn, void *cb_arg); 355 356 static inline void bdev_io_complete(void *ctx); 357 358 static bool bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort); 359 static bool bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort); 360 361 void 362 spdk_bdev_get_opts(struct spdk_bdev_opts *opts, size_t opts_size) 363 { 364 if (!opts) { 365 SPDK_ERRLOG("opts should not be NULL\n"); 366 return; 367 } 368 369 if (!opts_size) { 370 SPDK_ERRLOG("opts_size should not be zero value\n"); 371 return; 372 } 373 374 opts->opts_size = opts_size; 375 376 #define SET_FIELD(field) \ 377 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts_size) { \ 378 opts->field = g_bdev_opts.field; \ 379 } \ 380 381 SET_FIELD(bdev_io_pool_size); 382 SET_FIELD(bdev_io_cache_size); 383 SET_FIELD(bdev_auto_examine); 384 SET_FIELD(small_buf_pool_size); 385 SET_FIELD(large_buf_pool_size); 386 387 /* Do not remove this statement, you should always update this statement when you adding a new field, 388 * and do not forget to add the SET_FIELD statement for your added field. */ 389 SPDK_STATIC_ASSERT(sizeof(struct spdk_bdev_opts) == 32, "Incorrect size"); 390 391 #undef SET_FIELD 392 } 393 394 int 395 spdk_bdev_set_opts(struct spdk_bdev_opts *opts) 396 { 397 uint32_t min_pool_size; 398 399 if (!opts) { 400 SPDK_ERRLOG("opts cannot be NULL\n"); 401 return -1; 402 } 403 404 if (!opts->opts_size) { 405 SPDK_ERRLOG("opts_size inside opts cannot be zero value\n"); 406 return -1; 407 } 408 409 /* 410 * Add 1 to the thread count to account for the extra mgmt_ch that gets created during subsystem 411 * initialization. A second mgmt_ch will be created on the same thread when the application starts 412 * but before the deferred put_io_channel event is executed for the first mgmt_ch. 413 */ 414 min_pool_size = opts->bdev_io_cache_size * (spdk_thread_get_count() + 1); 415 if (opts->bdev_io_pool_size < min_pool_size) { 416 SPDK_ERRLOG("bdev_io_pool_size %" PRIu32 " is not compatible with bdev_io_cache_size %" PRIu32 417 " and %" PRIu32 " threads\n", opts->bdev_io_pool_size, opts->bdev_io_cache_size, 418 spdk_thread_get_count()); 419 SPDK_ERRLOG("bdev_io_pool_size must be at least %" PRIu32 "\n", min_pool_size); 420 return -1; 421 } 422 423 if (opts->small_buf_pool_size < BUF_SMALL_POOL_SIZE) { 424 SPDK_ERRLOG("small_buf_pool_size must be at least %" PRIu32 "\n", BUF_SMALL_POOL_SIZE); 425 return -1; 426 } 427 428 if (opts->large_buf_pool_size < BUF_LARGE_POOL_SIZE) { 429 SPDK_ERRLOG("large_buf_pool_size must be at least %" PRIu32 "\n", BUF_LARGE_POOL_SIZE); 430 return -1; 431 } 432 433 #define SET_FIELD(field) \ 434 if (offsetof(struct spdk_bdev_opts, field) + sizeof(opts->field) <= opts->opts_size) { \ 435 g_bdev_opts.field = opts->field; \ 436 } \ 437 438 SET_FIELD(bdev_io_pool_size); 439 SET_FIELD(bdev_io_cache_size); 440 SET_FIELD(bdev_auto_examine); 441 SET_FIELD(small_buf_pool_size); 442 SET_FIELD(large_buf_pool_size); 443 444 g_bdev_opts.opts_size = opts->opts_size; 445 446 #undef SET_FIELD 447 448 return 0; 449 } 450 451 static struct spdk_bdev * 452 bdev_get_by_name(const char *bdev_name) 453 { 454 struct spdk_bdev_name find; 455 struct spdk_bdev_name *res; 456 457 find.name = (char *)bdev_name; 458 res = RB_FIND(bdev_name_tree, &g_bdev_mgr.bdev_names, &find); 459 if (res != NULL) { 460 return res->bdev; 461 } 462 463 return NULL; 464 } 465 466 struct spdk_bdev * 467 spdk_bdev_get_by_name(const char *bdev_name) 468 { 469 struct spdk_bdev *bdev; 470 471 pthread_mutex_lock(&g_bdev_mgr.mutex); 472 bdev = bdev_get_by_name(bdev_name); 473 pthread_mutex_unlock(&g_bdev_mgr.mutex); 474 475 return bdev; 476 } 477 478 struct spdk_bdev_wait_for_examine_ctx { 479 struct spdk_poller *poller; 480 spdk_bdev_wait_for_examine_cb cb_fn; 481 void *cb_arg; 482 }; 483 484 static bool 485 bdev_module_all_actions_completed(void); 486 487 static int 488 bdev_wait_for_examine_cb(void *arg) 489 { 490 struct spdk_bdev_wait_for_examine_ctx *ctx = arg; 491 492 if (!bdev_module_all_actions_completed()) { 493 return SPDK_POLLER_IDLE; 494 } 495 496 spdk_poller_unregister(&ctx->poller); 497 ctx->cb_fn(ctx->cb_arg); 498 free(ctx); 499 500 return SPDK_POLLER_BUSY; 501 } 502 503 int 504 spdk_bdev_wait_for_examine(spdk_bdev_wait_for_examine_cb cb_fn, void *cb_arg) 505 { 506 struct spdk_bdev_wait_for_examine_ctx *ctx; 507 508 ctx = calloc(1, sizeof(*ctx)); 509 if (ctx == NULL) { 510 return -ENOMEM; 511 } 512 ctx->cb_fn = cb_fn; 513 ctx->cb_arg = cb_arg; 514 ctx->poller = SPDK_POLLER_REGISTER(bdev_wait_for_examine_cb, ctx, 0); 515 516 return 0; 517 } 518 519 struct spdk_bdev_examine_item { 520 char *name; 521 TAILQ_ENTRY(spdk_bdev_examine_item) link; 522 }; 523 524 TAILQ_HEAD(spdk_bdev_examine_allowlist, spdk_bdev_examine_item); 525 526 struct spdk_bdev_examine_allowlist g_bdev_examine_allowlist = TAILQ_HEAD_INITIALIZER( 527 g_bdev_examine_allowlist); 528 529 static inline bool 530 bdev_examine_allowlist_check(const char *name) 531 { 532 struct spdk_bdev_examine_item *item; 533 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 534 if (strcmp(name, item->name) == 0) { 535 return true; 536 } 537 } 538 return false; 539 } 540 541 static inline void 542 bdev_examine_allowlist_free(void) 543 { 544 struct spdk_bdev_examine_item *item; 545 while (!TAILQ_EMPTY(&g_bdev_examine_allowlist)) { 546 item = TAILQ_FIRST(&g_bdev_examine_allowlist); 547 TAILQ_REMOVE(&g_bdev_examine_allowlist, item, link); 548 free(item->name); 549 free(item); 550 } 551 } 552 553 static inline bool 554 bdev_in_examine_allowlist(struct spdk_bdev *bdev) 555 { 556 struct spdk_bdev_alias *tmp; 557 if (bdev_examine_allowlist_check(bdev->name)) { 558 return true; 559 } 560 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 561 if (bdev_examine_allowlist_check(tmp->alias.name)) { 562 return true; 563 } 564 } 565 return false; 566 } 567 568 static inline bool 569 bdev_ok_to_examine(struct spdk_bdev *bdev) 570 { 571 if (g_bdev_opts.bdev_auto_examine) { 572 return true; 573 } else { 574 return bdev_in_examine_allowlist(bdev); 575 } 576 } 577 578 static void 579 bdev_examine(struct spdk_bdev *bdev) 580 { 581 struct spdk_bdev_module *module; 582 uint32_t action; 583 584 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 585 if (module->examine_config && bdev_ok_to_examine(bdev)) { 586 action = module->internal.action_in_progress; 587 module->internal.action_in_progress++; 588 module->examine_config(bdev); 589 if (action != module->internal.action_in_progress) { 590 SPDK_ERRLOG("examine_config for module %s did not call spdk_bdev_module_examine_done()\n", 591 module->name); 592 } 593 } 594 } 595 596 if (bdev->internal.claim_module && bdev_ok_to_examine(bdev)) { 597 if (bdev->internal.claim_module->examine_disk) { 598 bdev->internal.claim_module->internal.action_in_progress++; 599 bdev->internal.claim_module->examine_disk(bdev); 600 } 601 return; 602 } 603 604 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 605 if (module->examine_disk && bdev_ok_to_examine(bdev)) { 606 module->internal.action_in_progress++; 607 module->examine_disk(bdev); 608 } 609 } 610 } 611 612 int 613 spdk_bdev_examine(const char *name) 614 { 615 struct spdk_bdev *bdev; 616 struct spdk_bdev_examine_item *item; 617 618 if (g_bdev_opts.bdev_auto_examine) { 619 SPDK_ERRLOG("Manual examine is not allowed if auto examine is enabled"); 620 return -EINVAL; 621 } 622 623 if (bdev_examine_allowlist_check(name)) { 624 SPDK_ERRLOG("Duplicate bdev name for manual examine: %s\n", name); 625 return -EEXIST; 626 } 627 628 item = calloc(1, sizeof(*item)); 629 if (!item) { 630 return -ENOMEM; 631 } 632 item->name = strdup(name); 633 if (!item->name) { 634 free(item); 635 return -ENOMEM; 636 } 637 TAILQ_INSERT_TAIL(&g_bdev_examine_allowlist, item, link); 638 639 bdev = spdk_bdev_get_by_name(name); 640 if (bdev) { 641 bdev_examine(bdev); 642 } 643 return 0; 644 } 645 646 static inline void 647 bdev_examine_allowlist_config_json(struct spdk_json_write_ctx *w) 648 { 649 struct spdk_bdev_examine_item *item; 650 TAILQ_FOREACH(item, &g_bdev_examine_allowlist, link) { 651 spdk_json_write_object_begin(w); 652 spdk_json_write_named_string(w, "method", "bdev_examine"); 653 spdk_json_write_named_object_begin(w, "params"); 654 spdk_json_write_named_string(w, "name", item->name); 655 spdk_json_write_object_end(w); 656 spdk_json_write_object_end(w); 657 } 658 } 659 660 struct spdk_bdev * 661 spdk_bdev_first(void) 662 { 663 struct spdk_bdev *bdev; 664 665 bdev = TAILQ_FIRST(&g_bdev_mgr.bdevs); 666 if (bdev) { 667 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 668 } 669 670 return bdev; 671 } 672 673 struct spdk_bdev * 674 spdk_bdev_next(struct spdk_bdev *prev) 675 { 676 struct spdk_bdev *bdev; 677 678 bdev = TAILQ_NEXT(prev, internal.link); 679 if (bdev) { 680 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 681 } 682 683 return bdev; 684 } 685 686 static struct spdk_bdev * 687 _bdev_next_leaf(struct spdk_bdev *bdev) 688 { 689 while (bdev != NULL) { 690 if (bdev->internal.claim_module == NULL) { 691 return bdev; 692 } else { 693 bdev = TAILQ_NEXT(bdev, internal.link); 694 } 695 } 696 697 return bdev; 698 } 699 700 struct spdk_bdev * 701 spdk_bdev_first_leaf(void) 702 { 703 struct spdk_bdev *bdev; 704 705 bdev = _bdev_next_leaf(TAILQ_FIRST(&g_bdev_mgr.bdevs)); 706 707 if (bdev) { 708 SPDK_DEBUGLOG(bdev, "Starting bdev iteration at %s\n", bdev->name); 709 } 710 711 return bdev; 712 } 713 714 struct spdk_bdev * 715 spdk_bdev_next_leaf(struct spdk_bdev *prev) 716 { 717 struct spdk_bdev *bdev; 718 719 bdev = _bdev_next_leaf(TAILQ_NEXT(prev, internal.link)); 720 721 if (bdev) { 722 SPDK_DEBUGLOG(bdev, "Continuing bdev iteration at %s\n", bdev->name); 723 } 724 725 return bdev; 726 } 727 728 static inline bool 729 bdev_io_use_memory_domain(struct spdk_bdev_io *bdev_io) 730 { 731 return bdev_io->internal.ext_opts && bdev_io->internal.ext_opts->memory_domain; 732 } 733 734 void 735 spdk_bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len) 736 { 737 struct iovec *iovs; 738 739 if (bdev_io->u.bdev.iovs == NULL) { 740 bdev_io->u.bdev.iovs = &bdev_io->iov; 741 bdev_io->u.bdev.iovcnt = 1; 742 } 743 744 iovs = bdev_io->u.bdev.iovs; 745 746 assert(iovs != NULL); 747 assert(bdev_io->u.bdev.iovcnt >= 1); 748 749 iovs[0].iov_base = buf; 750 iovs[0].iov_len = len; 751 } 752 753 void 754 spdk_bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 755 { 756 assert((len / spdk_bdev_get_md_size(bdev_io->bdev)) >= bdev_io->u.bdev.num_blocks); 757 bdev_io->u.bdev.md_buf = md_buf; 758 } 759 760 static bool 761 _is_buf_allocated(const struct iovec *iovs) 762 { 763 if (iovs == NULL) { 764 return false; 765 } 766 767 return iovs[0].iov_base != NULL; 768 } 769 770 static bool 771 _are_iovs_aligned(struct iovec *iovs, int iovcnt, uint32_t alignment) 772 { 773 int i; 774 uintptr_t iov_base; 775 776 if (spdk_likely(alignment == 1)) { 777 return true; 778 } 779 780 for (i = 0; i < iovcnt; i++) { 781 iov_base = (uintptr_t)iovs[i].iov_base; 782 if ((iov_base & (alignment - 1)) != 0) { 783 return false; 784 } 785 } 786 787 return true; 788 } 789 790 static void 791 _copy_iovs_to_buf(void *buf, size_t buf_len, struct iovec *iovs, int iovcnt) 792 { 793 int i; 794 size_t len; 795 796 for (i = 0; i < iovcnt; i++) { 797 len = spdk_min(iovs[i].iov_len, buf_len); 798 memcpy(buf, iovs[i].iov_base, len); 799 buf += len; 800 buf_len -= len; 801 } 802 } 803 804 static void 805 _copy_buf_to_iovs(struct iovec *iovs, int iovcnt, void *buf, size_t buf_len) 806 { 807 int i; 808 size_t len; 809 810 for (i = 0; i < iovcnt; i++) { 811 len = spdk_min(iovs[i].iov_len, buf_len); 812 memcpy(iovs[i].iov_base, buf, len); 813 buf += len; 814 buf_len -= len; 815 } 816 } 817 818 static void 819 bdev_io_get_buf_complete(struct spdk_bdev_io *bdev_io, bool status) 820 { 821 struct spdk_io_channel *ch = spdk_bdev_io_get_io_channel(bdev_io); 822 void *buf; 823 824 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 825 buf = bdev_io->internal.buf; 826 bdev_io->internal.buf = NULL; 827 bdev_io->internal.get_aux_buf_cb(ch, bdev_io, buf); 828 bdev_io->internal.get_aux_buf_cb = NULL; 829 } else { 830 assert(bdev_io->internal.get_buf_cb != NULL); 831 bdev_io->internal.get_buf_cb(ch, bdev_io, status); 832 bdev_io->internal.get_buf_cb = NULL; 833 } 834 } 835 836 static void 837 _bdev_io_pull_buffer_cpl(void *ctx, int rc) 838 { 839 struct spdk_bdev_io *bdev_io = ctx; 840 841 if (rc) { 842 SPDK_ERRLOG("Set bounce buffer failed with rc %d\n", rc); 843 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 844 } 845 bdev_io_get_buf_complete(bdev_io, !rc); 846 } 847 848 static void 849 _bdev_io_pull_bounce_md_buf(struct spdk_bdev_io *bdev_io, void *md_buf, size_t len) 850 { 851 int rc = 0; 852 853 /* save original md_buf */ 854 bdev_io->internal.orig_md_iov.iov_base = bdev_io->u.bdev.md_buf; 855 bdev_io->internal.orig_md_iov.iov_len = len; 856 bdev_io->internal.bounce_md_iov.iov_base = md_buf; 857 bdev_io->internal.bounce_md_iov.iov_len = len; 858 /* set bounce md_buf */ 859 bdev_io->u.bdev.md_buf = md_buf; 860 861 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 862 if (bdev_io_use_memory_domain(bdev_io)) { 863 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 864 bdev_io->internal.ext_opts->memory_domain_ctx, 865 &bdev_io->internal.orig_md_iov, 1, 866 &bdev_io->internal.bounce_md_iov, 1, 867 bdev_io->internal.data_transfer_cpl, 868 bdev_io); 869 if (rc == 0) { 870 /* Continue to submit IO in completion callback */ 871 return; 872 } 873 SPDK_ERRLOG("Failed to pull data from memory domain %s, rc %d\n", 874 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain), rc); 875 } else { 876 memcpy(md_buf, bdev_io->internal.orig_md_iov.iov_base, bdev_io->internal.orig_md_iov.iov_len); 877 } 878 } 879 880 assert(bdev_io->internal.data_transfer_cpl); 881 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 882 } 883 884 static void 885 _bdev_io_set_md_buf(struct spdk_bdev_io *bdev_io) 886 { 887 struct spdk_bdev *bdev = bdev_io->bdev; 888 uint64_t md_len; 889 void *buf; 890 891 if (spdk_bdev_is_md_separate(bdev)) { 892 buf = (char *)bdev_io->u.bdev.iovs[0].iov_base + bdev_io->u.bdev.iovs[0].iov_len; 893 md_len = bdev_io->u.bdev.num_blocks * bdev->md_len; 894 895 assert(((uintptr_t)buf & (spdk_bdev_get_buf_align(bdev) - 1)) == 0); 896 897 if (bdev_io->u.bdev.md_buf != NULL) { 898 _bdev_io_pull_bounce_md_buf(bdev_io, buf, md_len); 899 return; 900 } else { 901 spdk_bdev_io_set_md_buf(bdev_io, buf, md_len); 902 } 903 } 904 905 bdev_io_get_buf_complete(bdev_io, true); 906 } 907 908 static void 909 _bdev_io_pull_bounce_data_buf_done(void *ctx, int rc) 910 { 911 struct spdk_bdev_io *bdev_io = ctx; 912 913 if (rc) { 914 SPDK_ERRLOG("Failed to get data buffer\n"); 915 assert(bdev_io->internal.data_transfer_cpl); 916 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 917 return; 918 } 919 920 _bdev_io_set_md_buf(bdev_io); 921 } 922 923 static void 924 _bdev_io_pull_bounce_data_buf(struct spdk_bdev_io *bdev_io, void *buf, size_t len, 925 bdev_copy_bounce_buffer_cpl cpl_cb) 926 { 927 int rc = 0; 928 929 bdev_io->internal.data_transfer_cpl = cpl_cb; 930 /* save original iovec */ 931 bdev_io->internal.orig_iovs = bdev_io->u.bdev.iovs; 932 bdev_io->internal.orig_iovcnt = bdev_io->u.bdev.iovcnt; 933 /* set bounce iov */ 934 bdev_io->u.bdev.iovs = &bdev_io->internal.bounce_iov; 935 bdev_io->u.bdev.iovcnt = 1; 936 /* set bounce buffer for this operation */ 937 bdev_io->u.bdev.iovs[0].iov_base = buf; 938 bdev_io->u.bdev.iovs[0].iov_len = len; 939 /* if this is write path, copy data from original buffer to bounce buffer */ 940 if (bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 941 if (bdev_io_use_memory_domain(bdev_io)) { 942 rc = spdk_memory_domain_pull_data(bdev_io->internal.ext_opts->memory_domain, 943 bdev_io->internal.ext_opts->memory_domain_ctx, 944 bdev_io->internal.orig_iovs, 945 (uint32_t) bdev_io->internal.orig_iovcnt, 946 bdev_io->u.bdev.iovs, 1, 947 _bdev_io_pull_bounce_data_buf_done, 948 bdev_io); 949 if (rc == 0) { 950 /* Continue to submit IO in completion callback */ 951 return; 952 } 953 SPDK_ERRLOG("Failed to pull data from memory domain %s\n", 954 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 955 } else { 956 _copy_iovs_to_buf(buf, len, bdev_io->internal.orig_iovs, bdev_io->internal.orig_iovcnt); 957 } 958 } 959 960 _bdev_io_pull_bounce_data_buf_done(bdev_io, rc); 961 } 962 963 static void 964 _bdev_io_set_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t len) 965 { 966 struct spdk_bdev *bdev = bdev_io->bdev; 967 bool buf_allocated; 968 uint64_t alignment; 969 void *aligned_buf; 970 971 bdev_io->internal.buf = buf; 972 973 if (spdk_unlikely(bdev_io->internal.get_aux_buf_cb != NULL)) { 974 bdev_io_get_buf_complete(bdev_io, true); 975 return; 976 } 977 978 alignment = spdk_bdev_get_buf_align(bdev); 979 buf_allocated = _is_buf_allocated(bdev_io->u.bdev.iovs); 980 aligned_buf = (void *)(((uintptr_t)buf + (alignment - 1)) & ~(alignment - 1)); 981 982 if (buf_allocated) { 983 _bdev_io_pull_bounce_data_buf(bdev_io, aligned_buf, len, _bdev_io_pull_buffer_cpl); 984 /* Continue in completion callback */ 985 return; 986 } else { 987 spdk_bdev_io_set_buf(bdev_io, aligned_buf, len); 988 } 989 990 _bdev_io_set_md_buf(bdev_io); 991 } 992 993 static void 994 _bdev_io_put_buf(struct spdk_bdev_io *bdev_io, void *buf, uint64_t buf_len) 995 { 996 struct spdk_bdev *bdev = bdev_io->bdev; 997 struct spdk_mempool *pool; 998 struct spdk_bdev_io *tmp; 999 bdev_io_stailq_t *stailq; 1000 struct spdk_bdev_mgmt_channel *ch; 1001 uint64_t md_len, alignment; 1002 1003 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1004 alignment = spdk_bdev_get_buf_align(bdev); 1005 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1006 1007 if (buf_len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1008 SPDK_BDEV_POOL_ALIGNMENT) { 1009 pool = g_bdev_mgr.buf_small_pool; 1010 stailq = &ch->need_buf_small; 1011 } else { 1012 pool = g_bdev_mgr.buf_large_pool; 1013 stailq = &ch->need_buf_large; 1014 } 1015 1016 if (STAILQ_EMPTY(stailq)) { 1017 spdk_mempool_put(pool, buf); 1018 } else { 1019 tmp = STAILQ_FIRST(stailq); 1020 STAILQ_REMOVE_HEAD(stailq, internal.buf_link); 1021 _bdev_io_set_buf(tmp, buf, tmp->internal.buf_len); 1022 } 1023 } 1024 1025 static void 1026 bdev_io_put_buf(struct spdk_bdev_io *bdev_io) 1027 { 1028 assert(bdev_io->internal.buf != NULL); 1029 _bdev_io_put_buf(bdev_io, bdev_io->internal.buf, bdev_io->internal.buf_len); 1030 bdev_io->internal.buf = NULL; 1031 } 1032 1033 void 1034 spdk_bdev_io_put_aux_buf(struct spdk_bdev_io *bdev_io, void *buf) 1035 { 1036 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1037 1038 assert(buf != NULL); 1039 _bdev_io_put_buf(bdev_io, buf, len); 1040 } 1041 1042 static void 1043 bdev_ch_retry_io(struct spdk_bdev_channel *bdev_ch) 1044 { 1045 struct spdk_bdev *bdev = bdev_ch->bdev; 1046 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1047 struct spdk_bdev_io *bdev_io; 1048 1049 if (shared_resource->io_outstanding > shared_resource->nomem_threshold) { 1050 /* 1051 * Allow some more I/O to complete before retrying the nomem_io queue. 1052 * Some drivers (such as nvme) cannot immediately take a new I/O in 1053 * the context of a completion, because the resources for the I/O are 1054 * not released until control returns to the bdev poller. Also, we 1055 * may require several small I/O to complete before a larger I/O 1056 * (that requires splitting) can be submitted. 1057 */ 1058 return; 1059 } 1060 1061 while (!TAILQ_EMPTY(&shared_resource->nomem_io)) { 1062 bdev_io = TAILQ_FIRST(&shared_resource->nomem_io); 1063 TAILQ_REMOVE(&shared_resource->nomem_io, bdev_io, internal.link); 1064 bdev_io->internal.ch->io_outstanding++; 1065 shared_resource->io_outstanding++; 1066 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 1067 bdev_io->internal.error.nvme.cdw0 = 0; 1068 bdev_io->num_retries++; 1069 bdev->fn_table->submit_request(spdk_bdev_io_get_io_channel(bdev_io), bdev_io); 1070 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 1071 break; 1072 } 1073 } 1074 } 1075 1076 static inline void 1077 _bdev_io_decrement_outstanding(struct spdk_bdev_channel *bdev_ch, 1078 struct spdk_bdev_shared_resource *shared_resource) 1079 { 1080 assert(bdev_ch->io_outstanding > 0); 1081 assert(shared_resource->io_outstanding > 0); 1082 bdev_ch->io_outstanding--; 1083 shared_resource->io_outstanding--; 1084 } 1085 1086 static inline bool 1087 _bdev_io_handle_no_mem(struct spdk_bdev_io *bdev_io) 1088 { 1089 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1090 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1091 1092 if (spdk_unlikely(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM)) { 1093 TAILQ_INSERT_HEAD(&shared_resource->nomem_io, bdev_io, internal.link); 1094 /* 1095 * Wait for some of the outstanding I/O to complete before we 1096 * retry any of the nomem_io. Normally we will wait for 1097 * NOMEM_THRESHOLD_COUNT I/O to complete but for low queue 1098 * depth channels we will instead wait for half to complete. 1099 */ 1100 shared_resource->nomem_threshold = spdk_max((int64_t)shared_resource->io_outstanding / 2, 1101 (int64_t)shared_resource->io_outstanding - NOMEM_THRESHOLD_COUNT); 1102 return true; 1103 } 1104 1105 if (spdk_unlikely(!TAILQ_EMPTY(&shared_resource->nomem_io))) { 1106 bdev_ch_retry_io(bdev_ch); 1107 } 1108 1109 return false; 1110 } 1111 1112 static void 1113 _bdev_io_complete_push_bounce_done(void *ctx, int rc) 1114 { 1115 struct spdk_bdev_io *bdev_io = ctx; 1116 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 1117 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 1118 1119 if (rc) { 1120 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 1121 } 1122 /* We want to free the bounce buffer here since we know we're done with it (as opposed 1123 * to waiting for the conditional free of internal.buf in spdk_bdev_free_io()). 1124 */ 1125 bdev_io_put_buf(bdev_io); 1126 1127 /* Continue with IO completion flow */ 1128 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 1129 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 1130 return; 1131 } 1132 1133 bdev_io_complete(bdev_io); 1134 } 1135 1136 static inline void 1137 _bdev_io_push_bounce_md_buffer(struct spdk_bdev_io *bdev_io) 1138 { 1139 int rc = 0; 1140 1141 /* do the same for metadata buffer */ 1142 if (spdk_unlikely(bdev_io->internal.orig_md_iov.iov_base != NULL)) { 1143 assert(spdk_bdev_is_md_separate(bdev_io->bdev)); 1144 1145 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1146 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1147 if (bdev_io_use_memory_domain(bdev_io)) { 1148 /* If memory domain is used then we need to call async push function */ 1149 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1150 bdev_io->internal.ext_opts->memory_domain_ctx, 1151 &bdev_io->internal.orig_md_iov, 1152 (uint32_t)bdev_io->internal.orig_iovcnt, 1153 &bdev_io->internal.bounce_md_iov, 1, 1154 bdev_io->internal.data_transfer_cpl, 1155 bdev_io); 1156 if (rc == 0) { 1157 /* Continue IO completion in async callback */ 1158 return; 1159 } 1160 SPDK_ERRLOG("Failed to push md to memory domain %s\n", 1161 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1162 } else { 1163 memcpy(bdev_io->internal.orig_md_iov.iov_base, bdev_io->u.bdev.md_buf, 1164 bdev_io->internal.orig_md_iov.iov_len); 1165 } 1166 } 1167 } 1168 1169 assert(bdev_io->internal.data_transfer_cpl); 1170 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1171 } 1172 1173 static void 1174 _bdev_io_push_bounce_data_buffer_done(void *ctx, int rc) 1175 { 1176 struct spdk_bdev_io *bdev_io = ctx; 1177 1178 assert(bdev_io->internal.data_transfer_cpl); 1179 1180 if (rc) { 1181 bdev_io->internal.data_transfer_cpl(bdev_io, rc); 1182 return; 1183 } 1184 1185 /* set original buffer for this io */ 1186 bdev_io->u.bdev.iovcnt = bdev_io->internal.orig_iovcnt; 1187 bdev_io->u.bdev.iovs = bdev_io->internal.orig_iovs; 1188 /* disable bouncing buffer for this io */ 1189 bdev_io->internal.orig_iovcnt = 0; 1190 bdev_io->internal.orig_iovs = NULL; 1191 1192 _bdev_io_push_bounce_md_buffer(bdev_io); 1193 } 1194 1195 static inline void 1196 _bdev_io_push_bounce_data_buffer(struct spdk_bdev_io *bdev_io, bdev_copy_bounce_buffer_cpl cpl_cb) 1197 { 1198 int rc = 0; 1199 1200 bdev_io->internal.data_transfer_cpl = cpl_cb; 1201 1202 /* if this is read path, copy data from bounce buffer to original buffer */ 1203 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ && 1204 bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 1205 if (bdev_io_use_memory_domain(bdev_io)) { 1206 /* If memory domain is used then we need to call async push function */ 1207 rc = spdk_memory_domain_push_data(bdev_io->internal.ext_opts->memory_domain, 1208 bdev_io->internal.ext_opts->memory_domain_ctx, 1209 bdev_io->internal.orig_iovs, 1210 (uint32_t)bdev_io->internal.orig_iovcnt, 1211 &bdev_io->internal.bounce_iov, 1, 1212 _bdev_io_push_bounce_data_buffer_done, 1213 bdev_io); 1214 if (rc == 0) { 1215 /* Continue IO completion in async callback */ 1216 return; 1217 } 1218 SPDK_ERRLOG("Failed to push data to memory domain %s\n", 1219 spdk_memory_domain_get_dma_device_id(bdev_io->internal.ext_opts->memory_domain)); 1220 } else { 1221 _copy_buf_to_iovs(bdev_io->internal.orig_iovs, 1222 bdev_io->internal.orig_iovcnt, 1223 bdev_io->internal.bounce_iov.iov_base, 1224 bdev_io->internal.bounce_iov.iov_len); 1225 } 1226 } 1227 1228 _bdev_io_push_bounce_data_buffer_done(bdev_io, rc); 1229 } 1230 1231 static void 1232 bdev_io_get_buf(struct spdk_bdev_io *bdev_io, uint64_t len) 1233 { 1234 struct spdk_bdev *bdev = bdev_io->bdev; 1235 struct spdk_mempool *pool; 1236 bdev_io_stailq_t *stailq; 1237 struct spdk_bdev_mgmt_channel *mgmt_ch; 1238 uint64_t alignment, md_len; 1239 void *buf; 1240 1241 alignment = spdk_bdev_get_buf_align(bdev); 1242 md_len = spdk_bdev_is_md_separate(bdev) ? bdev_io->u.bdev.num_blocks * bdev->md_len : 0; 1243 1244 if (len + alignment + md_len > SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1245 SPDK_BDEV_POOL_ALIGNMENT) { 1246 SPDK_ERRLOG("Length + alignment %" PRIu64 " is larger than allowed\n", 1247 len + alignment); 1248 bdev_io_get_buf_complete(bdev_io, false); 1249 return; 1250 } 1251 1252 mgmt_ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1253 1254 bdev_io->internal.buf_len = len; 1255 1256 if (len + alignment + md_len <= SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1257 SPDK_BDEV_POOL_ALIGNMENT) { 1258 pool = g_bdev_mgr.buf_small_pool; 1259 stailq = &mgmt_ch->need_buf_small; 1260 } else { 1261 pool = g_bdev_mgr.buf_large_pool; 1262 stailq = &mgmt_ch->need_buf_large; 1263 } 1264 1265 buf = spdk_mempool_get(pool); 1266 if (!buf) { 1267 STAILQ_INSERT_TAIL(stailq, bdev_io, internal.buf_link); 1268 } else { 1269 _bdev_io_set_buf(bdev_io, buf, len); 1270 } 1271 } 1272 1273 void 1274 spdk_bdev_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, uint64_t len) 1275 { 1276 struct spdk_bdev *bdev = bdev_io->bdev; 1277 uint64_t alignment; 1278 1279 assert(cb != NULL); 1280 bdev_io->internal.get_buf_cb = cb; 1281 1282 alignment = spdk_bdev_get_buf_align(bdev); 1283 1284 if (_is_buf_allocated(bdev_io->u.bdev.iovs) && 1285 _are_iovs_aligned(bdev_io->u.bdev.iovs, bdev_io->u.bdev.iovcnt, alignment)) { 1286 /* Buffer already present and aligned */ 1287 cb(spdk_bdev_io_get_io_channel(bdev_io), bdev_io, true); 1288 return; 1289 } 1290 1291 bdev_io_get_buf(bdev_io, len); 1292 } 1293 1294 static void 1295 _bdev_memory_domain_get_io_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, 1296 bool success) 1297 { 1298 if (!success) { 1299 SPDK_ERRLOG("Failed to get data buffer, completing IO\n"); 1300 bdev_io_complete(bdev_io); 1301 } else { 1302 bdev_io_submit(bdev_io); 1303 } 1304 } 1305 1306 static void 1307 _bdev_memory_domain_io_get_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_buf_cb cb, 1308 uint64_t len) 1309 { 1310 assert(cb != NULL); 1311 bdev_io->internal.get_buf_cb = cb; 1312 1313 bdev_io_get_buf(bdev_io, len); 1314 } 1315 1316 void 1317 spdk_bdev_io_get_aux_buf(struct spdk_bdev_io *bdev_io, spdk_bdev_io_get_aux_buf_cb cb) 1318 { 1319 uint64_t len = bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 1320 1321 assert(cb != NULL); 1322 assert(bdev_io->internal.get_aux_buf_cb == NULL); 1323 bdev_io->internal.get_aux_buf_cb = cb; 1324 bdev_io_get_buf(bdev_io, len); 1325 } 1326 1327 static int 1328 bdev_module_get_max_ctx_size(void) 1329 { 1330 struct spdk_bdev_module *bdev_module; 1331 int max_bdev_module_size = 0; 1332 1333 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1334 if (bdev_module->get_ctx_size && bdev_module->get_ctx_size() > max_bdev_module_size) { 1335 max_bdev_module_size = bdev_module->get_ctx_size(); 1336 } 1337 } 1338 1339 return max_bdev_module_size; 1340 } 1341 1342 static void 1343 bdev_qos_config_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 1344 { 1345 int i; 1346 struct spdk_bdev_qos *qos = bdev->internal.qos; 1347 uint64_t limits[SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES]; 1348 1349 if (!qos) { 1350 return; 1351 } 1352 1353 spdk_bdev_get_qos_rate_limits(bdev, limits); 1354 1355 spdk_json_write_object_begin(w); 1356 spdk_json_write_named_string(w, "method", "bdev_set_qos_limit"); 1357 1358 spdk_json_write_named_object_begin(w, "params"); 1359 spdk_json_write_named_string(w, "name", bdev->name); 1360 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 1361 if (limits[i] > 0) { 1362 spdk_json_write_named_uint64(w, qos_rpc_type[i], limits[i]); 1363 } 1364 } 1365 spdk_json_write_object_end(w); 1366 1367 spdk_json_write_object_end(w); 1368 } 1369 1370 void 1371 spdk_bdev_subsystem_config_json(struct spdk_json_write_ctx *w) 1372 { 1373 struct spdk_bdev_module *bdev_module; 1374 struct spdk_bdev *bdev; 1375 1376 assert(w != NULL); 1377 1378 spdk_json_write_array_begin(w); 1379 1380 spdk_json_write_object_begin(w); 1381 spdk_json_write_named_string(w, "method", "bdev_set_options"); 1382 spdk_json_write_named_object_begin(w, "params"); 1383 spdk_json_write_named_uint32(w, "bdev_io_pool_size", g_bdev_opts.bdev_io_pool_size); 1384 spdk_json_write_named_uint32(w, "bdev_io_cache_size", g_bdev_opts.bdev_io_cache_size); 1385 spdk_json_write_named_bool(w, "bdev_auto_examine", g_bdev_opts.bdev_auto_examine); 1386 spdk_json_write_object_end(w); 1387 spdk_json_write_object_end(w); 1388 1389 bdev_examine_allowlist_config_json(w); 1390 1391 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1392 if (bdev_module->config_json) { 1393 bdev_module->config_json(w); 1394 } 1395 } 1396 1397 pthread_mutex_lock(&g_bdev_mgr.mutex); 1398 1399 TAILQ_FOREACH(bdev, &g_bdev_mgr.bdevs, internal.link) { 1400 if (bdev->fn_table->write_config_json) { 1401 bdev->fn_table->write_config_json(bdev, w); 1402 } 1403 1404 bdev_qos_config_json(bdev, w); 1405 } 1406 1407 pthread_mutex_unlock(&g_bdev_mgr.mutex); 1408 1409 /* This has to be last RPC in array to make sure all bdevs finished examine */ 1410 spdk_json_write_object_begin(w); 1411 spdk_json_write_named_string(w, "method", "bdev_wait_for_examine"); 1412 spdk_json_write_object_end(w); 1413 1414 spdk_json_write_array_end(w); 1415 } 1416 1417 static int 1418 bdev_mgmt_channel_create(void *io_device, void *ctx_buf) 1419 { 1420 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1421 struct spdk_bdev_io *bdev_io; 1422 uint32_t i; 1423 1424 STAILQ_INIT(&ch->need_buf_small); 1425 STAILQ_INIT(&ch->need_buf_large); 1426 1427 STAILQ_INIT(&ch->per_thread_cache); 1428 ch->bdev_io_cache_size = g_bdev_opts.bdev_io_cache_size; 1429 1430 /* Pre-populate bdev_io cache to ensure this thread cannot be starved. */ 1431 ch->per_thread_cache_count = 0; 1432 for (i = 0; i < ch->bdev_io_cache_size; i++) { 1433 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1434 assert(bdev_io != NULL); 1435 ch->per_thread_cache_count++; 1436 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1437 } 1438 1439 TAILQ_INIT(&ch->shared_resources); 1440 TAILQ_INIT(&ch->io_wait_queue); 1441 1442 return 0; 1443 } 1444 1445 static void 1446 bdev_mgmt_channel_destroy(void *io_device, void *ctx_buf) 1447 { 1448 struct spdk_bdev_mgmt_channel *ch = ctx_buf; 1449 struct spdk_bdev_io *bdev_io; 1450 1451 if (!STAILQ_EMPTY(&ch->need_buf_small) || !STAILQ_EMPTY(&ch->need_buf_large)) { 1452 SPDK_ERRLOG("Pending I/O list wasn't empty on mgmt channel free\n"); 1453 } 1454 1455 if (!TAILQ_EMPTY(&ch->shared_resources)) { 1456 SPDK_ERRLOG("Module channel list wasn't empty on mgmt channel free\n"); 1457 } 1458 1459 while (!STAILQ_EMPTY(&ch->per_thread_cache)) { 1460 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1461 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1462 ch->per_thread_cache_count--; 1463 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1464 } 1465 1466 assert(ch->per_thread_cache_count == 0); 1467 } 1468 1469 static void 1470 bdev_init_complete(int rc) 1471 { 1472 spdk_bdev_init_cb cb_fn = g_init_cb_fn; 1473 void *cb_arg = g_init_cb_arg; 1474 struct spdk_bdev_module *m; 1475 1476 g_bdev_mgr.init_complete = true; 1477 g_init_cb_fn = NULL; 1478 g_init_cb_arg = NULL; 1479 1480 /* 1481 * For modules that need to know when subsystem init is complete, 1482 * inform them now. 1483 */ 1484 if (rc == 0) { 1485 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1486 if (m->init_complete) { 1487 m->init_complete(); 1488 } 1489 } 1490 } 1491 1492 cb_fn(cb_arg, rc); 1493 } 1494 1495 static bool 1496 bdev_module_all_actions_completed(void) 1497 { 1498 struct spdk_bdev_module *m; 1499 1500 TAILQ_FOREACH(m, &g_bdev_mgr.bdev_modules, internal.tailq) { 1501 if (m->internal.action_in_progress > 0) { 1502 return false; 1503 } 1504 } 1505 return true; 1506 } 1507 1508 static void 1509 bdev_module_action_complete(void) 1510 { 1511 /* 1512 * Don't finish bdev subsystem initialization if 1513 * module pre-initialization is still in progress, or 1514 * the subsystem been already initialized. 1515 */ 1516 if (!g_bdev_mgr.module_init_complete || g_bdev_mgr.init_complete) { 1517 return; 1518 } 1519 1520 /* 1521 * Check all bdev modules for inits/examinations in progress. If any 1522 * exist, return immediately since we cannot finish bdev subsystem 1523 * initialization until all are completed. 1524 */ 1525 if (!bdev_module_all_actions_completed()) { 1526 return; 1527 } 1528 1529 /* 1530 * Modules already finished initialization - now that all 1531 * the bdev modules have finished their asynchronous I/O 1532 * processing, the entire bdev layer can be marked as complete. 1533 */ 1534 bdev_init_complete(0); 1535 } 1536 1537 static void 1538 bdev_module_action_done(struct spdk_bdev_module *module) 1539 { 1540 assert(module->internal.action_in_progress > 0); 1541 module->internal.action_in_progress--; 1542 bdev_module_action_complete(); 1543 } 1544 1545 void 1546 spdk_bdev_module_init_done(struct spdk_bdev_module *module) 1547 { 1548 bdev_module_action_done(module); 1549 } 1550 1551 void 1552 spdk_bdev_module_examine_done(struct spdk_bdev_module *module) 1553 { 1554 bdev_module_action_done(module); 1555 } 1556 1557 /** The last initialized bdev module */ 1558 static struct spdk_bdev_module *g_resume_bdev_module = NULL; 1559 1560 static void 1561 bdev_init_failed(void *cb_arg) 1562 { 1563 struct spdk_bdev_module *module = cb_arg; 1564 1565 module->internal.action_in_progress--; 1566 bdev_init_complete(-1); 1567 } 1568 1569 static int 1570 bdev_modules_init(void) 1571 { 1572 struct spdk_bdev_module *module; 1573 int rc = 0; 1574 1575 TAILQ_FOREACH(module, &g_bdev_mgr.bdev_modules, internal.tailq) { 1576 g_resume_bdev_module = module; 1577 if (module->async_init) { 1578 module->internal.action_in_progress = 1; 1579 } 1580 rc = module->module_init(); 1581 if (rc != 0) { 1582 /* Bump action_in_progress to prevent other modules from completion of modules_init 1583 * Send message to defer application shutdown until resources are cleaned up */ 1584 module->internal.action_in_progress = 1; 1585 spdk_thread_send_msg(spdk_get_thread(), bdev_init_failed, module); 1586 return rc; 1587 } 1588 } 1589 1590 g_resume_bdev_module = NULL; 1591 return 0; 1592 } 1593 1594 void 1595 spdk_bdev_initialize(spdk_bdev_init_cb cb_fn, void *cb_arg) 1596 { 1597 int cache_size; 1598 int rc = 0; 1599 char mempool_name[32]; 1600 1601 assert(cb_fn != NULL); 1602 1603 g_init_cb_fn = cb_fn; 1604 g_init_cb_arg = cb_arg; 1605 1606 spdk_notify_type_register("bdev_register"); 1607 spdk_notify_type_register("bdev_unregister"); 1608 1609 snprintf(mempool_name, sizeof(mempool_name), "bdev_io_%d", getpid()); 1610 1611 g_bdev_mgr.bdev_io_pool = spdk_mempool_create(mempool_name, 1612 g_bdev_opts.bdev_io_pool_size, 1613 sizeof(struct spdk_bdev_io) + 1614 bdev_module_get_max_ctx_size(), 1615 0, 1616 SPDK_ENV_SOCKET_ID_ANY); 1617 1618 if (g_bdev_mgr.bdev_io_pool == NULL) { 1619 SPDK_ERRLOG("could not allocate spdk_bdev_io pool\n"); 1620 bdev_init_complete(-1); 1621 return; 1622 } 1623 1624 /** 1625 * Ensure no more than half of the total buffers end up local caches, by 1626 * using spdk_env_get_core_count() to determine how many local caches we need 1627 * to account for. 1628 */ 1629 cache_size = BUF_SMALL_POOL_SIZE / (2 * spdk_env_get_core_count()); 1630 snprintf(mempool_name, sizeof(mempool_name), "buf_small_pool_%d", getpid()); 1631 1632 g_bdev_mgr.buf_small_pool = spdk_mempool_create(mempool_name, 1633 g_bdev_opts.small_buf_pool_size, 1634 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_SMALL_BUF_MAX_SIZE) + 1635 SPDK_BDEV_POOL_ALIGNMENT, 1636 cache_size, 1637 SPDK_ENV_SOCKET_ID_ANY); 1638 if (!g_bdev_mgr.buf_small_pool) { 1639 SPDK_ERRLOG("create rbuf small pool failed\n"); 1640 bdev_init_complete(-1); 1641 return; 1642 } 1643 1644 cache_size = BUF_LARGE_POOL_SIZE / (2 * spdk_env_get_core_count()); 1645 snprintf(mempool_name, sizeof(mempool_name), "buf_large_pool_%d", getpid()); 1646 1647 g_bdev_mgr.buf_large_pool = spdk_mempool_create(mempool_name, 1648 g_bdev_opts.large_buf_pool_size, 1649 SPDK_BDEV_BUF_SIZE_WITH_MD(SPDK_BDEV_LARGE_BUF_MAX_SIZE) + 1650 SPDK_BDEV_POOL_ALIGNMENT, 1651 cache_size, 1652 SPDK_ENV_SOCKET_ID_ANY); 1653 if (!g_bdev_mgr.buf_large_pool) { 1654 SPDK_ERRLOG("create rbuf large pool failed\n"); 1655 bdev_init_complete(-1); 1656 return; 1657 } 1658 1659 g_bdev_mgr.zero_buffer = spdk_zmalloc(ZERO_BUFFER_SIZE, ZERO_BUFFER_SIZE, 1660 NULL, SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA); 1661 if (!g_bdev_mgr.zero_buffer) { 1662 SPDK_ERRLOG("create bdev zero buffer failed\n"); 1663 bdev_init_complete(-1); 1664 return; 1665 } 1666 1667 #ifdef SPDK_CONFIG_VTUNE 1668 g_bdev_mgr.domain = __itt_domain_create("spdk_bdev"); 1669 #endif 1670 1671 spdk_io_device_register(&g_bdev_mgr, bdev_mgmt_channel_create, 1672 bdev_mgmt_channel_destroy, 1673 sizeof(struct spdk_bdev_mgmt_channel), 1674 "bdev_mgr"); 1675 1676 rc = bdev_modules_init(); 1677 g_bdev_mgr.module_init_complete = true; 1678 if (rc != 0) { 1679 SPDK_ERRLOG("bdev modules init failed\n"); 1680 return; 1681 } 1682 1683 bdev_module_action_complete(); 1684 } 1685 1686 static void 1687 bdev_mgr_unregister_cb(void *io_device) 1688 { 1689 spdk_bdev_fini_cb cb_fn = g_fini_cb_fn; 1690 1691 if (g_bdev_mgr.bdev_io_pool) { 1692 if (spdk_mempool_count(g_bdev_mgr.bdev_io_pool) != g_bdev_opts.bdev_io_pool_size) { 1693 SPDK_ERRLOG("bdev IO pool count is %zu but should be %u\n", 1694 spdk_mempool_count(g_bdev_mgr.bdev_io_pool), 1695 g_bdev_opts.bdev_io_pool_size); 1696 } 1697 1698 spdk_mempool_free(g_bdev_mgr.bdev_io_pool); 1699 } 1700 1701 if (g_bdev_mgr.buf_small_pool) { 1702 if (spdk_mempool_count(g_bdev_mgr.buf_small_pool) != g_bdev_opts.small_buf_pool_size) { 1703 SPDK_ERRLOG("Small buffer pool count is %zu but should be %u\n", 1704 spdk_mempool_count(g_bdev_mgr.buf_small_pool), 1705 g_bdev_opts.small_buf_pool_size); 1706 assert(false); 1707 } 1708 1709 spdk_mempool_free(g_bdev_mgr.buf_small_pool); 1710 } 1711 1712 if (g_bdev_mgr.buf_large_pool) { 1713 if (spdk_mempool_count(g_bdev_mgr.buf_large_pool) != g_bdev_opts.large_buf_pool_size) { 1714 SPDK_ERRLOG("Large buffer pool count is %zu but should be %u\n", 1715 spdk_mempool_count(g_bdev_mgr.buf_large_pool), 1716 g_bdev_opts.large_buf_pool_size); 1717 assert(false); 1718 } 1719 1720 spdk_mempool_free(g_bdev_mgr.buf_large_pool); 1721 } 1722 1723 spdk_free(g_bdev_mgr.zero_buffer); 1724 1725 bdev_examine_allowlist_free(); 1726 1727 cb_fn(g_fini_cb_arg); 1728 g_fini_cb_fn = NULL; 1729 g_fini_cb_arg = NULL; 1730 g_bdev_mgr.init_complete = false; 1731 g_bdev_mgr.module_init_complete = false; 1732 } 1733 1734 static void 1735 bdev_module_fini_iter(void *arg) 1736 { 1737 struct spdk_bdev_module *bdev_module; 1738 1739 /* FIXME: Handling initialization failures is broken now, 1740 * so we won't even try cleaning up after successfully 1741 * initialized modules. if module_init_complete is false, 1742 * just call spdk_bdev_mgr_unregister_cb 1743 */ 1744 if (!g_bdev_mgr.module_init_complete) { 1745 bdev_mgr_unregister_cb(NULL); 1746 return; 1747 } 1748 1749 /* Start iterating from the last touched module */ 1750 if (!g_resume_bdev_module) { 1751 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1752 } else { 1753 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, 1754 internal.tailq); 1755 } 1756 1757 while (bdev_module) { 1758 if (bdev_module->async_fini) { 1759 /* Save our place so we can resume later. We must 1760 * save the variable here, before calling module_fini() 1761 * below, because in some cases the module may immediately 1762 * call spdk_bdev_module_fini_done() and re-enter 1763 * this function to continue iterating. */ 1764 g_resume_bdev_module = bdev_module; 1765 } 1766 1767 if (bdev_module->module_fini) { 1768 bdev_module->module_fini(); 1769 } 1770 1771 if (bdev_module->async_fini) { 1772 return; 1773 } 1774 1775 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, 1776 internal.tailq); 1777 } 1778 1779 g_resume_bdev_module = NULL; 1780 spdk_io_device_unregister(&g_bdev_mgr, bdev_mgr_unregister_cb); 1781 } 1782 1783 void 1784 spdk_bdev_module_fini_done(void) 1785 { 1786 if (spdk_get_thread() != g_fini_thread) { 1787 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_iter, NULL); 1788 } else { 1789 bdev_module_fini_iter(NULL); 1790 } 1791 } 1792 1793 static void 1794 bdev_finish_unregister_bdevs_iter(void *cb_arg, int bdeverrno) 1795 { 1796 struct spdk_bdev *bdev = cb_arg; 1797 1798 if (bdeverrno && bdev) { 1799 SPDK_WARNLOG("Unable to unregister bdev '%s' during spdk_bdev_finish()\n", 1800 bdev->name); 1801 1802 /* 1803 * Since the call to spdk_bdev_unregister() failed, we have no way to free this 1804 * bdev; try to continue by manually removing this bdev from the list and continue 1805 * with the next bdev in the list. 1806 */ 1807 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 1808 } 1809 1810 if (TAILQ_EMPTY(&g_bdev_mgr.bdevs)) { 1811 SPDK_DEBUGLOG(bdev, "Done unregistering bdevs\n"); 1812 /* 1813 * Bdev module finish need to be deferred as we might be in the middle of some context 1814 * (like bdev part free) that will use this bdev (or private bdev driver ctx data) 1815 * after returning. 1816 */ 1817 spdk_thread_send_msg(spdk_get_thread(), bdev_module_fini_iter, NULL); 1818 return; 1819 } 1820 1821 /* 1822 * Unregister last unclaimed bdev in the list, to ensure that bdev subsystem 1823 * shutdown proceeds top-down. The goal is to give virtual bdevs an opportunity 1824 * to detect clean shutdown as opposed to run-time hot removal of the underlying 1825 * base bdevs. 1826 * 1827 * Also, walk the list in the reverse order. 1828 */ 1829 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1830 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1831 if (bdev->internal.claim_module != NULL) { 1832 SPDK_DEBUGLOG(bdev, "Skipping claimed bdev '%s'(<-'%s').\n", 1833 bdev->name, bdev->internal.claim_module->name); 1834 continue; 1835 } 1836 1837 SPDK_DEBUGLOG(bdev, "Unregistering bdev '%s'\n", bdev->name); 1838 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1839 return; 1840 } 1841 1842 /* 1843 * If any bdev fails to unclaim underlying bdev properly, we may face the 1844 * case of bdev list consisting of claimed bdevs only (if claims are managed 1845 * correctly, this would mean there's a loop in the claims graph which is 1846 * clearly impossible). Warn and unregister last bdev on the list then. 1847 */ 1848 for (bdev = TAILQ_LAST(&g_bdev_mgr.bdevs, spdk_bdev_list); 1849 bdev; bdev = TAILQ_PREV(bdev, spdk_bdev_list, internal.link)) { 1850 SPDK_WARNLOG("Unregistering claimed bdev '%s'!\n", bdev->name); 1851 spdk_bdev_unregister(bdev, bdev_finish_unregister_bdevs_iter, bdev); 1852 return; 1853 } 1854 } 1855 1856 static void 1857 bdev_module_fini_start_iter(void *arg) 1858 { 1859 struct spdk_bdev_module *bdev_module; 1860 1861 if (!g_resume_bdev_module) { 1862 bdev_module = TAILQ_LAST(&g_bdev_mgr.bdev_modules, bdev_module_list); 1863 } else { 1864 bdev_module = TAILQ_PREV(g_resume_bdev_module, bdev_module_list, internal.tailq); 1865 } 1866 1867 while (bdev_module) { 1868 if (bdev_module->async_fini_start) { 1869 /* Save our place so we can resume later. We must 1870 * save the variable here, before calling fini_start() 1871 * below, because in some cases the module may immediately 1872 * call spdk_bdev_module_fini_start_done() and re-enter 1873 * this function to continue iterating. */ 1874 g_resume_bdev_module = bdev_module; 1875 } 1876 1877 if (bdev_module->fini_start) { 1878 bdev_module->fini_start(); 1879 } 1880 1881 if (bdev_module->async_fini_start) { 1882 return; 1883 } 1884 1885 bdev_module = TAILQ_PREV(bdev_module, bdev_module_list, internal.tailq); 1886 } 1887 1888 g_resume_bdev_module = NULL; 1889 1890 bdev_finish_unregister_bdevs_iter(NULL, 0); 1891 } 1892 1893 void 1894 spdk_bdev_module_fini_start_done(void) 1895 { 1896 if (spdk_get_thread() != g_fini_thread) { 1897 spdk_thread_send_msg(g_fini_thread, bdev_module_fini_start_iter, NULL); 1898 } else { 1899 bdev_module_fini_start_iter(NULL); 1900 } 1901 } 1902 1903 void 1904 spdk_bdev_finish(spdk_bdev_fini_cb cb_fn, void *cb_arg) 1905 { 1906 assert(cb_fn != NULL); 1907 1908 g_fini_thread = spdk_get_thread(); 1909 1910 g_fini_cb_fn = cb_fn; 1911 g_fini_cb_arg = cb_arg; 1912 1913 bdev_module_fini_start_iter(NULL); 1914 } 1915 1916 struct spdk_bdev_io * 1917 bdev_channel_get_io(struct spdk_bdev_channel *channel) 1918 { 1919 struct spdk_bdev_mgmt_channel *ch = channel->shared_resource->mgmt_ch; 1920 struct spdk_bdev_io *bdev_io; 1921 1922 if (ch->per_thread_cache_count > 0) { 1923 bdev_io = STAILQ_FIRST(&ch->per_thread_cache); 1924 STAILQ_REMOVE_HEAD(&ch->per_thread_cache, internal.buf_link); 1925 ch->per_thread_cache_count--; 1926 } else if (spdk_unlikely(!TAILQ_EMPTY(&ch->io_wait_queue))) { 1927 /* 1928 * Don't try to look for bdev_ios in the global pool if there are 1929 * waiters on bdev_ios - we don't want this caller to jump the line. 1930 */ 1931 bdev_io = NULL; 1932 } else { 1933 bdev_io = spdk_mempool_get(g_bdev_mgr.bdev_io_pool); 1934 } 1935 1936 return bdev_io; 1937 } 1938 1939 void 1940 spdk_bdev_free_io(struct spdk_bdev_io *bdev_io) 1941 { 1942 struct spdk_bdev_mgmt_channel *ch; 1943 1944 assert(bdev_io != NULL); 1945 assert(bdev_io->internal.status != SPDK_BDEV_IO_STATUS_PENDING); 1946 1947 ch = bdev_io->internal.ch->shared_resource->mgmt_ch; 1948 1949 if (bdev_io->internal.buf != NULL) { 1950 bdev_io_put_buf(bdev_io); 1951 } 1952 1953 if (ch->per_thread_cache_count < ch->bdev_io_cache_size) { 1954 ch->per_thread_cache_count++; 1955 STAILQ_INSERT_HEAD(&ch->per_thread_cache, bdev_io, internal.buf_link); 1956 while (ch->per_thread_cache_count > 0 && !TAILQ_EMPTY(&ch->io_wait_queue)) { 1957 struct spdk_bdev_io_wait_entry *entry; 1958 1959 entry = TAILQ_FIRST(&ch->io_wait_queue); 1960 TAILQ_REMOVE(&ch->io_wait_queue, entry, link); 1961 entry->cb_fn(entry->cb_arg); 1962 } 1963 } else { 1964 /* We should never have a full cache with entries on the io wait queue. */ 1965 assert(TAILQ_EMPTY(&ch->io_wait_queue)); 1966 spdk_mempool_put(g_bdev_mgr.bdev_io_pool, (void *)bdev_io); 1967 } 1968 } 1969 1970 static bool 1971 bdev_qos_is_iops_rate_limit(enum spdk_bdev_qos_rate_limit_type limit) 1972 { 1973 assert(limit != SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 1974 1975 switch (limit) { 1976 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 1977 return true; 1978 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 1979 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 1980 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 1981 return false; 1982 case SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES: 1983 default: 1984 return false; 1985 } 1986 } 1987 1988 static bool 1989 bdev_qos_io_to_limit(struct spdk_bdev_io *bdev_io) 1990 { 1991 switch (bdev_io->type) { 1992 case SPDK_BDEV_IO_TYPE_NVME_IO: 1993 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 1994 case SPDK_BDEV_IO_TYPE_READ: 1995 case SPDK_BDEV_IO_TYPE_WRITE: 1996 return true; 1997 case SPDK_BDEV_IO_TYPE_ZCOPY: 1998 if (bdev_io->u.bdev.zcopy.start) { 1999 return true; 2000 } else { 2001 return false; 2002 } 2003 default: 2004 return false; 2005 } 2006 } 2007 2008 static bool 2009 bdev_is_read_io(struct spdk_bdev_io *bdev_io) 2010 { 2011 switch (bdev_io->type) { 2012 case SPDK_BDEV_IO_TYPE_NVME_IO: 2013 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2014 /* Bit 1 (0x2) set for read operation */ 2015 if (bdev_io->u.nvme_passthru.cmd.opc & SPDK_NVME_OPC_READ) { 2016 return true; 2017 } else { 2018 return false; 2019 } 2020 case SPDK_BDEV_IO_TYPE_READ: 2021 return true; 2022 case SPDK_BDEV_IO_TYPE_ZCOPY: 2023 /* Populate to read from disk */ 2024 if (bdev_io->u.bdev.zcopy.populate) { 2025 return true; 2026 } else { 2027 return false; 2028 } 2029 default: 2030 return false; 2031 } 2032 } 2033 2034 static uint64_t 2035 bdev_get_io_size_in_byte(struct spdk_bdev_io *bdev_io) 2036 { 2037 struct spdk_bdev *bdev = bdev_io->bdev; 2038 2039 switch (bdev_io->type) { 2040 case SPDK_BDEV_IO_TYPE_NVME_IO: 2041 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2042 return bdev_io->u.nvme_passthru.nbytes; 2043 case SPDK_BDEV_IO_TYPE_READ: 2044 case SPDK_BDEV_IO_TYPE_WRITE: 2045 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2046 case SPDK_BDEV_IO_TYPE_ZCOPY: 2047 /* Track the data in the start phase only */ 2048 if (bdev_io->u.bdev.zcopy.start) { 2049 return bdev_io->u.bdev.num_blocks * bdev->blocklen; 2050 } else { 2051 return 0; 2052 } 2053 default: 2054 return 0; 2055 } 2056 } 2057 2058 static bool 2059 bdev_qos_rw_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2060 { 2061 if (limit->max_per_timeslice > 0 && limit->remaining_this_timeslice <= 0) { 2062 return true; 2063 } else { 2064 return false; 2065 } 2066 } 2067 2068 static bool 2069 bdev_qos_r_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2070 { 2071 if (bdev_is_read_io(io) == false) { 2072 return false; 2073 } 2074 2075 return bdev_qos_rw_queue_io(limit, io); 2076 } 2077 2078 static bool 2079 bdev_qos_w_queue_io(const struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2080 { 2081 if (bdev_is_read_io(io) == true) { 2082 return false; 2083 } 2084 2085 return bdev_qos_rw_queue_io(limit, io); 2086 } 2087 2088 static void 2089 bdev_qos_rw_iops_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2090 { 2091 limit->remaining_this_timeslice--; 2092 } 2093 2094 static void 2095 bdev_qos_rw_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2096 { 2097 limit->remaining_this_timeslice -= bdev_get_io_size_in_byte(io); 2098 } 2099 2100 static void 2101 bdev_qos_r_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2102 { 2103 if (bdev_is_read_io(io) == false) { 2104 return; 2105 } 2106 2107 return bdev_qos_rw_bps_update_quota(limit, io); 2108 } 2109 2110 static void 2111 bdev_qos_w_bps_update_quota(struct spdk_bdev_qos_limit *limit, struct spdk_bdev_io *io) 2112 { 2113 if (bdev_is_read_io(io) == true) { 2114 return; 2115 } 2116 2117 return bdev_qos_rw_bps_update_quota(limit, io); 2118 } 2119 2120 static void 2121 bdev_qos_set_ops(struct spdk_bdev_qos *qos) 2122 { 2123 int i; 2124 2125 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2126 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 2127 qos->rate_limits[i].queue_io = NULL; 2128 qos->rate_limits[i].update_quota = NULL; 2129 continue; 2130 } 2131 2132 switch (i) { 2133 case SPDK_BDEV_QOS_RW_IOPS_RATE_LIMIT: 2134 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2135 qos->rate_limits[i].update_quota = bdev_qos_rw_iops_update_quota; 2136 break; 2137 case SPDK_BDEV_QOS_RW_BPS_RATE_LIMIT: 2138 qos->rate_limits[i].queue_io = bdev_qos_rw_queue_io; 2139 qos->rate_limits[i].update_quota = bdev_qos_rw_bps_update_quota; 2140 break; 2141 case SPDK_BDEV_QOS_R_BPS_RATE_LIMIT: 2142 qos->rate_limits[i].queue_io = bdev_qos_r_queue_io; 2143 qos->rate_limits[i].update_quota = bdev_qos_r_bps_update_quota; 2144 break; 2145 case SPDK_BDEV_QOS_W_BPS_RATE_LIMIT: 2146 qos->rate_limits[i].queue_io = bdev_qos_w_queue_io; 2147 qos->rate_limits[i].update_quota = bdev_qos_w_bps_update_quota; 2148 break; 2149 default: 2150 break; 2151 } 2152 } 2153 } 2154 2155 static void 2156 _bdev_io_complete_in_submit(struct spdk_bdev_channel *bdev_ch, 2157 struct spdk_bdev_io *bdev_io, 2158 enum spdk_bdev_io_status status) 2159 { 2160 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2161 2162 bdev_io->internal.in_submit_request = true; 2163 bdev_ch->io_outstanding++; 2164 shared_resource->io_outstanding++; 2165 spdk_bdev_io_complete(bdev_io, status); 2166 bdev_io->internal.in_submit_request = false; 2167 } 2168 2169 static inline void 2170 bdev_io_do_submit(struct spdk_bdev_channel *bdev_ch, struct spdk_bdev_io *bdev_io) 2171 { 2172 struct spdk_bdev *bdev = bdev_io->bdev; 2173 struct spdk_io_channel *ch = bdev_ch->channel; 2174 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 2175 2176 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 2177 struct spdk_bdev_mgmt_channel *mgmt_channel = shared_resource->mgmt_ch; 2178 struct spdk_bdev_io *bio_to_abort = bdev_io->u.abort.bio_to_abort; 2179 2180 if (bdev_abort_queued_io(&shared_resource->nomem_io, bio_to_abort) || 2181 bdev_abort_buf_io(&mgmt_channel->need_buf_small, bio_to_abort) || 2182 bdev_abort_buf_io(&mgmt_channel->need_buf_large, bio_to_abort)) { 2183 _bdev_io_complete_in_submit(bdev_ch, bdev_io, 2184 SPDK_BDEV_IO_STATUS_SUCCESS); 2185 return; 2186 } 2187 } 2188 2189 if (spdk_likely(TAILQ_EMPTY(&shared_resource->nomem_io))) { 2190 bdev_ch->io_outstanding++; 2191 shared_resource->io_outstanding++; 2192 bdev_io->internal.in_submit_request = true; 2193 bdev->fn_table->submit_request(ch, bdev_io); 2194 bdev_io->internal.in_submit_request = false; 2195 } else { 2196 TAILQ_INSERT_TAIL(&shared_resource->nomem_io, bdev_io, internal.link); 2197 } 2198 } 2199 2200 static int 2201 bdev_qos_io_submit(struct spdk_bdev_channel *ch, struct spdk_bdev_qos *qos) 2202 { 2203 struct spdk_bdev_io *bdev_io = NULL, *tmp = NULL; 2204 int i, submitted_ios = 0; 2205 2206 TAILQ_FOREACH_SAFE(bdev_io, &qos->queued, internal.link, tmp) { 2207 if (bdev_qos_io_to_limit(bdev_io) == true) { 2208 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2209 if (!qos->rate_limits[i].queue_io) { 2210 continue; 2211 } 2212 2213 if (qos->rate_limits[i].queue_io(&qos->rate_limits[i], 2214 bdev_io) == true) { 2215 return submitted_ios; 2216 } 2217 } 2218 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 2219 if (!qos->rate_limits[i].update_quota) { 2220 continue; 2221 } 2222 2223 qos->rate_limits[i].update_quota(&qos->rate_limits[i], bdev_io); 2224 } 2225 } 2226 2227 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 2228 bdev_io_do_submit(ch, bdev_io); 2229 submitted_ios++; 2230 } 2231 2232 return submitted_ios; 2233 } 2234 2235 static void 2236 bdev_queue_io_wait_with_cb(struct spdk_bdev_io *bdev_io, spdk_bdev_io_wait_cb cb_fn) 2237 { 2238 int rc; 2239 2240 bdev_io->internal.waitq_entry.bdev = bdev_io->bdev; 2241 bdev_io->internal.waitq_entry.cb_fn = cb_fn; 2242 bdev_io->internal.waitq_entry.cb_arg = bdev_io; 2243 rc = spdk_bdev_queue_io_wait(bdev_io->bdev, spdk_io_channel_from_ctx(bdev_io->internal.ch), 2244 &bdev_io->internal.waitq_entry); 2245 if (rc != 0) { 2246 SPDK_ERRLOG("Queue IO failed, rc=%d\n", rc); 2247 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2248 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2249 } 2250 } 2251 2252 static bool 2253 bdev_rw_should_split(struct spdk_bdev_io *bdev_io) 2254 { 2255 uint32_t io_boundary = bdev_io->bdev->optimal_io_boundary; 2256 uint32_t max_size = bdev_io->bdev->max_segment_size; 2257 int max_segs = bdev_io->bdev->max_num_segments; 2258 2259 io_boundary = bdev_io->bdev->split_on_optimal_io_boundary ? io_boundary : 0; 2260 2261 if (spdk_likely(!io_boundary && !max_segs && !max_size)) { 2262 return false; 2263 } 2264 2265 if (io_boundary) { 2266 uint64_t start_stripe, end_stripe; 2267 2268 start_stripe = bdev_io->u.bdev.offset_blocks; 2269 end_stripe = start_stripe + bdev_io->u.bdev.num_blocks - 1; 2270 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 2271 if (spdk_likely(spdk_u32_is_pow2(io_boundary))) { 2272 start_stripe >>= spdk_u32log2(io_boundary); 2273 end_stripe >>= spdk_u32log2(io_boundary); 2274 } else { 2275 start_stripe /= io_boundary; 2276 end_stripe /= io_boundary; 2277 } 2278 2279 if (start_stripe != end_stripe) { 2280 return true; 2281 } 2282 } 2283 2284 if (max_segs) { 2285 if (bdev_io->u.bdev.iovcnt > max_segs) { 2286 return true; 2287 } 2288 } 2289 2290 if (max_size) { 2291 for (int i = 0; i < bdev_io->u.bdev.iovcnt; i++) { 2292 if (bdev_io->u.bdev.iovs[i].iov_len > max_size) { 2293 return true; 2294 } 2295 } 2296 } 2297 2298 return false; 2299 } 2300 2301 static bool 2302 bdev_unmap_should_split(struct spdk_bdev_io *bdev_io) 2303 { 2304 uint32_t num_unmap_segments; 2305 2306 if (!bdev_io->bdev->max_unmap || !bdev_io->bdev->max_unmap_segments) { 2307 return false; 2308 } 2309 num_unmap_segments = spdk_divide_round_up(bdev_io->u.bdev.num_blocks, bdev_io->bdev->max_unmap); 2310 if (num_unmap_segments > bdev_io->bdev->max_unmap_segments) { 2311 return true; 2312 } 2313 2314 return false; 2315 } 2316 2317 static bool 2318 bdev_write_zeroes_should_split(struct spdk_bdev_io *bdev_io) 2319 { 2320 if (!bdev_io->bdev->max_write_zeroes) { 2321 return false; 2322 } 2323 2324 if (bdev_io->u.bdev.num_blocks > bdev_io->bdev->max_write_zeroes) { 2325 return true; 2326 } 2327 2328 return false; 2329 } 2330 2331 static bool 2332 bdev_io_should_split(struct spdk_bdev_io *bdev_io) 2333 { 2334 switch (bdev_io->type) { 2335 case SPDK_BDEV_IO_TYPE_READ: 2336 case SPDK_BDEV_IO_TYPE_WRITE: 2337 return bdev_rw_should_split(bdev_io); 2338 case SPDK_BDEV_IO_TYPE_UNMAP: 2339 return bdev_unmap_should_split(bdev_io); 2340 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2341 return bdev_write_zeroes_should_split(bdev_io); 2342 default: 2343 return false; 2344 } 2345 } 2346 2347 static uint32_t 2348 _to_next_boundary(uint64_t offset, uint32_t boundary) 2349 { 2350 return (boundary - (offset % boundary)); 2351 } 2352 2353 static void 2354 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg); 2355 2356 static void 2357 _bdev_rw_split(void *_bdev_io); 2358 2359 static void 2360 bdev_unmap_split(struct spdk_bdev_io *bdev_io); 2361 2362 static void 2363 _bdev_unmap_split(void *_bdev_io) 2364 { 2365 return bdev_unmap_split((struct spdk_bdev_io *)_bdev_io); 2366 } 2367 2368 static void 2369 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io); 2370 2371 static void 2372 _bdev_write_zeroes_split(void *_bdev_io) 2373 { 2374 return bdev_write_zeroes_split((struct spdk_bdev_io *)_bdev_io); 2375 } 2376 2377 static int 2378 bdev_io_split_submit(struct spdk_bdev_io *bdev_io, struct iovec *iov, int iovcnt, void *md_buf, 2379 uint64_t num_blocks, uint64_t *offset, uint64_t *remaining) 2380 { 2381 int rc; 2382 uint64_t current_offset, current_remaining; 2383 spdk_bdev_io_wait_cb io_wait_fn; 2384 2385 current_offset = *offset; 2386 current_remaining = *remaining; 2387 2388 bdev_io->u.bdev.split_outstanding++; 2389 2390 io_wait_fn = _bdev_rw_split; 2391 switch (bdev_io->type) { 2392 case SPDK_BDEV_IO_TYPE_READ: 2393 rc = bdev_readv_blocks_with_md(bdev_io->internal.desc, 2394 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2395 iov, iovcnt, md_buf, current_offset, 2396 num_blocks, 2397 bdev_io_split_done, bdev_io, 2398 bdev_io->internal.ext_opts, true); 2399 break; 2400 case SPDK_BDEV_IO_TYPE_WRITE: 2401 rc = bdev_writev_blocks_with_md(bdev_io->internal.desc, 2402 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2403 iov, iovcnt, md_buf, current_offset, 2404 num_blocks, 2405 bdev_io_split_done, bdev_io, 2406 bdev_io->internal.ext_opts, true); 2407 break; 2408 case SPDK_BDEV_IO_TYPE_UNMAP: 2409 io_wait_fn = _bdev_unmap_split; 2410 rc = spdk_bdev_unmap_blocks(bdev_io->internal.desc, 2411 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2412 current_offset, num_blocks, 2413 bdev_io_split_done, bdev_io); 2414 break; 2415 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2416 io_wait_fn = _bdev_write_zeroes_split; 2417 rc = spdk_bdev_write_zeroes_blocks(bdev_io->internal.desc, 2418 spdk_io_channel_from_ctx(bdev_io->internal.ch), 2419 current_offset, num_blocks, 2420 bdev_io_split_done, bdev_io); 2421 break; 2422 default: 2423 assert(false); 2424 rc = -EINVAL; 2425 break; 2426 } 2427 2428 if (rc == 0) { 2429 current_offset += num_blocks; 2430 current_remaining -= num_blocks; 2431 bdev_io->u.bdev.split_current_offset_blocks = current_offset; 2432 bdev_io->u.bdev.split_remaining_num_blocks = current_remaining; 2433 *offset = current_offset; 2434 *remaining = current_remaining; 2435 } else { 2436 bdev_io->u.bdev.split_outstanding--; 2437 if (rc == -ENOMEM) { 2438 if (bdev_io->u.bdev.split_outstanding == 0) { 2439 /* No I/O is outstanding. Hence we should wait here. */ 2440 bdev_queue_io_wait_with_cb(bdev_io, io_wait_fn); 2441 } 2442 } else { 2443 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2444 if (bdev_io->u.bdev.split_outstanding == 0) { 2445 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2446 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2447 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2448 } 2449 } 2450 } 2451 2452 return rc; 2453 } 2454 2455 static void 2456 _bdev_rw_split(void *_bdev_io) 2457 { 2458 struct iovec *parent_iov, *iov; 2459 struct spdk_bdev_io *bdev_io = _bdev_io; 2460 struct spdk_bdev *bdev = bdev_io->bdev; 2461 uint64_t parent_offset, current_offset, remaining; 2462 uint32_t parent_iov_offset, parent_iovcnt, parent_iovpos, child_iovcnt; 2463 uint32_t to_next_boundary, to_next_boundary_bytes, to_last_block_bytes; 2464 uint32_t iovcnt, iov_len, child_iovsize; 2465 uint32_t blocklen = bdev->blocklen; 2466 uint32_t io_boundary = bdev->optimal_io_boundary; 2467 uint32_t max_segment_size = bdev->max_segment_size; 2468 uint32_t max_child_iovcnt = bdev->max_num_segments; 2469 void *md_buf = NULL; 2470 int rc; 2471 2472 max_segment_size = max_segment_size ? max_segment_size : UINT32_MAX; 2473 max_child_iovcnt = max_child_iovcnt ? spdk_min(max_child_iovcnt, BDEV_IO_NUM_CHILD_IOV) : 2474 BDEV_IO_NUM_CHILD_IOV; 2475 io_boundary = bdev->split_on_optimal_io_boundary ? io_boundary : UINT32_MAX; 2476 2477 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2478 current_offset = bdev_io->u.bdev.split_current_offset_blocks; 2479 parent_offset = bdev_io->u.bdev.offset_blocks; 2480 parent_iov_offset = (current_offset - parent_offset) * blocklen; 2481 parent_iovcnt = bdev_io->u.bdev.iovcnt; 2482 2483 for (parent_iovpos = 0; parent_iovpos < parent_iovcnt; parent_iovpos++) { 2484 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2485 if (parent_iov_offset < parent_iov->iov_len) { 2486 break; 2487 } 2488 parent_iov_offset -= parent_iov->iov_len; 2489 } 2490 2491 child_iovcnt = 0; 2492 while (remaining > 0 && parent_iovpos < parent_iovcnt && child_iovcnt < BDEV_IO_NUM_CHILD_IOV) { 2493 to_next_boundary = _to_next_boundary(current_offset, io_boundary); 2494 to_next_boundary = spdk_min(remaining, to_next_boundary); 2495 to_next_boundary_bytes = to_next_boundary * blocklen; 2496 2497 iov = &bdev_io->child_iov[child_iovcnt]; 2498 iovcnt = 0; 2499 2500 if (bdev_io->u.bdev.md_buf) { 2501 md_buf = (char *)bdev_io->u.bdev.md_buf + 2502 (current_offset - parent_offset) * spdk_bdev_get_md_size(bdev); 2503 } 2504 2505 child_iovsize = spdk_min(BDEV_IO_NUM_CHILD_IOV - child_iovcnt, max_child_iovcnt); 2506 while (to_next_boundary_bytes > 0 && parent_iovpos < parent_iovcnt && 2507 iovcnt < child_iovsize) { 2508 parent_iov = &bdev_io->u.bdev.iovs[parent_iovpos]; 2509 iov_len = parent_iov->iov_len - parent_iov_offset; 2510 2511 iov_len = spdk_min(iov_len, max_segment_size); 2512 iov_len = spdk_min(iov_len, to_next_boundary_bytes); 2513 to_next_boundary_bytes -= iov_len; 2514 2515 bdev_io->child_iov[child_iovcnt].iov_base = parent_iov->iov_base + parent_iov_offset; 2516 bdev_io->child_iov[child_iovcnt].iov_len = iov_len; 2517 2518 if (iov_len < parent_iov->iov_len - parent_iov_offset) { 2519 parent_iov_offset += iov_len; 2520 } else { 2521 parent_iovpos++; 2522 parent_iov_offset = 0; 2523 } 2524 child_iovcnt++; 2525 iovcnt++; 2526 } 2527 2528 if (to_next_boundary_bytes > 0) { 2529 /* We had to stop this child I/O early because we ran out of 2530 * child_iov space or were limited by max_num_segments. 2531 * Ensure the iovs to be aligned with block size and 2532 * then adjust to_next_boundary before starting the 2533 * child I/O. 2534 */ 2535 assert(child_iovcnt == BDEV_IO_NUM_CHILD_IOV || 2536 iovcnt == child_iovsize); 2537 to_last_block_bytes = to_next_boundary_bytes % blocklen; 2538 if (to_last_block_bytes != 0) { 2539 uint32_t child_iovpos = child_iovcnt - 1; 2540 /* don't decrease child_iovcnt when it equals to BDEV_IO_NUM_CHILD_IOV 2541 * so the loop will naturally end 2542 */ 2543 2544 to_last_block_bytes = blocklen - to_last_block_bytes; 2545 to_next_boundary_bytes += to_last_block_bytes; 2546 while (to_last_block_bytes > 0 && iovcnt > 0) { 2547 iov_len = spdk_min(to_last_block_bytes, 2548 bdev_io->child_iov[child_iovpos].iov_len); 2549 bdev_io->child_iov[child_iovpos].iov_len -= iov_len; 2550 if (bdev_io->child_iov[child_iovpos].iov_len == 0) { 2551 child_iovpos--; 2552 if (--iovcnt == 0) { 2553 /* If the child IO is less than a block size just return. 2554 * If the first child IO of any split round is less than 2555 * a block size, an error exit. 2556 */ 2557 if (bdev_io->u.bdev.split_outstanding == 0) { 2558 SPDK_ERRLOG("The first child io was less than a block size\n"); 2559 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2560 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, bdev_io->internal.caller_ctx); 2561 TAILQ_REMOVE(&bdev_io->internal.ch->io_submitted, bdev_io, internal.ch_link); 2562 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 2563 } 2564 2565 return; 2566 } 2567 } 2568 2569 to_last_block_bytes -= iov_len; 2570 2571 if (parent_iov_offset == 0) { 2572 parent_iovpos--; 2573 parent_iov_offset = bdev_io->u.bdev.iovs[parent_iovpos].iov_len; 2574 } 2575 parent_iov_offset -= iov_len; 2576 } 2577 2578 assert(to_last_block_bytes == 0); 2579 } 2580 to_next_boundary -= to_next_boundary_bytes / blocklen; 2581 } 2582 2583 rc = bdev_io_split_submit(bdev_io, iov, iovcnt, md_buf, to_next_boundary, 2584 ¤t_offset, &remaining); 2585 if (spdk_unlikely(rc)) { 2586 return; 2587 } 2588 } 2589 } 2590 2591 static void 2592 bdev_unmap_split(struct spdk_bdev_io *bdev_io) 2593 { 2594 uint64_t offset, unmap_blocks, remaining, max_unmap_blocks; 2595 uint32_t num_children_reqs = 0; 2596 int rc; 2597 2598 offset = bdev_io->u.bdev.split_current_offset_blocks; 2599 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2600 max_unmap_blocks = bdev_io->bdev->max_unmap * bdev_io->bdev->max_unmap_segments; 2601 2602 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2603 unmap_blocks = spdk_min(remaining, max_unmap_blocks); 2604 2605 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, unmap_blocks, 2606 &offset, &remaining); 2607 if (spdk_likely(rc == 0)) { 2608 num_children_reqs++; 2609 } else { 2610 return; 2611 } 2612 } 2613 } 2614 2615 static void 2616 bdev_write_zeroes_split(struct spdk_bdev_io *bdev_io) 2617 { 2618 uint64_t offset, write_zeroes_blocks, remaining; 2619 uint32_t num_children_reqs = 0; 2620 int rc; 2621 2622 offset = bdev_io->u.bdev.split_current_offset_blocks; 2623 remaining = bdev_io->u.bdev.split_remaining_num_blocks; 2624 2625 while (remaining && (num_children_reqs < SPDK_BDEV_MAX_CHILDREN_UNMAP_WRITE_ZEROES_REQS)) { 2626 write_zeroes_blocks = spdk_min(remaining, bdev_io->bdev->max_write_zeroes); 2627 2628 rc = bdev_io_split_submit(bdev_io, NULL, 0, NULL, write_zeroes_blocks, 2629 &offset, &remaining); 2630 if (spdk_likely(rc == 0)) { 2631 num_children_reqs++; 2632 } else { 2633 return; 2634 } 2635 } 2636 } 2637 2638 static void 2639 parent_bdev_io_complete(void *ctx, int rc) 2640 { 2641 struct spdk_bdev_io *parent_io = ctx; 2642 2643 if (rc) { 2644 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2645 } 2646 2647 parent_io->internal.cb(parent_io, parent_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 2648 parent_io->internal.caller_ctx); 2649 } 2650 2651 static void 2652 bdev_io_split_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 2653 { 2654 struct spdk_bdev_io *parent_io = cb_arg; 2655 2656 spdk_bdev_free_io(bdev_io); 2657 2658 if (!success) { 2659 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 2660 /* If any child I/O failed, stop further splitting process. */ 2661 parent_io->u.bdev.split_current_offset_blocks += parent_io->u.bdev.split_remaining_num_blocks; 2662 parent_io->u.bdev.split_remaining_num_blocks = 0; 2663 } 2664 parent_io->u.bdev.split_outstanding--; 2665 if (parent_io->u.bdev.split_outstanding != 0) { 2666 return; 2667 } 2668 2669 /* 2670 * Parent I/O finishes when all blocks are consumed. 2671 */ 2672 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 2673 assert(parent_io->internal.cb != bdev_io_split_done); 2674 spdk_trace_record(TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)parent_io, bdev_io->internal.caller_ctx); 2675 TAILQ_REMOVE(&parent_io->internal.ch->io_submitted, parent_io, internal.ch_link); 2676 2677 if (parent_io->internal.orig_iovcnt != 0) { 2678 _bdev_io_push_bounce_data_buffer(parent_io, parent_bdev_io_complete); 2679 /* bdev IO will be completed in the callback */ 2680 } else { 2681 parent_bdev_io_complete(parent_io, 0); 2682 } 2683 return; 2684 } 2685 2686 /* 2687 * Continue with the splitting process. This function will complete the parent I/O if the 2688 * splitting is done. 2689 */ 2690 switch (parent_io->type) { 2691 case SPDK_BDEV_IO_TYPE_READ: 2692 case SPDK_BDEV_IO_TYPE_WRITE: 2693 _bdev_rw_split(parent_io); 2694 break; 2695 case SPDK_BDEV_IO_TYPE_UNMAP: 2696 bdev_unmap_split(parent_io); 2697 break; 2698 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2699 bdev_write_zeroes_split(parent_io); 2700 break; 2701 default: 2702 assert(false); 2703 break; 2704 } 2705 } 2706 2707 static void 2708 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success); 2709 2710 static void 2711 bdev_io_split(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io) 2712 { 2713 bdev_io->u.bdev.split_current_offset_blocks = bdev_io->u.bdev.offset_blocks; 2714 bdev_io->u.bdev.split_remaining_num_blocks = bdev_io->u.bdev.num_blocks; 2715 bdev_io->u.bdev.split_outstanding = 0; 2716 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 2717 2718 switch (bdev_io->type) { 2719 case SPDK_BDEV_IO_TYPE_READ: 2720 case SPDK_BDEV_IO_TYPE_WRITE: 2721 if (_is_buf_allocated(bdev_io->u.bdev.iovs)) { 2722 _bdev_rw_split(bdev_io); 2723 } else { 2724 assert(bdev_io->type == SPDK_BDEV_IO_TYPE_READ); 2725 spdk_bdev_io_get_buf(bdev_io, bdev_rw_split_get_buf_cb, 2726 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2727 } 2728 break; 2729 case SPDK_BDEV_IO_TYPE_UNMAP: 2730 bdev_unmap_split(bdev_io); 2731 break; 2732 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2733 bdev_write_zeroes_split(bdev_io); 2734 break; 2735 default: 2736 assert(false); 2737 break; 2738 } 2739 } 2740 2741 static void 2742 bdev_rw_split_get_buf_cb(struct spdk_io_channel *ch, struct spdk_bdev_io *bdev_io, bool success) 2743 { 2744 if (!success) { 2745 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2746 return; 2747 } 2748 2749 _bdev_rw_split(bdev_io); 2750 } 2751 2752 /* Explicitly mark this inline, since it's used as a function pointer and otherwise won't 2753 * be inlined, at least on some compilers. 2754 */ 2755 static inline void 2756 _bdev_io_submit(void *ctx) 2757 { 2758 struct spdk_bdev_io *bdev_io = ctx; 2759 struct spdk_bdev *bdev = bdev_io->bdev; 2760 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2761 uint64_t tsc; 2762 2763 tsc = spdk_get_ticks(); 2764 bdev_io->internal.submit_tsc = tsc; 2765 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_START, 0, 0, (uintptr_t)bdev_io, 2766 (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2767 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2768 2769 if (spdk_likely(bdev_ch->flags == 0)) { 2770 bdev_io_do_submit(bdev_ch, bdev_io); 2771 return; 2772 } 2773 2774 if (bdev_ch->flags & BDEV_CH_RESET_IN_PROGRESS) { 2775 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 2776 } else if (bdev_ch->flags & BDEV_CH_QOS_ENABLED) { 2777 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT) && 2778 bdev_abort_queued_io(&bdev->internal.qos->queued, bdev_io->u.abort.bio_to_abort)) { 2779 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_SUCCESS); 2780 } else { 2781 TAILQ_INSERT_TAIL(&bdev->internal.qos->queued, bdev_io, internal.link); 2782 bdev_qos_io_submit(bdev_ch, bdev->internal.qos); 2783 } 2784 } else { 2785 SPDK_ERRLOG("unknown bdev_ch flag %x found\n", bdev_ch->flags); 2786 _bdev_io_complete_in_submit(bdev_ch, bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 2787 } 2788 } 2789 2790 bool 2791 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2); 2792 2793 bool 2794 bdev_lba_range_overlapped(struct lba_range *range1, struct lba_range *range2) 2795 { 2796 if (range1->length == 0 || range2->length == 0) { 2797 return false; 2798 } 2799 2800 if (range1->offset + range1->length <= range2->offset) { 2801 return false; 2802 } 2803 2804 if (range2->offset + range2->length <= range1->offset) { 2805 return false; 2806 } 2807 2808 return true; 2809 } 2810 2811 static bool 2812 bdev_io_range_is_locked(struct spdk_bdev_io *bdev_io, struct lba_range *range) 2813 { 2814 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2815 struct lba_range r; 2816 2817 switch (bdev_io->type) { 2818 case SPDK_BDEV_IO_TYPE_NVME_IO: 2819 case SPDK_BDEV_IO_TYPE_NVME_IO_MD: 2820 /* Don't try to decode the NVMe command - just assume worst-case and that 2821 * it overlaps a locked range. 2822 */ 2823 return true; 2824 case SPDK_BDEV_IO_TYPE_WRITE: 2825 case SPDK_BDEV_IO_TYPE_UNMAP: 2826 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2827 case SPDK_BDEV_IO_TYPE_ZCOPY: 2828 r.offset = bdev_io->u.bdev.offset_blocks; 2829 r.length = bdev_io->u.bdev.num_blocks; 2830 if (!bdev_lba_range_overlapped(range, &r)) { 2831 /* This I/O doesn't overlap the specified LBA range. */ 2832 return false; 2833 } else if (range->owner_ch == ch && range->locked_ctx == bdev_io->internal.caller_ctx) { 2834 /* This I/O overlaps, but the I/O is on the same channel that locked this 2835 * range, and the caller_ctx is the same as the locked_ctx. This means 2836 * that this I/O is associated with the lock, and is allowed to execute. 2837 */ 2838 return false; 2839 } else { 2840 return true; 2841 } 2842 default: 2843 return false; 2844 } 2845 } 2846 2847 void 2848 bdev_io_submit(struct spdk_bdev_io *bdev_io) 2849 { 2850 struct spdk_bdev *bdev = bdev_io->bdev; 2851 struct spdk_thread *thread = spdk_bdev_io_get_thread(bdev_io); 2852 struct spdk_bdev_channel *ch = bdev_io->internal.ch; 2853 2854 assert(thread != NULL); 2855 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2856 2857 if (!TAILQ_EMPTY(&ch->locked_ranges)) { 2858 struct lba_range *range; 2859 2860 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 2861 if (bdev_io_range_is_locked(bdev_io, range)) { 2862 TAILQ_INSERT_TAIL(&ch->io_locked, bdev_io, internal.ch_link); 2863 return; 2864 } 2865 } 2866 } 2867 2868 TAILQ_INSERT_TAIL(&ch->io_submitted, bdev_io, internal.ch_link); 2869 2870 if (bdev_io_should_split(bdev_io)) { 2871 bdev_io->internal.submit_tsc = spdk_get_ticks(); 2872 spdk_trace_record_tsc(bdev_io->internal.submit_tsc, TRACE_BDEV_IO_START, 0, 0, 2873 (uintptr_t)bdev_io, (uint64_t)bdev_io->type, bdev_io->internal.caller_ctx, 2874 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks); 2875 bdev_io_split(NULL, bdev_io); 2876 return; 2877 } 2878 2879 if (ch->flags & BDEV_CH_QOS_ENABLED) { 2880 if ((thread == bdev->internal.qos->thread) || !bdev->internal.qos->thread) { 2881 _bdev_io_submit(bdev_io); 2882 } else { 2883 bdev_io->internal.io_submit_ch = ch; 2884 bdev_io->internal.ch = bdev->internal.qos->ch; 2885 spdk_thread_send_msg(bdev->internal.qos->thread, _bdev_io_submit, bdev_io); 2886 } 2887 } else { 2888 _bdev_io_submit(bdev_io); 2889 } 2890 } 2891 2892 static inline void 2893 _bdev_io_copy_ext_opts(struct spdk_bdev_io *bdev_io, struct spdk_bdev_ext_io_opts *opts) 2894 { 2895 struct spdk_bdev_ext_io_opts *opts_copy = &bdev_io->internal.ext_opts_copy; 2896 2897 /* Zero part we don't copy */ 2898 memset(((char *)opts_copy) + opts->size, 0, sizeof(*opts) - opts->size); 2899 memcpy(opts_copy, opts, opts->size); 2900 opts_copy->size = sizeof(*opts_copy); 2901 opts_copy->metadata = bdev_io->u.bdev.md_buf; 2902 /* Save pointer to the copied ext_opts which will be used by bdev modules */ 2903 bdev_io->u.bdev.ext_opts = opts_copy; 2904 } 2905 2906 static inline void 2907 _bdev_io_ext_use_bounce_buffer(struct spdk_bdev_io *bdev_io) 2908 { 2909 /* bdev doesn't support memory domains, thereby buffers in this IO request can't 2910 * be accessed directly. It is needed to allocate buffers before issuing IO operation. 2911 * For write operation we need to pull buffers from memory domain before submitting IO. 2912 * Once read operation completes, we need to use memory_domain push functionality to 2913 * update data in original memory domain IO buffer 2914 * This IO request will go through a regular IO flow, so clear memory domains pointers in 2915 * the copied ext_opts */ 2916 bdev_io->internal.ext_opts_copy.memory_domain = NULL; 2917 bdev_io->internal.ext_opts_copy.memory_domain_ctx = NULL; 2918 _bdev_memory_domain_io_get_buf(bdev_io, _bdev_memory_domain_get_io_cb, 2919 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen); 2920 } 2921 2922 static inline void 2923 _bdev_io_submit_ext(struct spdk_bdev_desc *desc, struct spdk_bdev_io *bdev_io, 2924 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 2925 { 2926 if (opts) { 2927 bool use_pull_push = opts->memory_domain && !desc->memory_domains_supported; 2928 assert(opts->size <= sizeof(*opts)); 2929 /* 2930 * copy if size is smaller than opts struct to avoid having to check size 2931 * on every access to bdev_io->u.bdev.ext_opts 2932 */ 2933 if (copy_opts || use_pull_push || opts->size < sizeof(*opts)) { 2934 _bdev_io_copy_ext_opts(bdev_io, opts); 2935 if (use_pull_push) { 2936 _bdev_io_ext_use_bounce_buffer(bdev_io); 2937 return; 2938 } 2939 } 2940 } 2941 bdev_io_submit(bdev_io); 2942 } 2943 2944 static void 2945 bdev_io_submit_reset(struct spdk_bdev_io *bdev_io) 2946 { 2947 struct spdk_bdev *bdev = bdev_io->bdev; 2948 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 2949 struct spdk_io_channel *ch = bdev_ch->channel; 2950 2951 assert(bdev_io->internal.status == SPDK_BDEV_IO_STATUS_PENDING); 2952 2953 bdev_io->internal.in_submit_request = true; 2954 bdev->fn_table->submit_request(ch, bdev_io); 2955 bdev_io->internal.in_submit_request = false; 2956 } 2957 2958 void 2959 bdev_io_init(struct spdk_bdev_io *bdev_io, 2960 struct spdk_bdev *bdev, void *cb_arg, 2961 spdk_bdev_io_completion_cb cb) 2962 { 2963 bdev_io->bdev = bdev; 2964 bdev_io->internal.caller_ctx = cb_arg; 2965 bdev_io->internal.cb = cb; 2966 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 2967 bdev_io->internal.in_submit_request = false; 2968 bdev_io->internal.buf = NULL; 2969 bdev_io->internal.io_submit_ch = NULL; 2970 bdev_io->internal.orig_iovs = NULL; 2971 bdev_io->internal.orig_iovcnt = 0; 2972 bdev_io->internal.orig_md_iov.iov_base = NULL; 2973 bdev_io->internal.error.nvme.cdw0 = 0; 2974 bdev_io->num_retries = 0; 2975 bdev_io->internal.get_buf_cb = NULL; 2976 bdev_io->internal.get_aux_buf_cb = NULL; 2977 bdev_io->internal.ext_opts = NULL; 2978 bdev_io->internal.data_transfer_cpl = NULL; 2979 } 2980 2981 static bool 2982 bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2983 { 2984 return bdev->fn_table->io_type_supported(bdev->ctxt, io_type); 2985 } 2986 2987 bool 2988 spdk_bdev_io_type_supported(struct spdk_bdev *bdev, enum spdk_bdev_io_type io_type) 2989 { 2990 bool supported; 2991 2992 supported = bdev_io_type_supported(bdev, io_type); 2993 2994 if (!supported) { 2995 switch (io_type) { 2996 case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: 2997 /* The bdev layer will emulate write zeroes as long as write is supported. */ 2998 supported = bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE); 2999 break; 3000 default: 3001 break; 3002 } 3003 } 3004 3005 return supported; 3006 } 3007 3008 int 3009 spdk_bdev_dump_info_json(struct spdk_bdev *bdev, struct spdk_json_write_ctx *w) 3010 { 3011 if (bdev->fn_table->dump_info_json) { 3012 return bdev->fn_table->dump_info_json(bdev->ctxt, w); 3013 } 3014 3015 return 0; 3016 } 3017 3018 static void 3019 bdev_qos_update_max_quota_per_timeslice(struct spdk_bdev_qos *qos) 3020 { 3021 uint32_t max_per_timeslice = 0; 3022 int i; 3023 3024 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3025 if (qos->rate_limits[i].limit == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3026 qos->rate_limits[i].max_per_timeslice = 0; 3027 continue; 3028 } 3029 3030 max_per_timeslice = qos->rate_limits[i].limit * 3031 SPDK_BDEV_QOS_TIMESLICE_IN_USEC / SPDK_SEC_TO_USEC; 3032 3033 qos->rate_limits[i].max_per_timeslice = spdk_max(max_per_timeslice, 3034 qos->rate_limits[i].min_per_timeslice); 3035 3036 qos->rate_limits[i].remaining_this_timeslice = qos->rate_limits[i].max_per_timeslice; 3037 } 3038 3039 bdev_qos_set_ops(qos); 3040 } 3041 3042 static int 3043 bdev_channel_poll_qos(void *arg) 3044 { 3045 struct spdk_bdev_qos *qos = arg; 3046 uint64_t now = spdk_get_ticks(); 3047 int i; 3048 3049 if (now < (qos->last_timeslice + qos->timeslice_size)) { 3050 /* We received our callback earlier than expected - return 3051 * immediately and wait to do accounting until at least one 3052 * timeslice has actually expired. This should never happen 3053 * with a well-behaved timer implementation. 3054 */ 3055 return SPDK_POLLER_IDLE; 3056 } 3057 3058 /* Reset for next round of rate limiting */ 3059 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3060 /* We may have allowed the IOs or bytes to slightly overrun in the last 3061 * timeslice. remaining_this_timeslice is signed, so if it's negative 3062 * here, we'll account for the overrun so that the next timeslice will 3063 * be appropriately reduced. 3064 */ 3065 if (qos->rate_limits[i].remaining_this_timeslice > 0) { 3066 qos->rate_limits[i].remaining_this_timeslice = 0; 3067 } 3068 } 3069 3070 while (now >= (qos->last_timeslice + qos->timeslice_size)) { 3071 qos->last_timeslice += qos->timeslice_size; 3072 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3073 qos->rate_limits[i].remaining_this_timeslice += 3074 qos->rate_limits[i].max_per_timeslice; 3075 } 3076 } 3077 3078 return bdev_qos_io_submit(qos->ch, qos); 3079 } 3080 3081 static void 3082 bdev_channel_destroy_resource(struct spdk_bdev_channel *ch) 3083 { 3084 struct spdk_bdev_shared_resource *shared_resource; 3085 struct lba_range *range; 3086 3087 while (!TAILQ_EMPTY(&ch->locked_ranges)) { 3088 range = TAILQ_FIRST(&ch->locked_ranges); 3089 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 3090 free(range); 3091 } 3092 3093 spdk_put_io_channel(ch->channel); 3094 3095 shared_resource = ch->shared_resource; 3096 3097 assert(TAILQ_EMPTY(&ch->io_locked)); 3098 assert(TAILQ_EMPTY(&ch->io_submitted)); 3099 assert(ch->io_outstanding == 0); 3100 assert(shared_resource->ref > 0); 3101 shared_resource->ref--; 3102 if (shared_resource->ref == 0) { 3103 assert(shared_resource->io_outstanding == 0); 3104 TAILQ_REMOVE(&shared_resource->mgmt_ch->shared_resources, shared_resource, link); 3105 spdk_put_io_channel(spdk_io_channel_from_ctx(shared_resource->mgmt_ch)); 3106 free(shared_resource); 3107 } 3108 } 3109 3110 /* Caller must hold bdev->internal.mutex. */ 3111 static void 3112 bdev_enable_qos(struct spdk_bdev *bdev, struct spdk_bdev_channel *ch) 3113 { 3114 struct spdk_bdev_qos *qos = bdev->internal.qos; 3115 int i; 3116 3117 /* Rate limiting on this bdev enabled */ 3118 if (qos) { 3119 if (qos->ch == NULL) { 3120 struct spdk_io_channel *io_ch; 3121 3122 SPDK_DEBUGLOG(bdev, "Selecting channel %p as QoS channel for bdev %s on thread %p\n", ch, 3123 bdev->name, spdk_get_thread()); 3124 3125 /* No qos channel has been selected, so set one up */ 3126 3127 /* Take another reference to ch */ 3128 io_ch = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 3129 assert(io_ch != NULL); 3130 qos->ch = ch; 3131 3132 qos->thread = spdk_io_channel_get_thread(io_ch); 3133 3134 TAILQ_INIT(&qos->queued); 3135 3136 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3137 if (bdev_qos_is_iops_rate_limit(i) == true) { 3138 qos->rate_limits[i].min_per_timeslice = 3139 SPDK_BDEV_QOS_MIN_IO_PER_TIMESLICE; 3140 } else { 3141 qos->rate_limits[i].min_per_timeslice = 3142 SPDK_BDEV_QOS_MIN_BYTE_PER_TIMESLICE; 3143 } 3144 3145 if (qos->rate_limits[i].limit == 0) { 3146 qos->rate_limits[i].limit = SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 3147 } 3148 } 3149 bdev_qos_update_max_quota_per_timeslice(qos); 3150 qos->timeslice_size = 3151 SPDK_BDEV_QOS_TIMESLICE_IN_USEC * spdk_get_ticks_hz() / SPDK_SEC_TO_USEC; 3152 qos->last_timeslice = spdk_get_ticks(); 3153 qos->poller = SPDK_POLLER_REGISTER(bdev_channel_poll_qos, 3154 qos, 3155 SPDK_BDEV_QOS_TIMESLICE_IN_USEC); 3156 } 3157 3158 ch->flags |= BDEV_CH_QOS_ENABLED; 3159 } 3160 } 3161 3162 struct poll_timeout_ctx { 3163 struct spdk_bdev_desc *desc; 3164 uint64_t timeout_in_sec; 3165 spdk_bdev_io_timeout_cb cb_fn; 3166 void *cb_arg; 3167 }; 3168 3169 static void 3170 bdev_desc_free(struct spdk_bdev_desc *desc) 3171 { 3172 pthread_mutex_destroy(&desc->mutex); 3173 free(desc->media_events_buffer); 3174 free(desc); 3175 } 3176 3177 static void 3178 bdev_channel_poll_timeout_io_done(struct spdk_io_channel_iter *i, int status) 3179 { 3180 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3181 struct spdk_bdev_desc *desc = ctx->desc; 3182 3183 free(ctx); 3184 3185 pthread_mutex_lock(&desc->mutex); 3186 desc->refs--; 3187 if (desc->closed == true && desc->refs == 0) { 3188 pthread_mutex_unlock(&desc->mutex); 3189 bdev_desc_free(desc); 3190 return; 3191 } 3192 pthread_mutex_unlock(&desc->mutex); 3193 } 3194 3195 static void 3196 bdev_channel_poll_timeout_io(struct spdk_io_channel_iter *i) 3197 { 3198 struct poll_timeout_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 3199 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3200 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 3201 struct spdk_bdev_desc *desc = ctx->desc; 3202 struct spdk_bdev_io *bdev_io; 3203 uint64_t now; 3204 3205 pthread_mutex_lock(&desc->mutex); 3206 if (desc->closed == true) { 3207 pthread_mutex_unlock(&desc->mutex); 3208 spdk_for_each_channel_continue(i, -1); 3209 return; 3210 } 3211 pthread_mutex_unlock(&desc->mutex); 3212 3213 now = spdk_get_ticks(); 3214 TAILQ_FOREACH(bdev_io, &bdev_ch->io_submitted, internal.ch_link) { 3215 /* Exclude any I/O that are generated via splitting. */ 3216 if (bdev_io->internal.cb == bdev_io_split_done) { 3217 continue; 3218 } 3219 3220 /* Once we find an I/O that has not timed out, we can immediately 3221 * exit the loop. 3222 */ 3223 if (now < (bdev_io->internal.submit_tsc + 3224 ctx->timeout_in_sec * spdk_get_ticks_hz())) { 3225 goto end; 3226 } 3227 3228 if (bdev_io->internal.desc == desc) { 3229 ctx->cb_fn(ctx->cb_arg, bdev_io); 3230 } 3231 } 3232 3233 end: 3234 spdk_for_each_channel_continue(i, 0); 3235 } 3236 3237 static int 3238 bdev_poll_timeout_io(void *arg) 3239 { 3240 struct spdk_bdev_desc *desc = arg; 3241 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3242 struct poll_timeout_ctx *ctx; 3243 3244 ctx = calloc(1, sizeof(struct poll_timeout_ctx)); 3245 if (!ctx) { 3246 SPDK_ERRLOG("failed to allocate memory\n"); 3247 return SPDK_POLLER_BUSY; 3248 } 3249 ctx->desc = desc; 3250 ctx->cb_arg = desc->cb_arg; 3251 ctx->cb_fn = desc->cb_fn; 3252 ctx->timeout_in_sec = desc->timeout_in_sec; 3253 3254 /* Take a ref on the descriptor in case it gets closed while we are checking 3255 * all of the channels. 3256 */ 3257 pthread_mutex_lock(&desc->mutex); 3258 desc->refs++; 3259 pthread_mutex_unlock(&desc->mutex); 3260 3261 spdk_for_each_channel(__bdev_to_io_dev(bdev), 3262 bdev_channel_poll_timeout_io, 3263 ctx, 3264 bdev_channel_poll_timeout_io_done); 3265 3266 return SPDK_POLLER_BUSY; 3267 } 3268 3269 int 3270 spdk_bdev_set_timeout(struct spdk_bdev_desc *desc, uint64_t timeout_in_sec, 3271 spdk_bdev_io_timeout_cb cb_fn, void *cb_arg) 3272 { 3273 assert(desc->thread == spdk_get_thread()); 3274 3275 spdk_poller_unregister(&desc->io_timeout_poller); 3276 3277 if (timeout_in_sec) { 3278 assert(cb_fn != NULL); 3279 desc->io_timeout_poller = SPDK_POLLER_REGISTER(bdev_poll_timeout_io, 3280 desc, 3281 SPDK_BDEV_IO_POLL_INTERVAL_IN_MSEC * SPDK_SEC_TO_USEC / 3282 1000); 3283 if (desc->io_timeout_poller == NULL) { 3284 SPDK_ERRLOG("can not register the desc timeout IO poller\n"); 3285 return -1; 3286 } 3287 } 3288 3289 desc->cb_fn = cb_fn; 3290 desc->cb_arg = cb_arg; 3291 desc->timeout_in_sec = timeout_in_sec; 3292 3293 return 0; 3294 } 3295 3296 static int 3297 bdev_channel_create(void *io_device, void *ctx_buf) 3298 { 3299 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 3300 struct spdk_bdev_channel *ch = ctx_buf; 3301 struct spdk_io_channel *mgmt_io_ch; 3302 struct spdk_bdev_mgmt_channel *mgmt_ch; 3303 struct spdk_bdev_shared_resource *shared_resource; 3304 struct lba_range *range; 3305 3306 ch->bdev = bdev; 3307 ch->channel = bdev->fn_table->get_io_channel(bdev->ctxt); 3308 if (!ch->channel) { 3309 return -1; 3310 } 3311 3312 spdk_trace_record(TRACE_BDEV_IOCH_CREATE, 0, 0, 0, ch->bdev->name, 3313 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3314 3315 assert(ch->histogram == NULL); 3316 if (bdev->internal.histogram_enabled) { 3317 ch->histogram = spdk_histogram_data_alloc(); 3318 if (ch->histogram == NULL) { 3319 SPDK_ERRLOG("Could not allocate histogram\n"); 3320 } 3321 } 3322 3323 mgmt_io_ch = spdk_get_io_channel(&g_bdev_mgr); 3324 if (!mgmt_io_ch) { 3325 spdk_put_io_channel(ch->channel); 3326 return -1; 3327 } 3328 3329 mgmt_ch = spdk_io_channel_get_ctx(mgmt_io_ch); 3330 TAILQ_FOREACH(shared_resource, &mgmt_ch->shared_resources, link) { 3331 if (shared_resource->shared_ch == ch->channel) { 3332 spdk_put_io_channel(mgmt_io_ch); 3333 shared_resource->ref++; 3334 break; 3335 } 3336 } 3337 3338 if (shared_resource == NULL) { 3339 shared_resource = calloc(1, sizeof(*shared_resource)); 3340 if (shared_resource == NULL) { 3341 spdk_put_io_channel(ch->channel); 3342 spdk_put_io_channel(mgmt_io_ch); 3343 return -1; 3344 } 3345 3346 shared_resource->mgmt_ch = mgmt_ch; 3347 shared_resource->io_outstanding = 0; 3348 TAILQ_INIT(&shared_resource->nomem_io); 3349 shared_resource->nomem_threshold = 0; 3350 shared_resource->shared_ch = ch->channel; 3351 shared_resource->ref = 1; 3352 TAILQ_INSERT_TAIL(&mgmt_ch->shared_resources, shared_resource, link); 3353 } 3354 3355 memset(&ch->stat, 0, sizeof(ch->stat)); 3356 ch->stat.ticks_rate = spdk_get_ticks_hz(); 3357 ch->io_outstanding = 0; 3358 TAILQ_INIT(&ch->queued_resets); 3359 TAILQ_INIT(&ch->locked_ranges); 3360 ch->flags = 0; 3361 ch->shared_resource = shared_resource; 3362 3363 TAILQ_INIT(&ch->io_submitted); 3364 TAILQ_INIT(&ch->io_locked); 3365 3366 #ifdef SPDK_CONFIG_VTUNE 3367 { 3368 char *name; 3369 __itt_init_ittlib(NULL, 0); 3370 name = spdk_sprintf_alloc("spdk_bdev_%s_%p", ch->bdev->name, ch); 3371 if (!name) { 3372 bdev_channel_destroy_resource(ch); 3373 return -1; 3374 } 3375 ch->handle = __itt_string_handle_create(name); 3376 free(name); 3377 ch->start_tsc = spdk_get_ticks(); 3378 ch->interval_tsc = spdk_get_ticks_hz() / 100; 3379 memset(&ch->prev_stat, 0, sizeof(ch->prev_stat)); 3380 } 3381 #endif 3382 3383 pthread_mutex_lock(&bdev->internal.mutex); 3384 bdev_enable_qos(bdev, ch); 3385 3386 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 3387 struct lba_range *new_range; 3388 3389 new_range = calloc(1, sizeof(*new_range)); 3390 if (new_range == NULL) { 3391 pthread_mutex_unlock(&bdev->internal.mutex); 3392 bdev_channel_destroy_resource(ch); 3393 return -1; 3394 } 3395 new_range->length = range->length; 3396 new_range->offset = range->offset; 3397 new_range->locked_ctx = range->locked_ctx; 3398 TAILQ_INSERT_TAIL(&ch->locked_ranges, new_range, tailq); 3399 } 3400 3401 pthread_mutex_unlock(&bdev->internal.mutex); 3402 3403 return 0; 3404 } 3405 3406 /* 3407 * Abort I/O that are waiting on a data buffer. These types of I/O are 3408 * linked using the spdk_bdev_io internal.buf_link TAILQ_ENTRY. 3409 */ 3410 static void 3411 bdev_abort_all_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_channel *ch) 3412 { 3413 bdev_io_stailq_t tmp; 3414 struct spdk_bdev_io *bdev_io; 3415 3416 STAILQ_INIT(&tmp); 3417 3418 while (!STAILQ_EMPTY(queue)) { 3419 bdev_io = STAILQ_FIRST(queue); 3420 STAILQ_REMOVE_HEAD(queue, internal.buf_link); 3421 if (bdev_io->internal.ch == ch) { 3422 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3423 } else { 3424 STAILQ_INSERT_TAIL(&tmp, bdev_io, internal.buf_link); 3425 } 3426 } 3427 3428 STAILQ_SWAP(&tmp, queue, spdk_bdev_io); 3429 } 3430 3431 /* 3432 * Abort I/O that are queued waiting for submission. These types of I/O are 3433 * linked using the spdk_bdev_io link TAILQ_ENTRY. 3434 */ 3435 static void 3436 bdev_abort_all_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_channel *ch) 3437 { 3438 struct spdk_bdev_io *bdev_io, *tmp; 3439 3440 TAILQ_FOREACH_SAFE(bdev_io, queue, internal.link, tmp) { 3441 if (bdev_io->internal.ch == ch) { 3442 TAILQ_REMOVE(queue, bdev_io, internal.link); 3443 /* 3444 * spdk_bdev_io_complete() assumes that the completed I/O had 3445 * been submitted to the bdev module. Since in this case it 3446 * hadn't, bump io_outstanding to account for the decrement 3447 * that spdk_bdev_io_complete() will do. 3448 */ 3449 if (bdev_io->type != SPDK_BDEV_IO_TYPE_RESET) { 3450 ch->io_outstanding++; 3451 ch->shared_resource->io_outstanding++; 3452 } 3453 spdk_bdev_io_complete(bdev_io, SPDK_BDEV_IO_STATUS_ABORTED); 3454 } 3455 } 3456 } 3457 3458 static bool 3459 bdev_abort_queued_io(bdev_io_tailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3460 { 3461 struct spdk_bdev_io *bdev_io; 3462 3463 TAILQ_FOREACH(bdev_io, queue, internal.link) { 3464 if (bdev_io == bio_to_abort) { 3465 TAILQ_REMOVE(queue, bio_to_abort, internal.link); 3466 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3467 return true; 3468 } 3469 } 3470 3471 return false; 3472 } 3473 3474 static bool 3475 bdev_abort_buf_io(bdev_io_stailq_t *queue, struct spdk_bdev_io *bio_to_abort) 3476 { 3477 struct spdk_bdev_io *bdev_io; 3478 3479 STAILQ_FOREACH(bdev_io, queue, internal.buf_link) { 3480 if (bdev_io == bio_to_abort) { 3481 STAILQ_REMOVE(queue, bio_to_abort, spdk_bdev_io, internal.buf_link); 3482 spdk_bdev_io_complete(bio_to_abort, SPDK_BDEV_IO_STATUS_ABORTED); 3483 return true; 3484 } 3485 } 3486 3487 return false; 3488 } 3489 3490 static void 3491 bdev_qos_channel_destroy(void *cb_arg) 3492 { 3493 struct spdk_bdev_qos *qos = cb_arg; 3494 3495 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 3496 spdk_poller_unregister(&qos->poller); 3497 3498 SPDK_DEBUGLOG(bdev, "Free QoS %p.\n", qos); 3499 3500 free(qos); 3501 } 3502 3503 static int 3504 bdev_qos_destroy(struct spdk_bdev *bdev) 3505 { 3506 int i; 3507 3508 /* 3509 * Cleanly shutting down the QoS poller is tricky, because 3510 * during the asynchronous operation the user could open 3511 * a new descriptor and create a new channel, spawning 3512 * a new QoS poller. 3513 * 3514 * The strategy is to create a new QoS structure here and swap it 3515 * in. The shutdown path then continues to refer to the old one 3516 * until it completes and then releases it. 3517 */ 3518 struct spdk_bdev_qos *new_qos, *old_qos; 3519 3520 old_qos = bdev->internal.qos; 3521 3522 new_qos = calloc(1, sizeof(*new_qos)); 3523 if (!new_qos) { 3524 SPDK_ERRLOG("Unable to allocate memory to shut down QoS.\n"); 3525 return -ENOMEM; 3526 } 3527 3528 /* Copy the old QoS data into the newly allocated structure */ 3529 memcpy(new_qos, old_qos, sizeof(*new_qos)); 3530 3531 /* Zero out the key parts of the QoS structure */ 3532 new_qos->ch = NULL; 3533 new_qos->thread = NULL; 3534 new_qos->poller = NULL; 3535 TAILQ_INIT(&new_qos->queued); 3536 /* 3537 * The limit member of spdk_bdev_qos_limit structure is not zeroed. 3538 * It will be used later for the new QoS structure. 3539 */ 3540 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3541 new_qos->rate_limits[i].remaining_this_timeslice = 0; 3542 new_qos->rate_limits[i].min_per_timeslice = 0; 3543 new_qos->rate_limits[i].max_per_timeslice = 0; 3544 } 3545 3546 bdev->internal.qos = new_qos; 3547 3548 if (old_qos->thread == NULL) { 3549 free(old_qos); 3550 } else { 3551 spdk_thread_send_msg(old_qos->thread, bdev_qos_channel_destroy, old_qos); 3552 } 3553 3554 /* It is safe to continue with destroying the bdev even though the QoS channel hasn't 3555 * been destroyed yet. The destruction path will end up waiting for the final 3556 * channel to be put before it releases resources. */ 3557 3558 return 0; 3559 } 3560 3561 static void 3562 bdev_io_stat_add(struct spdk_bdev_io_stat *total, struct spdk_bdev_io_stat *add) 3563 { 3564 total->bytes_read += add->bytes_read; 3565 total->num_read_ops += add->num_read_ops; 3566 total->bytes_written += add->bytes_written; 3567 total->num_write_ops += add->num_write_ops; 3568 total->bytes_unmapped += add->bytes_unmapped; 3569 total->num_unmap_ops += add->num_unmap_ops; 3570 total->read_latency_ticks += add->read_latency_ticks; 3571 total->write_latency_ticks += add->write_latency_ticks; 3572 total->unmap_latency_ticks += add->unmap_latency_ticks; 3573 } 3574 3575 static void 3576 bdev_channel_abort_queued_ios(struct spdk_bdev_channel *ch) 3577 { 3578 struct spdk_bdev_shared_resource *shared_resource = ch->shared_resource; 3579 struct spdk_bdev_mgmt_channel *mgmt_ch = shared_resource->mgmt_ch; 3580 3581 bdev_abort_all_queued_io(&shared_resource->nomem_io, ch); 3582 bdev_abort_all_buf_io(&mgmt_ch->need_buf_small, ch); 3583 bdev_abort_all_buf_io(&mgmt_ch->need_buf_large, ch); 3584 } 3585 3586 static void 3587 bdev_channel_destroy(void *io_device, void *ctx_buf) 3588 { 3589 struct spdk_bdev_channel *ch = ctx_buf; 3590 3591 SPDK_DEBUGLOG(bdev, "Destroying channel %p for bdev %s on thread %p\n", ch, ch->bdev->name, 3592 spdk_get_thread()); 3593 3594 spdk_trace_record(TRACE_BDEV_IOCH_DESTROY, 0, 0, 0, ch->bdev->name, 3595 spdk_thread_get_id(spdk_io_channel_get_thread(ch->channel))); 3596 3597 /* This channel is going away, so add its statistics into the bdev so that they don't get lost. */ 3598 pthread_mutex_lock(&ch->bdev->internal.mutex); 3599 bdev_io_stat_add(&ch->bdev->internal.stat, &ch->stat); 3600 pthread_mutex_unlock(&ch->bdev->internal.mutex); 3601 3602 bdev_abort_all_queued_io(&ch->queued_resets, ch); 3603 3604 bdev_channel_abort_queued_ios(ch); 3605 3606 if (ch->histogram) { 3607 spdk_histogram_data_free(ch->histogram); 3608 } 3609 3610 bdev_channel_destroy_resource(ch); 3611 } 3612 3613 /* 3614 * If the name already exists in the global bdev name tree, RB_INSERT() returns a pointer 3615 * to it. Hence we do not have to call bdev_get_by_name() when using this function. 3616 */ 3617 static int 3618 bdev_name_add(struct spdk_bdev_name *bdev_name, struct spdk_bdev *bdev, const char *name) 3619 { 3620 struct spdk_bdev_name *tmp; 3621 3622 bdev_name->name = strdup(name); 3623 if (bdev_name->name == NULL) { 3624 SPDK_ERRLOG("Unable to allocate bdev name\n"); 3625 return -ENOMEM; 3626 } 3627 3628 bdev_name->bdev = bdev; 3629 3630 pthread_mutex_lock(&g_bdev_mgr.mutex); 3631 tmp = RB_INSERT(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3632 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3633 3634 if (tmp != NULL) { 3635 SPDK_ERRLOG("Bdev name %s already exists\n", name); 3636 free(bdev_name->name); 3637 return -EEXIST; 3638 } 3639 3640 return 0; 3641 } 3642 3643 static void 3644 bdev_name_del_unsafe(struct spdk_bdev_name *bdev_name) 3645 { 3646 RB_REMOVE(bdev_name_tree, &g_bdev_mgr.bdev_names, bdev_name); 3647 free(bdev_name->name); 3648 } 3649 3650 static void 3651 bdev_name_del(struct spdk_bdev_name *bdev_name) 3652 { 3653 pthread_mutex_lock(&g_bdev_mgr.mutex); 3654 bdev_name_del_unsafe(bdev_name); 3655 pthread_mutex_unlock(&g_bdev_mgr.mutex); 3656 } 3657 3658 int 3659 spdk_bdev_alias_add(struct spdk_bdev *bdev, const char *alias) 3660 { 3661 struct spdk_bdev_alias *tmp; 3662 int ret; 3663 3664 if (alias == NULL) { 3665 SPDK_ERRLOG("Empty alias passed\n"); 3666 return -EINVAL; 3667 } 3668 3669 tmp = calloc(1, sizeof(*tmp)); 3670 if (tmp == NULL) { 3671 SPDK_ERRLOG("Unable to allocate alias\n"); 3672 return -ENOMEM; 3673 } 3674 3675 ret = bdev_name_add(&tmp->alias, bdev, alias); 3676 if (ret != 0) { 3677 free(tmp); 3678 return ret; 3679 } 3680 3681 TAILQ_INSERT_TAIL(&bdev->aliases, tmp, tailq); 3682 3683 return 0; 3684 } 3685 3686 static int 3687 bdev_alias_del(struct spdk_bdev *bdev, const char *alias, 3688 void (*alias_del_fn)(struct spdk_bdev_name *n)) 3689 { 3690 struct spdk_bdev_alias *tmp; 3691 3692 TAILQ_FOREACH(tmp, &bdev->aliases, tailq) { 3693 if (strcmp(alias, tmp->alias.name) == 0) { 3694 TAILQ_REMOVE(&bdev->aliases, tmp, tailq); 3695 alias_del_fn(&tmp->alias); 3696 free(tmp); 3697 return 0; 3698 } 3699 } 3700 3701 return -ENOENT; 3702 } 3703 3704 int 3705 spdk_bdev_alias_del(struct spdk_bdev *bdev, const char *alias) 3706 { 3707 int rc; 3708 3709 rc = bdev_alias_del(bdev, alias, bdev_name_del); 3710 if (rc == -ENOENT) { 3711 SPDK_INFOLOG(bdev, "Alias %s does not exist\n", alias); 3712 } 3713 3714 return rc; 3715 } 3716 3717 void 3718 spdk_bdev_alias_del_all(struct spdk_bdev *bdev) 3719 { 3720 struct spdk_bdev_alias *p, *tmp; 3721 3722 TAILQ_FOREACH_SAFE(p, &bdev->aliases, tailq, tmp) { 3723 TAILQ_REMOVE(&bdev->aliases, p, tailq); 3724 bdev_name_del(&p->alias); 3725 free(p); 3726 } 3727 } 3728 3729 struct spdk_io_channel * 3730 spdk_bdev_get_io_channel(struct spdk_bdev_desc *desc) 3731 { 3732 return spdk_get_io_channel(__bdev_to_io_dev(spdk_bdev_desc_get_bdev(desc))); 3733 } 3734 3735 void * 3736 spdk_bdev_get_module_ctx(struct spdk_bdev_desc *desc) 3737 { 3738 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 3739 void *ctx = NULL; 3740 3741 if (bdev->fn_table->get_module_ctx) { 3742 ctx = bdev->fn_table->get_module_ctx(bdev->ctxt); 3743 } 3744 3745 return ctx; 3746 } 3747 3748 const char * 3749 spdk_bdev_get_module_name(const struct spdk_bdev *bdev) 3750 { 3751 return bdev->module->name; 3752 } 3753 3754 const char * 3755 spdk_bdev_get_name(const struct spdk_bdev *bdev) 3756 { 3757 return bdev->name; 3758 } 3759 3760 const char * 3761 spdk_bdev_get_product_name(const struct spdk_bdev *bdev) 3762 { 3763 return bdev->product_name; 3764 } 3765 3766 const struct spdk_bdev_aliases_list * 3767 spdk_bdev_get_aliases(const struct spdk_bdev *bdev) 3768 { 3769 return &bdev->aliases; 3770 } 3771 3772 uint32_t 3773 spdk_bdev_get_block_size(const struct spdk_bdev *bdev) 3774 { 3775 return bdev->blocklen; 3776 } 3777 3778 uint32_t 3779 spdk_bdev_get_write_unit_size(const struct spdk_bdev *bdev) 3780 { 3781 return bdev->write_unit_size; 3782 } 3783 3784 uint64_t 3785 spdk_bdev_get_num_blocks(const struct spdk_bdev *bdev) 3786 { 3787 return bdev->blockcnt; 3788 } 3789 3790 const char * 3791 spdk_bdev_get_qos_rpc_type(enum spdk_bdev_qos_rate_limit_type type) 3792 { 3793 return qos_rpc_type[type]; 3794 } 3795 3796 void 3797 spdk_bdev_get_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 3798 { 3799 int i; 3800 3801 memset(limits, 0, sizeof(*limits) * SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES); 3802 3803 pthread_mutex_lock(&bdev->internal.mutex); 3804 if (bdev->internal.qos) { 3805 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 3806 if (bdev->internal.qos->rate_limits[i].limit != 3807 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 3808 limits[i] = bdev->internal.qos->rate_limits[i].limit; 3809 if (bdev_qos_is_iops_rate_limit(i) == false) { 3810 /* Change from Byte to Megabyte which is user visible. */ 3811 limits[i] = limits[i] / 1024 / 1024; 3812 } 3813 } 3814 } 3815 } 3816 pthread_mutex_unlock(&bdev->internal.mutex); 3817 } 3818 3819 size_t 3820 spdk_bdev_get_buf_align(const struct spdk_bdev *bdev) 3821 { 3822 return 1 << bdev->required_alignment; 3823 } 3824 3825 uint32_t 3826 spdk_bdev_get_optimal_io_boundary(const struct spdk_bdev *bdev) 3827 { 3828 return bdev->optimal_io_boundary; 3829 } 3830 3831 bool 3832 spdk_bdev_has_write_cache(const struct spdk_bdev *bdev) 3833 { 3834 return bdev->write_cache; 3835 } 3836 3837 const struct spdk_uuid * 3838 spdk_bdev_get_uuid(const struct spdk_bdev *bdev) 3839 { 3840 return &bdev->uuid; 3841 } 3842 3843 uint16_t 3844 spdk_bdev_get_acwu(const struct spdk_bdev *bdev) 3845 { 3846 return bdev->acwu; 3847 } 3848 3849 uint32_t 3850 spdk_bdev_get_md_size(const struct spdk_bdev *bdev) 3851 { 3852 return bdev->md_len; 3853 } 3854 3855 bool 3856 spdk_bdev_is_md_interleaved(const struct spdk_bdev *bdev) 3857 { 3858 return (bdev->md_len != 0) && bdev->md_interleave; 3859 } 3860 3861 bool 3862 spdk_bdev_is_md_separate(const struct spdk_bdev *bdev) 3863 { 3864 return (bdev->md_len != 0) && !bdev->md_interleave; 3865 } 3866 3867 bool 3868 spdk_bdev_is_zoned(const struct spdk_bdev *bdev) 3869 { 3870 return bdev->zoned; 3871 } 3872 3873 uint32_t 3874 spdk_bdev_get_data_block_size(const struct spdk_bdev *bdev) 3875 { 3876 if (spdk_bdev_is_md_interleaved(bdev)) { 3877 return bdev->blocklen - bdev->md_len; 3878 } else { 3879 return bdev->blocklen; 3880 } 3881 } 3882 3883 uint32_t 3884 spdk_bdev_get_physical_block_size(const struct spdk_bdev *bdev) 3885 { 3886 return bdev->phys_blocklen; 3887 } 3888 3889 static uint32_t 3890 _bdev_get_block_size_with_md(const struct spdk_bdev *bdev) 3891 { 3892 if (!spdk_bdev_is_md_interleaved(bdev)) { 3893 return bdev->blocklen + bdev->md_len; 3894 } else { 3895 return bdev->blocklen; 3896 } 3897 } 3898 3899 enum spdk_dif_type spdk_bdev_get_dif_type(const struct spdk_bdev *bdev) 3900 { 3901 if (bdev->md_len != 0) { 3902 return bdev->dif_type; 3903 } else { 3904 return SPDK_DIF_DISABLE; 3905 } 3906 } 3907 3908 bool 3909 spdk_bdev_is_dif_head_of_md(const struct spdk_bdev *bdev) 3910 { 3911 if (spdk_bdev_get_dif_type(bdev) != SPDK_DIF_DISABLE) { 3912 return bdev->dif_is_head_of_md; 3913 } else { 3914 return false; 3915 } 3916 } 3917 3918 bool 3919 spdk_bdev_is_dif_check_enabled(const struct spdk_bdev *bdev, 3920 enum spdk_dif_check_type check_type) 3921 { 3922 if (spdk_bdev_get_dif_type(bdev) == SPDK_DIF_DISABLE) { 3923 return false; 3924 } 3925 3926 switch (check_type) { 3927 case SPDK_DIF_CHECK_TYPE_REFTAG: 3928 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_REFTAG_CHECK) != 0; 3929 case SPDK_DIF_CHECK_TYPE_APPTAG: 3930 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_APPTAG_CHECK) != 0; 3931 case SPDK_DIF_CHECK_TYPE_GUARD: 3932 return (bdev->dif_check_flags & SPDK_DIF_FLAGS_GUARD_CHECK) != 0; 3933 default: 3934 return false; 3935 } 3936 } 3937 3938 uint64_t 3939 spdk_bdev_get_qd(const struct spdk_bdev *bdev) 3940 { 3941 return bdev->internal.measured_queue_depth; 3942 } 3943 3944 uint64_t 3945 spdk_bdev_get_qd_sampling_period(const struct spdk_bdev *bdev) 3946 { 3947 return bdev->internal.period; 3948 } 3949 3950 uint64_t 3951 spdk_bdev_get_weighted_io_time(const struct spdk_bdev *bdev) 3952 { 3953 return bdev->internal.weighted_io_time; 3954 } 3955 3956 uint64_t 3957 spdk_bdev_get_io_time(const struct spdk_bdev *bdev) 3958 { 3959 return bdev->internal.io_time; 3960 } 3961 3962 static void 3963 _calculate_measured_qd_cpl(struct spdk_io_channel_iter *i, int status) 3964 { 3965 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3966 3967 bdev->internal.measured_queue_depth = bdev->internal.temporary_queue_depth; 3968 3969 if (bdev->internal.measured_queue_depth) { 3970 bdev->internal.io_time += bdev->internal.period; 3971 bdev->internal.weighted_io_time += bdev->internal.period * bdev->internal.measured_queue_depth; 3972 } 3973 } 3974 3975 static void 3976 _calculate_measured_qd(struct spdk_io_channel_iter *i) 3977 { 3978 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 3979 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 3980 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(io_ch); 3981 3982 bdev->internal.temporary_queue_depth += ch->io_outstanding; 3983 spdk_for_each_channel_continue(i, 0); 3984 } 3985 3986 static int 3987 bdev_calculate_measured_queue_depth(void *ctx) 3988 { 3989 struct spdk_bdev *bdev = ctx; 3990 bdev->internal.temporary_queue_depth = 0; 3991 spdk_for_each_channel(__bdev_to_io_dev(bdev), _calculate_measured_qd, bdev, 3992 _calculate_measured_qd_cpl); 3993 return SPDK_POLLER_BUSY; 3994 } 3995 3996 void 3997 spdk_bdev_set_qd_sampling_period(struct spdk_bdev *bdev, uint64_t period) 3998 { 3999 bdev->internal.period = period; 4000 4001 if (bdev->internal.qd_poller != NULL) { 4002 spdk_poller_unregister(&bdev->internal.qd_poller); 4003 bdev->internal.measured_queue_depth = UINT64_MAX; 4004 } 4005 4006 if (period != 0) { 4007 bdev->internal.qd_poller = SPDK_POLLER_REGISTER(bdev_calculate_measured_queue_depth, bdev, 4008 period); 4009 } 4010 } 4011 4012 static void 4013 _resize_notify(void *arg) 4014 { 4015 struct spdk_bdev_desc *desc = arg; 4016 4017 pthread_mutex_lock(&desc->mutex); 4018 desc->refs--; 4019 if (!desc->closed) { 4020 pthread_mutex_unlock(&desc->mutex); 4021 desc->callback.event_fn(SPDK_BDEV_EVENT_RESIZE, 4022 desc->bdev, 4023 desc->callback.ctx); 4024 return; 4025 } else if (0 == desc->refs) { 4026 /* This descriptor was closed after this resize_notify message was sent. 4027 * spdk_bdev_close() could not free the descriptor since this message was 4028 * in flight, so we free it now using bdev_desc_free(). 4029 */ 4030 pthread_mutex_unlock(&desc->mutex); 4031 bdev_desc_free(desc); 4032 return; 4033 } 4034 pthread_mutex_unlock(&desc->mutex); 4035 } 4036 4037 int 4038 spdk_bdev_notify_blockcnt_change(struct spdk_bdev *bdev, uint64_t size) 4039 { 4040 struct spdk_bdev_desc *desc; 4041 int ret; 4042 4043 if (size == bdev->blockcnt) { 4044 return 0; 4045 } 4046 4047 pthread_mutex_lock(&bdev->internal.mutex); 4048 4049 /* bdev has open descriptors */ 4050 if (!TAILQ_EMPTY(&bdev->internal.open_descs) && 4051 bdev->blockcnt > size) { 4052 ret = -EBUSY; 4053 } else { 4054 bdev->blockcnt = size; 4055 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 4056 pthread_mutex_lock(&desc->mutex); 4057 if (!desc->closed) { 4058 desc->refs++; 4059 spdk_thread_send_msg(desc->thread, _resize_notify, desc); 4060 } 4061 pthread_mutex_unlock(&desc->mutex); 4062 } 4063 ret = 0; 4064 } 4065 4066 pthread_mutex_unlock(&bdev->internal.mutex); 4067 4068 return ret; 4069 } 4070 4071 /* 4072 * Convert I/O offset and length from bytes to blocks. 4073 * 4074 * Returns zero on success or non-zero if the byte parameters aren't divisible by the block size. 4075 */ 4076 static uint64_t 4077 bdev_bytes_to_blocks(struct spdk_bdev *bdev, uint64_t offset_bytes, uint64_t *offset_blocks, 4078 uint64_t num_bytes, uint64_t *num_blocks) 4079 { 4080 uint32_t block_size = bdev->blocklen; 4081 uint8_t shift_cnt; 4082 4083 /* Avoid expensive div operations if possible. These spdk_u32 functions are very cheap. */ 4084 if (spdk_likely(spdk_u32_is_pow2(block_size))) { 4085 shift_cnt = spdk_u32log2(block_size); 4086 *offset_blocks = offset_bytes >> shift_cnt; 4087 *num_blocks = num_bytes >> shift_cnt; 4088 return (offset_bytes - (*offset_blocks << shift_cnt)) | 4089 (num_bytes - (*num_blocks << shift_cnt)); 4090 } else { 4091 *offset_blocks = offset_bytes / block_size; 4092 *num_blocks = num_bytes / block_size; 4093 return (offset_bytes % block_size) | (num_bytes % block_size); 4094 } 4095 } 4096 4097 static bool 4098 bdev_io_valid_blocks(struct spdk_bdev *bdev, uint64_t offset_blocks, uint64_t num_blocks) 4099 { 4100 /* Return failure if offset_blocks + num_blocks is less than offset_blocks; indicates there 4101 * has been an overflow and hence the offset has been wrapped around */ 4102 if (offset_blocks + num_blocks < offset_blocks) { 4103 return false; 4104 } 4105 4106 /* Return failure if offset_blocks + num_blocks exceeds the size of the bdev */ 4107 if (offset_blocks + num_blocks > bdev->blockcnt) { 4108 return false; 4109 } 4110 4111 return true; 4112 } 4113 4114 static bool 4115 _bdev_io_check_md_buf(const struct iovec *iovs, const void *md_buf) 4116 { 4117 return _is_buf_allocated(iovs) == (md_buf != NULL); 4118 } 4119 4120 static int 4121 bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, void *buf, 4122 void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4123 spdk_bdev_io_completion_cb cb, void *cb_arg) 4124 { 4125 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4126 struct spdk_bdev_io *bdev_io; 4127 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4128 4129 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4130 return -EINVAL; 4131 } 4132 4133 bdev_io = bdev_channel_get_io(channel); 4134 if (!bdev_io) { 4135 return -ENOMEM; 4136 } 4137 4138 bdev_io->internal.ch = channel; 4139 bdev_io->internal.desc = desc; 4140 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4141 bdev_io->u.bdev.iovs = &bdev_io->iov; 4142 bdev_io->u.bdev.iovs[0].iov_base = buf; 4143 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4144 bdev_io->u.bdev.iovcnt = 1; 4145 bdev_io->u.bdev.md_buf = md_buf; 4146 bdev_io->u.bdev.num_blocks = num_blocks; 4147 bdev_io->u.bdev.offset_blocks = offset_blocks; 4148 bdev_io->u.bdev.ext_opts = NULL; 4149 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4150 4151 bdev_io_submit(bdev_io); 4152 return 0; 4153 } 4154 4155 int 4156 spdk_bdev_read(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4157 void *buf, uint64_t offset, uint64_t nbytes, 4158 spdk_bdev_io_completion_cb cb, void *cb_arg) 4159 { 4160 uint64_t offset_blocks, num_blocks; 4161 4162 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4163 nbytes, &num_blocks) != 0) { 4164 return -EINVAL; 4165 } 4166 4167 return spdk_bdev_read_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4168 } 4169 4170 int 4171 spdk_bdev_read_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4172 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4173 spdk_bdev_io_completion_cb cb, void *cb_arg) 4174 { 4175 return bdev_read_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, cb, cb_arg); 4176 } 4177 4178 int 4179 spdk_bdev_read_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4180 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4181 spdk_bdev_io_completion_cb cb, void *cb_arg) 4182 { 4183 struct iovec iov = { 4184 .iov_base = buf, 4185 }; 4186 4187 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4188 return -EINVAL; 4189 } 4190 4191 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4192 return -EINVAL; 4193 } 4194 4195 return bdev_read_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4196 cb, cb_arg); 4197 } 4198 4199 int 4200 spdk_bdev_readv(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4201 struct iovec *iov, int iovcnt, 4202 uint64_t offset, uint64_t nbytes, 4203 spdk_bdev_io_completion_cb cb, void *cb_arg) 4204 { 4205 uint64_t offset_blocks, num_blocks; 4206 4207 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4208 nbytes, &num_blocks) != 0) { 4209 return -EINVAL; 4210 } 4211 4212 return spdk_bdev_readv_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4213 } 4214 4215 static int 4216 bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4217 struct iovec *iov, int iovcnt, void *md_buf, uint64_t offset_blocks, 4218 uint64_t num_blocks, spdk_bdev_io_completion_cb cb, void *cb_arg, 4219 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4220 { 4221 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4222 struct spdk_bdev_io *bdev_io; 4223 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4224 4225 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4226 return -EINVAL; 4227 } 4228 4229 bdev_io = bdev_channel_get_io(channel); 4230 if (!bdev_io) { 4231 return -ENOMEM; 4232 } 4233 4234 bdev_io->internal.ch = channel; 4235 bdev_io->internal.desc = desc; 4236 bdev_io->type = SPDK_BDEV_IO_TYPE_READ; 4237 bdev_io->u.bdev.iovs = iov; 4238 bdev_io->u.bdev.iovcnt = iovcnt; 4239 bdev_io->u.bdev.md_buf = md_buf; 4240 bdev_io->u.bdev.num_blocks = num_blocks; 4241 bdev_io->u.bdev.offset_blocks = offset_blocks; 4242 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4243 bdev_io->internal.ext_opts = opts; 4244 bdev_io->u.bdev.ext_opts = opts; 4245 4246 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4247 4248 return 0; 4249 } 4250 4251 int spdk_bdev_readv_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4252 struct iovec *iov, int iovcnt, 4253 uint64_t offset_blocks, uint64_t num_blocks, 4254 spdk_bdev_io_completion_cb cb, void *cb_arg) 4255 { 4256 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4257 num_blocks, cb, cb_arg, NULL, false); 4258 } 4259 4260 int 4261 spdk_bdev_readv_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4262 struct iovec *iov, int iovcnt, void *md_buf, 4263 uint64_t offset_blocks, uint64_t num_blocks, 4264 spdk_bdev_io_completion_cb cb, void *cb_arg) 4265 { 4266 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4267 return -EINVAL; 4268 } 4269 4270 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4271 return -EINVAL; 4272 } 4273 4274 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4275 num_blocks, cb, cb_arg, NULL, false); 4276 } 4277 4278 static inline bool 4279 _bdev_io_check_opts(struct spdk_bdev_ext_io_opts *opts, struct iovec *iov) 4280 { 4281 /* 4282 * We check if opts size is at least of size when we first introduced 4283 * spdk_bdev_ext_io_opts (ac6f2bdd8d) since access to those members 4284 * are not checked internal. 4285 */ 4286 return opts->size >= offsetof(struct spdk_bdev_ext_io_opts, metadata) + 4287 sizeof(opts->metadata) && 4288 opts->size <= sizeof(*opts) && 4289 /* When memory domain is used, the user must provide data buffers */ 4290 (!opts->memory_domain || (iov && iov[0].iov_base)); 4291 } 4292 4293 int 4294 spdk_bdev_readv_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4295 struct iovec *iov, int iovcnt, 4296 uint64_t offset_blocks, uint64_t num_blocks, 4297 spdk_bdev_io_completion_cb cb, void *cb_arg, 4298 struct spdk_bdev_ext_io_opts *opts) 4299 { 4300 void *md = NULL; 4301 4302 if (opts) { 4303 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4304 return -EINVAL; 4305 } 4306 md = opts->metadata; 4307 } 4308 4309 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4310 return -EINVAL; 4311 } 4312 4313 if (md && !_bdev_io_check_md_buf(iov, md)) { 4314 return -EINVAL; 4315 } 4316 4317 return bdev_readv_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4318 num_blocks, cb, cb_arg, opts, false); 4319 } 4320 4321 static int 4322 bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4323 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4324 spdk_bdev_io_completion_cb cb, void *cb_arg) 4325 { 4326 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4327 struct spdk_bdev_io *bdev_io; 4328 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4329 4330 if (!desc->write) { 4331 return -EBADF; 4332 } 4333 4334 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4335 return -EINVAL; 4336 } 4337 4338 bdev_io = bdev_channel_get_io(channel); 4339 if (!bdev_io) { 4340 return -ENOMEM; 4341 } 4342 4343 bdev_io->internal.ch = channel; 4344 bdev_io->internal.desc = desc; 4345 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4346 bdev_io->u.bdev.iovs = &bdev_io->iov; 4347 bdev_io->u.bdev.iovs[0].iov_base = buf; 4348 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4349 bdev_io->u.bdev.iovcnt = 1; 4350 bdev_io->u.bdev.md_buf = md_buf; 4351 bdev_io->u.bdev.num_blocks = num_blocks; 4352 bdev_io->u.bdev.offset_blocks = offset_blocks; 4353 bdev_io->u.bdev.ext_opts = NULL; 4354 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4355 4356 bdev_io_submit(bdev_io); 4357 return 0; 4358 } 4359 4360 int 4361 spdk_bdev_write(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4362 void *buf, uint64_t offset, uint64_t nbytes, 4363 spdk_bdev_io_completion_cb cb, void *cb_arg) 4364 { 4365 uint64_t offset_blocks, num_blocks; 4366 4367 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4368 nbytes, &num_blocks) != 0) { 4369 return -EINVAL; 4370 } 4371 4372 return spdk_bdev_write_blocks(desc, ch, buf, offset_blocks, num_blocks, cb, cb_arg); 4373 } 4374 4375 int 4376 spdk_bdev_write_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4377 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4378 spdk_bdev_io_completion_cb cb, void *cb_arg) 4379 { 4380 return bdev_write_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4381 cb, cb_arg); 4382 } 4383 4384 int 4385 spdk_bdev_write_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4386 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4387 spdk_bdev_io_completion_cb cb, void *cb_arg) 4388 { 4389 struct iovec iov = { 4390 .iov_base = buf, 4391 }; 4392 4393 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4394 return -EINVAL; 4395 } 4396 4397 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4398 return -EINVAL; 4399 } 4400 4401 return bdev_write_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4402 cb, cb_arg); 4403 } 4404 4405 static int 4406 bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4407 struct iovec *iov, int iovcnt, void *md_buf, 4408 uint64_t offset_blocks, uint64_t num_blocks, 4409 spdk_bdev_io_completion_cb cb, void *cb_arg, 4410 struct spdk_bdev_ext_io_opts *opts, bool copy_opts) 4411 { 4412 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4413 struct spdk_bdev_io *bdev_io; 4414 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4415 4416 if (!desc->write) { 4417 return -EBADF; 4418 } 4419 4420 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4421 return -EINVAL; 4422 } 4423 4424 bdev_io = bdev_channel_get_io(channel); 4425 if (!bdev_io) { 4426 return -ENOMEM; 4427 } 4428 4429 bdev_io->internal.ch = channel; 4430 bdev_io->internal.desc = desc; 4431 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE; 4432 bdev_io->u.bdev.iovs = iov; 4433 bdev_io->u.bdev.iovcnt = iovcnt; 4434 bdev_io->u.bdev.md_buf = md_buf; 4435 bdev_io->u.bdev.num_blocks = num_blocks; 4436 bdev_io->u.bdev.offset_blocks = offset_blocks; 4437 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4438 bdev_io->internal.ext_opts = opts; 4439 bdev_io->u.bdev.ext_opts = opts; 4440 4441 _bdev_io_submit_ext(desc, bdev_io, opts, copy_opts); 4442 4443 return 0; 4444 } 4445 4446 int 4447 spdk_bdev_writev(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4448 struct iovec *iov, int iovcnt, 4449 uint64_t offset, uint64_t len, 4450 spdk_bdev_io_completion_cb cb, void *cb_arg) 4451 { 4452 uint64_t offset_blocks, num_blocks; 4453 4454 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4455 len, &num_blocks) != 0) { 4456 return -EINVAL; 4457 } 4458 4459 return spdk_bdev_writev_blocks(desc, ch, iov, iovcnt, offset_blocks, num_blocks, cb, cb_arg); 4460 } 4461 4462 int 4463 spdk_bdev_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4464 struct iovec *iov, int iovcnt, 4465 uint64_t offset_blocks, uint64_t num_blocks, 4466 spdk_bdev_io_completion_cb cb, void *cb_arg) 4467 { 4468 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4469 num_blocks, cb, cb_arg, NULL, false); 4470 } 4471 4472 int 4473 spdk_bdev_writev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4474 struct iovec *iov, int iovcnt, void *md_buf, 4475 uint64_t offset_blocks, uint64_t num_blocks, 4476 spdk_bdev_io_completion_cb cb, void *cb_arg) 4477 { 4478 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4479 return -EINVAL; 4480 } 4481 4482 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4483 return -EINVAL; 4484 } 4485 4486 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4487 num_blocks, cb, cb_arg, NULL, false); 4488 } 4489 4490 int 4491 spdk_bdev_writev_blocks_ext(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4492 struct iovec *iov, int iovcnt, 4493 uint64_t offset_blocks, uint64_t num_blocks, 4494 spdk_bdev_io_completion_cb cb, void *cb_arg, 4495 struct spdk_bdev_ext_io_opts *opts) 4496 { 4497 void *md = NULL; 4498 4499 if (opts) { 4500 if (spdk_unlikely(!_bdev_io_check_opts(opts, iov))) { 4501 return -EINVAL; 4502 } 4503 md = opts->metadata; 4504 } 4505 4506 if (md && !spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4507 return -EINVAL; 4508 } 4509 4510 if (md && !_bdev_io_check_md_buf(iov, md)) { 4511 return -EINVAL; 4512 } 4513 4514 return bdev_writev_blocks_with_md(desc, ch, iov, iovcnt, md, offset_blocks, 4515 num_blocks, cb, cb_arg, opts, false); 4516 } 4517 4518 static void 4519 bdev_compare_do_read_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4520 { 4521 struct spdk_bdev_io *parent_io = cb_arg; 4522 uint8_t *read_buf = bdev_io->u.bdev.iovs[0].iov_base; 4523 int i, rc = 0; 4524 4525 if (!success) { 4526 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4527 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4528 spdk_bdev_free_io(bdev_io); 4529 return; 4530 } 4531 4532 for (i = 0; i < parent_io->u.bdev.iovcnt; i++) { 4533 rc = memcmp(read_buf, 4534 parent_io->u.bdev.iovs[i].iov_base, 4535 parent_io->u.bdev.iovs[i].iov_len); 4536 if (rc) { 4537 break; 4538 } 4539 read_buf += parent_io->u.bdev.iovs[i].iov_len; 4540 } 4541 4542 spdk_bdev_free_io(bdev_io); 4543 4544 if (rc == 0) { 4545 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 4546 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 4547 } else { 4548 parent_io->internal.status = SPDK_BDEV_IO_STATUS_MISCOMPARE; 4549 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 4550 } 4551 } 4552 4553 static void 4554 bdev_compare_do_read(void *_bdev_io) 4555 { 4556 struct spdk_bdev_io *bdev_io = _bdev_io; 4557 int rc; 4558 4559 rc = spdk_bdev_read_blocks(bdev_io->internal.desc, 4560 spdk_io_channel_from_ctx(bdev_io->internal.ch), NULL, 4561 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4562 bdev_compare_do_read_done, bdev_io); 4563 4564 if (rc == -ENOMEM) { 4565 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_do_read); 4566 } else if (rc != 0) { 4567 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 4568 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4569 } 4570 } 4571 4572 static int 4573 bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4574 struct iovec *iov, int iovcnt, void *md_buf, 4575 uint64_t offset_blocks, uint64_t num_blocks, 4576 spdk_bdev_io_completion_cb cb, void *cb_arg) 4577 { 4578 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4579 struct spdk_bdev_io *bdev_io; 4580 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4581 4582 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4583 return -EINVAL; 4584 } 4585 4586 bdev_io = bdev_channel_get_io(channel); 4587 if (!bdev_io) { 4588 return -ENOMEM; 4589 } 4590 4591 bdev_io->internal.ch = channel; 4592 bdev_io->internal.desc = desc; 4593 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4594 bdev_io->u.bdev.iovs = iov; 4595 bdev_io->u.bdev.iovcnt = iovcnt; 4596 bdev_io->u.bdev.md_buf = md_buf; 4597 bdev_io->u.bdev.num_blocks = num_blocks; 4598 bdev_io->u.bdev.offset_blocks = offset_blocks; 4599 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4600 4601 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4602 bdev_io_submit(bdev_io); 4603 return 0; 4604 } 4605 4606 bdev_compare_do_read(bdev_io); 4607 4608 return 0; 4609 } 4610 4611 int 4612 spdk_bdev_comparev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4613 struct iovec *iov, int iovcnt, 4614 uint64_t offset_blocks, uint64_t num_blocks, 4615 spdk_bdev_io_completion_cb cb, void *cb_arg) 4616 { 4617 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, NULL, offset_blocks, 4618 num_blocks, cb, cb_arg); 4619 } 4620 4621 int 4622 spdk_bdev_comparev_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4623 struct iovec *iov, int iovcnt, void *md_buf, 4624 uint64_t offset_blocks, uint64_t num_blocks, 4625 spdk_bdev_io_completion_cb cb, void *cb_arg) 4626 { 4627 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4628 return -EINVAL; 4629 } 4630 4631 if (!_bdev_io_check_md_buf(iov, md_buf)) { 4632 return -EINVAL; 4633 } 4634 4635 return bdev_comparev_blocks_with_md(desc, ch, iov, iovcnt, md_buf, offset_blocks, 4636 num_blocks, cb, cb_arg); 4637 } 4638 4639 static int 4640 bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4641 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4642 spdk_bdev_io_completion_cb cb, void *cb_arg) 4643 { 4644 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4645 struct spdk_bdev_io *bdev_io; 4646 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4647 4648 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4649 return -EINVAL; 4650 } 4651 4652 bdev_io = bdev_channel_get_io(channel); 4653 if (!bdev_io) { 4654 return -ENOMEM; 4655 } 4656 4657 bdev_io->internal.ch = channel; 4658 bdev_io->internal.desc = desc; 4659 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE; 4660 bdev_io->u.bdev.iovs = &bdev_io->iov; 4661 bdev_io->u.bdev.iovs[0].iov_base = buf; 4662 bdev_io->u.bdev.iovs[0].iov_len = num_blocks * bdev->blocklen; 4663 bdev_io->u.bdev.iovcnt = 1; 4664 bdev_io->u.bdev.md_buf = md_buf; 4665 bdev_io->u.bdev.num_blocks = num_blocks; 4666 bdev_io->u.bdev.offset_blocks = offset_blocks; 4667 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4668 4669 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE)) { 4670 bdev_io_submit(bdev_io); 4671 return 0; 4672 } 4673 4674 bdev_compare_do_read(bdev_io); 4675 4676 return 0; 4677 } 4678 4679 int 4680 spdk_bdev_compare_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4681 void *buf, uint64_t offset_blocks, uint64_t num_blocks, 4682 spdk_bdev_io_completion_cb cb, void *cb_arg) 4683 { 4684 return bdev_compare_blocks_with_md(desc, ch, buf, NULL, offset_blocks, num_blocks, 4685 cb, cb_arg); 4686 } 4687 4688 int 4689 spdk_bdev_compare_blocks_with_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4690 void *buf, void *md_buf, uint64_t offset_blocks, uint64_t num_blocks, 4691 spdk_bdev_io_completion_cb cb, void *cb_arg) 4692 { 4693 struct iovec iov = { 4694 .iov_base = buf, 4695 }; 4696 4697 if (!spdk_bdev_is_md_separate(spdk_bdev_desc_get_bdev(desc))) { 4698 return -EINVAL; 4699 } 4700 4701 if (!_bdev_io_check_md_buf(&iov, md_buf)) { 4702 return -EINVAL; 4703 } 4704 4705 return bdev_compare_blocks_with_md(desc, ch, buf, md_buf, offset_blocks, num_blocks, 4706 cb, cb_arg); 4707 } 4708 4709 static void 4710 bdev_comparev_and_writev_blocks_unlocked(void *ctx, int unlock_status) 4711 { 4712 struct spdk_bdev_io *bdev_io = ctx; 4713 4714 if (unlock_status) { 4715 SPDK_ERRLOG("LBA range unlock failed\n"); 4716 } 4717 4718 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS ? true : 4719 false, bdev_io->internal.caller_ctx); 4720 } 4721 4722 static void 4723 bdev_comparev_and_writev_blocks_unlock(struct spdk_bdev_io *bdev_io, int status) 4724 { 4725 bdev_io->internal.status = status; 4726 4727 bdev_unlock_lba_range(bdev_io->internal.desc, spdk_io_channel_from_ctx(bdev_io->internal.ch), 4728 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4729 bdev_comparev_and_writev_blocks_unlocked, bdev_io); 4730 } 4731 4732 static void 4733 bdev_compare_and_write_do_write_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4734 { 4735 struct spdk_bdev_io *parent_io = cb_arg; 4736 4737 if (!success) { 4738 SPDK_ERRLOG("Compare and write operation failed\n"); 4739 } 4740 4741 spdk_bdev_free_io(bdev_io); 4742 4743 bdev_comparev_and_writev_blocks_unlock(parent_io, 4744 success ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED); 4745 } 4746 4747 static void 4748 bdev_compare_and_write_do_write(void *_bdev_io) 4749 { 4750 struct spdk_bdev_io *bdev_io = _bdev_io; 4751 int rc; 4752 4753 rc = spdk_bdev_writev_blocks(bdev_io->internal.desc, 4754 spdk_io_channel_from_ctx(bdev_io->internal.ch), 4755 bdev_io->u.bdev.fused_iovs, bdev_io->u.bdev.fused_iovcnt, 4756 bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4757 bdev_compare_and_write_do_write_done, bdev_io); 4758 4759 4760 if (rc == -ENOMEM) { 4761 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_write); 4762 } else if (rc != 0) { 4763 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FAILED); 4764 } 4765 } 4766 4767 static void 4768 bdev_compare_and_write_do_compare_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 4769 { 4770 struct spdk_bdev_io *parent_io = cb_arg; 4771 4772 spdk_bdev_free_io(bdev_io); 4773 4774 if (!success) { 4775 bdev_comparev_and_writev_blocks_unlock(parent_io, SPDK_BDEV_IO_STATUS_MISCOMPARE); 4776 return; 4777 } 4778 4779 bdev_compare_and_write_do_write(parent_io); 4780 } 4781 4782 static void 4783 bdev_compare_and_write_do_compare(void *_bdev_io) 4784 { 4785 struct spdk_bdev_io *bdev_io = _bdev_io; 4786 int rc; 4787 4788 rc = spdk_bdev_comparev_blocks(bdev_io->internal.desc, 4789 spdk_io_channel_from_ctx(bdev_io->internal.ch), bdev_io->u.bdev.iovs, 4790 bdev_io->u.bdev.iovcnt, bdev_io->u.bdev.offset_blocks, bdev_io->u.bdev.num_blocks, 4791 bdev_compare_and_write_do_compare_done, bdev_io); 4792 4793 if (rc == -ENOMEM) { 4794 bdev_queue_io_wait_with_cb(bdev_io, bdev_compare_and_write_do_compare); 4795 } else if (rc != 0) { 4796 bdev_comparev_and_writev_blocks_unlock(bdev_io, SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED); 4797 } 4798 } 4799 4800 static void 4801 bdev_comparev_and_writev_blocks_locked(void *ctx, int status) 4802 { 4803 struct spdk_bdev_io *bdev_io = ctx; 4804 4805 if (status) { 4806 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED; 4807 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 4808 return; 4809 } 4810 4811 bdev_compare_and_write_do_compare(bdev_io); 4812 } 4813 4814 int 4815 spdk_bdev_comparev_and_writev_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4816 struct iovec *compare_iov, int compare_iovcnt, 4817 struct iovec *write_iov, int write_iovcnt, 4818 uint64_t offset_blocks, uint64_t num_blocks, 4819 spdk_bdev_io_completion_cb cb, void *cb_arg) 4820 { 4821 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4822 struct spdk_bdev_io *bdev_io; 4823 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4824 4825 if (!desc->write) { 4826 return -EBADF; 4827 } 4828 4829 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4830 return -EINVAL; 4831 } 4832 4833 if (num_blocks > bdev->acwu) { 4834 return -EINVAL; 4835 } 4836 4837 bdev_io = bdev_channel_get_io(channel); 4838 if (!bdev_io) { 4839 return -ENOMEM; 4840 } 4841 4842 bdev_io->internal.ch = channel; 4843 bdev_io->internal.desc = desc; 4844 bdev_io->type = SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE; 4845 bdev_io->u.bdev.iovs = compare_iov; 4846 bdev_io->u.bdev.iovcnt = compare_iovcnt; 4847 bdev_io->u.bdev.fused_iovs = write_iov; 4848 bdev_io->u.bdev.fused_iovcnt = write_iovcnt; 4849 bdev_io->u.bdev.md_buf = NULL; 4850 bdev_io->u.bdev.num_blocks = num_blocks; 4851 bdev_io->u.bdev.offset_blocks = offset_blocks; 4852 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4853 4854 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_COMPARE_AND_WRITE)) { 4855 bdev_io_submit(bdev_io); 4856 return 0; 4857 } 4858 4859 return bdev_lock_lba_range(desc, ch, offset_blocks, num_blocks, 4860 bdev_comparev_and_writev_blocks_locked, bdev_io); 4861 } 4862 4863 int 4864 spdk_bdev_zcopy_start(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4865 struct iovec *iov, int iovcnt, 4866 uint64_t offset_blocks, uint64_t num_blocks, 4867 bool populate, 4868 spdk_bdev_io_completion_cb cb, void *cb_arg) 4869 { 4870 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4871 struct spdk_bdev_io *bdev_io; 4872 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4873 4874 if (!desc->write) { 4875 return -EBADF; 4876 } 4877 4878 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4879 return -EINVAL; 4880 } 4881 4882 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ZCOPY)) { 4883 return -ENOTSUP; 4884 } 4885 4886 bdev_io = bdev_channel_get_io(channel); 4887 if (!bdev_io) { 4888 return -ENOMEM; 4889 } 4890 4891 bdev_io->internal.ch = channel; 4892 bdev_io->internal.desc = desc; 4893 bdev_io->type = SPDK_BDEV_IO_TYPE_ZCOPY; 4894 bdev_io->u.bdev.num_blocks = num_blocks; 4895 bdev_io->u.bdev.offset_blocks = offset_blocks; 4896 bdev_io->u.bdev.iovs = iov; 4897 bdev_io->u.bdev.iovcnt = iovcnt; 4898 bdev_io->u.bdev.md_buf = NULL; 4899 bdev_io->u.bdev.zcopy.populate = populate ? 1 : 0; 4900 bdev_io->u.bdev.zcopy.commit = 0; 4901 bdev_io->u.bdev.zcopy.start = 1; 4902 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4903 4904 bdev_io_submit(bdev_io); 4905 4906 return 0; 4907 } 4908 4909 int 4910 spdk_bdev_zcopy_end(struct spdk_bdev_io *bdev_io, bool commit, 4911 spdk_bdev_io_completion_cb cb, void *cb_arg) 4912 { 4913 if (bdev_io->type != SPDK_BDEV_IO_TYPE_ZCOPY) { 4914 return -EINVAL; 4915 } 4916 4917 bdev_io->u.bdev.zcopy.commit = commit ? 1 : 0; 4918 bdev_io->u.bdev.zcopy.start = 0; 4919 bdev_io->internal.caller_ctx = cb_arg; 4920 bdev_io->internal.cb = cb; 4921 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_PENDING; 4922 4923 bdev_io_submit(bdev_io); 4924 4925 return 0; 4926 } 4927 4928 int 4929 spdk_bdev_write_zeroes(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4930 uint64_t offset, uint64_t len, 4931 spdk_bdev_io_completion_cb cb, void *cb_arg) 4932 { 4933 uint64_t offset_blocks, num_blocks; 4934 4935 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 4936 len, &num_blocks) != 0) { 4937 return -EINVAL; 4938 } 4939 4940 return spdk_bdev_write_zeroes_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 4941 } 4942 4943 int 4944 spdk_bdev_write_zeroes_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4945 uint64_t offset_blocks, uint64_t num_blocks, 4946 spdk_bdev_io_completion_cb cb, void *cb_arg) 4947 { 4948 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 4949 struct spdk_bdev_io *bdev_io; 4950 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 4951 4952 if (!desc->write) { 4953 return -EBADF; 4954 } 4955 4956 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 4957 return -EINVAL; 4958 } 4959 4960 if (!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES) && 4961 !bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)) { 4962 return -ENOTSUP; 4963 } 4964 4965 bdev_io = bdev_channel_get_io(channel); 4966 4967 if (!bdev_io) { 4968 return -ENOMEM; 4969 } 4970 4971 bdev_io->type = SPDK_BDEV_IO_TYPE_WRITE_ZEROES; 4972 bdev_io->internal.ch = channel; 4973 bdev_io->internal.desc = desc; 4974 bdev_io->u.bdev.offset_blocks = offset_blocks; 4975 bdev_io->u.bdev.num_blocks = num_blocks; 4976 bdev_io_init(bdev_io, bdev, cb_arg, cb); 4977 4978 if (bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE_ZEROES)) { 4979 bdev_io_submit(bdev_io); 4980 return 0; 4981 } 4982 4983 assert(bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_WRITE)); 4984 assert(_bdev_get_block_size_with_md(bdev) <= ZERO_BUFFER_SIZE); 4985 bdev_io->u.bdev.split_remaining_num_blocks = num_blocks; 4986 bdev_io->u.bdev.split_current_offset_blocks = offset_blocks; 4987 bdev_write_zero_buffer_next(bdev_io); 4988 4989 return 0; 4990 } 4991 4992 int 4993 spdk_bdev_unmap(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 4994 uint64_t offset, uint64_t nbytes, 4995 spdk_bdev_io_completion_cb cb, void *cb_arg) 4996 { 4997 uint64_t offset_blocks, num_blocks; 4998 4999 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5000 nbytes, &num_blocks) != 0) { 5001 return -EINVAL; 5002 } 5003 5004 return spdk_bdev_unmap_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5005 } 5006 5007 int 5008 spdk_bdev_unmap_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5009 uint64_t offset_blocks, uint64_t num_blocks, 5010 spdk_bdev_io_completion_cb cb, void *cb_arg) 5011 { 5012 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5013 struct spdk_bdev_io *bdev_io; 5014 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5015 5016 if (!desc->write) { 5017 return -EBADF; 5018 } 5019 5020 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5021 return -EINVAL; 5022 } 5023 5024 if (num_blocks == 0) { 5025 SPDK_ERRLOG("Can't unmap 0 bytes\n"); 5026 return -EINVAL; 5027 } 5028 5029 bdev_io = bdev_channel_get_io(channel); 5030 if (!bdev_io) { 5031 return -ENOMEM; 5032 } 5033 5034 bdev_io->internal.ch = channel; 5035 bdev_io->internal.desc = desc; 5036 bdev_io->type = SPDK_BDEV_IO_TYPE_UNMAP; 5037 5038 bdev_io->u.bdev.iovs = &bdev_io->iov; 5039 bdev_io->u.bdev.iovs[0].iov_base = NULL; 5040 bdev_io->u.bdev.iovs[0].iov_len = 0; 5041 bdev_io->u.bdev.iovcnt = 1; 5042 5043 bdev_io->u.bdev.offset_blocks = offset_blocks; 5044 bdev_io->u.bdev.num_blocks = num_blocks; 5045 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5046 5047 bdev_io_submit(bdev_io); 5048 return 0; 5049 } 5050 5051 int 5052 spdk_bdev_flush(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5053 uint64_t offset, uint64_t length, 5054 spdk_bdev_io_completion_cb cb, void *cb_arg) 5055 { 5056 uint64_t offset_blocks, num_blocks; 5057 5058 if (bdev_bytes_to_blocks(spdk_bdev_desc_get_bdev(desc), offset, &offset_blocks, 5059 length, &num_blocks) != 0) { 5060 return -EINVAL; 5061 } 5062 5063 return spdk_bdev_flush_blocks(desc, ch, offset_blocks, num_blocks, cb, cb_arg); 5064 } 5065 5066 int 5067 spdk_bdev_flush_blocks(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5068 uint64_t offset_blocks, uint64_t num_blocks, 5069 spdk_bdev_io_completion_cb cb, void *cb_arg) 5070 { 5071 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5072 struct spdk_bdev_io *bdev_io; 5073 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5074 5075 if (!desc->write) { 5076 return -EBADF; 5077 } 5078 5079 if (!bdev_io_valid_blocks(bdev, offset_blocks, num_blocks)) { 5080 return -EINVAL; 5081 } 5082 5083 bdev_io = bdev_channel_get_io(channel); 5084 if (!bdev_io) { 5085 return -ENOMEM; 5086 } 5087 5088 bdev_io->internal.ch = channel; 5089 bdev_io->internal.desc = desc; 5090 bdev_io->type = SPDK_BDEV_IO_TYPE_FLUSH; 5091 bdev_io->u.bdev.iovs = NULL; 5092 bdev_io->u.bdev.iovcnt = 0; 5093 bdev_io->u.bdev.offset_blocks = offset_blocks; 5094 bdev_io->u.bdev.num_blocks = num_blocks; 5095 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5096 5097 bdev_io_submit(bdev_io); 5098 return 0; 5099 } 5100 5101 static void 5102 bdev_reset_dev(struct spdk_io_channel_iter *i, int status) 5103 { 5104 struct spdk_bdev_channel *ch = spdk_io_channel_iter_get_ctx(i); 5105 struct spdk_bdev_io *bdev_io; 5106 5107 bdev_io = TAILQ_FIRST(&ch->queued_resets); 5108 TAILQ_REMOVE(&ch->queued_resets, bdev_io, internal.link); 5109 bdev_io_submit_reset(bdev_io); 5110 } 5111 5112 static void 5113 bdev_reset_freeze_channel(struct spdk_io_channel_iter *i) 5114 { 5115 struct spdk_io_channel *ch; 5116 struct spdk_bdev_channel *channel; 5117 struct spdk_bdev_mgmt_channel *mgmt_channel; 5118 struct spdk_bdev_shared_resource *shared_resource; 5119 bdev_io_tailq_t tmp_queued; 5120 5121 TAILQ_INIT(&tmp_queued); 5122 5123 ch = spdk_io_channel_iter_get_channel(i); 5124 channel = spdk_io_channel_get_ctx(ch); 5125 shared_resource = channel->shared_resource; 5126 mgmt_channel = shared_resource->mgmt_ch; 5127 5128 channel->flags |= BDEV_CH_RESET_IN_PROGRESS; 5129 5130 if ((channel->flags & BDEV_CH_QOS_ENABLED) != 0) { 5131 /* The QoS object is always valid and readable while 5132 * the channel flag is set, so the lock here should not 5133 * be necessary. We're not in the fast path though, so 5134 * just take it anyway. */ 5135 pthread_mutex_lock(&channel->bdev->internal.mutex); 5136 if (channel->bdev->internal.qos->ch == channel) { 5137 TAILQ_SWAP(&channel->bdev->internal.qos->queued, &tmp_queued, spdk_bdev_io, internal.link); 5138 } 5139 pthread_mutex_unlock(&channel->bdev->internal.mutex); 5140 } 5141 5142 bdev_abort_all_queued_io(&shared_resource->nomem_io, channel); 5143 bdev_abort_all_buf_io(&mgmt_channel->need_buf_small, channel); 5144 bdev_abort_all_buf_io(&mgmt_channel->need_buf_large, channel); 5145 bdev_abort_all_queued_io(&tmp_queued, channel); 5146 5147 spdk_for_each_channel_continue(i, 0); 5148 } 5149 5150 static void 5151 bdev_start_reset(void *ctx) 5152 { 5153 struct spdk_bdev_channel *ch = ctx; 5154 5155 spdk_for_each_channel(__bdev_to_io_dev(ch->bdev), bdev_reset_freeze_channel, 5156 ch, bdev_reset_dev); 5157 } 5158 5159 static void 5160 bdev_channel_start_reset(struct spdk_bdev_channel *ch) 5161 { 5162 struct spdk_bdev *bdev = ch->bdev; 5163 5164 assert(!TAILQ_EMPTY(&ch->queued_resets)); 5165 5166 pthread_mutex_lock(&bdev->internal.mutex); 5167 if (bdev->internal.reset_in_progress == NULL) { 5168 bdev->internal.reset_in_progress = TAILQ_FIRST(&ch->queued_resets); 5169 /* 5170 * Take a channel reference for the target bdev for the life of this 5171 * reset. This guards against the channel getting destroyed while 5172 * spdk_for_each_channel() calls related to this reset IO are in 5173 * progress. We will release the reference when this reset is 5174 * completed. 5175 */ 5176 bdev->internal.reset_in_progress->u.reset.ch_ref = spdk_get_io_channel(__bdev_to_io_dev(bdev)); 5177 bdev_start_reset(ch); 5178 } 5179 pthread_mutex_unlock(&bdev->internal.mutex); 5180 } 5181 5182 int 5183 spdk_bdev_reset(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5184 spdk_bdev_io_completion_cb cb, void *cb_arg) 5185 { 5186 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5187 struct spdk_bdev_io *bdev_io; 5188 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5189 5190 bdev_io = bdev_channel_get_io(channel); 5191 if (!bdev_io) { 5192 return -ENOMEM; 5193 } 5194 5195 bdev_io->internal.ch = channel; 5196 bdev_io->internal.desc = desc; 5197 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5198 bdev_io->type = SPDK_BDEV_IO_TYPE_RESET; 5199 bdev_io->u.reset.ch_ref = NULL; 5200 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5201 5202 pthread_mutex_lock(&bdev->internal.mutex); 5203 TAILQ_INSERT_TAIL(&channel->queued_resets, bdev_io, internal.link); 5204 pthread_mutex_unlock(&bdev->internal.mutex); 5205 5206 TAILQ_INSERT_TAIL(&bdev_io->internal.ch->io_submitted, bdev_io, 5207 internal.ch_link); 5208 5209 bdev_channel_start_reset(channel); 5210 5211 return 0; 5212 } 5213 5214 void 5215 spdk_bdev_get_io_stat(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5216 struct spdk_bdev_io_stat *stat) 5217 { 5218 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5219 5220 *stat = channel->stat; 5221 } 5222 5223 static void 5224 bdev_get_device_stat_done(struct spdk_io_channel_iter *i, int status) 5225 { 5226 void *io_device = spdk_io_channel_iter_get_io_device(i); 5227 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5228 5229 bdev_iostat_ctx->cb(__bdev_from_io_dev(io_device), bdev_iostat_ctx->stat, 5230 bdev_iostat_ctx->cb_arg, 0); 5231 free(bdev_iostat_ctx); 5232 } 5233 5234 static void 5235 bdev_get_each_channel_stat(struct spdk_io_channel_iter *i) 5236 { 5237 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx = spdk_io_channel_iter_get_ctx(i); 5238 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 5239 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5240 5241 bdev_io_stat_add(bdev_iostat_ctx->stat, &channel->stat); 5242 spdk_for_each_channel_continue(i, 0); 5243 } 5244 5245 void 5246 spdk_bdev_get_device_stat(struct spdk_bdev *bdev, struct spdk_bdev_io_stat *stat, 5247 spdk_bdev_get_device_stat_cb cb, void *cb_arg) 5248 { 5249 struct spdk_bdev_iostat_ctx *bdev_iostat_ctx; 5250 5251 assert(bdev != NULL); 5252 assert(stat != NULL); 5253 assert(cb != NULL); 5254 5255 bdev_iostat_ctx = calloc(1, sizeof(struct spdk_bdev_iostat_ctx)); 5256 if (bdev_iostat_ctx == NULL) { 5257 SPDK_ERRLOG("Unable to allocate memory for spdk_bdev_iostat_ctx\n"); 5258 cb(bdev, stat, cb_arg, -ENOMEM); 5259 return; 5260 } 5261 5262 bdev_iostat_ctx->stat = stat; 5263 bdev_iostat_ctx->cb = cb; 5264 bdev_iostat_ctx->cb_arg = cb_arg; 5265 5266 /* Start with the statistics from previously deleted channels. */ 5267 pthread_mutex_lock(&bdev->internal.mutex); 5268 bdev_io_stat_add(bdev_iostat_ctx->stat, &bdev->internal.stat); 5269 pthread_mutex_unlock(&bdev->internal.mutex); 5270 5271 /* Then iterate and add the statistics from each existing channel. */ 5272 spdk_for_each_channel(__bdev_to_io_dev(bdev), 5273 bdev_get_each_channel_stat, 5274 bdev_iostat_ctx, 5275 bdev_get_device_stat_done); 5276 } 5277 5278 int 5279 spdk_bdev_nvme_admin_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5280 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5281 spdk_bdev_io_completion_cb cb, void *cb_arg) 5282 { 5283 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5284 struct spdk_bdev_io *bdev_io; 5285 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5286 5287 if (!desc->write) { 5288 return -EBADF; 5289 } 5290 5291 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_ADMIN))) { 5292 return -ENOTSUP; 5293 } 5294 5295 bdev_io = bdev_channel_get_io(channel); 5296 if (!bdev_io) { 5297 return -ENOMEM; 5298 } 5299 5300 bdev_io->internal.ch = channel; 5301 bdev_io->internal.desc = desc; 5302 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_ADMIN; 5303 bdev_io->u.nvme_passthru.cmd = *cmd; 5304 bdev_io->u.nvme_passthru.buf = buf; 5305 bdev_io->u.nvme_passthru.nbytes = nbytes; 5306 bdev_io->u.nvme_passthru.md_buf = NULL; 5307 bdev_io->u.nvme_passthru.md_len = 0; 5308 5309 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5310 5311 bdev_io_submit(bdev_io); 5312 return 0; 5313 } 5314 5315 int 5316 spdk_bdev_nvme_io_passthru(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5317 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, 5318 spdk_bdev_io_completion_cb cb, void *cb_arg) 5319 { 5320 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5321 struct spdk_bdev_io *bdev_io; 5322 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5323 5324 if (!desc->write) { 5325 /* 5326 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5327 * to easily determine if the command is a read or write, but for now just 5328 * do not allow io_passthru with a read-only descriptor. 5329 */ 5330 return -EBADF; 5331 } 5332 5333 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO))) { 5334 return -ENOTSUP; 5335 } 5336 5337 bdev_io = bdev_channel_get_io(channel); 5338 if (!bdev_io) { 5339 return -ENOMEM; 5340 } 5341 5342 bdev_io->internal.ch = channel; 5343 bdev_io->internal.desc = desc; 5344 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO; 5345 bdev_io->u.nvme_passthru.cmd = *cmd; 5346 bdev_io->u.nvme_passthru.buf = buf; 5347 bdev_io->u.nvme_passthru.nbytes = nbytes; 5348 bdev_io->u.nvme_passthru.md_buf = NULL; 5349 bdev_io->u.nvme_passthru.md_len = 0; 5350 5351 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5352 5353 bdev_io_submit(bdev_io); 5354 return 0; 5355 } 5356 5357 int 5358 spdk_bdev_nvme_io_passthru_md(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5359 const struct spdk_nvme_cmd *cmd, void *buf, size_t nbytes, void *md_buf, size_t md_len, 5360 spdk_bdev_io_completion_cb cb, void *cb_arg) 5361 { 5362 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5363 struct spdk_bdev_io *bdev_io; 5364 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5365 5366 if (!desc->write) { 5367 /* 5368 * Do not try to parse the NVMe command - we could maybe use bits in the opcode 5369 * to easily determine if the command is a read or write, but for now just 5370 * do not allow io_passthru with a read-only descriptor. 5371 */ 5372 return -EBADF; 5373 } 5374 5375 if (spdk_unlikely(!bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_NVME_IO_MD))) { 5376 return -ENOTSUP; 5377 } 5378 5379 bdev_io = bdev_channel_get_io(channel); 5380 if (!bdev_io) { 5381 return -ENOMEM; 5382 } 5383 5384 bdev_io->internal.ch = channel; 5385 bdev_io->internal.desc = desc; 5386 bdev_io->type = SPDK_BDEV_IO_TYPE_NVME_IO_MD; 5387 bdev_io->u.nvme_passthru.cmd = *cmd; 5388 bdev_io->u.nvme_passthru.buf = buf; 5389 bdev_io->u.nvme_passthru.nbytes = nbytes; 5390 bdev_io->u.nvme_passthru.md_buf = md_buf; 5391 bdev_io->u.nvme_passthru.md_len = md_len; 5392 5393 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5394 5395 bdev_io_submit(bdev_io); 5396 return 0; 5397 } 5398 5399 static void bdev_abort_retry(void *ctx); 5400 static void bdev_abort(struct spdk_bdev_io *parent_io); 5401 5402 static void 5403 bdev_abort_io_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 5404 { 5405 struct spdk_bdev_channel *channel = bdev_io->internal.ch; 5406 struct spdk_bdev_io *parent_io = cb_arg; 5407 struct spdk_bdev_io *bio_to_abort, *tmp_io; 5408 5409 bio_to_abort = bdev_io->u.abort.bio_to_abort; 5410 5411 spdk_bdev_free_io(bdev_io); 5412 5413 if (!success) { 5414 /* Check if the target I/O completed in the meantime. */ 5415 TAILQ_FOREACH(tmp_io, &channel->io_submitted, internal.ch_link) { 5416 if (tmp_io == bio_to_abort) { 5417 break; 5418 } 5419 } 5420 5421 /* If the target I/O still exists, set the parent to failed. */ 5422 if (tmp_io != NULL) { 5423 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5424 } 5425 } 5426 5427 parent_io->u.bdev.split_outstanding--; 5428 if (parent_io->u.bdev.split_outstanding == 0) { 5429 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5430 bdev_abort_retry(parent_io); 5431 } else { 5432 bdev_io_complete(parent_io); 5433 } 5434 } 5435 } 5436 5437 static int 5438 bdev_abort_io(struct spdk_bdev_desc *desc, struct spdk_bdev_channel *channel, 5439 struct spdk_bdev_io *bio_to_abort, 5440 spdk_bdev_io_completion_cb cb, void *cb_arg) 5441 { 5442 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5443 struct spdk_bdev_io *bdev_io; 5444 5445 if (bio_to_abort->type == SPDK_BDEV_IO_TYPE_ABORT || 5446 bio_to_abort->type == SPDK_BDEV_IO_TYPE_RESET) { 5447 /* TODO: Abort reset or abort request. */ 5448 return -ENOTSUP; 5449 } 5450 5451 bdev_io = bdev_channel_get_io(channel); 5452 if (bdev_io == NULL) { 5453 return -ENOMEM; 5454 } 5455 5456 bdev_io->internal.ch = channel; 5457 bdev_io->internal.desc = desc; 5458 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5459 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5460 5461 if (bdev->split_on_optimal_io_boundary && bdev_io_should_split(bio_to_abort)) { 5462 bdev_io->u.bdev.abort.bio_cb_arg = bio_to_abort; 5463 5464 /* Parent abort request is not submitted directly, but to manage its 5465 * execution add it to the submitted list here. 5466 */ 5467 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5468 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5469 5470 bdev_abort(bdev_io); 5471 5472 return 0; 5473 } 5474 5475 bdev_io->u.abort.bio_to_abort = bio_to_abort; 5476 5477 /* Submit the abort request to the underlying bdev module. */ 5478 bdev_io_submit(bdev_io); 5479 5480 return 0; 5481 } 5482 5483 static uint32_t 5484 _bdev_abort(struct spdk_bdev_io *parent_io) 5485 { 5486 struct spdk_bdev_desc *desc = parent_io->internal.desc; 5487 struct spdk_bdev_channel *channel = parent_io->internal.ch; 5488 void *bio_cb_arg; 5489 struct spdk_bdev_io *bio_to_abort; 5490 uint32_t matched_ios; 5491 int rc; 5492 5493 bio_cb_arg = parent_io->u.bdev.abort.bio_cb_arg; 5494 5495 /* matched_ios is returned and will be kept by the caller. 5496 * 5497 * This funcion will be used for two cases, 1) the same cb_arg is used for 5498 * multiple I/Os, 2) a single large I/O is split into smaller ones. 5499 * Incrementing split_outstanding directly here may confuse readers especially 5500 * for the 1st case. 5501 * 5502 * Completion of I/O abort is processed after stack unwinding. Hence this trick 5503 * works as expected. 5504 */ 5505 matched_ios = 0; 5506 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5507 5508 TAILQ_FOREACH(bio_to_abort, &channel->io_submitted, internal.ch_link) { 5509 if (bio_to_abort->internal.caller_ctx != bio_cb_arg) { 5510 continue; 5511 } 5512 5513 if (bio_to_abort->internal.submit_tsc > parent_io->internal.submit_tsc) { 5514 /* Any I/O which was submitted after this abort command should be excluded. */ 5515 continue; 5516 } 5517 5518 rc = bdev_abort_io(desc, channel, bio_to_abort, bdev_abort_io_done, parent_io); 5519 if (rc != 0) { 5520 if (rc == -ENOMEM) { 5521 parent_io->internal.status = SPDK_BDEV_IO_STATUS_NOMEM; 5522 } else { 5523 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5524 } 5525 break; 5526 } 5527 matched_ios++; 5528 } 5529 5530 return matched_ios; 5531 } 5532 5533 static void 5534 bdev_abort_retry(void *ctx) 5535 { 5536 struct spdk_bdev_io *parent_io = ctx; 5537 uint32_t matched_ios; 5538 5539 matched_ios = _bdev_abort(parent_io); 5540 5541 if (matched_ios == 0) { 5542 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5543 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5544 } else { 5545 /* For retry, the case that no target I/O was found is success 5546 * because it means target I/Os completed in the meantime. 5547 */ 5548 bdev_io_complete(parent_io); 5549 } 5550 return; 5551 } 5552 5553 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5554 parent_io->u.bdev.split_outstanding = matched_ios; 5555 } 5556 5557 static void 5558 bdev_abort(struct spdk_bdev_io *parent_io) 5559 { 5560 uint32_t matched_ios; 5561 5562 matched_ios = _bdev_abort(parent_io); 5563 5564 if (matched_ios == 0) { 5565 if (parent_io->internal.status == SPDK_BDEV_IO_STATUS_NOMEM) { 5566 bdev_queue_io_wait_with_cb(parent_io, bdev_abort_retry); 5567 } else { 5568 /* The case the no target I/O was found is failure. */ 5569 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 5570 bdev_io_complete(parent_io); 5571 } 5572 return; 5573 } 5574 5575 /* Use split_outstanding to manage the progress of aborting I/Os. */ 5576 parent_io->u.bdev.split_outstanding = matched_ios; 5577 } 5578 5579 int 5580 spdk_bdev_abort(struct spdk_bdev_desc *desc, struct spdk_io_channel *ch, 5581 void *bio_cb_arg, 5582 spdk_bdev_io_completion_cb cb, void *cb_arg) 5583 { 5584 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 5585 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5586 struct spdk_bdev_io *bdev_io; 5587 5588 if (bio_cb_arg == NULL) { 5589 return -EINVAL; 5590 } 5591 5592 if (!spdk_bdev_io_type_supported(bdev, SPDK_BDEV_IO_TYPE_ABORT)) { 5593 return -ENOTSUP; 5594 } 5595 5596 bdev_io = bdev_channel_get_io(channel); 5597 if (bdev_io == NULL) { 5598 return -ENOMEM; 5599 } 5600 5601 bdev_io->internal.ch = channel; 5602 bdev_io->internal.desc = desc; 5603 bdev_io->internal.submit_tsc = spdk_get_ticks(); 5604 bdev_io->type = SPDK_BDEV_IO_TYPE_ABORT; 5605 bdev_io_init(bdev_io, bdev, cb_arg, cb); 5606 5607 bdev_io->u.bdev.abort.bio_cb_arg = bio_cb_arg; 5608 5609 /* Parent abort request is not submitted directly, but to manage its execution, 5610 * add it to the submitted list here. 5611 */ 5612 TAILQ_INSERT_TAIL(&channel->io_submitted, bdev_io, internal.ch_link); 5613 5614 bdev_abort(bdev_io); 5615 5616 return 0; 5617 } 5618 5619 int 5620 spdk_bdev_queue_io_wait(struct spdk_bdev *bdev, struct spdk_io_channel *ch, 5621 struct spdk_bdev_io_wait_entry *entry) 5622 { 5623 struct spdk_bdev_channel *channel = spdk_io_channel_get_ctx(ch); 5624 struct spdk_bdev_mgmt_channel *mgmt_ch = channel->shared_resource->mgmt_ch; 5625 5626 if (bdev != entry->bdev) { 5627 SPDK_ERRLOG("bdevs do not match\n"); 5628 return -EINVAL; 5629 } 5630 5631 if (mgmt_ch->per_thread_cache_count > 0) { 5632 SPDK_ERRLOG("Cannot queue io_wait if spdk_bdev_io available in per-thread cache\n"); 5633 return -EINVAL; 5634 } 5635 5636 TAILQ_INSERT_TAIL(&mgmt_ch->io_wait_queue, entry, link); 5637 return 0; 5638 } 5639 5640 static inline void 5641 bdev_io_complete(void *ctx) 5642 { 5643 struct spdk_bdev_io *bdev_io = ctx; 5644 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5645 uint64_t tsc, tsc_diff; 5646 5647 if (spdk_unlikely(bdev_io->internal.in_submit_request || bdev_io->internal.io_submit_ch)) { 5648 /* 5649 * Send the completion to the thread that originally submitted the I/O, 5650 * which may not be the current thread in the case of QoS. 5651 */ 5652 if (bdev_io->internal.io_submit_ch) { 5653 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 5654 bdev_io->internal.io_submit_ch = NULL; 5655 } 5656 5657 /* 5658 * Defer completion to avoid potential infinite recursion if the 5659 * user's completion callback issues a new I/O. 5660 */ 5661 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 5662 bdev_io_complete, bdev_io); 5663 return; 5664 } 5665 5666 tsc = spdk_get_ticks(); 5667 tsc_diff = tsc - bdev_io->internal.submit_tsc; 5668 spdk_trace_record_tsc(tsc, TRACE_BDEV_IO_DONE, 0, 0, (uintptr_t)bdev_io, 5669 bdev_io->internal.caller_ctx); 5670 5671 TAILQ_REMOVE(&bdev_ch->io_submitted, bdev_io, internal.ch_link); 5672 5673 if (bdev_io->internal.ch->histogram) { 5674 spdk_histogram_data_tally(bdev_io->internal.ch->histogram, tsc_diff); 5675 } 5676 5677 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5678 switch (bdev_io->type) { 5679 case SPDK_BDEV_IO_TYPE_READ: 5680 bdev_io->internal.ch->stat.bytes_read += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5681 bdev_io->internal.ch->stat.num_read_ops++; 5682 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5683 break; 5684 case SPDK_BDEV_IO_TYPE_WRITE: 5685 bdev_io->internal.ch->stat.bytes_written += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5686 bdev_io->internal.ch->stat.num_write_ops++; 5687 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5688 break; 5689 case SPDK_BDEV_IO_TYPE_UNMAP: 5690 bdev_io->internal.ch->stat.bytes_unmapped += bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5691 bdev_io->internal.ch->stat.num_unmap_ops++; 5692 bdev_io->internal.ch->stat.unmap_latency_ticks += tsc_diff; 5693 break; 5694 case SPDK_BDEV_IO_TYPE_ZCOPY: 5695 /* Track the data in the start phase only */ 5696 if (bdev_io->u.bdev.zcopy.start) { 5697 if (bdev_io->u.bdev.zcopy.populate) { 5698 bdev_io->internal.ch->stat.bytes_read += 5699 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5700 bdev_io->internal.ch->stat.num_read_ops++; 5701 bdev_io->internal.ch->stat.read_latency_ticks += tsc_diff; 5702 } else { 5703 bdev_io->internal.ch->stat.bytes_written += 5704 bdev_io->u.bdev.num_blocks * bdev_io->bdev->blocklen; 5705 bdev_io->internal.ch->stat.num_write_ops++; 5706 bdev_io->internal.ch->stat.write_latency_ticks += tsc_diff; 5707 } 5708 } 5709 break; 5710 default: 5711 break; 5712 } 5713 } 5714 5715 #ifdef SPDK_CONFIG_VTUNE 5716 uint64_t now_tsc = spdk_get_ticks(); 5717 if (now_tsc > (bdev_io->internal.ch->start_tsc + bdev_io->internal.ch->interval_tsc)) { 5718 uint64_t data[5]; 5719 5720 data[0] = bdev_io->internal.ch->stat.num_read_ops - bdev_io->internal.ch->prev_stat.num_read_ops; 5721 data[1] = bdev_io->internal.ch->stat.bytes_read - bdev_io->internal.ch->prev_stat.bytes_read; 5722 data[2] = bdev_io->internal.ch->stat.num_write_ops - bdev_io->internal.ch->prev_stat.num_write_ops; 5723 data[3] = bdev_io->internal.ch->stat.bytes_written - bdev_io->internal.ch->prev_stat.bytes_written; 5724 data[4] = bdev_io->bdev->fn_table->get_spin_time ? 5725 bdev_io->bdev->fn_table->get_spin_time(spdk_bdev_io_get_io_channel(bdev_io)) : 0; 5726 5727 __itt_metadata_add(g_bdev_mgr.domain, __itt_null, bdev_io->internal.ch->handle, 5728 __itt_metadata_u64, 5, data); 5729 5730 bdev_io->internal.ch->prev_stat = bdev_io->internal.ch->stat; 5731 bdev_io->internal.ch->start_tsc = now_tsc; 5732 } 5733 #endif 5734 5735 assert(bdev_io->internal.cb != NULL); 5736 assert(spdk_get_thread() == spdk_bdev_io_get_thread(bdev_io)); 5737 5738 bdev_io->internal.cb(bdev_io, bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS, 5739 bdev_io->internal.caller_ctx); 5740 } 5741 5742 static void bdev_destroy_cb(void *io_device); 5743 5744 static void 5745 bdev_reset_complete(struct spdk_io_channel_iter *i, int status) 5746 { 5747 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5748 struct spdk_bdev *bdev = bdev_io->bdev; 5749 5750 if (bdev_io->u.reset.ch_ref != NULL) { 5751 spdk_put_io_channel(bdev_io->u.reset.ch_ref); 5752 bdev_io->u.reset.ch_ref = NULL; 5753 } 5754 5755 bdev_io_complete(bdev_io); 5756 5757 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && 5758 TAILQ_EMPTY(&bdev->internal.open_descs)) { 5759 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 5760 } 5761 } 5762 5763 static void 5764 bdev_unfreeze_channel(struct spdk_io_channel_iter *i) 5765 { 5766 struct spdk_bdev_io *bdev_io = spdk_io_channel_iter_get_ctx(i); 5767 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 5768 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 5769 struct spdk_bdev_io *queued_reset; 5770 5771 ch->flags &= ~BDEV_CH_RESET_IN_PROGRESS; 5772 while (!TAILQ_EMPTY(&ch->queued_resets)) { 5773 queued_reset = TAILQ_FIRST(&ch->queued_resets); 5774 TAILQ_REMOVE(&ch->queued_resets, queued_reset, internal.link); 5775 spdk_bdev_io_complete(queued_reset, bdev_io->internal.status); 5776 } 5777 5778 spdk_for_each_channel_continue(i, 0); 5779 } 5780 5781 void 5782 spdk_bdev_io_complete(struct spdk_bdev_io *bdev_io, enum spdk_bdev_io_status status) 5783 { 5784 struct spdk_bdev *bdev = bdev_io->bdev; 5785 struct spdk_bdev_channel *bdev_ch = bdev_io->internal.ch; 5786 struct spdk_bdev_shared_resource *shared_resource = bdev_ch->shared_resource; 5787 5788 bdev_io->internal.status = status; 5789 5790 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_RESET)) { 5791 bool unlock_channels = false; 5792 5793 if (status == SPDK_BDEV_IO_STATUS_NOMEM) { 5794 SPDK_ERRLOG("NOMEM returned for reset\n"); 5795 } 5796 pthread_mutex_lock(&bdev->internal.mutex); 5797 if (bdev_io == bdev->internal.reset_in_progress) { 5798 bdev->internal.reset_in_progress = NULL; 5799 unlock_channels = true; 5800 } 5801 pthread_mutex_unlock(&bdev->internal.mutex); 5802 5803 if (unlock_channels) { 5804 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unfreeze_channel, 5805 bdev_io, bdev_reset_complete); 5806 return; 5807 } 5808 } else { 5809 if (spdk_unlikely(bdev_io->internal.orig_iovcnt != 0)) { 5810 _bdev_io_push_bounce_data_buffer(bdev_io, _bdev_io_complete_push_bounce_done); 5811 /* bdev IO will be completed in the callback */ 5812 return; 5813 } 5814 5815 _bdev_io_decrement_outstanding(bdev_ch, shared_resource); 5816 if (spdk_unlikely(_bdev_io_handle_no_mem(bdev_io))) { 5817 return; 5818 } 5819 } 5820 5821 bdev_io_complete(bdev_io); 5822 } 5823 5824 void 5825 spdk_bdev_io_complete_scsi_status(struct spdk_bdev_io *bdev_io, enum spdk_scsi_status sc, 5826 enum spdk_scsi_sense sk, uint8_t asc, uint8_t ascq) 5827 { 5828 if (sc == SPDK_SCSI_STATUS_GOOD) { 5829 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5830 } else { 5831 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SCSI_ERROR; 5832 bdev_io->internal.error.scsi.sc = sc; 5833 bdev_io->internal.error.scsi.sk = sk; 5834 bdev_io->internal.error.scsi.asc = asc; 5835 bdev_io->internal.error.scsi.ascq = ascq; 5836 } 5837 5838 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5839 } 5840 5841 void 5842 spdk_bdev_io_get_scsi_status(const struct spdk_bdev_io *bdev_io, 5843 int *sc, int *sk, int *asc, int *ascq) 5844 { 5845 assert(sc != NULL); 5846 assert(sk != NULL); 5847 assert(asc != NULL); 5848 assert(ascq != NULL); 5849 5850 switch (bdev_io->internal.status) { 5851 case SPDK_BDEV_IO_STATUS_SUCCESS: 5852 *sc = SPDK_SCSI_STATUS_GOOD; 5853 *sk = SPDK_SCSI_SENSE_NO_SENSE; 5854 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5855 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5856 break; 5857 case SPDK_BDEV_IO_STATUS_NVME_ERROR: 5858 spdk_scsi_nvme_translate(bdev_io, sc, sk, asc, ascq); 5859 break; 5860 case SPDK_BDEV_IO_STATUS_SCSI_ERROR: 5861 *sc = bdev_io->internal.error.scsi.sc; 5862 *sk = bdev_io->internal.error.scsi.sk; 5863 *asc = bdev_io->internal.error.scsi.asc; 5864 *ascq = bdev_io->internal.error.scsi.ascq; 5865 break; 5866 default: 5867 *sc = SPDK_SCSI_STATUS_CHECK_CONDITION; 5868 *sk = SPDK_SCSI_SENSE_ABORTED_COMMAND; 5869 *asc = SPDK_SCSI_ASC_NO_ADDITIONAL_SENSE; 5870 *ascq = SPDK_SCSI_ASCQ_CAUSE_NOT_REPORTABLE; 5871 break; 5872 } 5873 } 5874 5875 void 5876 spdk_bdev_io_complete_aio_status(struct spdk_bdev_io *bdev_io, int aio_result) 5877 { 5878 if (aio_result == 0) { 5879 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5880 } else { 5881 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_AIO_ERROR; 5882 } 5883 5884 bdev_io->internal.error.aio_result = aio_result; 5885 5886 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5887 } 5888 5889 void 5890 spdk_bdev_io_get_aio_status(const struct spdk_bdev_io *bdev_io, int *aio_result) 5891 { 5892 assert(aio_result != NULL); 5893 5894 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_AIO_ERROR) { 5895 *aio_result = bdev_io->internal.error.aio_result; 5896 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5897 *aio_result = 0; 5898 } else { 5899 *aio_result = -EIO; 5900 } 5901 } 5902 5903 void 5904 spdk_bdev_io_complete_nvme_status(struct spdk_bdev_io *bdev_io, uint32_t cdw0, int sct, int sc) 5905 { 5906 if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_SUCCESS) { 5907 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 5908 } else if (sct == SPDK_NVME_SCT_GENERIC && sc == SPDK_NVME_SC_ABORTED_BY_REQUEST) { 5909 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_ABORTED; 5910 } else { 5911 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_NVME_ERROR; 5912 } 5913 5914 bdev_io->internal.error.nvme.cdw0 = cdw0; 5915 bdev_io->internal.error.nvme.sct = sct; 5916 bdev_io->internal.error.nvme.sc = sc; 5917 5918 spdk_bdev_io_complete(bdev_io, bdev_io->internal.status); 5919 } 5920 5921 void 5922 spdk_bdev_io_get_nvme_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, int *sct, int *sc) 5923 { 5924 assert(sct != NULL); 5925 assert(sc != NULL); 5926 assert(cdw0 != NULL); 5927 5928 if (spdk_unlikely(bdev_io->type == SPDK_BDEV_IO_TYPE_ABORT)) { 5929 *sct = SPDK_NVME_SCT_GENERIC; 5930 *sc = SPDK_NVME_SC_SUCCESS; 5931 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5932 *cdw0 = 0; 5933 } else { 5934 *cdw0 = 1U; 5935 } 5936 return; 5937 } 5938 5939 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5940 *sct = bdev_io->internal.error.nvme.sct; 5941 *sc = bdev_io->internal.error.nvme.sc; 5942 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5943 *sct = SPDK_NVME_SCT_GENERIC; 5944 *sc = SPDK_NVME_SC_SUCCESS; 5945 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_ABORTED) { 5946 *sct = SPDK_NVME_SCT_GENERIC; 5947 *sc = SPDK_NVME_SC_ABORTED_BY_REQUEST; 5948 } else { 5949 *sct = SPDK_NVME_SCT_GENERIC; 5950 *sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5951 } 5952 5953 *cdw0 = bdev_io->internal.error.nvme.cdw0; 5954 } 5955 5956 void 5957 spdk_bdev_io_get_nvme_fused_status(const struct spdk_bdev_io *bdev_io, uint32_t *cdw0, 5958 int *first_sct, int *first_sc, int *second_sct, int *second_sc) 5959 { 5960 assert(first_sct != NULL); 5961 assert(first_sc != NULL); 5962 assert(second_sct != NULL); 5963 assert(second_sc != NULL); 5964 assert(cdw0 != NULL); 5965 5966 if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_NVME_ERROR) { 5967 if (bdev_io->internal.error.nvme.sct == SPDK_NVME_SCT_MEDIA_ERROR && 5968 bdev_io->internal.error.nvme.sc == SPDK_NVME_SC_COMPARE_FAILURE) { 5969 *first_sct = bdev_io->internal.error.nvme.sct; 5970 *first_sc = bdev_io->internal.error.nvme.sc; 5971 *second_sct = SPDK_NVME_SCT_GENERIC; 5972 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5973 } else { 5974 *first_sct = SPDK_NVME_SCT_GENERIC; 5975 *first_sc = SPDK_NVME_SC_SUCCESS; 5976 *second_sct = bdev_io->internal.error.nvme.sct; 5977 *second_sc = bdev_io->internal.error.nvme.sc; 5978 } 5979 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_SUCCESS) { 5980 *first_sct = SPDK_NVME_SCT_GENERIC; 5981 *first_sc = SPDK_NVME_SC_SUCCESS; 5982 *second_sct = SPDK_NVME_SCT_GENERIC; 5983 *second_sc = SPDK_NVME_SC_SUCCESS; 5984 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_FIRST_FUSED_FAILED) { 5985 *first_sct = SPDK_NVME_SCT_GENERIC; 5986 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5987 *second_sct = SPDK_NVME_SCT_GENERIC; 5988 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5989 } else if (bdev_io->internal.status == SPDK_BDEV_IO_STATUS_MISCOMPARE) { 5990 *first_sct = SPDK_NVME_SCT_MEDIA_ERROR; 5991 *first_sc = SPDK_NVME_SC_COMPARE_FAILURE; 5992 *second_sct = SPDK_NVME_SCT_GENERIC; 5993 *second_sc = SPDK_NVME_SC_ABORTED_FAILED_FUSED; 5994 } else { 5995 *first_sct = SPDK_NVME_SCT_GENERIC; 5996 *first_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5997 *second_sct = SPDK_NVME_SCT_GENERIC; 5998 *second_sc = SPDK_NVME_SC_INTERNAL_DEVICE_ERROR; 5999 } 6000 6001 *cdw0 = bdev_io->internal.error.nvme.cdw0; 6002 } 6003 6004 struct spdk_thread * 6005 spdk_bdev_io_get_thread(struct spdk_bdev_io *bdev_io) 6006 { 6007 return spdk_io_channel_get_thread(bdev_io->internal.ch->channel); 6008 } 6009 6010 struct spdk_io_channel * 6011 spdk_bdev_io_get_io_channel(struct spdk_bdev_io *bdev_io) 6012 { 6013 return bdev_io->internal.ch->channel; 6014 } 6015 6016 static int 6017 bdev_register(struct spdk_bdev *bdev) 6018 { 6019 char *bdev_name; 6020 char uuid[SPDK_UUID_STRING_LEN]; 6021 int ret; 6022 6023 assert(bdev->module != NULL); 6024 6025 if (!bdev->name) { 6026 SPDK_ERRLOG("Bdev name is NULL\n"); 6027 return -EINVAL; 6028 } 6029 6030 if (!strlen(bdev->name)) { 6031 SPDK_ERRLOG("Bdev name must not be an empty string\n"); 6032 return -EINVAL; 6033 } 6034 6035 /* Users often register their own I/O devices using the bdev name. In 6036 * order to avoid conflicts, prepend bdev_. */ 6037 bdev_name = spdk_sprintf_alloc("bdev_%s", bdev->name); 6038 if (!bdev_name) { 6039 SPDK_ERRLOG("Unable to allocate memory for internal bdev name.\n"); 6040 return -ENOMEM; 6041 } 6042 6043 bdev->internal.status = SPDK_BDEV_STATUS_READY; 6044 bdev->internal.measured_queue_depth = UINT64_MAX; 6045 bdev->internal.claim_module = NULL; 6046 bdev->internal.qd_poller = NULL; 6047 bdev->internal.qos = NULL; 6048 6049 TAILQ_INIT(&bdev->internal.open_descs); 6050 TAILQ_INIT(&bdev->internal.locked_ranges); 6051 TAILQ_INIT(&bdev->internal.pending_locked_ranges); 6052 TAILQ_INIT(&bdev->aliases); 6053 6054 ret = bdev_name_add(&bdev->internal.bdev_name, bdev, bdev->name); 6055 if (ret != 0) { 6056 free(bdev_name); 6057 return ret; 6058 } 6059 6060 /* If the user didn't specify a uuid, generate one. */ 6061 if (spdk_mem_all_zero(&bdev->uuid, sizeof(bdev->uuid))) { 6062 spdk_uuid_generate(&bdev->uuid); 6063 } 6064 6065 /* Add the UUID alias only if it's different than the name */ 6066 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6067 if (strcmp(bdev->name, uuid) != 0) { 6068 ret = spdk_bdev_alias_add(bdev, uuid); 6069 if (ret != 0) { 6070 SPDK_ERRLOG("Unable to add uuid:%s alias for bdev %s\n", uuid, bdev->name); 6071 bdev_name_del(&bdev->internal.bdev_name); 6072 free(bdev_name); 6073 return ret; 6074 } 6075 } 6076 6077 if (spdk_bdev_get_buf_align(bdev) > 1) { 6078 if (bdev->split_on_optimal_io_boundary) { 6079 bdev->optimal_io_boundary = spdk_min(bdev->optimal_io_boundary, 6080 SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen); 6081 } else { 6082 bdev->split_on_optimal_io_boundary = true; 6083 bdev->optimal_io_boundary = SPDK_BDEV_LARGE_BUF_MAX_SIZE / bdev->blocklen; 6084 } 6085 } 6086 6087 /* If the user didn't specify a write unit size, set it to one. */ 6088 if (bdev->write_unit_size == 0) { 6089 bdev->write_unit_size = 1; 6090 } 6091 6092 /* Set ACWU value to 1 if bdev module did not set it (does not support it natively) */ 6093 if (bdev->acwu == 0) { 6094 bdev->acwu = 1; 6095 } 6096 6097 if (bdev->phys_blocklen == 0) { 6098 bdev->phys_blocklen = spdk_bdev_get_data_block_size(bdev); 6099 } 6100 6101 bdev->internal.reset_in_progress = NULL; 6102 6103 spdk_io_device_register(__bdev_to_io_dev(bdev), 6104 bdev_channel_create, bdev_channel_destroy, 6105 sizeof(struct spdk_bdev_channel), 6106 bdev_name); 6107 6108 free(bdev_name); 6109 6110 pthread_mutex_init(&bdev->internal.mutex, NULL); 6111 6112 SPDK_DEBUGLOG(bdev, "Inserting bdev %s into list\n", bdev->name); 6113 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdevs, bdev, internal.link); 6114 6115 return 0; 6116 } 6117 6118 static void 6119 bdev_destroy_cb(void *io_device) 6120 { 6121 int rc; 6122 struct spdk_bdev *bdev; 6123 spdk_bdev_unregister_cb cb_fn; 6124 void *cb_arg; 6125 6126 bdev = __bdev_from_io_dev(io_device); 6127 cb_fn = bdev->internal.unregister_cb; 6128 cb_arg = bdev->internal.unregister_ctx; 6129 6130 pthread_mutex_destroy(&bdev->internal.mutex); 6131 free(bdev->internal.qos); 6132 6133 rc = bdev->fn_table->destruct(bdev->ctxt); 6134 if (rc < 0) { 6135 SPDK_ERRLOG("destruct failed\n"); 6136 } 6137 if (rc <= 0 && cb_fn != NULL) { 6138 cb_fn(cb_arg, rc); 6139 } 6140 } 6141 6142 static void 6143 bdev_register_finished(void *arg) 6144 { 6145 struct spdk_bdev *bdev = arg; 6146 6147 spdk_notify_send("bdev_register", spdk_bdev_get_name(bdev)); 6148 } 6149 6150 int 6151 spdk_bdev_register(struct spdk_bdev *bdev) 6152 { 6153 int rc = bdev_register(bdev); 6154 6155 if (rc == 0) { 6156 /* Examine configuration before initializing I/O */ 6157 bdev_examine(bdev); 6158 6159 spdk_bdev_wait_for_examine(bdev_register_finished, bdev); 6160 } 6161 6162 return rc; 6163 } 6164 6165 void 6166 spdk_bdev_destruct_done(struct spdk_bdev *bdev, int bdeverrno) 6167 { 6168 if (bdev->internal.unregister_cb != NULL) { 6169 bdev->internal.unregister_cb(bdev->internal.unregister_ctx, bdeverrno); 6170 } 6171 } 6172 6173 static void 6174 _remove_notify(void *arg) 6175 { 6176 struct spdk_bdev_desc *desc = arg; 6177 6178 pthread_mutex_lock(&desc->mutex); 6179 desc->refs--; 6180 6181 if (!desc->closed) { 6182 pthread_mutex_unlock(&desc->mutex); 6183 desc->callback.event_fn(SPDK_BDEV_EVENT_REMOVE, desc->bdev, desc->callback.ctx); 6184 return; 6185 } else if (0 == desc->refs) { 6186 /* This descriptor was closed after this remove_notify message was sent. 6187 * spdk_bdev_close() could not free the descriptor since this message was 6188 * in flight, so we free it now using bdev_desc_free(). 6189 */ 6190 pthread_mutex_unlock(&desc->mutex); 6191 bdev_desc_free(desc); 6192 return; 6193 } 6194 pthread_mutex_unlock(&desc->mutex); 6195 } 6196 6197 /* Must be called while holding g_bdev_mgr.mutex and bdev->internal.mutex. 6198 * returns: 0 - bdev removed and ready to be destructed. 6199 * -EBUSY - bdev can't be destructed yet. */ 6200 static int 6201 bdev_unregister_unsafe(struct spdk_bdev *bdev) 6202 { 6203 struct spdk_bdev_desc *desc, *tmp; 6204 int rc = 0; 6205 char uuid[SPDK_UUID_STRING_LEN]; 6206 6207 /* Notify each descriptor about hotremoval */ 6208 TAILQ_FOREACH_SAFE(desc, &bdev->internal.open_descs, link, tmp) { 6209 rc = -EBUSY; 6210 pthread_mutex_lock(&desc->mutex); 6211 /* 6212 * Defer invocation of the event_cb to a separate message that will 6213 * run later on its thread. This ensures this context unwinds and 6214 * we don't recursively unregister this bdev again if the event_cb 6215 * immediately closes its descriptor. 6216 */ 6217 desc->refs++; 6218 spdk_thread_send_msg(desc->thread, _remove_notify, desc); 6219 pthread_mutex_unlock(&desc->mutex); 6220 } 6221 6222 /* If there are no descriptors, proceed removing the bdev */ 6223 if (rc == 0) { 6224 TAILQ_REMOVE(&g_bdev_mgr.bdevs, bdev, internal.link); 6225 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list done\n", bdev->name); 6226 6227 /* Delete the name and the UUID alias */ 6228 spdk_uuid_fmt_lower(uuid, sizeof(uuid), &bdev->uuid); 6229 bdev_name_del_unsafe(&bdev->internal.bdev_name); 6230 bdev_alias_del(bdev, uuid, bdev_name_del_unsafe); 6231 6232 spdk_notify_send("bdev_unregister", spdk_bdev_get_name(bdev)); 6233 6234 if (bdev->internal.reset_in_progress != NULL) { 6235 /* If reset is in progress, let the completion callback for reset 6236 * unregister the bdev. 6237 */ 6238 rc = -EBUSY; 6239 } 6240 } 6241 6242 return rc; 6243 } 6244 6245 static void 6246 bdev_unregister_abort_channel(struct spdk_io_channel_iter *i) 6247 { 6248 struct spdk_io_channel *io_ch = spdk_io_channel_iter_get_channel(i); 6249 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(io_ch); 6250 6251 bdev_channel_abort_queued_ios(bdev_ch); 6252 spdk_for_each_channel_continue(i, 0); 6253 } 6254 6255 static void 6256 bdev_unregister(struct spdk_io_channel_iter *i, int status) 6257 { 6258 struct spdk_bdev *bdev = spdk_io_channel_iter_get_ctx(i); 6259 int rc; 6260 6261 pthread_mutex_lock(&g_bdev_mgr.mutex); 6262 pthread_mutex_lock(&bdev->internal.mutex); 6263 /* 6264 * Set the status to REMOVING after completing to abort channels. Otherwise, 6265 * the last spdk_bdev_close() may call spdk_io_device_unregister() while 6266 * spdk_for_each_channel() is executed and spdk_io_device_unregister() may fail. 6267 */ 6268 bdev->internal.status = SPDK_BDEV_STATUS_REMOVING; 6269 rc = bdev_unregister_unsafe(bdev); 6270 pthread_mutex_unlock(&bdev->internal.mutex); 6271 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6272 6273 if (rc == 0) { 6274 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6275 } 6276 } 6277 6278 void 6279 spdk_bdev_unregister(struct spdk_bdev *bdev, spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6280 { 6281 struct spdk_thread *thread; 6282 6283 SPDK_DEBUGLOG(bdev, "Removing bdev %s from list\n", bdev->name); 6284 6285 thread = spdk_get_thread(); 6286 if (!thread) { 6287 /* The user called this from a non-SPDK thread. */ 6288 if (cb_fn != NULL) { 6289 cb_fn(cb_arg, -ENOTSUP); 6290 } 6291 return; 6292 } 6293 6294 pthread_mutex_lock(&g_bdev_mgr.mutex); 6295 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6296 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6297 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6298 if (cb_fn) { 6299 cb_fn(cb_arg, -EBUSY); 6300 } 6301 return; 6302 } 6303 6304 pthread_mutex_lock(&bdev->internal.mutex); 6305 bdev->internal.status = SPDK_BDEV_STATUS_UNREGISTERING; 6306 bdev->internal.unregister_cb = cb_fn; 6307 bdev->internal.unregister_ctx = cb_arg; 6308 pthread_mutex_unlock(&bdev->internal.mutex); 6309 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6310 6311 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6312 bdev_unregister_abort_channel, 6313 bdev, 6314 bdev_unregister); 6315 } 6316 6317 static void 6318 _tmp_bdev_event_cb(enum spdk_bdev_event_type type, struct spdk_bdev *bdev, void *ctx) 6319 { 6320 SPDK_NOTICELOG("Unexpected event type: %d\n", type); 6321 } 6322 6323 int 6324 spdk_bdev_unregister_by_name(const char *bdev_name, struct spdk_bdev_module *module, 6325 spdk_bdev_unregister_cb cb_fn, void *cb_arg) 6326 { 6327 struct spdk_bdev_desc *desc; 6328 struct spdk_bdev *bdev; 6329 int rc; 6330 6331 rc = spdk_bdev_open_ext(bdev_name, false, _tmp_bdev_event_cb, NULL, &desc); 6332 if (rc != 0) { 6333 SPDK_ERRLOG("Failed to open bdev with name: %s\n", bdev_name); 6334 return rc; 6335 } 6336 6337 bdev = spdk_bdev_desc_get_bdev(desc); 6338 6339 if (bdev->module != module) { 6340 spdk_bdev_close(desc); 6341 SPDK_ERRLOG("Bdev %s was not registered by the specified module.\n", 6342 bdev_name); 6343 return -ENODEV; 6344 } 6345 6346 spdk_bdev_unregister(bdev, cb_fn, cb_arg); 6347 6348 spdk_bdev_close(desc); 6349 6350 return 0; 6351 } 6352 6353 static int 6354 bdev_start_qos(struct spdk_bdev *bdev) 6355 { 6356 struct set_qos_limit_ctx *ctx; 6357 6358 /* Enable QoS */ 6359 if (bdev->internal.qos && bdev->internal.qos->thread == NULL) { 6360 ctx = calloc(1, sizeof(*ctx)); 6361 if (ctx == NULL) { 6362 SPDK_ERRLOG("Failed to allocate memory for QoS context\n"); 6363 return -ENOMEM; 6364 } 6365 ctx->bdev = bdev; 6366 spdk_for_each_channel(__bdev_to_io_dev(bdev), 6367 bdev_enable_qos_msg, ctx, 6368 bdev_enable_qos_done); 6369 } 6370 6371 return 0; 6372 } 6373 6374 static int 6375 bdev_open(struct spdk_bdev *bdev, bool write, struct spdk_bdev_desc *desc) 6376 { 6377 struct spdk_thread *thread; 6378 int rc = 0; 6379 6380 thread = spdk_get_thread(); 6381 if (!thread) { 6382 SPDK_ERRLOG("Cannot open bdev from non-SPDK thread.\n"); 6383 return -ENOTSUP; 6384 } 6385 6386 SPDK_DEBUGLOG(bdev, "Opening descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6387 spdk_get_thread()); 6388 6389 desc->bdev = bdev; 6390 desc->thread = thread; 6391 desc->write = write; 6392 6393 pthread_mutex_lock(&bdev->internal.mutex); 6394 if (bdev->internal.status == SPDK_BDEV_STATUS_UNREGISTERING || 6395 bdev->internal.status == SPDK_BDEV_STATUS_REMOVING) { 6396 pthread_mutex_unlock(&bdev->internal.mutex); 6397 return -ENODEV; 6398 } 6399 6400 if (write && bdev->internal.claim_module) { 6401 SPDK_ERRLOG("Could not open %s - %s module already claimed it\n", 6402 bdev->name, bdev->internal.claim_module->name); 6403 pthread_mutex_unlock(&bdev->internal.mutex); 6404 return -EPERM; 6405 } 6406 6407 rc = bdev_start_qos(bdev); 6408 if (rc != 0) { 6409 SPDK_ERRLOG("Failed to start QoS on bdev %s\n", bdev->name); 6410 pthread_mutex_unlock(&bdev->internal.mutex); 6411 return rc; 6412 } 6413 6414 TAILQ_INSERT_TAIL(&bdev->internal.open_descs, desc, link); 6415 6416 pthread_mutex_unlock(&bdev->internal.mutex); 6417 6418 return 0; 6419 } 6420 6421 static int 6422 bdev_desc_alloc(struct spdk_bdev *bdev, spdk_bdev_event_cb_t event_cb, void *event_ctx, 6423 struct spdk_bdev_desc **_desc) 6424 { 6425 struct spdk_bdev_desc *desc; 6426 unsigned int event_id; 6427 6428 desc = calloc(1, sizeof(*desc)); 6429 if (desc == NULL) { 6430 SPDK_ERRLOG("Failed to allocate memory for bdev descriptor\n"); 6431 return -ENOMEM; 6432 } 6433 6434 TAILQ_INIT(&desc->pending_media_events); 6435 TAILQ_INIT(&desc->free_media_events); 6436 6437 desc->memory_domains_supported = spdk_bdev_get_memory_domains(bdev, NULL, 0) > 0; 6438 desc->callback.event_fn = event_cb; 6439 desc->callback.ctx = event_ctx; 6440 pthread_mutex_init(&desc->mutex, NULL); 6441 6442 if (bdev->media_events) { 6443 desc->media_events_buffer = calloc(MEDIA_EVENT_POOL_SIZE, 6444 sizeof(*desc->media_events_buffer)); 6445 if (desc->media_events_buffer == NULL) { 6446 SPDK_ERRLOG("Failed to initialize media event pool\n"); 6447 bdev_desc_free(desc); 6448 return -ENOMEM; 6449 } 6450 6451 for (event_id = 0; event_id < MEDIA_EVENT_POOL_SIZE; ++event_id) { 6452 TAILQ_INSERT_TAIL(&desc->free_media_events, 6453 &desc->media_events_buffer[event_id], tailq); 6454 } 6455 } 6456 6457 *_desc = desc; 6458 6459 return 0; 6460 } 6461 6462 int 6463 spdk_bdev_open_ext(const char *bdev_name, bool write, spdk_bdev_event_cb_t event_cb, 6464 void *event_ctx, struct spdk_bdev_desc **_desc) 6465 { 6466 struct spdk_bdev_desc *desc; 6467 struct spdk_bdev *bdev; 6468 int rc; 6469 6470 if (event_cb == NULL) { 6471 SPDK_ERRLOG("Missing event callback function\n"); 6472 return -EINVAL; 6473 } 6474 6475 pthread_mutex_lock(&g_bdev_mgr.mutex); 6476 6477 bdev = bdev_get_by_name(bdev_name); 6478 6479 if (bdev == NULL) { 6480 SPDK_NOTICELOG("Currently unable to find bdev with name: %s\n", bdev_name); 6481 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6482 return -ENODEV; 6483 } 6484 6485 rc = bdev_desc_alloc(bdev, event_cb, event_ctx, &desc); 6486 if (rc != 0) { 6487 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6488 return rc; 6489 } 6490 6491 rc = bdev_open(bdev, write, desc); 6492 if (rc != 0) { 6493 bdev_desc_free(desc); 6494 desc = NULL; 6495 } 6496 6497 *_desc = desc; 6498 6499 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6500 6501 return rc; 6502 } 6503 6504 static void 6505 bdev_close(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc) 6506 { 6507 int rc; 6508 6509 pthread_mutex_lock(&bdev->internal.mutex); 6510 pthread_mutex_lock(&desc->mutex); 6511 6512 TAILQ_REMOVE(&bdev->internal.open_descs, desc, link); 6513 6514 desc->closed = true; 6515 6516 if (0 == desc->refs) { 6517 pthread_mutex_unlock(&desc->mutex); 6518 bdev_desc_free(desc); 6519 } else { 6520 pthread_mutex_unlock(&desc->mutex); 6521 } 6522 6523 /* If no more descriptors, kill QoS channel */ 6524 if (bdev->internal.qos && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6525 SPDK_DEBUGLOG(bdev, "Closed last descriptor for bdev %s on thread %p. Stopping QoS.\n", 6526 bdev->name, spdk_get_thread()); 6527 6528 if (bdev_qos_destroy(bdev)) { 6529 /* There isn't anything we can do to recover here. Just let the 6530 * old QoS poller keep running. The QoS handling won't change 6531 * cores when the user allocates a new channel, but it won't break. */ 6532 SPDK_ERRLOG("Unable to shut down QoS poller. It will continue running on the current thread.\n"); 6533 } 6534 } 6535 6536 spdk_bdev_set_qd_sampling_period(bdev, 0); 6537 6538 if (bdev->internal.status == SPDK_BDEV_STATUS_REMOVING && TAILQ_EMPTY(&bdev->internal.open_descs)) { 6539 rc = bdev_unregister_unsafe(bdev); 6540 pthread_mutex_unlock(&bdev->internal.mutex); 6541 6542 if (rc == 0) { 6543 spdk_io_device_unregister(__bdev_to_io_dev(bdev), bdev_destroy_cb); 6544 } 6545 } else { 6546 pthread_mutex_unlock(&bdev->internal.mutex); 6547 } 6548 } 6549 6550 void 6551 spdk_bdev_close(struct spdk_bdev_desc *desc) 6552 { 6553 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 6554 6555 SPDK_DEBUGLOG(bdev, "Closing descriptor %p for bdev %s on thread %p\n", desc, bdev->name, 6556 spdk_get_thread()); 6557 6558 assert(desc->thread == spdk_get_thread()); 6559 6560 spdk_poller_unregister(&desc->io_timeout_poller); 6561 6562 pthread_mutex_lock(&g_bdev_mgr.mutex); 6563 6564 bdev_close(bdev, desc); 6565 6566 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6567 } 6568 6569 int 6570 spdk_bdev_module_claim_bdev(struct spdk_bdev *bdev, struct spdk_bdev_desc *desc, 6571 struct spdk_bdev_module *module) 6572 { 6573 if (bdev->internal.claim_module != NULL) { 6574 SPDK_ERRLOG("bdev %s already claimed by module %s\n", bdev->name, 6575 bdev->internal.claim_module->name); 6576 return -EPERM; 6577 } 6578 6579 if (desc && !desc->write) { 6580 desc->write = true; 6581 } 6582 6583 bdev->internal.claim_module = module; 6584 return 0; 6585 } 6586 6587 void 6588 spdk_bdev_module_release_bdev(struct spdk_bdev *bdev) 6589 { 6590 assert(bdev->internal.claim_module != NULL); 6591 bdev->internal.claim_module = NULL; 6592 } 6593 6594 struct spdk_bdev * 6595 spdk_bdev_desc_get_bdev(struct spdk_bdev_desc *desc) 6596 { 6597 assert(desc != NULL); 6598 return desc->bdev; 6599 } 6600 6601 int 6602 spdk_for_each_bdev(void *ctx, spdk_for_each_bdev_fn fn) 6603 { 6604 struct spdk_bdev *bdev, *tmp; 6605 struct spdk_bdev_desc *desc; 6606 int rc = 0; 6607 6608 assert(fn != NULL); 6609 6610 pthread_mutex_lock(&g_bdev_mgr.mutex); 6611 bdev = spdk_bdev_first(); 6612 while (bdev != NULL) { 6613 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6614 if (rc != 0) { 6615 break; 6616 } 6617 rc = bdev_open(bdev, false, desc); 6618 if (rc != 0) { 6619 bdev_desc_free(desc); 6620 break; 6621 } 6622 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6623 6624 rc = fn(ctx, bdev); 6625 6626 pthread_mutex_lock(&g_bdev_mgr.mutex); 6627 tmp = spdk_bdev_next(bdev); 6628 bdev_close(bdev, desc); 6629 if (rc != 0) { 6630 break; 6631 } 6632 bdev = tmp; 6633 } 6634 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6635 6636 return rc; 6637 } 6638 6639 int 6640 spdk_for_each_bdev_leaf(void *ctx, spdk_for_each_bdev_fn fn) 6641 { 6642 struct spdk_bdev *bdev, *tmp; 6643 struct spdk_bdev_desc *desc; 6644 int rc = 0; 6645 6646 assert(fn != NULL); 6647 6648 pthread_mutex_lock(&g_bdev_mgr.mutex); 6649 bdev = spdk_bdev_first_leaf(); 6650 while (bdev != NULL) { 6651 rc = bdev_desc_alloc(bdev, _tmp_bdev_event_cb, NULL, &desc); 6652 if (rc != 0) { 6653 break; 6654 } 6655 rc = bdev_open(bdev, false, desc); 6656 if (rc != 0) { 6657 bdev_desc_free(desc); 6658 break; 6659 } 6660 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6661 6662 rc = fn(ctx, bdev); 6663 6664 pthread_mutex_lock(&g_bdev_mgr.mutex); 6665 tmp = spdk_bdev_next_leaf(bdev); 6666 bdev_close(bdev, desc); 6667 if (rc != 0) { 6668 break; 6669 } 6670 bdev = tmp; 6671 } 6672 pthread_mutex_unlock(&g_bdev_mgr.mutex); 6673 6674 return rc; 6675 } 6676 6677 void 6678 spdk_bdev_io_get_iovec(struct spdk_bdev_io *bdev_io, struct iovec **iovp, int *iovcntp) 6679 { 6680 struct iovec *iovs; 6681 int iovcnt; 6682 6683 if (bdev_io == NULL) { 6684 return; 6685 } 6686 6687 switch (bdev_io->type) { 6688 case SPDK_BDEV_IO_TYPE_READ: 6689 case SPDK_BDEV_IO_TYPE_WRITE: 6690 case SPDK_BDEV_IO_TYPE_ZCOPY: 6691 iovs = bdev_io->u.bdev.iovs; 6692 iovcnt = bdev_io->u.bdev.iovcnt; 6693 break; 6694 default: 6695 iovs = NULL; 6696 iovcnt = 0; 6697 break; 6698 } 6699 6700 if (iovp) { 6701 *iovp = iovs; 6702 } 6703 if (iovcntp) { 6704 *iovcntp = iovcnt; 6705 } 6706 } 6707 6708 void * 6709 spdk_bdev_io_get_md_buf(struct spdk_bdev_io *bdev_io) 6710 { 6711 if (bdev_io == NULL) { 6712 return NULL; 6713 } 6714 6715 if (!spdk_bdev_is_md_separate(bdev_io->bdev)) { 6716 return NULL; 6717 } 6718 6719 if (bdev_io->type == SPDK_BDEV_IO_TYPE_READ || 6720 bdev_io->type == SPDK_BDEV_IO_TYPE_WRITE) { 6721 return bdev_io->u.bdev.md_buf; 6722 } 6723 6724 return NULL; 6725 } 6726 6727 void * 6728 spdk_bdev_io_get_cb_arg(struct spdk_bdev_io *bdev_io) 6729 { 6730 if (bdev_io == NULL) { 6731 assert(false); 6732 return NULL; 6733 } 6734 6735 return bdev_io->internal.caller_ctx; 6736 } 6737 6738 void 6739 spdk_bdev_module_list_add(struct spdk_bdev_module *bdev_module) 6740 { 6741 6742 if (spdk_bdev_module_list_find(bdev_module->name)) { 6743 SPDK_ERRLOG("ERROR: module '%s' already registered.\n", bdev_module->name); 6744 assert(false); 6745 } 6746 6747 /* 6748 * Modules with examine callbacks must be initialized first, so they are 6749 * ready to handle examine callbacks from later modules that will 6750 * register physical bdevs. 6751 */ 6752 if (bdev_module->examine_config != NULL || bdev_module->examine_disk != NULL) { 6753 TAILQ_INSERT_HEAD(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6754 } else { 6755 TAILQ_INSERT_TAIL(&g_bdev_mgr.bdev_modules, bdev_module, internal.tailq); 6756 } 6757 } 6758 6759 struct spdk_bdev_module * 6760 spdk_bdev_module_list_find(const char *name) 6761 { 6762 struct spdk_bdev_module *bdev_module; 6763 6764 TAILQ_FOREACH(bdev_module, &g_bdev_mgr.bdev_modules, internal.tailq) { 6765 if (strcmp(name, bdev_module->name) == 0) { 6766 break; 6767 } 6768 } 6769 6770 return bdev_module; 6771 } 6772 6773 static void 6774 bdev_write_zero_buffer_next(void *_bdev_io) 6775 { 6776 struct spdk_bdev_io *bdev_io = _bdev_io; 6777 uint64_t num_bytes, num_blocks; 6778 void *md_buf = NULL; 6779 int rc; 6780 6781 num_bytes = spdk_min(_bdev_get_block_size_with_md(bdev_io->bdev) * 6782 bdev_io->u.bdev.split_remaining_num_blocks, 6783 ZERO_BUFFER_SIZE); 6784 num_blocks = num_bytes / _bdev_get_block_size_with_md(bdev_io->bdev); 6785 6786 if (spdk_bdev_is_md_separate(bdev_io->bdev)) { 6787 md_buf = (char *)g_bdev_mgr.zero_buffer + 6788 spdk_bdev_get_block_size(bdev_io->bdev) * num_blocks; 6789 } 6790 6791 rc = bdev_write_blocks_with_md(bdev_io->internal.desc, 6792 spdk_io_channel_from_ctx(bdev_io->internal.ch), 6793 g_bdev_mgr.zero_buffer, md_buf, 6794 bdev_io->u.bdev.split_current_offset_blocks, num_blocks, 6795 bdev_write_zero_buffer_done, bdev_io); 6796 if (rc == 0) { 6797 bdev_io->u.bdev.split_remaining_num_blocks -= num_blocks; 6798 bdev_io->u.bdev.split_current_offset_blocks += num_blocks; 6799 } else if (rc == -ENOMEM) { 6800 bdev_queue_io_wait_with_cb(bdev_io, bdev_write_zero_buffer_next); 6801 } else { 6802 bdev_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6803 bdev_io->internal.cb(bdev_io, false, bdev_io->internal.caller_ctx); 6804 } 6805 } 6806 6807 static void 6808 bdev_write_zero_buffer_done(struct spdk_bdev_io *bdev_io, bool success, void *cb_arg) 6809 { 6810 struct spdk_bdev_io *parent_io = cb_arg; 6811 6812 spdk_bdev_free_io(bdev_io); 6813 6814 if (!success) { 6815 parent_io->internal.status = SPDK_BDEV_IO_STATUS_FAILED; 6816 parent_io->internal.cb(parent_io, false, parent_io->internal.caller_ctx); 6817 return; 6818 } 6819 6820 if (parent_io->u.bdev.split_remaining_num_blocks == 0) { 6821 parent_io->internal.status = SPDK_BDEV_IO_STATUS_SUCCESS; 6822 parent_io->internal.cb(parent_io, true, parent_io->internal.caller_ctx); 6823 return; 6824 } 6825 6826 bdev_write_zero_buffer_next(parent_io); 6827 } 6828 6829 static void 6830 bdev_set_qos_limit_done(struct set_qos_limit_ctx *ctx, int status) 6831 { 6832 pthread_mutex_lock(&ctx->bdev->internal.mutex); 6833 ctx->bdev->internal.qos_mod_in_progress = false; 6834 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 6835 6836 if (ctx->cb_fn) { 6837 ctx->cb_fn(ctx->cb_arg, status); 6838 } 6839 free(ctx); 6840 } 6841 6842 static void 6843 bdev_disable_qos_done(void *cb_arg) 6844 { 6845 struct set_qos_limit_ctx *ctx = cb_arg; 6846 struct spdk_bdev *bdev = ctx->bdev; 6847 struct spdk_bdev_io *bdev_io; 6848 struct spdk_bdev_qos *qos; 6849 6850 pthread_mutex_lock(&bdev->internal.mutex); 6851 qos = bdev->internal.qos; 6852 bdev->internal.qos = NULL; 6853 pthread_mutex_unlock(&bdev->internal.mutex); 6854 6855 while (!TAILQ_EMPTY(&qos->queued)) { 6856 /* Send queued I/O back to their original thread for resubmission. */ 6857 bdev_io = TAILQ_FIRST(&qos->queued); 6858 TAILQ_REMOVE(&qos->queued, bdev_io, internal.link); 6859 6860 if (bdev_io->internal.io_submit_ch) { 6861 /* 6862 * Channel was changed when sending it to the QoS thread - change it back 6863 * before sending it back to the original thread. 6864 */ 6865 bdev_io->internal.ch = bdev_io->internal.io_submit_ch; 6866 bdev_io->internal.io_submit_ch = NULL; 6867 } 6868 6869 spdk_thread_send_msg(spdk_bdev_io_get_thread(bdev_io), 6870 _bdev_io_submit, bdev_io); 6871 } 6872 6873 if (qos->thread != NULL) { 6874 spdk_put_io_channel(spdk_io_channel_from_ctx(qos->ch)); 6875 spdk_poller_unregister(&qos->poller); 6876 } 6877 6878 free(qos); 6879 6880 bdev_set_qos_limit_done(ctx, 0); 6881 } 6882 6883 static void 6884 bdev_disable_qos_msg_done(struct spdk_io_channel_iter *i, int status) 6885 { 6886 void *io_device = spdk_io_channel_iter_get_io_device(i); 6887 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6888 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6889 struct spdk_thread *thread; 6890 6891 pthread_mutex_lock(&bdev->internal.mutex); 6892 thread = bdev->internal.qos->thread; 6893 pthread_mutex_unlock(&bdev->internal.mutex); 6894 6895 if (thread != NULL) { 6896 spdk_thread_send_msg(thread, bdev_disable_qos_done, ctx); 6897 } else { 6898 bdev_disable_qos_done(ctx); 6899 } 6900 } 6901 6902 static void 6903 bdev_disable_qos_msg(struct spdk_io_channel_iter *i) 6904 { 6905 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6906 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6907 6908 bdev_ch->flags &= ~BDEV_CH_QOS_ENABLED; 6909 6910 spdk_for_each_channel_continue(i, 0); 6911 } 6912 6913 static void 6914 bdev_update_qos_rate_limit_msg(void *cb_arg) 6915 { 6916 struct set_qos_limit_ctx *ctx = cb_arg; 6917 struct spdk_bdev *bdev = ctx->bdev; 6918 6919 pthread_mutex_lock(&bdev->internal.mutex); 6920 bdev_qos_update_max_quota_per_timeslice(bdev->internal.qos); 6921 pthread_mutex_unlock(&bdev->internal.mutex); 6922 6923 bdev_set_qos_limit_done(ctx, 0); 6924 } 6925 6926 static void 6927 bdev_enable_qos_msg(struct spdk_io_channel_iter *i) 6928 { 6929 void *io_device = spdk_io_channel_iter_get_io_device(i); 6930 struct spdk_bdev *bdev = __bdev_from_io_dev(io_device); 6931 struct spdk_io_channel *ch = spdk_io_channel_iter_get_channel(i); 6932 struct spdk_bdev_channel *bdev_ch = spdk_io_channel_get_ctx(ch); 6933 6934 pthread_mutex_lock(&bdev->internal.mutex); 6935 bdev_enable_qos(bdev, bdev_ch); 6936 pthread_mutex_unlock(&bdev->internal.mutex); 6937 spdk_for_each_channel_continue(i, 0); 6938 } 6939 6940 static void 6941 bdev_enable_qos_done(struct spdk_io_channel_iter *i, int status) 6942 { 6943 struct set_qos_limit_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 6944 6945 bdev_set_qos_limit_done(ctx, status); 6946 } 6947 6948 static void 6949 bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits) 6950 { 6951 int i; 6952 6953 assert(bdev->internal.qos != NULL); 6954 6955 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6956 if (limits[i] != SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6957 bdev->internal.qos->rate_limits[i].limit = limits[i]; 6958 6959 if (limits[i] == 0) { 6960 bdev->internal.qos->rate_limits[i].limit = 6961 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED; 6962 } 6963 } 6964 } 6965 } 6966 6967 void 6968 spdk_bdev_set_qos_rate_limits(struct spdk_bdev *bdev, uint64_t *limits, 6969 void (*cb_fn)(void *cb_arg, int status), void *cb_arg) 6970 { 6971 struct set_qos_limit_ctx *ctx; 6972 uint32_t limit_set_complement; 6973 uint64_t min_limit_per_sec; 6974 int i; 6975 bool disable_rate_limit = true; 6976 6977 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 6978 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED) { 6979 continue; 6980 } 6981 6982 if (limits[i] > 0) { 6983 disable_rate_limit = false; 6984 } 6985 6986 if (bdev_qos_is_iops_rate_limit(i) == true) { 6987 min_limit_per_sec = SPDK_BDEV_QOS_MIN_IOS_PER_SEC; 6988 } else { 6989 /* Change from megabyte to byte rate limit */ 6990 limits[i] = limits[i] * 1024 * 1024; 6991 min_limit_per_sec = SPDK_BDEV_QOS_MIN_BYTES_PER_SEC; 6992 } 6993 6994 limit_set_complement = limits[i] % min_limit_per_sec; 6995 if (limit_set_complement) { 6996 SPDK_ERRLOG("Requested rate limit %" PRIu64 " is not a multiple of %" PRIu64 "\n", 6997 limits[i], min_limit_per_sec); 6998 limits[i] += min_limit_per_sec - limit_set_complement; 6999 SPDK_ERRLOG("Round up the rate limit to %" PRIu64 "\n", limits[i]); 7000 } 7001 } 7002 7003 ctx = calloc(1, sizeof(*ctx)); 7004 if (ctx == NULL) { 7005 cb_fn(cb_arg, -ENOMEM); 7006 return; 7007 } 7008 7009 ctx->cb_fn = cb_fn; 7010 ctx->cb_arg = cb_arg; 7011 ctx->bdev = bdev; 7012 7013 pthread_mutex_lock(&bdev->internal.mutex); 7014 if (bdev->internal.qos_mod_in_progress) { 7015 pthread_mutex_unlock(&bdev->internal.mutex); 7016 free(ctx); 7017 cb_fn(cb_arg, -EAGAIN); 7018 return; 7019 } 7020 bdev->internal.qos_mod_in_progress = true; 7021 7022 if (disable_rate_limit == true && bdev->internal.qos) { 7023 for (i = 0; i < SPDK_BDEV_QOS_NUM_RATE_LIMIT_TYPES; i++) { 7024 if (limits[i] == SPDK_BDEV_QOS_LIMIT_NOT_DEFINED && 7025 (bdev->internal.qos->rate_limits[i].limit > 0 && 7026 bdev->internal.qos->rate_limits[i].limit != 7027 SPDK_BDEV_QOS_LIMIT_NOT_DEFINED)) { 7028 disable_rate_limit = false; 7029 break; 7030 } 7031 } 7032 } 7033 7034 if (disable_rate_limit == false) { 7035 if (bdev->internal.qos == NULL) { 7036 bdev->internal.qos = calloc(1, sizeof(*bdev->internal.qos)); 7037 if (!bdev->internal.qos) { 7038 pthread_mutex_unlock(&bdev->internal.mutex); 7039 SPDK_ERRLOG("Unable to allocate memory for QoS tracking\n"); 7040 bdev_set_qos_limit_done(ctx, -ENOMEM); 7041 return; 7042 } 7043 } 7044 7045 if (bdev->internal.qos->thread == NULL) { 7046 /* Enabling */ 7047 bdev_set_qos_rate_limits(bdev, limits); 7048 7049 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7050 bdev_enable_qos_msg, ctx, 7051 bdev_enable_qos_done); 7052 } else { 7053 /* Updating */ 7054 bdev_set_qos_rate_limits(bdev, limits); 7055 7056 spdk_thread_send_msg(bdev->internal.qos->thread, 7057 bdev_update_qos_rate_limit_msg, ctx); 7058 } 7059 } else { 7060 if (bdev->internal.qos != NULL) { 7061 bdev_set_qos_rate_limits(bdev, limits); 7062 7063 /* Disabling */ 7064 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7065 bdev_disable_qos_msg, ctx, 7066 bdev_disable_qos_msg_done); 7067 } else { 7068 pthread_mutex_unlock(&bdev->internal.mutex); 7069 bdev_set_qos_limit_done(ctx, 0); 7070 return; 7071 } 7072 } 7073 7074 pthread_mutex_unlock(&bdev->internal.mutex); 7075 } 7076 7077 struct spdk_bdev_histogram_ctx { 7078 spdk_bdev_histogram_status_cb cb_fn; 7079 void *cb_arg; 7080 struct spdk_bdev *bdev; 7081 int status; 7082 }; 7083 7084 static void 7085 bdev_histogram_disable_channel_cb(struct spdk_io_channel_iter *i, int status) 7086 { 7087 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7088 7089 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7090 ctx->bdev->internal.histogram_in_progress = false; 7091 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7092 ctx->cb_fn(ctx->cb_arg, ctx->status); 7093 free(ctx); 7094 } 7095 7096 static void 7097 bdev_histogram_disable_channel(struct spdk_io_channel_iter *i) 7098 { 7099 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7100 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7101 7102 if (ch->histogram != NULL) { 7103 spdk_histogram_data_free(ch->histogram); 7104 ch->histogram = NULL; 7105 } 7106 spdk_for_each_channel_continue(i, 0); 7107 } 7108 7109 static void 7110 bdev_histogram_enable_channel_cb(struct spdk_io_channel_iter *i, int status) 7111 { 7112 struct spdk_bdev_histogram_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7113 7114 if (status != 0) { 7115 ctx->status = status; 7116 ctx->bdev->internal.histogram_enabled = false; 7117 spdk_for_each_channel(__bdev_to_io_dev(ctx->bdev), bdev_histogram_disable_channel, ctx, 7118 bdev_histogram_disable_channel_cb); 7119 } else { 7120 pthread_mutex_lock(&ctx->bdev->internal.mutex); 7121 ctx->bdev->internal.histogram_in_progress = false; 7122 pthread_mutex_unlock(&ctx->bdev->internal.mutex); 7123 ctx->cb_fn(ctx->cb_arg, ctx->status); 7124 free(ctx); 7125 } 7126 } 7127 7128 static void 7129 bdev_histogram_enable_channel(struct spdk_io_channel_iter *i) 7130 { 7131 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7132 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7133 int status = 0; 7134 7135 if (ch->histogram == NULL) { 7136 ch->histogram = spdk_histogram_data_alloc(); 7137 if (ch->histogram == NULL) { 7138 status = -ENOMEM; 7139 } 7140 } 7141 7142 spdk_for_each_channel_continue(i, status); 7143 } 7144 7145 void 7146 spdk_bdev_histogram_enable(struct spdk_bdev *bdev, spdk_bdev_histogram_status_cb cb_fn, 7147 void *cb_arg, bool enable) 7148 { 7149 struct spdk_bdev_histogram_ctx *ctx; 7150 7151 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_ctx)); 7152 if (ctx == NULL) { 7153 cb_fn(cb_arg, -ENOMEM); 7154 return; 7155 } 7156 7157 ctx->bdev = bdev; 7158 ctx->status = 0; 7159 ctx->cb_fn = cb_fn; 7160 ctx->cb_arg = cb_arg; 7161 7162 pthread_mutex_lock(&bdev->internal.mutex); 7163 if (bdev->internal.histogram_in_progress) { 7164 pthread_mutex_unlock(&bdev->internal.mutex); 7165 free(ctx); 7166 cb_fn(cb_arg, -EAGAIN); 7167 return; 7168 } 7169 7170 bdev->internal.histogram_in_progress = true; 7171 pthread_mutex_unlock(&bdev->internal.mutex); 7172 7173 bdev->internal.histogram_enabled = enable; 7174 7175 if (enable) { 7176 /* Allocate histogram for each channel */ 7177 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_enable_channel, ctx, 7178 bdev_histogram_enable_channel_cb); 7179 } else { 7180 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_disable_channel, ctx, 7181 bdev_histogram_disable_channel_cb); 7182 } 7183 } 7184 7185 struct spdk_bdev_histogram_data_ctx { 7186 spdk_bdev_histogram_data_cb cb_fn; 7187 void *cb_arg; 7188 struct spdk_bdev *bdev; 7189 /** merged histogram data from all channels */ 7190 struct spdk_histogram_data *histogram; 7191 }; 7192 7193 static void 7194 bdev_histogram_get_channel_cb(struct spdk_io_channel_iter *i, int status) 7195 { 7196 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7197 7198 ctx->cb_fn(ctx->cb_arg, status, ctx->histogram); 7199 free(ctx); 7200 } 7201 7202 static void 7203 bdev_histogram_get_channel(struct spdk_io_channel_iter *i) 7204 { 7205 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7206 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7207 struct spdk_bdev_histogram_data_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7208 int status = 0; 7209 7210 if (ch->histogram == NULL) { 7211 status = -EFAULT; 7212 } else { 7213 spdk_histogram_data_merge(ctx->histogram, ch->histogram); 7214 } 7215 7216 spdk_for_each_channel_continue(i, status); 7217 } 7218 7219 void 7220 spdk_bdev_histogram_get(struct spdk_bdev *bdev, struct spdk_histogram_data *histogram, 7221 spdk_bdev_histogram_data_cb cb_fn, 7222 void *cb_arg) 7223 { 7224 struct spdk_bdev_histogram_data_ctx *ctx; 7225 7226 ctx = calloc(1, sizeof(struct spdk_bdev_histogram_data_ctx)); 7227 if (ctx == NULL) { 7228 cb_fn(cb_arg, -ENOMEM, NULL); 7229 return; 7230 } 7231 7232 ctx->bdev = bdev; 7233 ctx->cb_fn = cb_fn; 7234 ctx->cb_arg = cb_arg; 7235 7236 ctx->histogram = histogram; 7237 7238 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_histogram_get_channel, ctx, 7239 bdev_histogram_get_channel_cb); 7240 } 7241 7242 size_t 7243 spdk_bdev_get_media_events(struct spdk_bdev_desc *desc, struct spdk_bdev_media_event *events, 7244 size_t max_events) 7245 { 7246 struct media_event_entry *entry; 7247 size_t num_events = 0; 7248 7249 for (; num_events < max_events; ++num_events) { 7250 entry = TAILQ_FIRST(&desc->pending_media_events); 7251 if (entry == NULL) { 7252 break; 7253 } 7254 7255 events[num_events] = entry->event; 7256 TAILQ_REMOVE(&desc->pending_media_events, entry, tailq); 7257 TAILQ_INSERT_TAIL(&desc->free_media_events, entry, tailq); 7258 } 7259 7260 return num_events; 7261 } 7262 7263 int 7264 spdk_bdev_push_media_events(struct spdk_bdev *bdev, const struct spdk_bdev_media_event *events, 7265 size_t num_events) 7266 { 7267 struct spdk_bdev_desc *desc; 7268 struct media_event_entry *entry; 7269 size_t event_id; 7270 int rc = 0; 7271 7272 assert(bdev->media_events); 7273 7274 pthread_mutex_lock(&bdev->internal.mutex); 7275 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7276 if (desc->write) { 7277 break; 7278 } 7279 } 7280 7281 if (desc == NULL || desc->media_events_buffer == NULL) { 7282 rc = -ENODEV; 7283 goto out; 7284 } 7285 7286 for (event_id = 0; event_id < num_events; ++event_id) { 7287 entry = TAILQ_FIRST(&desc->free_media_events); 7288 if (entry == NULL) { 7289 break; 7290 } 7291 7292 TAILQ_REMOVE(&desc->free_media_events, entry, tailq); 7293 TAILQ_INSERT_TAIL(&desc->pending_media_events, entry, tailq); 7294 entry->event = events[event_id]; 7295 } 7296 7297 rc = event_id; 7298 out: 7299 pthread_mutex_unlock(&bdev->internal.mutex); 7300 return rc; 7301 } 7302 7303 void 7304 spdk_bdev_notify_media_management(struct spdk_bdev *bdev) 7305 { 7306 struct spdk_bdev_desc *desc; 7307 7308 pthread_mutex_lock(&bdev->internal.mutex); 7309 TAILQ_FOREACH(desc, &bdev->internal.open_descs, link) { 7310 if (!TAILQ_EMPTY(&desc->pending_media_events)) { 7311 desc->callback.event_fn(SPDK_BDEV_EVENT_MEDIA_MANAGEMENT, bdev, 7312 desc->callback.ctx); 7313 } 7314 } 7315 pthread_mutex_unlock(&bdev->internal.mutex); 7316 } 7317 7318 struct locked_lba_range_ctx { 7319 struct lba_range range; 7320 struct spdk_bdev *bdev; 7321 struct lba_range *current_range; 7322 struct lba_range *owner_range; 7323 struct spdk_poller *poller; 7324 lock_range_cb cb_fn; 7325 void *cb_arg; 7326 }; 7327 7328 static void 7329 bdev_lock_error_cleanup_cb(struct spdk_io_channel_iter *i, int status) 7330 { 7331 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7332 7333 ctx->cb_fn(ctx->cb_arg, -ENOMEM); 7334 free(ctx); 7335 } 7336 7337 static void 7338 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i); 7339 7340 static void 7341 bdev_lock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7342 { 7343 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7344 struct spdk_bdev *bdev = ctx->bdev; 7345 7346 if (status == -ENOMEM) { 7347 /* One of the channels could not allocate a range object. 7348 * So we have to go back and clean up any ranges that were 7349 * allocated successfully before we return error status to 7350 * the caller. We can reuse the unlock function to do that 7351 * clean up. 7352 */ 7353 spdk_for_each_channel(__bdev_to_io_dev(bdev), 7354 bdev_unlock_lba_range_get_channel, ctx, 7355 bdev_lock_error_cleanup_cb); 7356 return; 7357 } 7358 7359 /* All channels have locked this range and no I/O overlapping the range 7360 * are outstanding! Set the owner_ch for the range object for the 7361 * locking channel, so that this channel will know that it is allowed 7362 * to write to this range. 7363 */ 7364 ctx->owner_range->owner_ch = ctx->range.owner_ch; 7365 ctx->cb_fn(ctx->cb_arg, status); 7366 7367 /* Don't free the ctx here. Its range is in the bdev's global list of 7368 * locked ranges still, and will be removed and freed when this range 7369 * is later unlocked. 7370 */ 7371 } 7372 7373 static int 7374 bdev_lock_lba_range_check_io(void *_i) 7375 { 7376 struct spdk_io_channel_iter *i = _i; 7377 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7378 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7379 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7380 struct lba_range *range = ctx->current_range; 7381 struct spdk_bdev_io *bdev_io; 7382 7383 spdk_poller_unregister(&ctx->poller); 7384 7385 /* The range is now in the locked_ranges, so no new IO can be submitted to this 7386 * range. But we need to wait until any outstanding IO overlapping with this range 7387 * are completed. 7388 */ 7389 TAILQ_FOREACH(bdev_io, &ch->io_submitted, internal.ch_link) { 7390 if (bdev_io_range_is_locked(bdev_io, range)) { 7391 ctx->poller = SPDK_POLLER_REGISTER(bdev_lock_lba_range_check_io, i, 100); 7392 return SPDK_POLLER_BUSY; 7393 } 7394 } 7395 7396 spdk_for_each_channel_continue(i, 0); 7397 return SPDK_POLLER_BUSY; 7398 } 7399 7400 static void 7401 bdev_lock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7402 { 7403 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7404 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7405 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7406 struct lba_range *range; 7407 7408 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7409 if (range->length == ctx->range.length && 7410 range->offset == ctx->range.offset && 7411 range->locked_ctx == ctx->range.locked_ctx) { 7412 /* This range already exists on this channel, so don't add 7413 * it again. This can happen when a new channel is created 7414 * while the for_each_channel operation is in progress. 7415 * Do not check for outstanding I/O in that case, since the 7416 * range was locked before any I/O could be submitted to the 7417 * new channel. 7418 */ 7419 spdk_for_each_channel_continue(i, 0); 7420 return; 7421 } 7422 } 7423 7424 range = calloc(1, sizeof(*range)); 7425 if (range == NULL) { 7426 spdk_for_each_channel_continue(i, -ENOMEM); 7427 return; 7428 } 7429 7430 range->length = ctx->range.length; 7431 range->offset = ctx->range.offset; 7432 range->locked_ctx = ctx->range.locked_ctx; 7433 ctx->current_range = range; 7434 if (ctx->range.owner_ch == ch) { 7435 /* This is the range object for the channel that will hold 7436 * the lock. Store it in the ctx object so that we can easily 7437 * set its owner_ch after the lock is finally acquired. 7438 */ 7439 ctx->owner_range = range; 7440 } 7441 TAILQ_INSERT_TAIL(&ch->locked_ranges, range, tailq); 7442 bdev_lock_lba_range_check_io(i); 7443 } 7444 7445 static void 7446 bdev_lock_lba_range_ctx(struct spdk_bdev *bdev, struct locked_lba_range_ctx *ctx) 7447 { 7448 assert(spdk_get_thread() == spdk_io_channel_get_thread(ctx->range.owner_ch->channel)); 7449 7450 /* We will add a copy of this range to each channel now. */ 7451 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_lock_lba_range_get_channel, ctx, 7452 bdev_lock_lba_range_cb); 7453 } 7454 7455 static bool 7456 bdev_lba_range_overlaps_tailq(struct lba_range *range, lba_range_tailq_t *tailq) 7457 { 7458 struct lba_range *r; 7459 7460 TAILQ_FOREACH(r, tailq, tailq) { 7461 if (bdev_lba_range_overlapped(range, r)) { 7462 return true; 7463 } 7464 } 7465 return false; 7466 } 7467 7468 static int 7469 bdev_lock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7470 uint64_t offset, uint64_t length, 7471 lock_range_cb cb_fn, void *cb_arg) 7472 { 7473 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7474 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7475 struct locked_lba_range_ctx *ctx; 7476 7477 if (cb_arg == NULL) { 7478 SPDK_ERRLOG("cb_arg must not be NULL\n"); 7479 return -EINVAL; 7480 } 7481 7482 ctx = calloc(1, sizeof(*ctx)); 7483 if (ctx == NULL) { 7484 return -ENOMEM; 7485 } 7486 7487 ctx->range.offset = offset; 7488 ctx->range.length = length; 7489 ctx->range.owner_ch = ch; 7490 ctx->range.locked_ctx = cb_arg; 7491 ctx->bdev = bdev; 7492 ctx->cb_fn = cb_fn; 7493 ctx->cb_arg = cb_arg; 7494 7495 pthread_mutex_lock(&bdev->internal.mutex); 7496 if (bdev_lba_range_overlaps_tailq(&ctx->range, &bdev->internal.locked_ranges)) { 7497 /* There is an active lock overlapping with this range. 7498 * Put it on the pending list until this range no 7499 * longer overlaps with another. 7500 */ 7501 TAILQ_INSERT_TAIL(&bdev->internal.pending_locked_ranges, &ctx->range, tailq); 7502 } else { 7503 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, &ctx->range, tailq); 7504 bdev_lock_lba_range_ctx(bdev, ctx); 7505 } 7506 pthread_mutex_unlock(&bdev->internal.mutex); 7507 return 0; 7508 } 7509 7510 static void 7511 bdev_lock_lba_range_ctx_msg(void *_ctx) 7512 { 7513 struct locked_lba_range_ctx *ctx = _ctx; 7514 7515 bdev_lock_lba_range_ctx(ctx->bdev, ctx); 7516 } 7517 7518 static void 7519 bdev_unlock_lba_range_cb(struct spdk_io_channel_iter *i, int status) 7520 { 7521 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7522 struct locked_lba_range_ctx *pending_ctx; 7523 struct spdk_bdev_channel *ch = ctx->range.owner_ch; 7524 struct spdk_bdev *bdev = ch->bdev; 7525 struct lba_range *range, *tmp; 7526 7527 pthread_mutex_lock(&bdev->internal.mutex); 7528 /* Check if there are any pending locked ranges that overlap with this range 7529 * that was just unlocked. If there are, check that it doesn't overlap with any 7530 * other locked ranges before calling bdev_lock_lba_range_ctx which will start 7531 * the lock process. 7532 */ 7533 TAILQ_FOREACH_SAFE(range, &bdev->internal.pending_locked_ranges, tailq, tmp) { 7534 if (bdev_lba_range_overlapped(range, &ctx->range) && 7535 !bdev_lba_range_overlaps_tailq(range, &bdev->internal.locked_ranges)) { 7536 TAILQ_REMOVE(&bdev->internal.pending_locked_ranges, range, tailq); 7537 pending_ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7538 TAILQ_INSERT_TAIL(&bdev->internal.locked_ranges, range, tailq); 7539 spdk_thread_send_msg(spdk_io_channel_get_thread(pending_ctx->range.owner_ch->channel), 7540 bdev_lock_lba_range_ctx_msg, pending_ctx); 7541 } 7542 } 7543 pthread_mutex_unlock(&bdev->internal.mutex); 7544 7545 ctx->cb_fn(ctx->cb_arg, status); 7546 free(ctx); 7547 } 7548 7549 static void 7550 bdev_unlock_lba_range_get_channel(struct spdk_io_channel_iter *i) 7551 { 7552 struct spdk_io_channel *_ch = spdk_io_channel_iter_get_channel(i); 7553 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7554 struct locked_lba_range_ctx *ctx = spdk_io_channel_iter_get_ctx(i); 7555 TAILQ_HEAD(, spdk_bdev_io) io_locked; 7556 struct spdk_bdev_io *bdev_io; 7557 struct lba_range *range; 7558 7559 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7560 if (ctx->range.offset == range->offset && 7561 ctx->range.length == range->length && 7562 ctx->range.locked_ctx == range->locked_ctx) { 7563 TAILQ_REMOVE(&ch->locked_ranges, range, tailq); 7564 free(range); 7565 break; 7566 } 7567 } 7568 7569 /* Note: we should almost always be able to assert that the range specified 7570 * was found. But there are some very rare corner cases where a new channel 7571 * gets created simultaneously with a range unlock, where this function 7572 * would execute on that new channel and wouldn't have the range. 7573 * We also use this to clean up range allocations when a later allocation 7574 * fails in the locking path. 7575 * So we can't actually assert() here. 7576 */ 7577 7578 /* Swap the locked IO into a temporary list, and then try to submit them again. 7579 * We could hyper-optimize this to only resubmit locked I/O that overlap 7580 * with the range that was just unlocked, but this isn't a performance path so 7581 * we go for simplicity here. 7582 */ 7583 TAILQ_INIT(&io_locked); 7584 TAILQ_SWAP(&ch->io_locked, &io_locked, spdk_bdev_io, internal.ch_link); 7585 while (!TAILQ_EMPTY(&io_locked)) { 7586 bdev_io = TAILQ_FIRST(&io_locked); 7587 TAILQ_REMOVE(&io_locked, bdev_io, internal.ch_link); 7588 bdev_io_submit(bdev_io); 7589 } 7590 7591 spdk_for_each_channel_continue(i, 0); 7592 } 7593 7594 static int 7595 bdev_unlock_lba_range(struct spdk_bdev_desc *desc, struct spdk_io_channel *_ch, 7596 uint64_t offset, uint64_t length, 7597 lock_range_cb cb_fn, void *cb_arg) 7598 { 7599 struct spdk_bdev *bdev = spdk_bdev_desc_get_bdev(desc); 7600 struct spdk_bdev_channel *ch = spdk_io_channel_get_ctx(_ch); 7601 struct locked_lba_range_ctx *ctx; 7602 struct lba_range *range; 7603 bool range_found = false; 7604 7605 /* Let's make sure the specified channel actually has a lock on 7606 * the specified range. Note that the range must match exactly. 7607 */ 7608 TAILQ_FOREACH(range, &ch->locked_ranges, tailq) { 7609 if (range->offset == offset && range->length == length && 7610 range->owner_ch == ch && range->locked_ctx == cb_arg) { 7611 range_found = true; 7612 break; 7613 } 7614 } 7615 7616 if (!range_found) { 7617 return -EINVAL; 7618 } 7619 7620 pthread_mutex_lock(&bdev->internal.mutex); 7621 /* We confirmed that this channel has locked the specified range. To 7622 * start the unlock the process, we find the range in the bdev's locked_ranges 7623 * and remove it. This ensures new channels don't inherit the locked range. 7624 * Then we will send a message to each channel (including the one specified 7625 * here) to remove the range from its per-channel list. 7626 */ 7627 TAILQ_FOREACH(range, &bdev->internal.locked_ranges, tailq) { 7628 if (range->offset == offset && range->length == length && 7629 range->locked_ctx == cb_arg) { 7630 break; 7631 } 7632 } 7633 if (range == NULL) { 7634 assert(false); 7635 pthread_mutex_unlock(&bdev->internal.mutex); 7636 return -EINVAL; 7637 } 7638 TAILQ_REMOVE(&bdev->internal.locked_ranges, range, tailq); 7639 ctx = SPDK_CONTAINEROF(range, struct locked_lba_range_ctx, range); 7640 pthread_mutex_unlock(&bdev->internal.mutex); 7641 7642 ctx->cb_fn = cb_fn; 7643 ctx->cb_arg = cb_arg; 7644 7645 spdk_for_each_channel(__bdev_to_io_dev(bdev), bdev_unlock_lba_range_get_channel, ctx, 7646 bdev_unlock_lba_range_cb); 7647 return 0; 7648 } 7649 7650 int 7651 spdk_bdev_get_memory_domains(struct spdk_bdev *bdev, struct spdk_memory_domain **domains, 7652 int array_size) 7653 { 7654 if (!bdev) { 7655 return -EINVAL; 7656 } 7657 7658 if (bdev->fn_table->get_memory_domains) { 7659 return bdev->fn_table->get_memory_domains(bdev->ctxt, domains, array_size); 7660 } 7661 7662 return 0; 7663 } 7664 7665 SPDK_LOG_REGISTER_COMPONENT(bdev) 7666 7667 SPDK_TRACE_REGISTER_FN(bdev_trace, "bdev", TRACE_GROUP_BDEV) 7668 { 7669 struct spdk_trace_tpoint_opts opts[] = { 7670 { 7671 "BDEV_IO_START", TRACE_BDEV_IO_START, 7672 OWNER_BDEV, OBJECT_BDEV_IO, 1, 7673 { 7674 { "type", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7675 { "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }, 7676 { "offset", SPDK_TRACE_ARG_TYPE_INT, 8 }, 7677 { "len", SPDK_TRACE_ARG_TYPE_INT, 8 } 7678 } 7679 }, 7680 { 7681 "BDEV_IO_DONE", TRACE_BDEV_IO_DONE, 7682 OWNER_BDEV, OBJECT_BDEV_IO, 0, 7683 {{ "ctx", SPDK_TRACE_ARG_TYPE_PTR, 8 }} 7684 }, 7685 { 7686 "BDEV_IOCH_CREATE", TRACE_BDEV_IOCH_CREATE, 7687 OWNER_BDEV, OBJECT_NONE, 1, 7688 { 7689 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7690 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7691 } 7692 }, 7693 { 7694 "BDEV_IOCH_DESTROY", TRACE_BDEV_IOCH_DESTROY, 7695 OWNER_BDEV, OBJECT_NONE, 0, 7696 { 7697 { "name", SPDK_TRACE_ARG_TYPE_STR, 40 }, 7698 { "thread_id", SPDK_TRACE_ARG_TYPE_INT, 8} 7699 } 7700 }, 7701 }; 7702 7703 7704 spdk_trace_register_owner(OWNER_BDEV, 'b'); 7705 spdk_trace_register_object(OBJECT_BDEV_IO, 'i'); 7706 spdk_trace_register_description_ext(opts, SPDK_COUNTOF(opts)); 7707 } 7708